forked from lucidrains/alphafold3-pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdistillation_data_download.sh
39 lines (31 loc) · 1.79 KB
/
distillation_data_download.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/bash
######################### Batch Headers #########################
#SBATCH --partition=copy # use partition `gpu` for GPU nodes
#SBATCH --account=pawsey1018 # IMPORTANT: use your own project and the -gpu suffix
#SBATCH --nodes=1 # NOTE: this needs to match Lightning's `Trainer(num_nodes=...)`
#SBATCH --ntasks-per-node=1 # NOTE: this needs to be `1` on SLURM clusters when using Lightning's `ddp_spawn` strategy`; otherwise, set to match Lightning's quantity of `Trainer(devices=...)`
#SBATCH --time 0-48:00:00 # time limit for the job (up to 48 hours: `0-48:00:00`)
#SBATCH --job-name=distillation_data_download # job name
#SBATCH --output=J-%x.%j.out # output log file
#SBATCH --error=J-%x.%j.err # error log file
#################################################################
# Load required modules
module load pawseyenv/2024.05
module load singularity/4.1.0-slurm
# Ensure AWS CLI is installed
awsv2 --install
# Define paths
AFDB_URL="https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/swissprot_cif_v4.tar"
OUTPUT_DIR="./data/afdb_data"
MMCIF_OUTPUT_DIR="$OUTPUT_DIR/unfiltered_train_mmcifs"
MSA_OUTPUT_DIR="$OUTPUT_DIR/data_caches/train"
mkdir -p "$MMCIF_OUTPUT_DIR"
mkdir -p "$MSA_OUTPUT_DIR"
# Run download commands
bash -c "
wget -O $OUTPUT_DIR/afdb_swissprot_cif_v4.tar $AFDB_URL \
&& tar -xvf $OUTPUT_DIR/afdb_swissprot_cif_v4.tar -C $MMCIF_OUTPUT_DIR \
&& awsv2 s3 cp s3://openfold/pdb/ $MSA_OUTPUT_DIR --recursive --no-sign-request
"
# Inform user of task completion
echo "Task completed for job: $SLURM_JOB_NAME"