-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup_databases.sh
executable file
·310 lines (255 loc) · 8.66 KB
/
setup_databases.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/bin/bash -e
# Modified verrsion of colabfold database setup script for UoD cluster
# This is intended to be qsubbed to an A40 node so the mmseqs2 indexes
# are built on the same nodes as used for execution, then distributed
# to the remaining nodes
#$ -j y
#$ -o colabfold_db_logs/$JOB_NAME.$JOB_ID
#$ -jc long
#$ -adds l_hard gpu 1
#$ -adds l_hard cuda.0.name 'NVIDIA A40'
VERSION="1.5.2"
# Current A40 nodes are gpu-34-gpu52
NODES=$(seq 34 52)
if [[ "$USER" != "dbadmin" ]]; then
echo "Please submit this script as the dbadmin user"
exit 1
fi
source ~/miniconda3/etc/profile.d/conda.sh
conda activate mmseqs2
echo "CONDA_PREFIX=$CONDA_PREFIX"
echo "HOSTNAME=$HOSTNAME"
ARIA_NUM_CONN=8
db_dir="/opt/colabfold/${VERSION}"
discoba_dir="/opt/colabfold/discoba"
######################################################################
#
# downloadFile
#
# Downloads specified file to cwd
#
# Required params:
# URL: URL to download
# OUTPUT: Output filename
#
# Returns: None
#
######################################################################
downloadFile() {
URL="$1"
OUTPUT="$2"
set +e
FILENAME=$(basename "${OUTPUT}")
DIR=$(dirname "${OUTPUT}")
aria2c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && set -e && return 0
set -e
fail "Could not download $URL to $OUTPUT"
}
######################################################################
#
# download_colabfold_dbs
#
# Carries out parallel download of required databases
#
# Required args: None
# Returns: None
#
######################################################################
download_colabfold_db() {
mkdir -p $db_dir
cd $db_dir
{
if [ ! -f UNIREF30_READY ]; then
downloadFile "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2202.tar.gz" "uniref30_2202.tar.gz"
tar xzvf "uniref30_2202.tar.gz"
mmseqs tsv2exprofiledb "uniref30_2202" "uniref30_2202_db"
mmseqs createindex "uniref30_2202_db" tmp1 --remove-tmp-files 1
if [ -e uniref30_2202_db_mapping ]; then
ln -sf uniref30_2202_db_mapping uniref30_2202_db.idx_mapping
fi
if [ -e uniref30_2202_db_taxonomy ]; then
ln -sf uniref30_2202_db_taxonomy uniref30_2202_db.idx_taxonomy
fi
rm -r tmp1
rm -f uniref30_2202.tar.gz
touch UNIREF30_READY
fi
}&
{
if [ ! -f COLABDB_READY ]; then
downloadFile "https://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz" "colabfold_envdb_202108.tar.gz"
tar xzvf "colabfold_envdb_202108.tar.gz"
mmseqs tsv2exprofiledb "colabfold_envdb_202108" "colabfold_envdb_202108_db"
mmseqs createindex "colabfold_envdb_202108_db" tmp2 --remove-tmp-files 1
rm -r tmp2
rm -f colabfold_envdb_202108.tar.gz
touch COLABDB_READY
fi
}&
{
if [ ! -f PDB_READY ]; then
downloadFile "https://wwwuser.gwdg.de/~compbiol/colabfold/pdb70_220313.fasta.gz" "pdb70_220313.fasta.gz"
mmseqs createdb pdb70_220313.fasta.gz pdb70_220313
mmseqs createindex pdb70_220313 tmp3 --remove-tmp-files 1
rm -r tmp3
rm -f pdb70_220313.fasta.gz
touch PDB_READY
fi
}&
{
if [ ! -f PDB70_READY ]; then
downloadFile "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pdb70_from_mmcif_220313.tar.gz" "pdb70_from_mmcif_220313.tar.gz"
tar xzvf pdb70_from_mmcif_220313.tar.gz pdb70_a3m.ffdata pdb70_a3m.ffindex
rm -f pdb70_from_mmcif_220313.tar.gz
touch PDB70_READY
fi
}&
{
if [ ! -f PDB_MMCIF_READY ]; then
mkdir -p pdb/divided
mkdir -p pdb/obsolete
rsync -av --delete /cluster/gjb_lab/db/NOBACK/mirrors/pdb/data/structures/divided/mmCIF pdb/divided
rsync -av --delete /cluster/gjb_lab/db/NOBACK/mirrors/pdb/data/structures/obsolete/mmCIF pdb/obsolete
touch PDB_MMCIF_READY
fi
}&
wait
}
######################################################################
#
# download_discoba_db
#
# Downloads Discoba-specific database (https://doi.org/10.5281/zenodo.5563073)
# from Wheeler (https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0259871)
#
# Required argumnents: None
#
# Returns: None
#
######################################################################
download_discoba_db() {
mkdir -p $discoba_dir
cd $discoba_dir
if [ ! -f DISCOBA_READY ]; then
downloadFile "https://zenodo.org/record/5682928/files/discoba.fasta.gz?download=1" "discoba.fasta.gz"
mmseqs createdb discoba.fasta.gz discoba
mmseqs createindex discoba tmp4 --remove-tmp-files 1
rm -r tmp4
rm -f discoba.fasta.gz
touch DISCOBA_READY
fi
}
######################################################################
#
# create_wrapper
#
# Generates a shell script for qsubbing each rsync job.
# Requirement to activate conda environment makes this
# to complex for qsub -b...
#
# Required parameters:
# source_node: hostname to sync from
# target_node: hostname to sync to
# hold: jid to submit hold_jid for
#
# Returns:
# path to wrapper script
#
######################################################################
create_wrapper() {
source_node=$1
target_node=$2
hold=$3
extra_args=""
if [[ ! -z "$hold" ]]; then
extra_args="## -hold_jid $hold"
fi
script="${TMPDIR}/sync_$target_node.sh"
# SGE directives have ## rather than #$ to we can qsub this script, then
# sed them afterwards...
cat<<EOF > $script
#!/bin/env bash
## -mods l_hard hostname ${target_node}
## -N colabfold_mirror
## -j y
## -o $HOME/colabfold_db_logs/\$JOB_NAME.\$JOB_ID
$extra_args
source ~/miniconda3/etc/profile.d/conda.sh
conda activate mmseqs2
rsync -e 'ssh -oStrictHostKeyChecking=no' --rsync-path=$CONDA_PREFIX/bin/rsync --delete -av $source_node:/opt/colabfold/ /opt/colabfold
find /opt/colabfold -type d -exec chmod 0755 {} \;
find /opt/colabfold -type f -exec chmod 0644 {} \;
EOF
sed -i 's/##/#$/' $script
echo $script
}
######################################################################
#
# distribute_db
#
# Shares database across appropriate GPU nodes via qsubbed rsync jobs
# The database is pulled onto two nodes for each source node, so
# the initial transfer is submitted to run immediately, but subsequent
# dependent jobs which require a previous transfer to complete are
# submitted with '-hold_jid' so they will not run until the database has
# mirrored to their source node
#
# Required args: None
# Returns: None
#
######################################################################
distribute_db() {
# We need to exlude the current host since we already have the database
# and are going to use this as a starting point...
source_node=$(hostname -s)
cur_node=$(echo $source_node|sed -r 's/gpu-([0-9]+)/\1/')
nodes=( "${NODES[@]/$cur_node}" )
# Now we need to reorder the node list so that any nodes which are down
# appear last on the list, then the jobs assigned to them will hold until
# the nodes return, rather than having live nodes waiting on a job to complete
# on a node which is down
declare -a bad_nodes
for node in ${nodes[@]}; do
nodename=$(printf "gpu-%s.compute.dundee.ac.uk" "$node")
# qhost will return '-' in NLOAD field if a node is offline or uncommunicative...
status=$( qhost -h $nodename |tail -n +4|awk '{print $7}')
if [[ "$status" == '-' ]]; then
echo "Warning: $nodename is down..."
bad_nodes+=($node)
fi
done
# remove 'bad nodes' from the list and append them again
# at the end to ensure they are not dependancies for other jobs
for bad_node in ${bad_nodes[@]}; do
nodes=( "${nodes[@]/$bad_node}" )
nodes+=($bad_node)
done
declare -a submitted_nodes # list of nodes for which jobs have been prepared
declare -A node_jobs # associative array mapping node name to job id, for determining jid to hold
# 'submission' tracks the number of jobs submitted from each node
submission=-1
# sub_level tracks index of submitted nodes to identify correct source node
sub_level=1
for node in ${nodes[@]};do
submission=$((submission+1))
target_node=$(printf "gpu-%s.compute.dundee.ac.uk" "$node")
submitted_nodes+=($target_node)
# change the source node when we have prepared jobs for two nodes...
if [[ "$submission" == 2 ]]; then
submission=0
source_node=${submitted_nodes[0]}
submitted_nodes=$(echo ${submitted_nodes[$sub_level]:-1})
sub_level=$(($sub_level+1))
fi
echo "$source_node -> $target_node"
hold=${node_jobs[$source_node]}
script=$(create_wrapper $source_node $target_node $hold)
return=$(qsub $script)
job_id=$(echo $return|cut -f3 -d' ')
echo "job_id=$job_id"
node_jobs[$target_node]=$job_id
done
}
download_colabfold_db
download_discoba_db
distribute_db