Skip to content

Commit acc9dd2

Browse files
committed
add training and testing scripts
1 parent 19a75f1 commit acc9dd2

File tree

3 files changed

+50
-0
lines changed

3 files changed

+50
-0
lines changed

tools/scripts/dist_train.sh

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/usr/bin/env bash
2+
3+
set -x
4+
NGPUS=$1
5+
PY_ARGS=${@:2}
6+
7+
python -m torch.distributed.launch --nproc_per_node=${NGPUS} train.py --launcher pytorch ${PY_ARGS}
8+

tools/scripts/slurm_test_single.sh

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/usr/bin/env bash
2+
3+
set -x
4+
5+
PARTITION=$1
6+
GPUS=1
7+
GPUS_PER_NODE=1
8+
PY_ARGS=${@:2}
9+
JOB_NAME=eval
10+
SRUN_ARGS=${SRUN_ARGS:-""}
11+
12+
srun -p ${PARTITION} \
13+
--job-name=${JOB_NAME} \
14+
--gres=gpu:${GPUS_PER_NODE} \
15+
--ntasks=${GPUS} \
16+
--ntasks-per-node=${GPUS_PER_NODE} \
17+
--kill-on-bad-exit=1 \
18+
${SRUN_ARGS} \
19+
python -u test.py ${PY_ARGS}

tools/scripts/slurm_train.sh

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env bash
2+
3+
set -x
4+
5+
PARTITION=$1
6+
JOB_NAME=$2
7+
GPUS=$3
8+
PY_ARGS=${@:4}
9+
10+
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
11+
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
12+
SRUN_ARGS=${SRUN_ARGS:-""}
13+
14+
15+
srun -p ${PARTITION} \
16+
--job-name=${JOB_NAME} \
17+
--gres=gpu:${GPUS_PER_NODE} \
18+
--ntasks=${GPUS} \
19+
--ntasks-per-node=${GPUS_PER_NODE} \
20+
--cpus-per-task=${CPUS_PER_TASK} \
21+
--kill-on-bad-exit=1 \
22+
${SRUN_ARGS} \
23+
python -u train.py --launcher slurm ${PY_ARGS}

0 commit comments

Comments
 (0)