File tree 3 files changed +50
-0
lines changed
3 files changed +50
-0
lines changed Original file line number Diff line number Diff line change
1
+ #! /usr/bin/env bash
2
+
3
+ set -x
4
+ NGPUS=$1
5
+ PY_ARGS=${@: 2}
6
+
7
+ python -m torch.distributed.launch --nproc_per_node=${NGPUS} train.py --launcher pytorch ${PY_ARGS}
8
+
Original file line number Diff line number Diff line change
1
+ #! /usr/bin/env bash
2
+
3
+ set -x
4
+
5
+ PARTITION=$1
6
+ GPUS=1
7
+ GPUS_PER_NODE=1
8
+ PY_ARGS=${@: 2}
9
+ JOB_NAME=eval
10
+ SRUN_ARGS=${SRUN_ARGS:- " " }
11
+
12
+ srun -p ${PARTITION} \
13
+ --job-name=${JOB_NAME} \
14
+ --gres=gpu:${GPUS_PER_NODE} \
15
+ --ntasks=${GPUS} \
16
+ --ntasks-per-node=${GPUS_PER_NODE} \
17
+ --kill-on-bad-exit=1 \
18
+ ${SRUN_ARGS} \
19
+ python -u test.py ${PY_ARGS}
Original file line number Diff line number Diff line change
1
+ #! /usr/bin/env bash
2
+
3
+ set -x
4
+
5
+ PARTITION=$1
6
+ JOB_NAME=$2
7
+ GPUS=$3
8
+ PY_ARGS=${@: 4}
9
+
10
+ GPUS_PER_NODE=${GPUS_PER_NODE:- 8}
11
+ CPUS_PER_TASK=${CPUS_PER_TASK:- 5}
12
+ SRUN_ARGS=${SRUN_ARGS:- " " }
13
+
14
+
15
+ srun -p ${PARTITION} \
16
+ --job-name=${JOB_NAME} \
17
+ --gres=gpu:${GPUS_PER_NODE} \
18
+ --ntasks=${GPUS} \
19
+ --ntasks-per-node=${GPUS_PER_NODE} \
20
+ --cpus-per-task=${CPUS_PER_TASK} \
21
+ --kill-on-bad-exit=1 \
22
+ ${SRUN_ARGS} \
23
+ python -u train.py --launcher slurm ${PY_ARGS}
You can’t perform that action at this time.
0 commit comments