1
- image : gitlab-master. nvidia.com/dl/dgx/ pytorch:20.12 -py3-devel
1
+ image : nvcr.io/ nvidia/ pytorch:23.04 -py3
2
2
3
- test :
3
+ stages :
4
+ - test
5
+ - cleanup
6
+
7
+ variables : &VARS
8
+ SELENE_ADLR_CI_PATH : " /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
9
+ DATA_DIR : " /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
10
+ PYTORCH_IMAGE : gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
11
+ PYTHON_VIRTUAL_ENV : /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
12
+ TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED : L0 # Can specify levels
13
+ TESTS_TO_RUN_AFTER_MERGING : L0 # Can specify levels
14
+ TESTS_TO_RUN_ON_THIS_COMMIT : unit_tests
15
+ TEST_REGEX_ON_THIS_COMMIT : NONE # https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
16
+ DISPLAY_OUTPUT : " True" # Set to true for new tests to copy the logs for creating golden truth file
17
+
18
+ unit_tests :
19
+ tags :
20
+ - docker_local_runner
21
+ stage : test
4
22
script :
5
- - pytest --junitxml=report.xml tests
23
+ - pip install pytest-cov
24
+ - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
25
+ coverage : ' /(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
6
26
artifacts :
7
- when : always
8
- reports :
9
- junit : report.xml
10
-
27
+ paths :
28
+ - coverage
29
+ expire_in : 30 days
30
+ only :
31
+ - merge_requests
32
+
33
+ .selene_test_resume_checkpoint_launcher : &selene-test-resume-checkpoint-launcher
34
+ tags :
35
+ - ssh_selene_runner
36
+ stage : test
37
+ script : &selene-test-resume-launcher-script
38
+ - echo "Running selene resume from checkpoint test. "
39
+ - pwd
40
+ - export BUILD_DIR=`pwd`
41
+ - export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes
42
+ - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
43
+ - export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS
44
+ - export DATA_DIR=$DATA_DIR
45
+ - echo "Run name is $RUN_NAME"
46
+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
47
+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
48
+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
49
+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
50
+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
51
+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
52
+ - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
53
+ - export LOGS_DIR=$BASE_DIR/logs
54
+ - export RESULTS_DIR=$BASE_DIR/results
55
+ - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
56
+ - echo "Submitting job"
57
+ - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES`
58
+ - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
59
+ - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
60
+ - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
61
+ " ----------WAITING FOR SLURM JOB TO BEGIN-----------\n "
62
+ " ---------------------------------------------------\n "
63
+ " $(scontrol show job=${SLURM_JOBID})\n "
64
+ " ---------------------------------------------------\n "
65
+ # Gitlab logs collapsible section markers
66
+ - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
67
+ # Follow output of the job
68
+ - echo "Finished job"
69
+ - export SLURM_STATE=$(sacct -j "${SLURM_JOBID}" --format State --parsable2 --noheader |& head -n 1)
70
+ - echo "Slurm job state $SLURM_STATE"
71
+ - if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
72
+ - source $PYTHON_VIRTUAL_ENV
73
+ - cmd="pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py"
74
+ - if $cmd; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs"; fi
75
+ - echo "Completed the job"
76
+ rules :
77
+ - if : $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
78
+ when : always
79
+ - if : ' $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
80
+ when : always
81
+ - if : $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
82
+ when : always
83
+ allow_failure : false
84
+
85
+ .selene_test_launcher : &selene-test-launcher
86
+ tags :
87
+ - ssh_selene_runner
88
+ stage : test
89
+ script : &selene-test-launcher-script
90
+ - echo "Running selene test"
91
+ - echo "$CI_MERGE_REQUEST_APPROVED"
92
+ - pwd
93
+ - export BUILD_DIR=`pwd`
94
+ - RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
95
+ - if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi
96
+ - export $RUN_NAME
97
+ - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
98
+ - export USE_TE TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE
99
+ - export MBS GBS
100
+ - export DATA_DIR=$DATA_DIR
101
+ - echo "Run name is $RUN_NAME"
102
+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
103
+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
104
+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
105
+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
106
+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
107
+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
108
+ - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
109
+ - export LOGS_DIR=$BASE_DIR/logs
110
+ - export RESULTS_DIR=$BASE_DIR/results
111
+ - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
112
+ - echo "Submitting job"
113
+ - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS`
114
+ - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
115
+ - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
116
+ - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
117
+ " ----------WAITING FOR SLURM JOB TO BEGIN-----------\n "
118
+ " ---------------------------------------------------\n "
119
+ " $(scontrol show job=${SLURM_JOBID})\n "
120
+ " ---------------------------------------------------\n "
121
+ # Gitlab logs collapsible section markers
122
+ - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
123
+ # Follow output of the job
124
+ - echo "Finished job"
125
+ - echo "Slurm log dump start ------------------------------------------------------------"
126
+ - cat $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
127
+ - echo "Slurm log dump end --------------------------------------------------------------"
128
+ - python3 $BUILD_DIR/tests/functional_tests/python_test_utils/check_slurm_job_completion.py $SLURM_JOBID
129
+ - if [ $? -ne 0 ]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
130
+ - source $PYTHON_VIRTUAL_ENV
131
+ - |
132
+ if [[ "$DISPLAY_OUTPUT" == "True" ]]; then
133
+ python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
134
+ fi
135
+ - |
136
+ if [[ $USE_TE -ne 1 ]]; then
137
+ echo "Checking against ground truth file"
138
+ export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json
139
+ cmd="pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py"
140
+ if $cmd; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs"; fi
141
+ fi
142
+ - echo "Completed the job"
143
+ rules :
144
+ - if : $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
145
+ when : always
146
+ - if : ' $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
147
+ when : always
148
+ - if : $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
149
+ when : always
150
+ allow_failure : false
151
+
152
+ train.te_gpt3.345m_tp2_pp2_1node_50steps :
153
+ << : *selene-test-launcher
154
+ variables :
155
+ << : [*VARS]
156
+ RUN_MODEL : gpt3
157
+ USE_TE : 1
158
+ TP_SIZE : 2
159
+ PP_SIZE : 2
160
+ NUM_NODES : 1
161
+ MAX_STEPS : 50
162
+ TIME_LIMIT : " 50:00"
163
+ TEST_LEVEL : L0
164
+
165
+ train.gpt3.345m_tp4_pp1_1node_50steps :
166
+ << : *selene-test-launcher
167
+ variables :
168
+ << : [*VARS]
169
+ RUN_MODEL : gpt3
170
+ USE_TE : 0
171
+ TP_SIZE : 4
172
+ PP_SIZE : 1
173
+ NUM_NODES : 1
174
+ MAX_STEPS : 50
175
+ TIME_LIMIT : " 20:00"
176
+ TEST_LEVEL : L0
177
+
178
+ train.gpt3.345m_tp2_pp2_1node_50steps :
179
+ << : *selene-test-launcher
180
+ variables :
181
+ << : [*VARS]
182
+ RUN_MODEL : gpt3
183
+ USE_TE : 0
184
+ TP_SIZE : 2
185
+ PP_SIZE : 2
186
+ NUM_NODES : 1
187
+ MAX_STEPS : 50
188
+ TIME_LIMIT : " 20:00"
189
+ TEST_LEVEL : L0
190
+
191
+ train.gpt3.345m_tp1_pp2_1node_50steps :
192
+ << : *selene-test-launcher
193
+ variables :
194
+ << : [*VARS]
195
+ RUN_MODEL : gpt3
196
+ USE_TE : 0
197
+ TP_SIZE : 1
198
+ PP_SIZE : 2
199
+ NUM_NODES : 1
200
+ MAX_STEPS : 50
201
+ TIME_LIMIT : " 20:00"
202
+ TEST_LEVEL : L0
203
+
204
+ train.gpt3.345m_tp1_pp4_1node_50steps :
205
+ << : *selene-test-launcher
206
+ variables :
207
+ << : [*VARS]
208
+ RUN_MODEL : gpt3
209
+ USE_TE : 0
210
+ TP_SIZE : 1
211
+ PP_SIZE : 4
212
+ VP_SIZE : 1
213
+ NUM_NODES : 1
214
+ MAX_STEPS : 50
215
+ TIME_LIMIT : " 20:00"
216
+ TEST_LEVEL : L0
217
+
218
+ resume.checkpoint.gpt3.345m_tp1_pp2_1node :
219
+ << : *selene-test-resume-checkpoint-launcher
220
+ variables :
221
+ << : [*VARS]
222
+ RUN_MODEL : gpt3
223
+ TP_SIZE : 1
224
+ PP_SIZE : 2
225
+ NUM_NODES : 1
226
+ TIME_LIMIT : " 30:00"
227
+ TEST_LEVEL : L0
228
+
229
+ train.bert.345m_tp4_pp1_1node_50steps :
230
+ << : *selene-test-launcher
231
+ variables :
232
+ << : [*VARS]
233
+ RUN_MODEL : bert
234
+ TP_SIZE : 4
235
+ PP_SIZE : 1
236
+ NUM_NODES : 1
237
+ MAX_STEPS : 50
238
+ TIME_LIMIT : " 20:00"
239
+ TEST_LEVEL : L0
240
+
241
+ train.bert.345m_tp2_pp2_1node_50steps :
242
+ << : *selene-test-launcher
243
+ variables :
244
+ << : [*VARS]
245
+ RUN_MODEL : bert
246
+ TP_SIZE : 2
247
+ PP_SIZE : 2
248
+ NUM_NODES : 1
249
+ MAX_STEPS : 50
250
+ TIME_LIMIT : " 20:00"
251
+ TEST_LEVEL : L0
252
+
253
+ train.bert.345m_tp1_pp2_1node_50steps :
254
+ << : *selene-test-launcher
255
+ variables :
256
+ << : [*VARS]
257
+ RUN_MODEL : bert
258
+ TP_SIZE : 1
259
+ PP_SIZE : 2
260
+ NUM_NODES : 1
261
+ MAX_STEPS : 50
262
+ TIME_LIMIT : " 20:00"
263
+ TEST_LEVEL : L0
264
+
265
+ train.bert.345m_tp1_pp4_1node_50steps :
266
+ << : *selene-test-launcher
267
+ variables :
268
+ << : [*VARS]
269
+ RUN_MODEL : bert
270
+ TP_SIZE : 1
271
+ PP_SIZE : 4
272
+ VP_SIZE : 2
273
+ NUM_NODES : 1
274
+ MAX_STEPS : 50
275
+ TIME_LIMIT : " 20:00"
276
+ TEST_LEVEL : L0
277
+
278
+ resume.checkpoint.bert.345m_tp1_pp2_1node :
279
+ << : *selene-test-resume-checkpoint-launcher
280
+ variables :
281
+ << : [*VARS]
282
+ RUN_MODEL : bert
283
+ TP_SIZE : 1
284
+ PP_SIZE : 2
285
+ NUM_NODES : 1
286
+ TIME_LIMIT : " 30:00"
287
+ TEST_LEVEL : L0
288
+
289
+ cleanup.selene :
290
+ tags :
291
+ - ssh_selene_runner
292
+ stage : cleanup
293
+ variables :
294
+ << : [*VARS]
295
+ script :
296
+ - set +e
297
+ - NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l`
298
+ - find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf
299
+ - echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
300
+ allow_failure : true
301
+ rules :
302
+ - when : always
0 commit comments