Skip to content

Commit 9e72d57

Browse files
committed
Merge remote-tracking branch 'upstream/main' into rebase
2 parents f01c9e6 + 3316e81 commit 9e72d57

File tree

356 files changed

+34963
-8051
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

356 files changed

+34963
-8051
lines changed

.coveragerc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[html]
2+
directory = coverage
3+
4+
[run]
5+
data_file = .coverage_$LOCAL_RANK

.gitignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,9 @@ dist/
1212
*.swp
1313

1414
# AML workspace config file
15-
*config.json
15+
*config.json
16+
17+
.coverage_*
18+
*~
19+
slurm*
20+
logs

.gitlab-ci.yml

Lines changed: 299 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,302 @@
1-
image: gitlab-master.nvidia.com/dl/dgx/pytorch:20.12-py3-devel
1+
image: nvcr.io/nvidia/pytorch:23.04-py3
22

3-
test:
3+
stages:
4+
- test
5+
- cleanup
6+
7+
variables: &VARS
8+
SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
9+
DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
10+
PYTORCH_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
11+
PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
12+
TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: L0 # Can specify levels
13+
TESTS_TO_RUN_AFTER_MERGING: L0 # Can specify levels
14+
TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
15+
TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
16+
DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
17+
18+
unit_tests:
19+
tags:
20+
- docker_local_runner
21+
stage: test
422
script:
5-
- pytest --junitxml=report.xml tests
23+
- pip install pytest-cov
24+
- torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
25+
coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
626
artifacts:
7-
when: always
8-
reports:
9-
junit: report.xml
10-
27+
paths:
28+
- coverage
29+
expire_in: 30 days
30+
only:
31+
- merge_requests
32+
33+
.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
34+
tags:
35+
- ssh_selene_runner
36+
stage: test
37+
script: &selene-test-resume-launcher-script
38+
- echo "Running selene resume from checkpoint test. "
39+
- pwd
40+
- export BUILD_DIR=`pwd`
41+
- export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes
42+
- echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
43+
- export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS
44+
- export DATA_DIR=$DATA_DIR
45+
- echo "Run name is $RUN_NAME"
46+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
47+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
48+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
49+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
50+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
51+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
52+
- export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
53+
- export LOGS_DIR=$BASE_DIR/logs
54+
- export RESULTS_DIR=$BASE_DIR/results
55+
- export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
56+
- echo "Submitting job"
57+
- sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES`
58+
- export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
59+
- bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
60+
- \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
61+
"----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
62+
"---------------------------------------------------\n"
63+
"$(scontrol show job=${SLURM_JOBID})\n"
64+
"---------------------------------------------------\n"
65+
# Gitlab logs collapsible section markers
66+
- echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
67+
# Follow output of the job
68+
- echo "Finished job"
69+
- export SLURM_STATE=$(sacct -j "${SLURM_JOBID}" --format State --parsable2 --noheader |& head -n 1)
70+
- echo "Slurm job state $SLURM_STATE"
71+
- if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
72+
- source $PYTHON_VIRTUAL_ENV
73+
- cmd="pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py"
74+
- if $cmd; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs"; fi
75+
- echo "Completed the job"
76+
rules:
77+
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
78+
when: always
79+
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
80+
when: always
81+
- if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
82+
when: always
83+
allow_failure: false
84+
85+
.selene_test_launcher: &selene-test-launcher
86+
tags:
87+
- ssh_selene_runner
88+
stage: test
89+
script: &selene-test-launcher-script
90+
- echo "Running selene test"
91+
- echo "$CI_MERGE_REQUEST_APPROVED"
92+
- pwd
93+
- export BUILD_DIR=`pwd`
94+
- RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
95+
- if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi
96+
- export $RUN_NAME
97+
- echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
98+
- export USE_TE TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE
99+
- export MBS GBS
100+
- export DATA_DIR=$DATA_DIR
101+
- echo "Run name is $RUN_NAME"
102+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
103+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
104+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
105+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
106+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
107+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
108+
- export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
109+
- export LOGS_DIR=$BASE_DIR/logs
110+
- export RESULTS_DIR=$BASE_DIR/results
111+
- export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
112+
- echo "Submitting job"
113+
- sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS`
114+
- export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
115+
- bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
116+
- \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
117+
"----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
118+
"---------------------------------------------------\n"
119+
"$(scontrol show job=${SLURM_JOBID})\n"
120+
"---------------------------------------------------\n"
121+
# Gitlab logs collapsible section markers
122+
- echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
123+
# Follow output of the job
124+
- echo "Finished job"
125+
- echo "Slurm log dump start ------------------------------------------------------------"
126+
- cat $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
127+
- echo "Slurm log dump end --------------------------------------------------------------"
128+
- python3 $BUILD_DIR/tests/functional_tests/python_test_utils/check_slurm_job_completion.py $SLURM_JOBID
129+
- if [ $? -ne 0 ]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
130+
- source $PYTHON_VIRTUAL_ENV
131+
- |
132+
if [[ "$DISPLAY_OUTPUT" == "True" ]]; then
133+
python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
134+
fi
135+
- |
136+
if [[ $USE_TE -ne 1 ]]; then
137+
echo "Checking against ground truth file"
138+
export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json
139+
cmd="pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py"
140+
if $cmd; then echo "Pytest succeded"; else echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs"; fi
141+
fi
142+
- echo "Completed the job"
143+
rules:
144+
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
145+
when: always
146+
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
147+
when: always
148+
- if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
149+
when: always
150+
allow_failure: false
151+
152+
train.te_gpt3.345m_tp2_pp2_1node_50steps:
153+
<<: *selene-test-launcher
154+
variables:
155+
<<: [*VARS]
156+
RUN_MODEL: gpt3
157+
USE_TE: 1
158+
TP_SIZE: 2
159+
PP_SIZE: 2
160+
NUM_NODES: 1
161+
MAX_STEPS: 50
162+
TIME_LIMIT: "50:00"
163+
TEST_LEVEL: L0
164+
165+
train.gpt3.345m_tp4_pp1_1node_50steps:
166+
<<: *selene-test-launcher
167+
variables:
168+
<<: [*VARS]
169+
RUN_MODEL: gpt3
170+
USE_TE: 0
171+
TP_SIZE: 4
172+
PP_SIZE: 1
173+
NUM_NODES: 1
174+
MAX_STEPS: 50
175+
TIME_LIMIT: "20:00"
176+
TEST_LEVEL: L0
177+
178+
train.gpt3.345m_tp2_pp2_1node_50steps:
179+
<<: *selene-test-launcher
180+
variables:
181+
<<: [*VARS]
182+
RUN_MODEL: gpt3
183+
USE_TE: 0
184+
TP_SIZE: 2
185+
PP_SIZE: 2
186+
NUM_NODES: 1
187+
MAX_STEPS: 50
188+
TIME_LIMIT: "20:00"
189+
TEST_LEVEL: L0
190+
191+
train.gpt3.345m_tp1_pp2_1node_50steps:
192+
<<: *selene-test-launcher
193+
variables:
194+
<<: [*VARS]
195+
RUN_MODEL: gpt3
196+
USE_TE: 0
197+
TP_SIZE: 1
198+
PP_SIZE: 2
199+
NUM_NODES: 1
200+
MAX_STEPS: 50
201+
TIME_LIMIT: "20:00"
202+
TEST_LEVEL: L0
203+
204+
train.gpt3.345m_tp1_pp4_1node_50steps:
205+
<<: *selene-test-launcher
206+
variables:
207+
<<: [*VARS]
208+
RUN_MODEL: gpt3
209+
USE_TE: 0
210+
TP_SIZE: 1
211+
PP_SIZE: 4
212+
VP_SIZE: 1
213+
NUM_NODES: 1
214+
MAX_STEPS: 50
215+
TIME_LIMIT: "20:00"
216+
TEST_LEVEL: L0
217+
218+
resume.checkpoint.gpt3.345m_tp1_pp2_1node:
219+
<<: *selene-test-resume-checkpoint-launcher
220+
variables:
221+
<<: [*VARS]
222+
RUN_MODEL: gpt3
223+
TP_SIZE: 1
224+
PP_SIZE: 2
225+
NUM_NODES: 1
226+
TIME_LIMIT: "30:00"
227+
TEST_LEVEL: L0
228+
229+
train.bert.345m_tp4_pp1_1node_50steps:
230+
<<: *selene-test-launcher
231+
variables:
232+
<<: [*VARS]
233+
RUN_MODEL: bert
234+
TP_SIZE: 4
235+
PP_SIZE: 1
236+
NUM_NODES: 1
237+
MAX_STEPS: 50
238+
TIME_LIMIT: "20:00"
239+
TEST_LEVEL: L0
240+
241+
train.bert.345m_tp2_pp2_1node_50steps:
242+
<<: *selene-test-launcher
243+
variables:
244+
<<: [*VARS]
245+
RUN_MODEL: bert
246+
TP_SIZE: 2
247+
PP_SIZE: 2
248+
NUM_NODES: 1
249+
MAX_STEPS: 50
250+
TIME_LIMIT: "20:00"
251+
TEST_LEVEL: L0
252+
253+
train.bert.345m_tp1_pp2_1node_50steps:
254+
<<: *selene-test-launcher
255+
variables:
256+
<<: [*VARS]
257+
RUN_MODEL: bert
258+
TP_SIZE: 1
259+
PP_SIZE: 2
260+
NUM_NODES: 1
261+
MAX_STEPS: 50
262+
TIME_LIMIT: "20:00"
263+
TEST_LEVEL: L0
264+
265+
train.bert.345m_tp1_pp4_1node_50steps:
266+
<<: *selene-test-launcher
267+
variables:
268+
<<: [*VARS]
269+
RUN_MODEL: bert
270+
TP_SIZE: 1
271+
PP_SIZE: 4
272+
VP_SIZE: 2
273+
NUM_NODES: 1
274+
MAX_STEPS: 50
275+
TIME_LIMIT: "20:00"
276+
TEST_LEVEL: L0
277+
278+
resume.checkpoint.bert.345m_tp1_pp2_1node:
279+
<<: *selene-test-resume-checkpoint-launcher
280+
variables:
281+
<<: [*VARS]
282+
RUN_MODEL: bert
283+
TP_SIZE: 1
284+
PP_SIZE: 2
285+
NUM_NODES: 1
286+
TIME_LIMIT: "30:00"
287+
TEST_LEVEL: L0
288+
289+
cleanup.selene:
290+
tags:
291+
- ssh_selene_runner
292+
stage: cleanup
293+
variables:
294+
<<: [*VARS]
295+
script:
296+
- set +e
297+
- NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l`
298+
- find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf
299+
- echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
300+
allow_failure: true
301+
rules:
302+
- when: always

0 commit comments

Comments
 (0)