Skip to content

Commit

Permalink
Merge branch 'ko3n1g/tests/repeats' into 'main'
Browse files Browse the repository at this point in the history
tests: Allow running tests multiple times

See merge request ADLR/megatron-lm!1941
  • Loading branch information
ko3n1g committed Aug 19, 2024
2 parents fb155f4 + 49af43e commit 3bb2585
Show file tree
Hide file tree
Showing 44 changed files with 86 additions and 42 deletions.
4 changes: 2 additions & 2 deletions tests/functional_tests/jet_recipes/bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ spec:
nodes: 1
gpus: 8
platforms: dgx_a100
time_limit: 1200
scope: null
artifacts:
/workspace/data/bert_data: text/the_pile/bert_shard00
script: |-
Expand All @@ -32,6 +30,7 @@ spec:
products:
- scope: [mr]
time_limit: [1200]
test_case:
- bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
- bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
Expand All @@ -42,6 +41,7 @@ products:
- bert_mr_tp2_pp2_dgx_a100_1N8G
- bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
- scope: [nightly]
time_limit: [12000]
test_case:
- bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2
- bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2
Expand Down
2 changes: 1 addition & 1 deletion tests/functional_tests/jet_recipes/gpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ products:
- gpt3_mr_tp2_pp2_dgx_a100_1N8G
- scope: [nightly]
platforms: [dgx_a100]
time_limit: [1200]
time_limit: [12000]
test_case:
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
Expand Down
74 changes: 42 additions & 32 deletions tests/functional_tests/shell_test_utils/run_ci_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,45 +34,55 @@ done
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
ROOT_DIR=$(realpath $SCRIPT_DIR/../../../)

# Training
bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh

# Extract settings from params file
TEST_TYPE=$(cat $TRAINING_PARAMS_PATH \
| yq '.TEST_TYPE')
NVTE_ALLOW_NONDETERMINISTIC_ALGO=$(cat $TRAINING_PARAMS_PATH \
| yq '.ENV_VARS.NVTE_ALLOW_NONDETERMINISTIC_ALGO')
SKIP_PYTEST=$(cat $TRAINING_PARAMS_PATH \
| yq '.ENV_VARS.SKIP_PYTEST')
N_REPEATS=$(cat $TRAINING_PARAMS_PATH \
| yq '.ENV_VARS.N_REPEATS //1')

for i in $(seq 1 $N_REPEATS);
do
rm -rf $CHECKPOINT_PATH/*
rm -rf $OUTPUT_PATH/*

# Maybe checkpoint resume training
if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then
rm -rf $CHECKPOINT_PATH/iter_0000100;
echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt;
# Training
bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
fi

# Save run results
export PYTHONPATH=$ROOT_DIR
python3 $ROOT_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py \
--logs-dir $TENSORBOARD_PATH \
--output-path ${OUTPUT_PATH}/$(basename $GOLDEN_VALUES_PATH)

# Maybe run tests
if [[ ${SKIP_PYTEST:-0} != 1 ]]; then
export NVTE_ALLOW_NONDETERMINISTIC_ALGO
export LOGS_DIR=$TENSORBOARD_PATH

if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then
echo "Running pytest 1st vs 2nd run comparison"
pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py

elif [[ "$TEST_TYPE" == "regular" ]]; then
echo "Running pytest checks against golden values"
export EXPECTED_METRICS_FILE=$GOLDEN_VALUES_PATH
pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py

else
echo "Test type $TEST_TYPE not yet implemented."

# Maybe checkpoint resume training
if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then
rm -rf $CHECKPOINT_PATH/iter_0000100;
echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt;
bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
fi
fi

# Save run results
export PYTHONPATH=$ROOT_DIR
python3 $ROOT_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py \
--logs-dir $TENSORBOARD_PATH \
--output-path ${OUTPUT_PATH}/$(basename $GOLDEN_VALUES_PATH)

# Maybe run tests
if [[ ${SKIP_PYTEST:-0} != 1 ]]; then
export NVTE_ALLOW_NONDETERMINISTIC_ALGO
export LOGS_DIR=$TENSORBOARD_PATH

if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then
echo "Running pytest 1st vs 2nd run comparison"
pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py

elif [[ "$TEST_TYPE" == "regular" ]]; then
echo "Running pytest checks against golden values"
export EXPECTED_METRICS_FILE=$GOLDEN_VALUES_PATH
pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py

else
echo "Test type $TEST_TYPE not yet implemented."
fi
fi
done


Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ ENV_VARS:
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
NVTE_APPLY_QK_LAYER_SCALING: 1
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ ENV_VARS:
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
NVTE_APPLY_QK_LAYER_SCALING: 1
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 10
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
Expand Down
Loading

0 comments on commit 3bb2585

Please sign in to comment.