Skip to content

Commit 891d6db

Browse files
SW publisherJenkins
authored andcommitted
Megatron-DeepSpeed content for 1.19.0
Signed-off-by: SW publisher <[email protected]>
1 parent 7eb36a1 commit 891d6db

File tree

100 files changed

+7401
-1474
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

100 files changed

+7401
-1474
lines changed

.github/CODEOWNERS

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# This is a comment.
2+
# Code owners are automatically requested for review
3+
# when someone opens a pull request that modifies code that they own.
4+
# Code owners are not automatically requested to review draft pull requests.
5+
#
6+
# To use a CODEOWNERS file, create a new file called CODEOWNERS
7+
# in the root, docs/, or .github/ directory of the repository,
8+
# in the branch where you'd like to add the code owners.
9+
# Each CODEOWNERS file assigns the code owners for a single branch in the repository.
10+
# Thus, you can assign different code owners for different branches.
11+
12+
# Each line is a file pattern followed by one or more owners.
13+
# These owners will be the default owners for everything in
14+
# the repo. Unless a later match takes precedence,
15+
# below owners will be requested for review when someone opens a pull request.
16+
* @polisettyvarma

CODEOWNERS

Lines changed: 0 additions & 1 deletion
This file was deleted.

README.md

Lines changed: 419 additions & 543 deletions
Large diffs are not rendered by default.

examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
2+
13
#!/bin/bash
24
DIR=`pwd`
35
###############################################################################
@@ -119,8 +121,14 @@ MP_SIZE=1
119121
## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
120122
## to 1 and use the "--no-pipeline-parallel" arg.
121123
PP_SIZE=1
122-
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
123-
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
124+
nvidia-smi || count_GPU=0
125+
if [[ ${count_GPU} == 0 ]];then
126+
NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
127+
NUM_GPUS_PERNODE=${NUM_GPUS}
128+
else
129+
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
130+
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
131+
fi
124132
NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
125133
###############################################################################
126134
### MoE configs
@@ -172,6 +180,7 @@ LOG_INTERVAL=10
172180
EVAL_ITERS=10
173181
EVAL_INTERVAL=100
174182
SAVE_INTERVAL=10000
183+
EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}
175184

176185
## Standard deviation for weight initialization
177186
## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
@@ -241,13 +250,17 @@ if [ "${USE_INTERNAL_DATA}" = "true" ]; then
241250
0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
242251
0.01359 ${ARX} 0.01588 ${GIT}"
243252
else
244-
VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
245-
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
253+
#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
254+
#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
246255
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
247256
# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
248-
DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
257+
#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
249258
# For cluster Azure-WestUS3-A100
250259
# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
260+
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
261+
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
262+
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
263+
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
251264
fi
252265
###############################################################################
253266
data_options=" \
@@ -284,6 +297,7 @@ megatron_options=" \
284297
--min-lr ${MIN_LR} \
285298
--lr-decay-style cosine \
286299
--split 98,2,0 \
300+
--exit-interval ${EXIT_INTERVAL} \
287301
--log-interval ${LOG_INTERVAL} \
288302
--eval-interval ${EVAL_INTERVAL} \
289303
--eval-iters ${EVAL_ITERS} \
@@ -299,11 +313,12 @@ megatron_options=" \
299313
--log-timers-to-tensorboard \
300314
--log-batch-size-to-tensorboard \
301315
--log-validation-ppl-to-tensorboard \
316+
--no-gradient-accumulation-fusion \
302317
--tensorboard-dir ${TENSORBOARD_DIR}"
303318

304319
if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
305320
megatron_options="${megatron_options} \
306-
--checkpoint-activations"
321+
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
307322
fi
308323

309324
if [[ $EP_SIZE -gt 1 ]]; then
@@ -329,12 +344,12 @@ sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
329344
| sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
330345
| sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
331346
| sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
332-
> ${config_json}
347+
> ${config_json}
333348

334349
deepspeed_options=" \
335-
--deepspeed \
336-
--deepspeed_config ${config_json} \
337-
--pipeline-model-parallel-size ${PP_SIZE}"
350+
--deepspeed \
351+
--deepspeed_config ${config_json} \
352+
--pipeline-model-parallel-size ${PP_SIZE}"
338353

339354
# Currently MoE is not compatible with pipeline parallel
340355
if [[ $EP_SIZE -gt 1 ]]; then
@@ -369,4 +384,4 @@ fi
369384
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
370385
echo ${run_cmd}
371386
eval ${run_cmd}
372-
set +x
387+
set +x

examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
2+
13
#!/bin/bash
24
DIR=`pwd`
35
###############################################################################
@@ -123,8 +125,14 @@ NO_PP="true"
123125
ZERO_STAGE=0
124126

125127
## Total number of GPUs
126-
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
127-
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
128+
nvidia-smi || count_GPU=0
129+
if [[ ${count_GPU} == 0 ]];then
130+
NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
131+
NUM_GPUS_PERNODE=${NUM_GPUS}
132+
else
133+
NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
134+
NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
135+
fi
128136
NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
129137
DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
130138
###############################################################################
@@ -143,6 +151,7 @@ LOG_INTERVAL=10
143151
EVAL_ITERS=10
144152
EVAL_INTERVAL=100
145153
SAVE_INTERVAL=1000
154+
EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}
146155

147156
## Standard deviation for weight initialization. Usually larger model needs
148157
## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
@@ -175,13 +184,17 @@ mkdir -p ${LOG_PATH}
175184
mkdir -p ${TENSORBOARD_PATH}
176185
mkdir -p ${CHECKPOINT_PATH}
177186

178-
VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
179-
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
187+
#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
188+
#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
180189
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
181190
# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
182-
DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
191+
#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
183192
# For cluster Azure-WestUS3-A100
184193
# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
194+
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
195+
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
196+
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
197+
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
185198
###############################################################################
186199
data_options=" \
187200
--vocab-file ${VOCAB_PATH} \
@@ -211,6 +224,7 @@ megatron_options=" \
211224
--min-lr ${MIN_LR} \
212225
--lr-decay-style cosine \
213226
--split 98,2,0 \
227+
--exit-interval ${EXIT_INTERVAL} \
214228
--log-interval ${LOG_INTERVAL} \
215229
--eval-interval ${EVAL_INTERVAL} \
216230
--eval-iters ${EVAL_ITERS} \
@@ -226,11 +240,12 @@ megatron_options=" \
226240
--log-timers-to-tensorboard \
227241
--log-batch-size-to-tensorboard \
228242
--log-validation-ppl-to-tensorboard \
243+
--no-gradient-accumulation-fusion \
229244
--tensorboard-dir ${TENSORBOARD_PATH}"
230245

231246
if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
232247
megatron_options="${megatron_options} \
233-
--checkpoint-activations"
248+
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
234249
fi
235250

236251
if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
@@ -306,4 +321,4 @@ fi
306321
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
307322
echo ${run_cmd}
308323
eval ${run_cmd}
309-
set +x
324+
set +x

examples_deepspeed/MoE/readme_evalharness.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,16 @@ Import location: Replace data at selected cell
166166
4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
167167
168168
5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
169+
170+
## Running lm-eval in Mixtral
171+
In Mixtral LM evaluation harness can be triggered directly from generic run script. To run the tests, use a pre-trained model checkpoint and load it in evaluation framework using the same training script, adding `HL_RUN_EVAL_HARNESS=1`, path `HL_CHECKPOINTS_DIR` and tag `HL_CHECKPOINT_LOAD_TAG` of the saved checkpoint:
172+
```
173+
HL_RUN_EVAL_HARNESS=1 \
174+
HL_CHECKPOINTS_DIR=<dir> \
175+
HL_CHECKPOINT_LOAD_TAG=global_step1000 \
176+
HL_TRUST_REMOTE_CODE=1 \
177+
HL_EVAL_TASKS='wikitext,webqs,winogrande' \
178+
$MEGATRON_DEEPSPEED_ROOT/scripts/run_mixtral.sh
179+
```
180+
Standard model arguments for inference such as 3D config, batch size, etc. are also required. Specify `HL_EVAL_TASKS` to run the tests on a subset of the tasks.
181+
For tasks not included in HuggingFace database, pass `HL_TRUST_REMOTE_CDDE=1`. For some tasks, pre-downloaded dataset may be needed, additional preparation steps can be found in the section above.

examples_deepspeed/README.md

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,3 @@
1-
# Megatron-DeepSpeed Recipes and Scripts
1+
# examples_deepspeed by microsoft/Megatron-DeepSpeed
22

3-
This folder includes various example scripts with DeepSpeed technologies integrated. Below we describe each sub-folder, sorted by last update date.
4-
5-
## Sync with NVIDIA/Megatron-LM (last updated: Jul 2023)
6-
The ```rebase``` folder includes details about the recent sync with the NVIDIA/Megatron-LM repo (where this repo is forked from). It includes example scripts we used to test after the sync, together with a README documentation about what were tested.
7-
8-
## Data Efficiency (last updated: Feb 2023)
9-
10-
The ```data_efficiency``` folder includes GPT-3 and BERT pretraining examples for DeepSpeed Data Efficiency Library, together with examples of zero-shot evaluation for GPT models and GLUE finetuning for BERT models. Please refer to the detailed tutorials in data_efficiency/README.MD. Currently this folder includes the newest example scripts for GPT/BERT pretraining/eval/finetuning, both with and without DeepSpeed Data Efficiency Library techniques.
11-
12-
## BERT example (last updated: Dec 2022)
13-
14-
The ```bert_with_pile``` folder includes examples about BERT-style model pre-training (using the public Pile data or user's own data) with DeepSpeed integration. Please refer to the readme in the folder for tutorial.
15-
16-
## Azure (last updated: Nov 2022)
17-
18-
We strongly recommend to start with AzureML recipe in the ```azureml``` folder.
19-
20-
If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS based environments, please refer to the bash scripts in the ```azure``` folder.
21-
22-
## Model Compression (last updated: Aug 2022)
23-
24-
The ```compression``` folder includes examples about layer reduction for task-agnostic compression. Please refer to [this tutorial](https://www.deepspeed.ai/tutorials/model-compression/#11-layer-reduction) about the DeepSpeed Model Compression Library. These recipes are for GPT-style NLG models.
25-
26-
## MoE (last updated: Jun 2022)
27-
28-
Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models and dense models. These recipes are for GPT-style NLG models, and currently this is the only folder with MoE training examples.
29-
30-
## Curriculum Learning (last updated: Oct 2021)
31-
32-
Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. These recipes are for GPT-style NLG models.
33-
Note that the DeepSpeed Data Efficiency Library above includes a more general curriculum learning support. This legacy curriculum learning feature is still compatible, but we recommend using the DeepSpeed Data Efficiency Library above. However, the newer DeepSpeed Data Efficiency Library currently is not compatible with pipeline parallelism. So if you have to use pipeline parallelism, you would need to use this legacy curriculum learning version.
3+
Tests in examples_deepspeed are not supported on Intel® Gaudi® 2 and Gaudi® 3 AI accelerator. Supported scripts and configurations are mentioned [here](../README.md).

examples_deepspeed/run_deepspeed_example.sh

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1+
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
2+
13
#!/bin/bash
24
set -ex
35

4-
BASE_PATH=/vc_data/Megatron-LM/data
5-
DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
6+
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
7+
DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
8+
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
9+
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
610
DS_CONFIG=ds_config.json
711

812
TP=1
@@ -48,7 +52,7 @@ ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
4852
ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
4953

5054

51-
deepspeed pretrain_gpt.py \
55+
deepspeed ../pretrain_gpt.py \
5256
--tensor-model-parallel-size $TP \
5357
--pipeline-model-parallel-size $PP \
5458
--num-layers $NLAYERS \
@@ -67,8 +71,8 @@ deepspeed pretrain_gpt.py \
6771
--eval-iters 40 \
6872
--eval-interval 1000 \
6973
--data-path $DATA_PATH \
70-
--vocab-file $BASE_PATH/gpt2-vocab.json \
71-
--merge-file $BASE_PATH/gpt2-merges.txt \
74+
--vocab-file $VOCAB_PATH \
75+
--merge-file $MERGE_PATH \
7276
--save-interval 1000 \
7377
--split 98,2,0 \
7478
--clip-grad 1.0 \
@@ -78,7 +82,9 @@ deepspeed pretrain_gpt.py \
7882
--init-method-std 0.006 \
7983
--fp16 \
8084
--checkpoint-activations \
85+
--recompute-granularity=full \
86+
--recompute-method=uniform \
87+
--no-gradient-accumulation-fusion \
8188
--tensorboard-dir $OUTPUT_DIR \
8289
$ds_args \
83-
--exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
84-
90+
--exit-interval 5000 | tee ${OUTPUT_DIR}/output.log

examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
2+
13
#!/bin/bash
24
dir=`pwd`
35
###############################################################################
@@ -147,8 +149,14 @@ no_pp="true"
147149
zero_stage=1
148150

149151
## Total number of GPUs. ds_ssh is from DeepSpeed library.
150-
num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
151-
num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
152+
nvidia-smi || count_GPU=0
153+
if [[ ${count_GPU} == 0 ]];then
154+
num_gpus=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
155+
num_gpus_pernode=${num_gpus}
156+
else
157+
num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
158+
num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
159+
fi
152160
num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
153161

154162
## Data parallel size.
@@ -187,21 +195,28 @@ host="${HOSTNAME}"
187195
seed=1234
188196
num_workers=0
189197

190-
data_path="BookCorpusDataset_text_document"
191-
if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
192-
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
193-
fi
194-
if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
195-
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
196-
fi
197-
198-
vocab_path="gpt2-vocab.json"
199-
if [ ! -f "$vocab_path" ]; then
200-
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
201-
fi
202-
merge_path="gpt2-merges.txt"
203-
if [ ! -f "$merge_path" ]; then
204-
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
198+
USE_INTERNAL_DATA="false"
199+
if [ "${USE_INTERNAL_DATA}" = "true" ]; then
200+
data_path="BookCorpusDataset_text_document"
201+
if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
202+
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
203+
fi
204+
if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
205+
wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
206+
fi
207+
vocab_path="gpt2-vocab.json"
208+
if [ ! -f "$vocab_path" ]; then
209+
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
210+
fi
211+
merge_path="gpt2-merges.txt"
212+
if [ ! -f "$merge_path" ]; then
213+
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
214+
fi
215+
else
216+
BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
217+
data_path=${BASE_DATA_PATH}/meg-gpt2_text_document
218+
vocab_path=${BASE_DATA_PATH}/gpt2-vocab.json
219+
merge_path=${BASE_DATA_PATH}/gpt2-merges.txt
205220
fi
206221

207222
prescale_grad="true"
@@ -282,11 +297,12 @@ megatron_options=" \
282297
--log-timers-to-tensorboard \
283298
--log-batch-size-to-tensorboard \
284299
--log-validation-ppl-to-tensorboard \
300+
--no-gradient-accumulation-fusion \
285301
--tensorboard-dir ${tensorboard_path}"
286302

287303
if [ "${activation_checkpoint}" = "true" ]; then
288304
megatron_options="${megatron_options} \
289-
--checkpoint-activations"
305+
--checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
290306
fi
291307

292308
if [ "${log_optimizer_state}" = "true" ]; then
@@ -338,4 +354,4 @@ if [[ $iteration -gt 0 ]]; then
338354
ds_ssh "echo $iteration_2 > $iteration_file_2"
339355
fi
340356

341-
deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
357+
deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log

0 commit comments

Comments
 (0)