HabanaAI
diff --git a/‎.github/CODEOWNERS
Lines changed: 16 additions & 0 deletions b/‎.github/CODEOWNERS
Lines changed: 16 additions & 0 deletions
diff --git a/‎CODEOWNERS
Lines changed: 0 additions & 1 deletion b/‎CODEOWNERS
Lines changed: 0 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 419 additions & 543 deletions b/‎README.md
Lines changed: 419 additions & 543 deletions
diff --git a/‎examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh
Lines changed: 26 additions & 11 deletions b/‎examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh
Lines changed: 26 additions & 11 deletions
diff --git a/‎examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh
Lines changed: 22 additions & 7 deletions b/‎examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh
Lines changed: 22 additions & 7 deletions
diff --git a/‎examples_deepspeed/MoE/readme_evalharness.md
Lines changed: 13 additions & 0 deletions b/‎examples_deepspeed/MoE/readme_evalharness.md
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples_deepspeed/README.md
Lines changed: 2 additions & 32 deletions b/‎examples_deepspeed/README.md
Lines changed: 2 additions & 32 deletions
diff --git a/‎examples_deepspeed/run_deepspeed_example.sh
Lines changed: 13 additions & 7 deletions b/‎examples_deepspeed/run_deepspeed_example.sh
Lines changed: 13 additions & 7 deletions
diff --git a/‎examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh
Lines changed: 35 additions & 19 deletions b/‎examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh
Lines changed: 35 additions & 19 deletions
@@ -0,0 +1,16 @@
+# This is a comment.
+# Code owners are automatically requested for review
+# when someone opens a pull request that modifies code that they own.
+# Code owners are not automatically requested to review draft pull requests.
+#
+# To use a CODEOWNERS file, create a new file called CODEOWNERS
+# in the root, docs/, or .github/ directory of the repository,
+# in the branch where you'd like to add the code owners.
+# Each CODEOWNERS file assigns the code owners for a single branch in the repository.
+# Thus, you can assign different code owners for different branches.
+
+# Each line is a file pattern followed by one or more owners.
+# These owners will be the default owners for everything in
+# the repo. Unless a later match takes precedence,
+# below owners will be requested for review when someone opens a pull request.
+* @polisettyvarma
@@ -1,3 +1,5 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
 #!/bin/bash
 DIR=`pwd`
 ###############################################################################
@@ -119,8 +121,14 @@ MP_SIZE=1
 ## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
 ## to 1 and use the "--no-pipeline-parallel" arg.
 PP_SIZE=1
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+nvidia-smi || count_GPU=0
+if [[ ${count_GPU} == 0 ]];then
+    NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
+    NUM_GPUS_PERNODE=${NUM_GPUS}
+else
+    NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+    NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+fi
 NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
 ###############################################################################
 ### MoE configs
@@ -172,6 +180,7 @@ LOG_INTERVAL=10
 EVAL_ITERS=10
 EVAL_INTERVAL=100
 SAVE_INTERVAL=10000
+EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}
 
 ## Standard deviation for weight initialization
 ## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
@@ -241,13 +250,17 @@ if [ "${USE_INTERNAL_DATA}" = "true" ]; then
     0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
     0.01359 ${ARX} 0.01588 ${GIT}"
 else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    #VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    #MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
     # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
     # For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-    DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
+    #DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
     # For cluster Azure-WestUS3-A100
     # DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
+    BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
 fi
 ###############################################################################
 data_options=" \
@@ -284,6 +297,7 @@ megatron_options=" \
         --min-lr ${MIN_LR} \
         --lr-decay-style cosine \
         --split 98,2,0 \
+        --exit-interval ${EXIT_INTERVAL} \
         --log-interval ${LOG_INTERVAL} \
         --eval-interval ${EVAL_INTERVAL} \
         --eval-iters ${EVAL_ITERS} \
@@ -299,11 +313,12 @@ megatron_options=" \
         --log-timers-to-tensorboard \
         --log-batch-size-to-tensorboard \
         --log-validation-ppl-to-tensorboard \
+        --no-gradient-accumulation-fusion \
         --tensorboard-dir ${TENSORBOARD_DIR}"
 
 if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
 megatron_options="${megatron_options} \
-        --checkpoint-activations"
+        --checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
 fi
 
 if [[ $EP_SIZE -gt 1 ]]; then
@@ -329,12 +344,12 @@ sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
     | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
     | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
     | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
+        > ${config_json}
 
 deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
+            --deepspeed \
+            --deepspeed_config ${config_json} \
+            --pipeline-model-parallel-size ${PP_SIZE}"
 
 # Currently MoE is not compatible with pipeline parallel
 if [[ $EP_SIZE -gt 1 ]]; then
@@ -369,4 +384,4 @@ fi
 run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
 echo ${run_cmd}
 eval ${run_cmd}
-set +x
+set +x
@@ -1,3 +1,5 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
 #!/bin/bash
 DIR=`pwd`
 ###############################################################################
@@ -123,8 +125,14 @@ NO_PP="true"
 ZERO_STAGE=0
 
 ## Total number of GPUs
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+nvidia-smi || count_GPU=0
+if [[ ${count_GPU} == 0 ]];then
+    NUM_GPUS=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
+    NUM_GPUS_PERNODE=${NUM_GPUS}
+else
+    NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+    NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+fi
 NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
 DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
 ###############################################################################
@@ -143,6 +151,7 @@ LOG_INTERVAL=10
 EVAL_ITERS=10
 EVAL_INTERVAL=100
 SAVE_INTERVAL=1000
+EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0}
 
 ## Standard deviation for weight initialization. Usually larger model needs
 ## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
@@ -175,13 +184,17 @@ mkdir -p ${LOG_PATH}
 mkdir -p ${TENSORBOARD_PATH}
 mkdir -p ${CHECKPOINT_PATH}
 
-VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+#VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+#MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
 # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
 # For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
+#DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
 # For cluster Azure-WestUS3-A100
 # DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
+BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
 ###############################################################################
 data_options=" \
          --vocab-file ${VOCAB_PATH} \
@@ -211,6 +224,7 @@ megatron_options=" \
         --min-lr ${MIN_LR} \
         --lr-decay-style cosine \
         --split 98,2,0 \
+        --exit-interval ${EXIT_INTERVAL} \
         --log-interval ${LOG_INTERVAL} \
         --eval-interval ${EVAL_INTERVAL} \
         --eval-iters ${EVAL_ITERS} \
@@ -226,11 +240,12 @@ megatron_options=" \
         --log-timers-to-tensorboard \
         --log-batch-size-to-tensorboard \
         --log-validation-ppl-to-tensorboard \
+        --no-gradient-accumulation-fusion \
         --tensorboard-dir ${TENSORBOARD_PATH}"
 
 if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
 megatron_options="${megatron_options} \
-        --checkpoint-activations"
+        --checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
 fi
 
 if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
@@ -306,4 +321,4 @@ fi
 run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
 echo ${run_cmd}
 eval ${run_cmd}
-set +x
+set +x
@@ -166,3 +166,16 @@ Import location: Replace data at selected cell
 4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
 
 5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
+
+## Running lm-eval in Mixtral
+In Mixtral LM evaluation harness can be triggered directly from generic run script. To run the tests, use a pre-trained model checkpoint and load it in evaluation framework using the same training script, adding `HL_RUN_EVAL_HARNESS=1`, path `HL_CHECKPOINTS_DIR` and tag `HL_CHECKPOINT_LOAD_TAG` of the saved checkpoint:
+  ```
+  HL_RUN_EVAL_HARNESS=1 \
+  HL_CHECKPOINTS_DIR=<dir> \
+  HL_CHECKPOINT_LOAD_TAG=global_step1000 \
+  HL_TRUST_REMOTE_CODE=1 \
+  HL_EVAL_TASKS='wikitext,webqs,winogrande' \
+  $MEGATRON_DEEPSPEED_ROOT/scripts/run_mixtral.sh
+  ```
+Standard model arguments for inference such as 3D config, batch size, etc. are also required. Specify `HL_EVAL_TASKS` to run the tests on a subset of the tasks.
+For tasks not included in HuggingFace database, pass `HL_TRUST_REMOTE_CDDE=1`. For some tasks, pre-downloaded dataset may be needed, additional preparation steps can be found in the section above.
@@ -1,33 +1,3 @@
-# Megatron-DeepSpeed Recipes and Scripts
+# examples_deepspeed by microsoft/Megatron-DeepSpeed
 
-This folder includes various example scripts with DeepSpeed technologies integrated. Below we describe each sub-folder, sorted by last update date.
-
-## Sync with NVIDIA/Megatron-LM (last updated: Jul 2023)
-The ```rebase``` folder includes details about the recent sync with the NVIDIA/Megatron-LM repo (where this repo is forked from). It includes example scripts we used to test after the sync, together with a README documentation about what were tested.
-
-## Data Efficiency (last updated: Feb 2023)
-
-The ```data_efficiency``` folder includes GPT-3 and BERT pretraining examples for DeepSpeed Data Efficiency Library, together with examples of zero-shot evaluation for GPT models and GLUE finetuning for BERT models. Please refer to the detailed tutorials in data_efficiency/README.MD. Currently this folder includes the newest example scripts for GPT/BERT pretraining/eval/finetuning, both with and without DeepSpeed Data Efficiency Library techniques.
-
-## BERT example (last updated: Dec 2022)
-
-The ```bert_with_pile``` folder includes examples about BERT-style model pre-training (using the public Pile data or user's own data) with DeepSpeed integration. Please refer to the readme in the folder for tutorial.
-
-## Azure (last updated: Nov 2022)
-
-We strongly recommend to start with AzureML recipe in the ```azureml``` folder.
-
-If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS based environments, please refer to the bash scripts in the ```azure``` folder.
-
-## Model Compression (last updated: Aug 2022)
-
-The ```compression``` folder includes examples about layer reduction for task-agnostic compression. Please refer to [this tutorial](https://www.deepspeed.ai/tutorials/model-compression/#11-layer-reduction) about the DeepSpeed Model Compression Library. These recipes are for GPT-style NLG models.
-
-## MoE (last updated: Jun 2022)
-
-Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models and dense models. These recipes are for GPT-style NLG models, and currently this is the only folder with MoE training examples.
-
-## Curriculum Learning (last updated: Oct 2021)
-
-Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. These recipes are for GPT-style NLG models.
-Note that the DeepSpeed Data Efficiency Library above includes a more general curriculum learning support. This legacy curriculum learning feature is still compatible, but we recommend using the DeepSpeed Data Efficiency Library above. However, the newer DeepSpeed Data Efficiency Library currently is not compatible with pipeline parallelism. So if you have to use pipeline parallelism, you would need to use this legacy curriculum learning version.
+Tests in examples_deepspeed are not supported on Intel® Gaudi® 2 and Gaudi® 3 AI accelerator. Supported scripts and configurations are mentioned [here](../README.md).
@@ -1,8 +1,12 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
 #!/bin/bash
 set -ex
 
-BASE_PATH=/vc_data/Megatron-LM/data
-DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
+BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
+DATA_PATH=${BASE_DATA_PATH}/meg-gpt2_text_document
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
 DS_CONFIG=ds_config.json
 
 TP=1
@@ -48,7 +52,7 @@ ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
 ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
 
 
-deepspeed pretrain_gpt.py \
+deepspeed ../pretrain_gpt.py \
     --tensor-model-parallel-size $TP \
     --pipeline-model-parallel-size $PP \
     --num-layers $NLAYERS \
@@ -67,8 +71,8 @@ deepspeed pretrain_gpt.py \
     --eval-iters 40 \
     --eval-interval 1000 \
     --data-path $DATA_PATH \
-    --vocab-file $BASE_PATH/gpt2-vocab.json \
-    --merge-file $BASE_PATH/gpt2-merges.txt \
+    --vocab-file $VOCAB_PATH \
+    --merge-file $MERGE_PATH \
     --save-interval 1000 \
     --split 98,2,0 \
     --clip-grad 1.0 \
@@ -78,7 +82,9 @@ deepspeed pretrain_gpt.py \
     --init-method-std 0.006 \
     --fp16 \
     --checkpoint-activations \
+    --recompute-granularity=full \
+    --recompute-method=uniform \
+    --no-gradient-accumulation-fusion \
     --tensorboard-dir $OUTPUT_DIR \
     $ds_args \
-    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
-
+    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
@@ -1,3 +1,5 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
 #!/bin/bash
 dir=`pwd`
 ###############################################################################
@@ -147,8 +149,14 @@ no_pp="true"
 zero_stage=1
 
 ## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+nvidia-smi || count_GPU=0
+if [[ ${count_GPU} == 0 ]];then
+    num_gpus=$(lspci | grep -i "Processing accelerators: Habana Labs Ltd." | wc -l)
+    num_gpus_pernode=${num_gpus}
+else
+    num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+    num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+fi
 num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
 
 ## Data parallel size.
@@ -187,21 +195,28 @@ host="${HOSTNAME}"
 seed=1234
 num_workers=0
 
-data_path="BookCorpusDataset_text_document"
-if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-fi
-if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
-fi
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+USE_INTERNAL_DATA="false"
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    data_path="BookCorpusDataset_text_document"
+    if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
+        wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
+    fi
+    if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
+        wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
+    fi
+    vocab_path="gpt2-vocab.json"
+    if [ ! -f "$vocab_path" ]; then
+        wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+    fi
+    merge_path="gpt2-merges.txt"
+    if [ ! -f "$merge_path" ]; then
+        wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+    fi
+else
+    BASE_DATA_PATH=${HL_DATA_DIR_ROOT:-/data/bigscience/oscar-en/}
+    data_path=${BASE_DATA_PATH}/meg-gpt2_text_document
+    vocab_path=${BASE_DATA_PATH}/gpt2-vocab.json
+    merge_path=${BASE_DATA_PATH}/gpt2-merges.txt
 fi
 
 prescale_grad="true"
@@ -282,11 +297,12 @@ megatron_options=" \
     --log-timers-to-tensorboard \
     --log-batch-size-to-tensorboard \
     --log-validation-ppl-to-tensorboard \
+    --no-gradient-accumulation-fusion \
     --tensorboard-dir ${tensorboard_path}"
 
 if [ "${activation_checkpoint}" = "true" ]; then
 megatron_options="${megatron_options} \
-    --checkpoint-activations"
+    --checkpoint-activations --recompute-granularity=full --recompute-method=uniform"
 fi
 
 if [ "${log_optimizer_state}" = "true" ]; then
@@ -338,4 +354,4 @@ if [[ $iteration -gt 0 ]]; then
     ds_ssh "echo $iteration_2 > $iteration_file_2"
 fi
 
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
+deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log