Merge azure branch manually (deepspeedai#65)

awan-10 · web-flow · commit 62bc4562b806 · 2022-07-11T16:03:23.000-07:00
diff --git a/README.md b/README.md
@@ -1,5 +1,8 @@
 ## Megatron-DeepSpeed
-DeepSpeed version of NVIDIA's Megatron-LM that adds additional support for several features such as MoE, Curriculum Learning, 3D Parallelism, etc.
+DeepSpeed version of NVIDIA's Megatron-LM that adds additional support for several features such as MoE model training, Curriculum Learning, 3D Parallelism, and others.
+
+### Run on Azure and AzureML
+To try out DeepSpeed on Azure, this fork of Megatron offers easy-to-use recipes and bash scripts. We strongly recommend to start with AzureML recipe in the ```examples/azureml``` folder. If you have a custom infrastructure (e.g. HPC clusters) or Azure VM based environment, please refer to the bash scripts in the ```examples/azure``` folder. 
 
 ------
 
@@ -76,7 +79,8 @@ The models require vocabulary files to run. The BERT  WordPiece vocab file can b
 Additional notes for DeepSpeed. We have added a helper script to download the checkpoints and make the example runnable.
 
 Steps to follow:
- - bash ds_download_ckpt.sh -- this will download and extract the checkpoint and GPT merges and vocab files.
+ - bash dataset/download_ckpt.sh -- this will download and extract the checkpoint
+ - bash dataset/download_vocab.sh -- this will download GPT merges and vocab files.
  - bash examples/generate_text.sh -- this will generate examples using the 345m GPT model.
 
 # Usage
diff --git a/dataset/README.md b/dataset/README.md
@@ -0,0 +1,5 @@
+# Run the scripts below to setup dataset 
+
+bash download_books.sh
+
+bash download_vocab.sh
diff --git a/dataset/download_books.sh b/dataset/download_books.sh
@@ -0,0 +1,2 @@
+wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
+wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
diff --git a/dataset/download_ckpt.sh b/dataset/download_ckpt.sh
@@ -1,7 +1,3 @@
-
-wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-
 mkdir -p checkpoints/gpt2_345m
 
 cd checkpoints/gpt2_345m
diff --git a/dataset/download_vocab.sh b/dataset/download_vocab.sh
@@ -0,0 +1,2 @@
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
diff --git a/examples/azure/README.md b/examples/azure/README.md
@@ -0,0 +1,27 @@
+## Recipes for experimentation on Azure
+
+The recipes have been tested on command line on a cluster setup using Azure VMs and VMSS as well as inside Docker based environments.
+
+To run any of the examples in this folder, please go to the base directory of Megatron-DeepSpeed and run as follows
+
+```bash examples/azure/run-benchmark-model.sh```
+
+### Pre-requisites
+
+To run the above script, you will need to either setup your own dataset and modify the scripts or use our helper scripts to download the publicly available Books dataset and GPT vocab files. Please use the following from the ```dataset``` folder
+
+```bash dataset/download_books.sh```
+
+```bash dataset/download_vocab.sh```
+
+### Run 175B and 1T models
+
+We have included two recipes for the 175B model and the 1T model. To train the model, we assume that the users will modify and tune hyperparameters and configurations by themselves. To facilitate initial training, we have made the recipes runnable with the Books dataset as follows.
+
+```bash examples/azure/run-175b.sh```
+
+```bash examples/azure/run-1t.sh```
+
+### Note about ZeRO stage 3 and CPU offload 
+
+By default, we have enabled ZeRO Stage 3 for both the recipes above. For the 1T model, we have also enabled the CPU-offload feature to save on memory and enable a larger batch size that offers better performance. 
diff --git a/examples/azure/run-175b.sh b/examples/azure/run-175b.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+set -ex
+
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_PATH} \
+         --data-impl mmap"
+
+BASE_PATH=$PWD/dataset/
+DATA_PATH=${BASE_PATH}/BookCorpusDataset_text_document
+DS_CONFIG=ds_config.json
+
+# Hostfile path
+HF=/job/hostfile 
+
+# Disabling tensor/pipeline parallelism
+TP=1
+PP=1
+
+# HEADS ~= HIDDEN/128
+
+# Model: 175B
+NLAYERS=96
+HIDDEN=12288
+HEADS=96
+SEQ=1024
+
+
+MICRO_BATCH=4
+NODES=1
+GPN=8
+GLOBAL_BATCH=$(( ${GPN} * ${MICRO_BATCH} * ${NODES} ))
+
+# Initial power scale for loss
+SP=15
+
+# Uncomment/comment one of the following blocks.
+
+# For 1T model, start with microbatch=1, try to get 2 and 4.  If OOM w/ 4, use cpu-offloading
+
+# Set to cpu for offloading to cpu for larger models
+#OFFLOAD_DEVICE="cpu"
+#CPU_OPTIM=" --cpu-optimizer"
+
+# Set to none and empty string for no cpu offloading
+OFFLOAD_DEVICE="none"  
+CPU_OPTIM=" "
+
+ZERO_STAGE=3
+OUTPUT_DIR=ds_z_off-${OFFLOAD_DEVICE}_stage_${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_nodes${NODES}
+#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
+mkdir -p $OUTPUT_DIR
+
+cat <<EOT > $DS_CONFIG
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+  "gradient_accumulation_steps": 1,
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 3e9,
+    "stage3_max_reuse_distance": 3e9,
+    "stage3_param_persitence_threshold": 1e5,
+    "stage3_prefetch_bucket_size": 5e7,
+    "contiguous_gradients": true,
+    "overlap_comm": true,
+    "reduce_bucket_size": 90000000,
+    "sub_group_size": 1e9,
+    "offload_optimizer": {
+      "device": "$OFFLOAD_DEVICE",
+      "buffer_count": 4,
+      "pipeline_read": false,
+      "pipeline_write": false,
+      "pin_memory": true
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "initial_scale_power" : $SP,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": true,
+  "zero_allow_untested_optimizer": false,
+  "aio": {
+    "block_size": 1048576,
+    "queue_depth": 16,
+    "single_submit": false,
+    "overlap_events": true,
+    "thread_count": 2
+  }
+}
+EOT
+
+export NCCL_DEBUG=warn 
+
+ds_args=" "
+ds_args=" --deepspeed ${ds_args}"
+ds_args=" --no-pipeline-parallel ${ds_args}" 
+ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
+ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
+ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
+
+
+
+deepspeed --force_multi --num_nodes=$NODES --hostfile $HF pretrain_gpt.py \
+    --tensor-model-parallel-size $TP \
+    --pipeline-model-parallel-size $PP \
+    --num-layers $NLAYERS \
+    --hidden-size $HIDDEN \
+    --num-attention-heads $HEADS \
+    --seq-length $SEQ \
+    --loss-scale $SP \
+    --max-position-embeddings $SEQ \
+    --micro-batch-size $MICRO_BATCH \
+    --global-batch-size $GLOBAL_BATCH \
+    --train-iters 1000 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --log-interval 1 \
+    --eval-iters 40 \
+    --eval-interval 1000 \
+    --data-path $DATA_PATH \
+    --vocab-file $BASE_PATH/gpt2-vocab.json \
+    --merge-file $BASE_PATH/gpt2-merges.txt \
+    --save-interval 1000 \
+    --split 98,2,0 \
+    --clip-grad 1.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.006 \
+    --fp16 \
+    --checkpoint-activations \
+    --tensorboard-dir $OUTPUT_DIR \
+    $CPU_OPTIM $ds_args \
+    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
diff --git a/examples/azure/run-1t.sh b/examples/azure/run-1t.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+set -ex
+
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_PATH} \
+         --data-impl mmap"
+
+BASE_PATH=$PWD/dataset/
+DATA_PATH=${BASE_PATH}/BookCorpusDataset_text_document
+DS_CONFIG=ds_config.json
+
+# Hostfile path
+HF=/job/hostfile
+
+# Disabling tensor/pipeline parallelism
+TP=1
+PP=1
+
+# HEADS ~= HIDDEN/128
+
+# Refer to Megatron-table in the README.md file for model sizes
+# Model: 310B
+#NLAYERS=96
+#HIDDEN=16384
+#HEADS=128
+#SEQ=2048
+
+# Model 530B
+#NLAYERS=105
+#HIDDEN=20480
+#HEADS=160
+#SEQ=2048
+
+# Model 1T
+NLAYERS=128
+HIDDEN=25600
+HEADS=160
+SEQ=1024
+
+MICRO_BATCH=1
+NODES=1
+GPN=8
+GLOBAL_BATCH=$(( ${GPN} * ${MICRO_BATCH} * ${NODES} ))
+
+# Initial power scale for loss
+SP=15
+
+# Uncomment/comment one of the following blocks.
+
+# For 1T model, start with microbatch=1, try to get 2 and 4.  If OOM w/ 4, use cpu-offloading
+
+# Set to cpu for offloading to cpu for larger models
+OFFLOAD_DEVICE="cpu"
+CPU_OPTIM=" --cpu-optimizer"
+
+# Set to none and empty string for no cpu offloading
+#OFFLOAD_DEVICE="none"  
+#CPU_OPTIM=" "
+
+ZERO_STAGE=3
+OUTPUT_DIR=ds_z_off-${OFFLOAD_DEVICE}_stage_${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_nodes${NODES}
+#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
+mkdir -p $OUTPUT_DIR
+
+cat <<EOT > $DS_CONFIG
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+  "gradient_accumulation_steps": 1,
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 3e9,
+    "stage3_max_reuse_distance": 3e9,
+    "stage3_param_persitence_threshold": 1e5,
+    "stage3_prefetch_bucket_size": 5e7,
+    "contiguous_gradients": true,
+    "overlap_comm": true,
+    "reduce_bucket_size": 90000000,
+    "sub_group_size": 1e9,
+    "offload_optimizer": {
+      "device": "$OFFLOAD_DEVICE",
+      "buffer_count": 4,
+      "pipeline_read": false,
+      "pipeline_write": false,
+      "pin_memory": true
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "initial_scale_power" : $SP,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": true,
+  "zero_allow_untested_optimizer": false,
+  "aio": {
+    "block_size": 1048576,
+    "queue_depth": 16,
+    "single_submit": false,
+    "overlap_events": true,
+    "thread_count": 2
+  }
+}
+EOT
+
+export NCCL_DEBUG=warn 
+
+ds_args=" "
+ds_args=" --deepspeed ${ds_args}"
+ds_args=" --no-pipeline-parallel ${ds_args}" 
+ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
+ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
+ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
+
+
+
+deepspeed --force_multi --num_nodes=$NODES --hostfile $HF pretrain_gpt.py \
+    --tensor-model-parallel-size $TP \
+    --pipeline-model-parallel-size $PP \
+    --num-layers $NLAYERS \
+    --hidden-size $HIDDEN \
+    --num-attention-heads $HEADS \
+    --seq-length $SEQ \
+    --loss-scale $SP \
+    --max-position-embeddings $SEQ \
+    --micro-batch-size $MICRO_BATCH \
+    --global-batch-size $GLOBAL_BATCH \
+    --train-iters 1000 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --log-interval 1 \
+    --eval-iters 40 \
+    --eval-interval 1000 \
+    --data-path $DATA_PATH \
+    --vocab-file $BASE_PATH/gpt2-vocab.json \
+    --merge-file $BASE_PATH/gpt2-merges.txt \
+    --save-interval 1000 \
+    --split 98,2,0 \
+    --clip-grad 1.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.006 \
+    --fp16 \
+    --checkpoint-activations \
+    --tensorboard-dir $OUTPUT_DIR \
+    $CPU_OPTIM $ds_args \
+    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
diff --git a/examples/azure/run-benchmark-model.sh b/examples/azure/run-benchmark-model.sh
diff --git a/megatron/training.py b/megatron/training.py
diff --git a/run.sh b/run.sh

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin`
	`2`	`+wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json`
	`2`	`+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt`