From 34aafb2c21b8fecbb9524872ff357d6eb0713fc1 Mon Sep 17 00:00:00 2001 From: Nathan Mailhot Date: Mon, 6 Jan 2025 19:02:03 +0000 Subject: [PATCH] minor documentation changes post-2.21 --- .../inference/Dockerfile-inference | 2 +- .../inference/Dockerfile-inference-dlc | 4 +- .../inference/Dockerfile-libmode | 2 +- .../inference/Dockerfile.mxnet-serving | 2 +- .../v1/inference/Dockerfile.app-rt-diff | 2 +- .../v1/inference/Dockerfile.torch-neuron | 2 +- frameworks/jax/setup/jax-setup.rst | 5 +- .../tutorials/bert_mxnet/index.rst | 2 +- .../tutorials/bert_demo/bert_demo.rst | 6 +- .../tutorials/ssd300_demo/ssd300_detection.py | 2 +- .../ssd300_demo/ssd300_evaluation.py | 2 +- .../tutorials/ssd300_demo/ssd300_model.py | 6 +- .../setup/pytorch-install-cxx11.rst | 10 +- .../torch-neuron/troubleshooting-guide.rst | 2 +- .../pytorch-neuron-programming-guide.rst | 142 +++++++++--------- .../install-templates/pytorch-dev-install.txt | 4 +- .../neuronx-2.3.0-pytorch-install.rst | 4 +- .../neuronx-2.5.0-pytorch-install.rst | 4 +- .../setup/pytorch-neuronx-install-cxx11.rst | 16 +- .../torch-neuronx/tutorials/training/bert.rst | 10 +- .../announcements/neuron1.x/announcements.rst | 6 +- .../appnotes/torch-neuron/rcnn-app-note.rst | 6 +- .../torch-neuronx/introducing-pytorch-2-x.rst | 4 +- .../migration-from-xla-downcast-bf16.rst | 78 ++++++++-- .../quick-start/torch-neuron-tab-training.rst | 7 +- .../inf1/neuron-pip-setup.rst | 2 +- libraries/neuronx-distributed/setup/index.rst | 2 +- .../tutorials/finetuning_llama2_7b_ptl.rst | 2 +- .../finetuning_llama3_8B_ptl_lora.rst | 2 +- .../gpt_neox_tp_zero1/gpt_neox_20b.sh | 22 +++ .../gpt_neox_tp_zero1/gpt_neox_6_9b.sh | 22 +++ .../llama_tp_pp/llama_2_13b.sh | 24 +++ .../llama_tp_pp/llama_2_70b.sh | 24 +++ .../llama_tp_pp/llama_31_70b.sh | 24 +++ .../llama_tp_pp/llama_3_70b.sh | 24 +++ .../llama_tp_pp/llama_tp_pp_setup.sh | 12 ++ .../llama_tp_pp_ptl/llama_2_13b.sh | 25 +++ .../llama_tp_pp_ptl/llama_2_70b.sh | 25 +++ .../llama_tp_pp_ptl/llama_2_7b.sh | 25 +++ .../llama_tp_pp_ptl/llama_tp_pp_ptl_setup.sh | 13 ++ .../llama_tp_zero1/llama_2_7b.sh | 23 +++ .../llama_tp_zero1/llama_31_8b.sh | 25 +++ .../llama_tp_zero1/llama_3_8b.sh | 23 +++ .../llama_tp_zero1/llama_tp_zero1_setup.sh | 10 ++ .../tutorials/training-gpt-neox-20b.rst | 43 ++---- .../tutorials/training-gpt-neox.rst | 46 ++---- .../tutorials/training.rst | 2 +- .../tutorials/training_llama2_tp_pp_ptl.rst | 105 +++++-------- .../tutorials/training_llama_tp_pp.rst | 90 ++++------- .../tutorials/training_llama_tp_zero1.rst | 96 +++++------- libraries/nxd-inference/nxdi-setup.rst | 2 +- .../general/installation_guide.rst | 4 +- .../transformers-neuronx/setup/index.rst | 2 +- release-notes/torch/torch-neuronx/index.rst | 4 +- .../sagemaker_container_neuron.ipynb | 2 +- .../t5-inference/t5-inference-tutorial.ipynb | 2 +- .../keras_resnet50/keras_resnet50.ipynb | 4 +- .../tensorflow/openpose_demo/openpose.ipynb | 4 +- .../ssd300_demo/ssd300_detection.py | 2 +- .../ssd300_demo/ssd300_evaluation.py | 2 +- .../tensorflow/ssd300_demo/ssd300_model.py | 6 +- .../tensorflow_resnet50/resnet50.ipynb | 4 +- .../tensorflow/yolo_v3_demo/yolo_v3.ipynb | 4 +- .../tensorflow/yolo_v4_demo/evaluate.ipynb | 4 +- src/helperscripts/n2-helper.py | 2 +- src/helperscripts/neuronsetuphelper.py | 2 +- tools/neuronperf/index.rst | 2 +- tools/neuronperf/neuronperf_install.rst | 2 +- ...ing-started-tensorboard-neuronx-plugin.rst | 2 +- 69 files changed, 678 insertions(+), 419 deletions(-) create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/gpt_neox_tp_zero1/gpt_neox_20b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/gpt_neox_tp_zero1/gpt_neox_6_9b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_pp/llama_2_13b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_pp/llama_2_70b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_pp/llama_31_70b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_pp/llama_3_70b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_pp/llama_tp_pp_setup.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_pp_ptl/llama_2_13b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_pp_ptl/llama_2_70b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_pp_ptl/llama_2_7b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_pp_ptl/llama_tp_pp_ptl_setup.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_zero1/llama_2_7b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_zero1/llama_31_8b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_zero1/llama_3_8b.sh create mode 100755 libraries/neuronx-distributed/tutorials/nxd-source-code/llama_tp_zero1/llama_tp_zero1_setup.sh diff --git a/containers/docker-example/inference/Dockerfile-inference b/containers/docker-example/inference/Dockerfile-inference index 26e5535a..916c9786 100644 --- a/containers/docker-example/inference/Dockerfile-inference +++ b/containers/docker-example/inference/Dockerfile-inference @@ -32,7 +32,7 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}" # Include framework tensorflow-neuron or torch-neuronx and compiler (compiler not needed for inference) RUN pip3 install \ torch-neuronx \ - --extra-index-url=https://pip.repos.neuron.amazonaws.com + --index-url=https://pip.repos.neuron.amazonaws.com # Include your APP dependencies here. # RUN ... diff --git a/containers/docker-example/inference/Dockerfile-inference-dlc b/containers/docker-example/inference/Dockerfile-inference-dlc index 6aaaef1c..e0645e60 100644 --- a/containers/docker-example/inference/Dockerfile-inference-dlc +++ b/containers/docker-example/inference/Dockerfile-inference-dlc @@ -105,8 +105,8 @@ RUN pip install --no-cache-dir -U \ "awscli<2" \ boto3 -RUN pip install neuron-cc[tensorflow] --extra-index-url https://pip.repos.neuron.amazonaws.com \ - && pip install "torch-neuron>=1.10.2,<1.10.3" --extra-index-url https://pip.repos.neuron.amazonaws.com \ +RUN pip install neuron-cc[tensorflow] --index-url https://pip.repos.neuron.amazonaws.com \ + && pip install "torch-neuron>=1.10.2,<1.10.3" --index-url https://pip.repos.neuron.amazonaws.com \ && pip install torchserve==$TS_VERSION \ && pip install --no-deps --no-cache-dir -U torchvision==0.11.3 \ # Install TF 1.15.5 to override neuron-cc[tensorflow]'s installation of tensorflow==1.15.0 diff --git a/containers/docker-example/inference/Dockerfile-libmode b/containers/docker-example/inference/Dockerfile-libmode index 423d2d21..b5ae24e3 100644 --- a/containers/docker-example/inference/Dockerfile-libmode +++ b/containers/docker-example/inference/Dockerfile-libmode @@ -32,7 +32,7 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}" # Include framework tensorflow-neuron or torch-neuron and compiler (compiler not needed for inference) RUN pip3 install \ torch-neuron \ - --extra-index-url=https://pip.repos.neuron.amazonaws.com + --index-url=https://pip.repos.neuron.amazonaws.com # Include your APP dependencies here. # RUN ... diff --git a/containers/docker-example/inference/Dockerfile.mxnet-serving b/containers/docker-example/inference/Dockerfile.mxnet-serving index 8ef04517..0f543951 100644 --- a/containers/docker-example/inference/Dockerfile.mxnet-serving +++ b/containers/docker-example/inference/Dockerfile.mxnet-serving @@ -15,7 +15,7 @@ RUN cd /tmp \ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 RUN update-alternatives --install /usr/local/bin/pip pip /usr/local/bin/pip3 1 -RUN pip install mxnet-neuron --extra-index-url=https://pip.repos.neuron.amazonaws.com +RUN pip install mxnet-neuron --index-url=https://pip.repos.neuron.amazonaws.com RUN pip install multi-model-server diff --git a/containers/docker-example/v1/inference/Dockerfile.app-rt-diff b/containers/docker-example/v1/inference/Dockerfile.app-rt-diff index 7c9ca1cb..84b772e4 100644 --- a/containers/docker-example/v1/inference/Dockerfile.app-rt-diff +++ b/containers/docker-example/v1/inference/Dockerfile.app-rt-diff @@ -26,7 +26,7 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU # Include framework tensorflow-neuron or torch-neuron and compiler (compiler not needed for inference) RUN pip3 install \ torch-neuron \ - --extra-index-url=https://pip.repos.neuron.amazonaws.com + --index-url=https://pip.repos.neuron.amazonaws.com # Include your APP dependencies here. # RUN/ENTRYPOINT/CMD ... diff --git a/containers/docker-example/v1/inference/Dockerfile.torch-neuron b/containers/docker-example/v1/inference/Dockerfile.torch-neuron index 285dd35a..aea601c3 100644 --- a/containers/docker-example/v1/inference/Dockerfile.torch-neuron +++ b/containers/docker-example/v1/inference/Dockerfile.torch-neuron @@ -36,7 +36,7 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}" # Include framework tensorflow-neuron or torch-neuron and compiler (compiler not needed for inference) RUN pip3 install \ torch-neuron \ - --extra-index-url=https://pip.repos.neuron.amazonaws.com + --index-url=https://pip.repos.neuron.amazonaws.com # Include your APP dependencies here. # RUN ... diff --git a/frameworks/jax/setup/jax-setup.rst b/frameworks/jax/setup/jax-setup.rst index 11cad4de..10b04071 100644 --- a/frameworks/jax/setup/jax-setup.rst +++ b/frameworks/jax/setup/jax-setup.rst @@ -54,7 +54,7 @@ pip repository. .. code:: bash - python3 -m pip install jax-neuronx[stable] --extra-index-url=https://pip.repos.neuron.amazonaws.com + python3 -m pip install jax-neuronx[stable] --index-url=https://pip.repos.neuron.amazonaws.com The second is to install packages ``jax``, ``jaxlib``, ``libneuronxla``, and ``neuronx-cc`` separately, with ``jax-neuronx`` being an optional addition. @@ -65,7 +65,8 @@ pip repository. .. code:: bash - python3 -m pip install jax==0.4.31 jaxlib==0.4.31 jax-neuronx libneuronxla neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com + python3 -m pip install jax==0.4.31 jaxlib==0.4.31 + python3 -m pip install jax-neuronx libneuronxla neuronx-cc==2.* --index-url=https://pip.repos.neuron.amazonaws.com We can now run some simple JAX programs on the Trainium or Inferentia accelerators. diff --git a/frameworks/mxnet-neuron/tutorials/bert_mxnet/index.rst b/frameworks/mxnet-neuron/tutorials/bert_mxnet/index.rst index ac5f1da6..3bbc8825 100644 --- a/frameworks/mxnet-neuron/tutorials/bert_mxnet/index.rst +++ b/frameworks/mxnet-neuron/tutorials/bert_mxnet/index.rst @@ -108,7 +108,7 @@ Modify Pip repository configurations to point to the Neuron repository: tee $VIRTUAL_ENV/pip.conf > /dev/null <=1.8" --no-deps - pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com/cxx11 "torch-neuron==1.9.1" --no-deps - pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com/cxx11 "torch-neuron<1.10" --no-deps + pip install --index-url=https://pip.repos.neuron.amazonaws.com/cxx11 "torch-neuron>=1.8" --no-deps + pip install --index-url=https://pip.repos.neuron.amazonaws.com/cxx11 "torch-neuron==1.9.1" --no-deps + pip install --index-url=https://pip.repos.neuron.amazonaws.com/cxx11 "torch-neuron<1.10" --no-deps .. important:: @@ -117,7 +117,7 @@ is to download the wheel and unpack the contents: .. code:: bash - pip download --extra-index-url=https://pip.repos.neuron.amazonaws.com/cxx11 torch-neuron --no-deps + pip download --index-url=https://pip.repos.neuron.amazonaws.com/cxx11 torch-neuron --no-deps wheel unpack torch_neuron-*.whl If the exact version of the ``torch-neuron`` package is known and no diff --git a/frameworks/torch/torch-neuron/troubleshooting-guide.rst b/frameworks/torch/torch-neuron/troubleshooting-guide.rst index 3b2bcd2e..7a22f174 100644 --- a/frameworks/torch/torch-neuron/troubleshooting-guide.rst +++ b/frameworks/torch/torch-neuron/troubleshooting-guide.rst @@ -33,7 +33,7 @@ If you encounter an error like below, it is because the model size is larger tha To compile such large models, use the :ref:`separate_weights=True ` flag. Note, ensure that you have the latest version of compiler installed to support this flag. You can upgrade neuron-cc using -:code:`python3 -m pip install neuron-cc[tensorflow] -U --force --extra-index-url=https://pip.repos.neuron.amazonaws.com` +:code:`python3 -m pip install neuron-cc[tensorflow] -U --force --index-url=https://pip.repos.neuron.amazonaws.com` :: diff --git a/frameworks/torch/torch-neuronx/programming-guide/training/pytorch-neuron-programming-guide.rst b/frameworks/torch/torch-neuronx/programming-guide/training/pytorch-neuron-programming-guide.rst index 1ad38081..38c78d28 100644 --- a/frameworks/torch/torch-neuronx/programming-guide/training/pytorch-neuron-programming-guide.rst +++ b/frameworks/torch/torch-neuronx/programming-guide/training/pytorch-neuron-programming-guide.rst @@ -218,83 +218,82 @@ compiled and executed if there are extra mark-steps or functions with implicit mark-steps. Additionally, more graphs can be generated if there are different execution paths taken due to control-flows. -Automatic casting of float tensors to BFloat16 ----------------------------------------------- - -With PyTorch Neuron, the default behavior is for torch.float (FP32) and torch.double (FP64) tensors -to be mapped to torch.float in hardware. To reduce memory footprint and improve performance, -torch.float and torch.double tensors can automatically be converted to BFloat16 by setting -the environment variable ``XLA_USE_BF16=1``. Alternatively, torch.float can automatically be converted -to BFloat16 and torch.double converted to FP32 by setting the environment variable ``XLA_DOWNCAST_BF16=1``. - -Automatic Mixed-Precision -------------------------- - -BF16 mixed-precision using PyTorch Autocast -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -By default, the compiler automatically cast internal FP32 operations to -BF16. You can disable this and allow PyTorch's BF16 mixed-precision to -do the casting. PyTorch's BF16 mixed-precision is achieved by casting -certain operations to operate BF16. We currently use CUDA's list of -operations that can operate in BF16: - -.. code:: bash - - _convolution - _convolution - _convolution_nogroup - conv1d - conv2d - conv3d - conv_tbc - conv_transpose1d - conv_transpose2d - conv_transpose3d - convolution - cudnn_convolution - cudnn_convolution_transpose - cudnn_convolution - cudnn_convolution_transpose - cudnn_convolution - cudnn_convolution_transpose - prelu - addmm - addmv - addr - matmul - mm - mv - linear - addbmm - baddbmm - bmm - chain_matmul - linalg_multi_dot +Full BF16 with stochastic rounding enabled +------------------------------------------ -To enable PyTorch's BF16 mixed-precision, first turn off the Neuron -compiler auto-cast: +Previously, on torch-neuronx 2.1 and earlier, the environmental variables ``XLA_USE_BF16`` or ``XLA_DOWNCAST_BF16`` provided full casting to BF16 with stochastic rounding enabled by default. These environmental variables are deprecated in torch-neuronx 2.5, although still functional with warnings. To replace ``XLA_USE_BF16`` or ``XLA_DOWNCAST_BF16`` with stochastic rounding on Neuron, set ``NEURON_RT_STOCHASTIC_ROUNDING_EN=1`` and use the ``torch.nn.Module.to`` method to cast model floating-point parameters and buffers to data-type BF16 as follows: .. code:: python - os.environ["NEURON_CC_FLAGS"] = "--auto-cast=none" + os.environ["NEURON_RT_STOCHASTIC_ROUNDING_EN"] = "1" + + # model is created + model.to(torch.bfloat16) + +Stochastic rounding is needed to enable faster convergence for full BF16 model. + +If the loss is to be kept in FP32, initialize it with ``dtype=torch.float`` as follows: + +.. code:: python + + running_loss = torch.zeros(1, dtype=torch.float).to(device) -Next, overwrite torch.cuda.is_bf16_supported to return True: +Similarly, if the optimizer states are to be kept in FP32, convert the gradients to FP32 before optimizer computations: .. code:: python - torch.cuda.is_bf16_supported = lambda: True + grad = p.grad.data.float() -Next, per recommendation from official PyTorch documentation, place only -the forward-pass of the training step in the torch.autocast scope: +For a full example, please see the :ref:`PyTorch Neuron BERT Pretraining Tutorial (Data-Parallel) `, which has been updated to use ``torch.nn.Module.to`` instead of ``XLA_DOWNCAST_BF16``. + +BF16 in GPU-compatible mode without stochastic rounding enabled +--------------------------------------------------------------- + +Full BF16 training in GPU-compatible mode would enable faster convergence without the need for stochastic rounding, but would require a FP32 copy of weights/parameters to be saved and used in the optimizer. To enable BF16 in GPU-compatible mode without stochastic rounding enabled, use the ``torch.nn.Module.to`` method to cast model floating-point parameters and buffers to data-type bfloat16 as follows without setting ``NEURON_RT_STOCHASTIC_ROUNDING_EN=1``: .. code:: python - with torch.autocast(dtype=torch.bfloat16, device_type='cuda'): + # model is created + model.to(torch.bfloat16) + +In the initializer of the optimizer, for example AdamW, you can add code like the following code snippet to make a FP32 copy of weights: + +.. code:: python + + # keep a copy of weights in highprec + self.param_groups_highprec = [] + for group in self.param_groups: + params = group['params'] + param_groups_highprec = [p.data.float() for p in params] + self.param_groups_highprec.append({'params': param_groups_highprec}) + +In the :ref:`PyTorch Neuron BERT Pretraining Tutorial (Data-Parallel) `, this mode can be enabled by pasing ``--optimizer=AdamW_FP32ParamsCopy`` option to ``dp_bert_large_hf_pretrain_hdf5.py`` and setting ``NEURON_RT_STOCHASTIC_ROUNDING_EN=0`` (or leave it unset). + +.. _automatic_mixed_precision_autocast: + +BF16 automatic mixed precision using PyTorch Autocast +----------------------------------------------------- + +By default, the compiler automatically casts internal FP32 operations to +BF16. You can disable this and allow PyTorch's BF16 automatic mixed precision function (``torch.autocast``) to +do the casting of certain operations to operate in BF16. + +To enable PyTorch's BF16 mixed-precision, first turn off the Neuron +compiler auto-cast: + +.. code:: python + + os.environ["NEURON_CC_FLAGS"] = "--auto-cast=none" + +Next, per recommendation from official PyTorch `torch.autocast documentation `__, place only +the forward-pass of the training step in the ``torch.autocast`` scope with ``xla`` device type: + +.. code:: python + + with torch.autocast(dtype=torch.bfloat16, device_type='xla'): # forward pass -The device type is CUDA because we are using CUDA's list of BF16 -compatible operations as mentioned above. +The device type is XLA because we are using PyTorch-XLA's autocast backend. The PyTorch-XLA `autocast mode source code `_ lists which operations are casted to lower precision BF16 ("lower precision fp cast policy" section), which are maintained in FP32 ("fp32 cast policy"), and which are promoted to the widest input types ("promote" section). Example showing the original training code snippet: @@ -319,7 +318,7 @@ The following shows the training loop modified to use BF16 autocast: def train_loop_fn(train_loader): for i, data in enumerate(train_loader): torch.cuda.is_bf16_supported = lambda: True - with torch.autocast(dtype=torch.bfloat16, device_type='cuda'): + with torch.autocast(dtype=torch.bfloat16, device_type='xla'): inputs = data[0] labels = data[3] outputs = model(inputs, labels=labels) @@ -328,7 +327,7 @@ The following shows the training loop modified to use BF16 autocast: optimizer.step() xm.mark_step() -For a full example of BF16 mixed-precision, see :ref:`PyTorch Neuron BERT Pretraining Tutorial `. +For a full example of BF16 mixed-precision, see :ref:`PyTorch Neuron BERT Pretraining Tutorial (Data-Parallel) `. See official PyTorch documentation for more details about `torch.autocast `__ @@ -370,6 +369,12 @@ intermediate results such as loss values. In such case, the printing of lazy tensors should be wrapped using ``xm.add_step_closure()`` to avoid unnecessary compilation-and-executions. +Aggregate the data transfers between host CPUs and devices +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For best performance, you may try to aggregate the data transfers between host CPUs and devices. +For example, increasing the value for `batches_per_execution` argument when instantiating ``MpDeviceLoader`` can help increase performance for certain where there's frequent host-device traffic like ViT as described in `a blog `_. NOTE: Increasing `batches_per_execution` value would delay the mark-step for multiple batches specified by this value, increasing graph size and could lead to out-of-memory (device OOM) error. + Ensure common initial weights across workers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -396,13 +401,6 @@ be loaded using ``serialization.load`` api. More information on this here: `Savi FAQ --- - -What is the difference between Trainium and Inferentia? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Trainium is an accelerator designed to speed up training, whereas -Inferentia is an accelerator designed to speed up inference. - Debugging and troubleshooting ----------------------------- diff --git a/frameworks/torch/torch-neuronx/setup/install-templates/pytorch-dev-install.txt b/frameworks/torch/torch-neuronx/setup/install-templates/pytorch-dev-install.txt index 37ce4ae4..15579071 100644 --- a/frameworks/torch/torch-neuronx/setup/install-templates/pytorch-dev-install.txt +++ b/frameworks/torch/torch-neuronx/setup/install-templates/pytorch-dev-install.txt @@ -72,7 +72,7 @@ pip install awscli # Install packages from repos - python -m pip config set global.extra-index-url "https://pip.repos.neuron.amazonaws.com" + python -m pip config set global.index-url "https://pip.repos.neuron.amazonaws.com" # Install Python packages - Transformers package is needed for BERT python -m pip install torch-neuronx=="1.11.0.1.*" "neuronx-cc==2.*" @@ -144,7 +144,7 @@ pip install awscli # Install packages from repos - python -m pip config set global.extra-index-url "https://pip.repos.neuron.amazonaws.com" + python -m pip config set global.index-url "https://pip.repos.neuron.amazonaws.com" # Install Python packages - Transformers package is needed for BERT python -m pip install torch-neuronx=="1.11.0.1.*" "neuronx-cc==2.*" diff --git a/frameworks/torch/torch-neuronx/setup/prev-releases/neuronx-2.3.0-pytorch-install.rst b/frameworks/torch/torch-neuronx/setup/prev-releases/neuronx-2.3.0-pytorch-install.rst index eb2616aa..ecc73da7 100644 --- a/frameworks/torch/torch-neuronx/setup/prev-releases/neuronx-2.3.0-pytorch-install.rst +++ b/frameworks/torch/torch-neuronx/setup/prev-releases/neuronx-2.3.0-pytorch-install.rst @@ -66,7 +66,7 @@ Install PyTorch Neuron (Neuron 2.3.0) pip install -U pip # Install packages from beta repos - python -m pip config set global.extra-index-url "https://pip.repos.neuron.amazonaws.com" + python -m pip config set global.index-url "https://pip.repos.neuron.amazonaws.com" # Install Python packages - Transformers package is needed for BERT python -m pip install torch-neuronx=="1.11.0.1.1.1" "neuronx-cc==2.1.0.76" torchvision @@ -130,7 +130,7 @@ Install PyTorch Neuron (Neuron 2.3.0) python -m pip install -U pip # Install packages from beta repos - python -m pip config set global.extra-index-url "https://pip.repos.neuron.amazonaws.com" + python -m pip config set global.index-url "https://pip.repos.neuron.amazonaws.com" # Install Python packages - Transformers package is needed for BERT python -m pip install torch-neuronx=="1.11.0.1.1.1" "neuronx-cc==2.1.0.76" torchvision diff --git a/frameworks/torch/torch-neuronx/setup/prev-releases/neuronx-2.5.0-pytorch-install.rst b/frameworks/torch/torch-neuronx/setup/prev-releases/neuronx-2.5.0-pytorch-install.rst index 208827e0..bbada777 100644 --- a/frameworks/torch/torch-neuronx/setup/prev-releases/neuronx-2.5.0-pytorch-install.rst +++ b/frameworks/torch/torch-neuronx/setup/prev-releases/neuronx-2.5.0-pytorch-install.rst @@ -66,7 +66,7 @@ Install PyTorch NeuronX (Neuron 2.5.0) pip install -U pip # Install packages from beta repos - python -m pip config set global.extra-index-url "https://pip.repos.neuron.amazonaws.com" + python -m pip config set global.index-url "https://pip.repos.neuron.amazonaws.com" # Install Python packages - Transformers package is needed for BERT python -m pip install torch-neuronx=="1.11.0.1.2.0" "neuronx-cc==2.2.0.73" @@ -130,7 +130,7 @@ Install PyTorch NeuronX (Neuron 2.5.0) python -m pip install -U pip # Install packages from beta repos - python -m pip config set global.extra-index-url "https://pip.repos.neuron.amazonaws.com" + python -m pip config set global.index-url "https://pip.repos.neuron.amazonaws.com" # Install Python packages - Transformers package is needed for BERT python -m pip install torch-neuronx=="1.11.0.1.2.0" "neuronx-cc==2.2.0.73" diff --git a/frameworks/torch/torch-neuronx/setup/pytorch-neuronx-install-cxx11.rst b/frameworks/torch/torch-neuronx/setup/pytorch-neuronx-install-cxx11.rst index 81bb76eb..198cb986 100644 --- a/frameworks/torch/torch-neuronx/setup/pytorch-neuronx-install-cxx11.rst +++ b/frameworks/torch/torch-neuronx/setup/pytorch-neuronx-install-cxx11.rst @@ -55,7 +55,7 @@ index. :: - pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com/cxx11 torch-neuronx --no-deps + pip install --index-url=https://pip.repos.neuron.amazonaws.com/cxx11 torch-neuronx --no-deps Specific versions of ``torch-neuronx`` with C++11 ABI support can be installed @@ -63,8 +63,8 @@ just like standard versions of ``torch-neuronx``. :: - pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com/cxx11 "torch-neuronx==2.1.*" --no-deps - pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com/cxx11 "torch-neuronx==2.5,*" --no-deps + pip install --index-url=https://pip.repos.neuron.amazonaws.com/cxx11 "torch-neuronx==2.1.*" --no-deps + pip install --index-url=https://pip.repos.neuron.amazonaws.com/cxx11 "torch-neuronx==2.5,*" --no-deps .. important:: @@ -87,9 +87,9 @@ just like standard versions of ``torch-neuronx``. Building ``torch`` and ``torch-xla`` with C++11 ABI ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The instructions for building ``torch`` from source is at https://github.com/pytorch/pytorch#from-source +The instructions for building ``torch`` from source are at https://github.com/pytorch/pytorch#from-source -The instructions for building ``torch-xla`` from source is at https://github.com/pytorch/xla/blob/master/CONTRIBUTING.md +The instructions for building ``torch-xla`` from source are at https://github.com/pytorch/xla/blob/master/CONTRIBUTING.md The following are simplified instructions (subject to change): @@ -169,7 +169,7 @@ is to download the wheel and unpack the contents: .. code:: bash - pip download --extra-index-url=https://pip.repos.neuron.amazonaws.com/cxx11 torch-neuronx --no-deps + pip download --index-url=https://pip.repos.neuron.amazonaws.com/cxx11 torch-neuronx --no-deps wheel unpack torch_neuronx-*.whl If the exact version of the ``torch-neuronx`` package is known and no @@ -184,8 +184,8 @@ package file directly and ``unzip`` the wheel: .. _pytorch-neuronx-cxx11-versioning: -How can I know which ABI torch-neuron is using? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How can I know which ABI torch-neuronx is using? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Packages which use the pre-C++11 ABI have no local identifier and use the following version scheme: diff --git a/frameworks/torch/torch-neuronx/tutorials/training/bert.rst b/frameworks/torch/torch-neuronx/tutorials/training/bert.rst index a5aab02a..04aae17a 100644 --- a/frameworks/torch/torch-neuronx/tutorials/training/bert.rst +++ b/frameworks/torch/torch-neuronx/tutorials/training/bert.rst @@ -1,10 +1,10 @@ .. _hf-bert-pretraining-tutorial: -Hugging Face BERT Pretraining Tutorial -====================================== +Hugging Face BERT Pretraining Tutorial (Data-Parallel) +====================================================== This tutorial explains how to run Hugging Face BERT-Large model -pretraining on Trainium using PyTorch Neuron. +pretraining on Trainium using PyTorch Neuron and data-parallel mode. The Hugging Face BERT pretraining example demonstrates the steps required to perform single-node, multi-accelerator PyTorch model @@ -44,7 +44,7 @@ Phase 1 BFloat16 BERT-Large pretraining with AdamW and stochastic rounding Setting up the training environment on trn1.32xlarge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The BERT training script ``dp_bert_large_hf_pretrain_hdf5.py`` +The BERT training script ``dp_bert_large_hf_pretrain_hdf5.py`` (`source `_) can run on a Trainium instance (trn1.32xlarge) that contains the appropriate Neuron runtime and Python dependencies. @@ -60,7 +60,7 @@ For all the commands below, make sure you are in the virtual environment that yo source ~/aws_neuron_venv_pytorch/bin/activate -Next, clone the AWS Neuron Samples repository and install requirements in the BERT tutorial directory ``aws-neuron-samples/torch-neuronx/training/dp_bert_hf_pretrain``: +Next, clone the `AWS Neuron Samples repository `_ and install requirements in the BERT tutorial directory ``aws-neuron-samples/torch-neuronx/training/dp_bert_hf_pretrain`` (`directory link `_): .. code:: shell diff --git a/general/announcements/neuron1.x/announcements.rst b/general/announcements/neuron1.x/announcements.rst index 9174aaf3..3ac797b0 100644 --- a/general/announcements/neuron1.x/announcements.rst +++ b/general/announcements/neuron1.x/announcements.rst @@ -157,7 +157,7 @@ Starting with Neuron SDK 1.14.0, run one of the following commands to upgrade to .. code-block:: source activate aws_neuron_pytorch_p36 - pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + pip config set global.index-url https://pip.repos.neuron.amazonaws.com pip install --upgrade torch-neuron neuron-cc[tensorflow] torchvision * To upgrade TensorFlow Neuron: @@ -165,7 +165,7 @@ Starting with Neuron SDK 1.14.0, run one of the following commands to upgrade to .. code-block:: source activate aws_neuron_tensorflow_p36 - pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + pip config set global.index-url https://pip.repos.neuron.amazonaws.com pip install --upgrade tensorflow-neuron tensorboard-neuron neuron-cc * To upgrade MXNet Neuron: @@ -173,7 +173,7 @@ Starting with Neuron SDK 1.14.0, run one of the following commands to upgrade to .. code-block:: source activate aws_neuron_mxnet_p36 - pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + pip config set global.index-url https://pip.repos.neuron.amazonaws.com pip install --upgrade mxnet-neuron neuron-cc For more information please check the `blog `__. diff --git a/general/appnotes/torch-neuron/rcnn-app-note.rst b/general/appnotes/torch-neuron/rcnn-app-note.rst index 03c363d9..bbea55bd 100644 --- a/general/appnotes/torch-neuron/rcnn-app-note.rst +++ b/general/appnotes/torch-neuron/rcnn-app-note.rst @@ -88,10 +88,10 @@ leads to a RoI Align latency reduction two to three times larger than the defaul !sudo apt install python3.7-dev -y # Install Neuron packages - !pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com !pip uninstall -y torchvision - !pip install --force-reinstall torch-neuron==1.11.0.* neuron-cc[tensorflow] "protobuf==3.20.1" ninja opencv-python - + !pip install --force-reinstall "protobuf==3.20.1" ninja opencv-python + !pip install --force-reinstall torch-neuron==1.11.0.* neuron-cc[tensorflow] --index-url https://pip.repos.neuron.amazonaws.com + # Change cuda to 10.2 for Detectron2 !sudo rm /usr/local/cuda !sudo ln -s /usr/local/cuda-10.2 /usr/local/cuda diff --git a/general/appnotes/torch-neuronx/introducing-pytorch-2-x.rst b/general/appnotes/torch-neuronx/introducing-pytorch-2-x.rst index a7d1f6af..2418b367 100644 --- a/general/appnotes/torch-neuronx/introducing-pytorch-2-x.rst +++ b/general/appnotes/torch-neuronx/introducing-pytorch-2-x.rst @@ -63,7 +63,7 @@ To migrate the training scripts from PyTorch NeuronX 2.1 to PyTorch NeuronX 2.5, ``xm`` below refers to ``torch_xla.core.xla_model`` and ``xr`` refers to ``torch_xla.runtime`` -* The environment variables ``XLA_DOWNCAST_BF16`` and ``XLA_USE_BF16`` are deprecated (warning when used). Please switch to automatic mixed-precision or use ``model.to(torch.bfloat16)`` command to convert model to BF16 format. (see :ref:``) +* The environment variables ``XLA_DOWNCAST_BF16`` and ``XLA_USE_BF16`` are deprecated (warning when used). Please switch to automatic mixed-precision or use ``model.to(torch.bfloat16)`` command to convert model to BF16 format. (see :ref:`migration_from_xla_downcast_bf16`) * The ``torch_xla.experimental.pjrt`` module which was replaced by ``torch_xla.runtime`` in Torch-XLA 2.1, has been removed in Torch-XLA 2.5. Users should now utilize the ``torch_xla.runtime`` module as a replacement. * ``torch_xla.runtime.using_pjrt`` is removed because PJRT is the sole Torch-XLA runtime. * ``xm.all_reduce`` no longer operates in-place for single tensors. To fix this, please convert the single tensor to an array (e.g.. ``[single_tensor]``) or assign the output of ``xm.all_reduce`` to a variable. @@ -108,7 +108,7 @@ This is a warning that ``torch_xla.core.xla_model.xrt_world_size()`` will be rem WARNING:torch_xla.core.xla_model.xla_model.get_ordinal() will be removed in release 2.7. is deprecated. Use torch_xla.runtime.global_ordinal instead. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This is a warning that ``torch_xla.core.xla_model.xla_model.get_ordinal() `` will be removed in a future release. Please switch to using ``torch_xla.runtime.global_ordinal`` instead. +This is a warning that ``torch_xla.core.xla_model.xla_model.get_ordinal()`` will be removed in a future release. Please switch to using ``torch_xla.runtime.global_ordinal`` instead. AttributeError: module 'torch_xla.runtime' has no attribute 'using_pjrt' diff --git a/general/appnotes/torch-neuronx/migration-from-xla-downcast-bf16.rst b/general/appnotes/torch-neuronx/migration-from-xla-downcast-bf16.rst index 954ba352..f141a9e2 100644 --- a/general/appnotes/torch-neuronx/migration-from-xla-downcast-bf16.rst +++ b/general/appnotes/torch-neuronx/migration-from-xla-downcast-bf16.rst @@ -8,15 +8,12 @@ Introduction The environmental variables ``XLA_USE_BF16`` and ``XLA_DOWNCAST_BF16`` were created to provide an easy cast-to-bf16 option before automatic mixed-precision or ``model.to(torch.bfloat16)`` as available in Torch-XLA. Now that both automatic mixed precision and ``model.to(torch.bfloat16)`` are available in Torch-XLA, ``XLA_USE_BF16`` and ``XLA_DOWNCAST_BF16`` are redundant and can be replaced with these options as a more familiar experience as on other platforms such as CPUs and GPUs. (They are deprecated in Torch-XLA 2.5 as warnings, and will be removed in a future release). -This change can best be made to scripts running with Torch-XLA 2.1 and 2.5. +The changes recommended below can best be made to scripts running with Torch-XLA 2.1 and 2.5. The same recommendations are also available in :ref:`pytorch-neuronx-programming-guide`. Full BF16 with stochastic rounding enabled ------------------------------------------ -On Neuron, when the environmental variable ``XLA_USE_BF16`` or ``XLA_DOWNCAST_BF16`` is set, stochastic rounding is enabled by default. If they are not used, then stochastic rounding is off unless ``NEURON_RT_STOCHASTIC_ROUNDING_EN=1``. - -To replace ``XLA_USE_BF16`` or ``XLA_DOWNCAST_BF16`` with stochastic rounding on Neuron, set ``NEURON_RT_STOCHASTIC_ROUNDING_EN=1`` and use the “to” function to move the model to data-type bfloat16 as follows: - +Previously, on torch-neuronx 2.1 and earlier, the environmental variables ``XLA_USE_BF16`` or ``XLA_DOWNCAST_BF16`` provided full casting to BF16 with stochastic rounding enabled by default. These environmental variables are deprecated in torch-neuronx 2.5, although still functional with warnings. To replace ``XLA_USE_BF16`` or ``XLA_DOWNCAST_BF16`` with stochastic rounding on Neuron, set ``NEURON_RT_STOCHASTIC_ROUNDING_EN=1`` and use the ``torch.nn.Module.to`` method to cast model floating-point parameters and buffers to data-type BF16 as follows: .. code:: python @@ -25,6 +22,8 @@ To replace ``XLA_USE_BF16`` or ``XLA_DOWNCAST_BF16`` with stochastic rounding on # model is created model.to(torch.bfloat16) +Stochastic rounding is needed to enable faster convergence for full BF16 model. + If the loss is to be kept in FP32, initialize it with ``dtype=torch.float`` as follows: .. code:: python @@ -37,12 +36,12 @@ Similarly, if the optimizer states are to be kept in FP32, convert the gradients grad = p.grad.data.float() -For a full example, please see the updated BERT pretraining tutorial. +For a full example, please see the :ref:`PyTorch Neuron BERT Pretraining Tutorial (Data-Parallel) `, which has been updated to use ``torch.nn.Module.to`` instead of ``XLA_DOWNCAST_BF16``. BF16 in GPU-compatible mode without stochastic rounding enabled --------------------------------------------------------------- -To enable BF16 in GPU-compatible mode without stochastic rounding enabled, use the “to” function to move the model to data-type bfloat16 as follows without setting ``NEURON_RT_STOCHASTIC_ROUNDING_EN=1``: +Full BF16 training in GPU-compatible mode would enable faster convergence without the need for stochastic rounding, but would require a FP32 copy of weights/parameters to be saved and used in the optimizer. To enable BF16 in GPU-compatible mode without stochastic rounding enabled, use the ``torch.nn.Module.to`` method to cast model floating-point parameters and buffers to data-type bfloat16 as follows without setting ``NEURON_RT_STOCHASTIC_ROUNDING_EN=1``: .. code:: python @@ -76,7 +75,66 @@ From then, you can use the usual gradients but updating the FP32 copy of weights p_highprec.data.addcdiv_(exponential_avg, denominator, value=-step_size) -Automatic Mixed Precision -------------------------- +In the :ref:`PyTorch Neuron BERT Pretraining Tutorial (Data-Parallel) `, this mode can be enabled by pasing ``--optimizer=AdamW_FP32ParamsCopy`` option to ``dp_bert_large_hf_pretrain_hdf5.py`` and setting ``NEURON_RT_STOCHASTIC_ROUNDING_EN=0`` (or leave it unset). + +BF16 automatic mixed precision using PyTorch Autocast +----------------------------------------------------- + +By default, the compiler automatically casts internal FP32 operations to +BF16. You can disable this and allow PyTorch's BF16 automatic mixed precision function (``torch.autocast``) to +do the casting of certain operations to operate in BF16. + +To enable PyTorch's BF16 mixed-precision, first turn off the Neuron +compiler auto-cast: + +.. code:: python + + os.environ["NEURON_CC_FLAGS"] = "--auto-cast=none" + +Next, per recommendation from official PyTorch `torch.autocast documentation `__, place only +the forward-pass of the training step in the ``torch.autocast`` scope with ``xla`` device type: + +.. code:: python + + with torch.autocast(dtype=torch.bfloat16, device_type='xla'): + # forward pass + +The device type is XLA because we are using PyTorch-XLA's autocast backend. The PyTorch-XLA `autocast mode source code `_ lists which operations are casted to lower precision BF16 ("lower precision fp cast policy" section), which are maintained in FP32 ("fp32 cast policy"), and which are promoted to the widest input types ("promote" section). + +Example showing the original training code snippet: + +.. code:: python + + def train_loop_fn(train_loader): + for i, data in enumerate(train_loader): + inputs = data[0] + labels = data[3] + outputs = model(inputs, labels=labels) + loss = outputs.loss/ flags.grad_acc_steps + loss.backward() + optimizer.step() + xm.mark_step() + +The following shows the training loop modified to use BF16 autocast: + +.. code:: python -See the existing `Automatic Mixed Precision example `_. + os.environ["NEURON_CC_FLAGS"] = "--auto-cast=none" + + def train_loop_fn(train_loader): + for i, data in enumerate(train_loader): + torch.cuda.is_bf16_supported = lambda: True + with torch.autocast(dtype=torch.bfloat16, device_type='xla'): + inputs = data[0] + labels = data[3] + outputs = model(inputs, labels=labels) + loss = outputs.loss/ flags.grad_acc_steps + loss.backward() + optimizer.step() + xm.mark_step() + +For a full example of BF16 mixed-precision, see :ref:`PyTorch Neuron BERT Pretraining Tutorial (Data-Parallel) `. + +See official PyTorch documentation for more details about +`torch.autocast `__ +. diff --git a/general/quick-start/torch-neuron-tab-training.rst b/general/quick-start/torch-neuron-tab-training.rst index 35814b8c..3c15d256 100644 --- a/general/quick-start/torch-neuron-tab-training.rst +++ b/general/quick-start/torch-neuron-tab-training.rst @@ -76,16 +76,13 @@ source aws_neuron_venv_pytorch/bin/activate pip install -U pip - # Install packages from repos - python -m pip config set global.extra-index-url "https://pip.repos.neuron.amazonaws.com" - # Install wget, awscli pip install wget pip install awscli # Install Neuron packages - pip install torch-neuronx==1.13.0.1.* - pip install neuronx-cc==2.* + pip install torch-neuronx==1.13.0.1.* --index-url=https://pip.repos.neuron.amazonaws.com + pip install neuronx-cc==2.* --index-url=https://pip.repos.neuron.amazonaws.com .. dropdown:: Run Tutorial diff --git a/general/setup/install-templates/inf1/neuron-pip-setup.rst b/general/setup/install-templates/inf1/neuron-pip-setup.rst index 5d280d77..fe2b5f3b 100644 --- a/general/setup/install-templates/inf1/neuron-pip-setup.rst +++ b/general/setup/install-templates/inf1/neuron-pip-setup.rst @@ -4,5 +4,5 @@ Modify Pip repository configurations to point to the Neuron repository: tee $VIRTUAL_ENV/pip.conf > /dev/null <` to install neuron python packages. -We also need to install the ``neuronx-distributed`` package using the following command: +We also need to install and clone the ``neuronx-distributed`` package using the following command: .. code:: ipython3 - python -m pip install neuronx_distributed --extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install neuronx_distributed --index-url https://pip.repos.neuron.amazonaws.com + git clone git@github.com:aws-neuron/neuronx-distributed.git Let’s download the scripts for pretraining. -.. code:: ipython3 - - mkdir -p ~/examples/tp_dp_gpt_neox_hf_pretrain - cd ~/examples/tp_dp_gpt_neox_hf_pretrain - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain.sh - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/modeling_gpt_neox_nxd.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/utils.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/tp_dp_gpt_neox_hf_pretrain/common/adamw_fp32_optim_params.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/tp_dp_gpt_neox_hf_pretrain/common/get_dataset.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/tp_dp_gpt_neox_hf_pretrain/common/requirements.txt - python3 -m pip install -r requirements.txt +.. literalinclude:: nxd-source-code/gpt_neox_tp_zero1/gpt_neox_6_9b.sh + :language: shell + :lines: 4-10 Next let’s download and pre-process the dataset: -.. code:: ipython3 - - cd ~/examples/tp_dp_gpt_neox_hf_pretrain - python3 get_dataset.py +.. literalinclude:: nxd-source-code/gpt_neox_tp_zero1/gpt_neox_6_9b.sh + :language: shell + :lines: 12 At this point, you are all set to start training. @@ -50,25 +41,20 @@ At this point, you are all set to start training. We first pre-compile the graphs using the ``neuron_parallel_compile``. -Suppose the cluster queue name is ``compute1-dy-training-0`` and we are using node 1-4, -let’s run the command below: +Let’s run the command below: -.. code:: ipython3 - - sbatch --exclusive \ - --nodelist=compute1-dy-training-0-[1-4] \ - --wrap="srun neuron_parallel_compile bash $(pwd)/tp_dp_gpt_neox_6.9b_hf_pretrain.sh" +.. literalinclude:: nxd-source-code/gpt_neox_tp_zero1/gpt_neox_6_9b.sh + :language: shell + :lines: 16-18 This script uses a tensor-parallel size of 8. This will automatically set the zero-1 sharding degree to 16 (4 * 32 workers / tensor_parallel_size). Once the graphs are compiled we can now run training and observe our loss goes down. To run the training, we just the above command but without ``neuron_parallel_compile``. -.. code:: ipython3 - - sbatch --exclusive \ - --nodelist=compute1-dy-training-0-[1-4] \ - --wrap="srun bash $(pwd)/tp_dp_gpt_neox_6.9b_hf_pretrain.sh" +.. literalinclude:: nxd-source-code/gpt_neox_tp_zero1/gpt_neox_6_9b.sh + :language: shell + :lines: 20-22 **ZeRO-1 Optimizer** diff --git a/libraries/neuronx-distributed/tutorials/training.rst b/libraries/neuronx-distributed/tutorials/training.rst index 84ede50d..9e8622e5 100644 --- a/libraries/neuronx-distributed/tutorials/training.rst +++ b/libraries/neuronx-distributed/tutorials/training.rst @@ -24,7 +24,7 @@ following command: .. code:: ipython3 - python -m pip install neuronx_distributed --extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install neuronx_distributed --index-url https://pip.repos.neuron.amazonaws.com Make sure the transformers version is set to ``4.26.0`` (Note: If you have transformers-neuronx in your environment, you need to uninstall it to avoid a conflict with the transformers version.) diff --git a/libraries/neuronx-distributed/tutorials/training_llama2_tp_pp_ptl.rst b/libraries/neuronx-distributed/tutorials/training_llama2_tp_pp_ptl.rst index e1dd95dd..b0c01631 100644 --- a/libraries/neuronx-distributed/tutorials/training_llama2_tp_pp_ptl.rst +++ b/libraries/neuronx-distributed/tutorials/training_llama2_tp_pp_ptl.rst @@ -20,65 +20,47 @@ We also need to install the ``neuronx-distributed`` package inside the virtual e .. code:: ipython3 - python -m pip install neuronx_distributed --extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install neuronx_distributed --index-url https://pip.repos.neuron.amazonaws.com + git clone git@github.com:aws-neuron/neuronx-distributed.git Let’s download the scripts for pretraining: -1. Creating a directory to hold our experiments +1. Navigate to a directory to hold our experiments -.. code:: ipython3 - - mkdir -p ~/examples/llama2_lightning - cd ~/examples/llama2_lightning - -2. Downloading training scripts for our experiments +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_tp_pp_ptl_setup.sh + :language: shell + :lines: 4 -.. code:: ipython3 - - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/lightning/data_module.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/lightning/module_llama.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/lightning/run_llama_nxd_ptl.py +2. Link the training scripts for our experiments - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/get_dataset.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/lr.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/modeling_llama_nxd.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/requirements.txt - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/requirements_ptl.txt - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/training_utils.py +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_tp_pp_ptl_setup.sh + :language: shell + :lines: 5-10 If you want to pre-train Llama 7B, you would need to run the following steps - -.. code:: ipython3 - - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/lightning/run_llama_7b_tp_ptl.sh - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/tp_zero1_llama_hf_pretrain/7B_config_llama2/config.json - chmod +x run_llama_7b_tp_ptl.sh +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_2_7b.sh + :language: shell + :lines: 5-8 If you want to pre-train Llama 13B, you would need to run the following steps - -.. code:: ipython3 - - mkdir -p ~/examples/llama2_lightning/13B_config - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/lightning/run_llama_13b_tp_pp_ptl.sh - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/tp_pp_llama_hf_pretrain/13B_config_llama2/config.json -P 13B_config/ - chmod +x run_llama_13b_tp_pp_ptl.sh +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_2_13b.sh + :language: shell + :lines: 5-8 If you want to pre-train Llama 70B, you would need to run the following steps - -.. code:: ipython3 - - mkdir -p ~/examples/llama2_lightning/70B_config - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/lightning/run_llama_70b_tp_pp_ptl.sh - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/main/examples/training/llama/tp_pp_llama_hf_pretrain/70B_config_llama2/config.json -P 70B_config/ - chmod +x run_llama_70b_tp_pp_ptl.sh +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_2_70b.sh + :language: shell + :lines: 5-8 3. Installing the additional requirements and giving the right permissions to our shell script -.. code:: ipython3 - - python3 -m pip install -r requirements.txt - python3 -m pip install -r requirements_ptl.txt # Currently we're supporting Lightning version 2.1.0 +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_tp_pp_ptl_setup.sh + :language: shell + :lines: 12-13 Next, we tokenize our dataset. @@ -92,10 +74,9 @@ Once you have downloaded the tokenizer and model weights, you can copy the ``tok Next let’s download and pre-process the dataset: -.. code:: ipython3 - - cd ~/examples/llama2_lightning - python3 get_dataset.py --llama-version 2 # currently we only support Llama-2 models +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_2_7b.sh + :language: shell + :lines: 13 ``Note``: In case you see an error of the following form when downloading data: ``huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/ubuntu/examples/llama2_lightning'. Use `repo_type` argument if needed.`` This could be because of a stale cache. Try deleting the cache using: @@ -114,12 +95,9 @@ By this step, the ParallelCluster is all setup for running experiments. Before we run training, we first pre-compile the graphs using the :ref:`neuron_parallel_compile `. Let’s run the command below: -.. code:: ipython3 - - sbatch --exclusive \ - --nodes 4 \ - --cpus-per-task 128 \ - --wrap="srun neuron_parallel_compile bash $(pwd)/run_llama_7b_tp_ptl.sh" +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_2_7b.sh + :language: shell + :lines: 17-20 This script uses a tensor-parallel size of 8. This will automatically set the zero-1 sharding degree to 16 (4 * 32 workers / tensor_parallel_size). @@ -133,12 +111,9 @@ created. Once the graphs are compiled we can now run training and observe our loss goes down. To run the training, we just run the above command but without ``neuron_parallel_compile``. -.. code:: ipython3 - - sbatch --exclusive \ - --nodes 4 \ - --cpus-per-task 128 \ - --wrap="srun bash $(pwd)/run_llama_7b_tp_ptl.sh" +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_2_7b.sh + :language: shell + :lines: 22-25 Training Llama2-13B/70B with Tensor Parallelism and Pipeline Parallelism ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -149,22 +124,16 @@ Let’s run the command below: Pre-compiling -.. code:: ipython3 - - sbatch --exclusive \ - --nodes 32 \ - --cpus-per-task 128 \ - --wrap="srun neuron_parallel_compile bash $(pwd)/run_llama_70b_tp_pp_ptl.sh" +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_2_70b.sh + :language: shell + :lines: 17-20 This script uses a tensor-parallel size of 8, pipeline-parallel size of 8 To run the training, we just use the above command but without ``neuron_parallel_compile``. -.. code:: ipython3 - - sbatch --exclusive \ - --nodes 32 \ - --cpus-per-task 128 \ - --wrap="srun bash $(pwd)/run_llama_70b_tp_pp_ptl.sh" +.. literalinclude:: nxd-source-code/llama_tp_pp_ptl/llama_2_7b.sh + :language: shell + :lines: 22-25 Checkpointing: diff --git a/libraries/neuronx-distributed/tutorials/training_llama_tp_pp.rst b/libraries/neuronx-distributed/tutorials/training_llama_tp_pp.rst index d1daec98..6c93b0e9 100644 --- a/libraries/neuronx-distributed/tutorials/training_llama_tp_pp.rst +++ b/libraries/neuronx-distributed/tutorials/training_llama_tp_pp.rst @@ -17,68 +17,43 @@ We also need to install the ``neuronx-distributed`` package using the following .. code:: ipython3 - python -m pip install neuronx_distributed --extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install neuronx_distributed --index-url https://pip.repos.neuron.amazonaws.com + git clone git@github.com:aws-neuron/neuronx-distributed.git Let’s download the scripts for pretraining: -.. code:: ipython3 - - mkdir -p ~/examples/tp_pp_llama_hf_pretrain - cd ~/examples/tp_pp_llama_hf_pretrain - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/activation_checkpoint.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/logger.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/lr.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/run_llama_nxd.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/training_utils.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/convert_checkpoints.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/get_dataset.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/modeling_llama_nxd.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/requirements.txt +.. literalinclude:: nxd-source-code/llama_tp_pp/llama_tp_pp_setup.sh + :language: shell + :lines: 4-10 If you want to pre-train Llama3.1 70B, you would need to run the following steps - -.. code:: ipython3 - - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/run_llama3_70B_tp_pp.sh - chmod +x run_llama3_70B_tp_pp.sh - mkdir 70B_config_llama3 && cd 70B_config_llama3 - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/70B_config_llama3.1/config.json - cd .. +.. literalinclude:: nxd-source-code/llama_tp_pp/llama_31_70b.sh + :language: shell + :lines: 6-7 If you want to pre-train Llama3 70B, you would need to run the following steps - -.. code:: ipython3 - - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/run_llama3_70B_tp_pp.sh - chmod +x run_llama3_70B_tp_pp.sh - mkdir 70B_config_llama3 && cd 70B_config_llama3 - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/70B_config_llama3/config.json - cd .. +.. literalinclude:: nxd-source-code/llama_tp_pp/llama_3_70b.sh + :language: shell + :lines: 6-7 For llama2 13B, you would need to run the following steps - -.. code:: ipython3 - - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/run_llama2_13B_tp_pp.sh - chmod +x run_llama2_13B_tp_pp.sh - mkdir 13B_config_llama2 && cd 13B_config_llama2 - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/13B_config_llama2/config.json - cd .. +.. literalinclude:: nxd-source-code/llama_tp_pp/llama_2_13b.sh + :language: shell + :lines: 6-7 If you want to pre-train Llama2 70B, you would need to run the following steps - -.. code:: ipython3 - - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/run_llama2_70B_tp_pp.sh - chmod +x run_llama2_70B_tp_pp.sh - mkdir 70B_config_llama2 && cd 70B_config_llama2 - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_pp_llama_hf_pretrain/70B_config_llama2/config.json - cd .. +.. literalinclude:: nxd-source-code/llama_tp_pp/llama_2_70b.sh + :language: shell + :lines: 6-7 @@ -86,9 +61,9 @@ The below tutorial uses ``Llama3.1 70B`` as an example. To run Llama2 70B or 13B First, let's get all the needed dependencies -.. code:: ipython3 - - python3 -m pip install -r requirements.txt +.. literalinclude:: nxd-source-code/llama_tp_pp/llama_tp_pp_setup.sh + :language: shell + :lines: 12 To tokenize the data, we must request the tokenizer from hugging face and meta by following the instructions at the following link: `HuggingFace Llama 3 8B Model `__ . @@ -122,10 +97,9 @@ For Llama2, you can just copy the ``tokenizer.model`` to the ``~/examples/tp_pp_ Next let’s download and pre-process the dataset: -.. code:: ipython3 - - cd ~/examples/tp_pp_llama_hf_pretrain - python3 get_dataset.py --llama-version 3 # change the version number to 2 for Llama-2 models +.. literalinclude:: nxd-source-code/llama_tp_pp/llama_3_70b.sh + :language: shell + :lines: 12 In case you see an error of the following form when downloading data: ``huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/ubuntu/examples/tp_pp_llama2_hf_pretrain'. Use `repo_type` argument if needed.`` This could be because of a stale cache. Try deleting the cache using: @@ -147,22 +121,16 @@ Running training We first pre-compile the graphs using the ``neuron_parallel_compile``. Let’s run the command below: -.. code:: ipython3 - - sbatch --exclusive \ - --nodes 32 \ - --cpus-per-task 128 \ - --wrap="srun neuron_parallel_compile bash $(pwd)/run_llama3_70B_tp_pp.sh" +.. literalinclude:: nxd-source-code/llama_tp_pp/llama_3_70b.sh + :language: shell + :lines: 16-19 This script uses a tensor-parallel size of 8, pipeline-parallel size of 8 To run the training, we just use the above command but without ``neuron_parallel_compile``. -.. code:: ipython3 - - sbatch --exclusive \ - --nodes 32 \ - --cpus-per-task 128 \ - --wrap="srun bash $(pwd)/run_llama3_70B_tp_pp.sh" +.. literalinclude:: nxd-source-code/llama_tp_pp/llama_3_70b.sh + :language: shell + :lines: 21-24 To achieve better performance, the script applies few techniques: diff --git a/libraries/neuronx-distributed/tutorials/training_llama_tp_zero1.rst b/libraries/neuronx-distributed/tutorials/training_llama_tp_zero1.rst index eb410c25..95f1c181 100644 --- a/libraries/neuronx-distributed/tutorials/training_llama_tp_zero1.rst +++ b/libraries/neuronx-distributed/tutorials/training_llama_tp_zero1.rst @@ -23,67 +23,52 @@ introduces how to setup and use a ParallelCluster. To setup the packages on the headnode of the ParallelCluster, follow the instructions mentioned here: :ref:`Install PyTorch Neuron on Trn1 `. -We also need to install the ``neuronx-distributed`` package inside the virtual env using the following command: +We also need to install and clone the ``neuronx-distributed`` package inside the virtual env using the following commands: .. code:: ipython3 - python -m pip install neuronx_distributed --extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install neuronx_distributed --index-url https://pip.repos.neuron.amazonaws.com + git clone git@github.com:aws-neuron/neuronx-distributed.git Let’s download the scripts for pretraining: -1. Creating a directory to hold our experiments +1. Navigate to a directory to hold our experiments -.. code:: ipython3 - - mkdir -p ~/examples/tp_zero1_llama_hf_pretrain - cd ~/examples/tp_zero1_llama_hf_pretrain - -2. Downloading training scripts for our experiments - -.. code:: ipython3 +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_tp_zero1_setup.sh + :language: shell + :lines: 4 - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_zero1_llama_hf_pretrain/tp_zero1_llama_hf_pretrain.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_zero1_llama_hf_pretrain/logger.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/training_utils.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/modeling_llama_nxd.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/get_dataset.py - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/requirements.txt +2. Link the training scripts for our experiments -If you want to pre-train Llama3.108B, you would need to run the following steps - +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_tp_zero1_setup.sh + :language: shell + :lines: 5-8 -.. code:: ipython3 +If you want to pre-train Llama3.1 8B, you would need to run the following steps - - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_zero1_llama_hf_pretrain/tp_zero1_llama3_8B_hf_pretrain.sh - mkdir 8B_config_llama3 && cd 8B_config_llama3 - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_zero1_llama_hf_pretrain/8B_config_llama3.1/config.json - cd .. +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_31_8b.sh + :language: shell + :lines: 5-7 If you want to pre-train Llama3 8B, you would need to run the following steps - -.. code:: ipython3 - - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_zero1_llama_hf_pretrain/tp_zero1_llama3_8B_hf_pretrain.sh - mkdir 8B_config_llama3 && cd 8B_config_llama3 - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_zero1_llama_hf_pretrain/8B_config_llama3/config.json - cd .. +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_3_8b.sh + :language: shell + :lines: 5-6 If you want to pre-train Llama2 7B, run the following steps - -.. code:: ipython3 - - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_zero1_llama_hf_pretrain/tp_zero1_llama2_7B_hf_pretrain.sh - mkdir 7B_config_llama2 && cd 7B_config_llama2 - wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed/master/examples/training/llama/tp_zero1_llama_hf_pretrain/7B_config_llama2/config.json - cd .. +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_2_7b.sh + :language: shell + :lines: 5-6 -3. Installing the additional requirements and giving the right permissions to our shell script +3. Installing the additional requirements -.. code:: ipython3 - - python3 -m pip install -r requirements.txt - chmod +x tp_zero1_llama2_7B_hf_pretrain.sh +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_tp_zero1_setup.sh + :language: shell + :lines: 10 To tokenize the data, we must request the tokenizer from hugging face and meta by following the instructions at the following link: `HuggingFace Llama 3 8B Model `__ . @@ -115,17 +100,16 @@ For Llama3.1/Llama3, make sure your ``~/examples/tp_zero1_llama_hf_pretrain`` di For Llama2, you just copy the ``tokenizer.model`` to the ``~/examples/tp_zero1_llama_hf_pretrain`` directory. Next let’s download and pre-process the dataset: -.. code:: ipython3 - - cd ~/examples/tp_zero1_llama_hf_pretrain - python3 get_dataset.py --llama-version 3 # change the version number to 2 for Llama-2 models +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_3_8b.sh + :language: shell + :lines: 11 `Note:` In case you see an error of the following form when downloading data: ``huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/ubuntu/examples/tp_zero1_llama_hf_pretrain'. Use `repo_type` argument if needed.`` This could be because of a stale cache. Try deleting the cache using: -.. code:: ipython3 - - sudo rm -rf /home/ubuntu/.cache/ +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_3_8b.sh + :language: shell + :lines: 8 At this point, you are all set to start training. The below tutorial uses ``Llama3 8B`` as an example. To run Llama2 7B, simply change the script from ``tp_zero1_llama3_8B_hf_pretrain.sh`` to ``tp_zero1_llama2_7B_hf_pretrain.sh`` @@ -137,12 +121,9 @@ By this step, the ParallelCluster is all setup for running experiments. Before we run training, we first pre-compile the graphs using the :ref:`neuron_parallel_compile `. Let’s run the command below: -.. code:: ipython3 - - sbatch --exclusive \ - --nodes 4 \ - --cpus-per-task 128 \ - --wrap="srun neuron_parallel_compile bash $(pwd)/tp_zero1_llama3_8B_hf_pretrain.sh" +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_3_8b.sh + :language: shell + :lines: 15-18 This script uses a tensor-parallel size of 8. This will automatically set the zero-1 sharding degree to 16 (4 * 32 workers / tensor_parallel_size). @@ -156,12 +137,9 @@ created. Once the graphs are compiled we can now run training and observe our loss goes down. To run the training, we just run the above command but without ``neuron_parallel_compile``. -.. code:: ipython3 - - sbatch --exclusive \ - --nodes 4 \ - --cpus-per-task 128 \ - --wrap="srun bash $(pwd)/tp_zero1_llama3_8B_hf_pretrain.sh" +.. literalinclude:: nxd-source-code/llama_tp_zero1/llama_3_8b.sh + :language: shell + :lines: 20-23 Performance: diff --git a/libraries/nxd-inference/nxdi-setup.rst b/libraries/nxd-inference/nxdi-setup.rst index ee52935f..22df76b6 100644 --- a/libraries/nxd-inference/nxdi-setup.rst +++ b/libraries/nxd-inference/nxdi-setup.rst @@ -67,7 +67,7 @@ Run this command to install NxD Inference. source aws_neuron_venv_pytorch/bin/activate pip install -U pip - pip install --upgrade neuronx-cc==2.* neuronx-distributed-inference --extra-index-url https://pip.repos.neuron.amazonaws.com + pip install --upgrade neuronx-cc==2.* neuronx-distributed-inference --index-url https://pip.repos.neuron.amazonaws.com Verify NxD Inference Installation diff --git a/libraries/nxd-training/general/installation_guide.rst b/libraries/nxd-training/general/installation_guide.rst index dd15ec08..6aef4f5b 100644 --- a/libraries/nxd-training/general/installation_guide.rst +++ b/libraries/nxd-training/general/installation_guide.rst @@ -37,7 +37,7 @@ Install the neuron packages using the command: .. code-block :: shell pip install -U pip - pip install --upgrade neuronx-cc==2.* torch-neuronx torchvision neuronx_distributed --extra-index-url https://pip.repos.neuron.amazonaws.com + pip install --upgrade neuronx-cc==2.* torch-neuronx torchvision neuronx_distributed --index-url https://pip.repos.neuron.amazonaws.com .. _nxdt_nemo_deps: @@ -132,7 +132,7 @@ To install the library, one can run the following command: .. code-block :: shell - pip install neuronx_distributed_training --extra-index-url https://pip.repos.neuron.amazonaws.com + pip install neuronx_distributed_training --index-url https://pip.repos.neuron.amazonaws.com .. _nxdt_installation_common_failures: diff --git a/libraries/transformers-neuronx/setup/index.rst b/libraries/transformers-neuronx/setup/index.rst index 2d1066d4..3de07cc1 100644 --- a/libraries/transformers-neuronx/setup/index.rst +++ b/libraries/transformers-neuronx/setup/index.rst @@ -8,7 +8,7 @@ the following instruction. .. code-block:: - pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com + pip install transformers-neuronx --index-url=https://pip.repos.neuron.amazonaws.com If you are starting from scratch, Neuron Multi Framework DLAMI is recommended as it comes pre-installed with Transformers NeuronX virtual environment. diff --git a/release-notes/torch/torch-neuronx/index.rst b/release-notes/torch/torch-neuronx/index.rst index 4310489d..c5723416 100644 --- a/release-notes/torch/torch-neuronx/index.rst +++ b/release-notes/torch/torch-neuronx/index.rst @@ -56,7 +56,7 @@ Currently, BERT pretraining performance is ~11% lower with torch-neuronx 2.5 com Warning "XLA_DOWNCAST_BF16 will be deprecated after the 2.5 release, please downcast your model directly" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Environment variables ``XLA_DOWNCAST_BF16`` and ``XLA_USE_BF16`` are deprecated (warning when used). Please switch to automatic mixed-precision or use ``model.to(torch.bfloat16)`` command to cast model to BF16. (see :ref:``) +Environment variables ``XLA_DOWNCAST_BF16`` and ``XLA_USE_BF16`` are deprecated (warning when used). Please switch to automatic mixed-precision or use ``model.to(torch.bfloat16)`` command to cast model to BF16. (see :ref:`migration_from_xla_downcast_bf16`) WARNING:root:torch_xla.core.xla_model.xrt_world_size() will be removed in release 2.7. is deprecated. Use torch_xla.runtime.world_size instead. @@ -68,7 +68,7 @@ This is a warning that ``torch_xla.core.xla_model.xrt_world_size()`` will be rem WARNING:torch_xla.core.xla_model.xla_model.get_ordinal() will be removed in release 2.7. is deprecated. Use torch_xla.runtime.global_ordinal instead. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This is a warning that ``torch_xla.core.xla_model.xla_model.get_ordinal() `` will be removed in a future release. Please switch to using ``torch_xla.runtime.global_ordinal`` instead. +This is a warning that ``torch_xla.core.xla_model.xla_model.get_ordinal()`` will be removed in a future release. Please switch to using ``torch_xla.runtime.global_ordinal`` instead. AttributeError: module 'torch_xla.runtime' has no attribute 'using_pjrt' diff --git a/src/examples/pytorch/byoc_sm_bert_tutorial/sagemaker_container_neuron.ipynb b/src/examples/pytorch/byoc_sm_bert_tutorial/sagemaker_container_neuron.ipynb index 0bfd1051..43cd042a 100644 --- a/src/examples/pytorch/byoc_sm_bert_tutorial/sagemaker_container_neuron.ipynb +++ b/src/examples/pytorch/byoc_sm_bert_tutorial/sagemaker_container_neuron.ipynb @@ -62,7 +62,7 @@ "outputs": [], "source": [ "%env TOKENIZERS_PARALLELISM=True #Supresses tokenizer warnings making errors easier to detect\n", - "!pip install --upgrade --no-cache-dir torch-neuron neuron-cc[tensorflow] torchvision torch --extra-index-url=https://pip.repos.neuron.amazonaws.com\n", + "!pip install --upgrade --no-cache-dir torch-neuron neuron-cc[tensorflow] torchvision torch --index-url=https://pip.repos.neuron.amazonaws.com\n", "!pip install --upgrade --no-cache-dir 'transformers==4.6.0'" ] }, diff --git a/src/examples/pytorch/neuronx_distributed/t5-inference/t5-inference-tutorial.ipynb b/src/examples/pytorch/neuronx_distributed/t5-inference/t5-inference-tutorial.ipynb index 6c2f502c..d5827015 100644 --- a/src/examples/pytorch/neuronx_distributed/t5-inference/t5-inference-tutorial.ipynb +++ b/src/examples/pytorch/neuronx_distributed/t5-inference/t5-inference-tutorial.ipynb @@ -325,7 +325,7 @@ "outputs": [], "source": [ "# Let us install NeuronPerf. We will use it to measure the performance.\n", - "! pip install neuronperf --extra-index-url=https://pip.repos.neuron.amazonaws.com" + "! pip install neuronperf --index-url=https://pip.repos.neuron.amazonaws.com" ] }, { diff --git a/src/examples/tensorflow/keras_resnet50/keras_resnet50.ipynb b/src/examples/tensorflow/keras_resnet50/keras_resnet50.ipynb index 6b75c9ec..d899d9f5 100644 --- a/src/examples/tensorflow/keras_resnet50/keras_resnet50.ipynb +++ b/src/examples/tensorflow/keras_resnet50/keras_resnet50.ipynb @@ -55,8 +55,8 @@ "outputs": [], "source": [ "!pip install pillow requests # Necessary for loading images\n", - "!pip install tensorflow_neuron==1.15.5.2.8.9.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com/\n", - "!pip install neuron_cc==1.13.5.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com" + "!pip install tensorflow_neuron==1.15.5.2.8.9.0 --index-url=https://pip.repos.neuron.amazonaws.com/\n", + "!pip install neuron_cc==1.13.5.0 --index-url=https://pip.repos.neuron.amazonaws.com" ] }, { diff --git a/src/examples/tensorflow/openpose_demo/openpose.ipynb b/src/examples/tensorflow/openpose_demo/openpose.ipynb index 34af5146..d854e64d 100644 --- a/src/examples/tensorflow/openpose_demo/openpose.ipynb +++ b/src/examples/tensorflow/openpose_demo/openpose.ipynb @@ -61,8 +61,8 @@ "source": [ "!wget -c --tries=2 $( wget -q -O - http://www.mediafire.com/file/qlzzr20mpocnpa3/graph_opt.pb | grep -o 'http*://download[^\"]*' | tail -n 1 ) -O graph_opt.pb\n", "\n", - "!pip install tensorflow_neuron==1.15.5.2.8.9.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com/\n", - "!pip install neuron_cc==1.13.5.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com" + "!pip install tensorflow_neuron==1.15.5.2.8.9.0 --index-url=https://pip.repos.neuron.amazonaws.com/\n", + "!pip install neuron_cc==1.13.5.0 --index-url=https://pip.repos.neuron.amazonaws.com" ] }, { diff --git a/src/examples/tensorflow/ssd300_demo/ssd300_detection.py b/src/examples/tensorflow/ssd300_demo/ssd300_detection.py index 4abf99d5..51107d2d 100644 --- a/src/examples/tensorflow/ssd300_demo/ssd300_detection.py +++ b/src/examples/tensorflow/ssd300_demo/ssd300_detection.py @@ -25,7 +25,7 @@ def main(): if tfn_version < LooseVersion('1.15.0.1.0.1333.0'): raise RuntimeError( 'tensorflow-neuron version {} is too low for this demo. Please upgrade ' - 'by "pip install -U tensorflow-neuron --extra-index-url=https://pip.repos.neuron.amazonaws.com"'.format(tfn_version)) + 'by "pip install -U tensorflow-neuron --index-url=https://pip.repos.neuron.amazonaws.com"'.format(tfn_version)) with open(args.image, 'rb') as f: img_jpg_bytes = f.read() diff --git a/src/examples/tensorflow/ssd300_demo/ssd300_evaluation.py b/src/examples/tensorflow/ssd300_demo/ssd300_evaluation.py index 0c1559a3..525be7c8 100644 --- a/src/examples/tensorflow/ssd300_demo/ssd300_evaluation.py +++ b/src/examples/tensorflow/ssd300_demo/ssd300_evaluation.py @@ -39,7 +39,7 @@ def main(): if tfn_version < LooseVersion('1.15.0.1.0.1333.0'): raise RuntimeError( 'tensorflow-neuron version {} is too low for this demo. Please upgrade ' - 'by "pip install -U tensorflow-neuron --extra-index-url=https://pip.repos.neuron.amazonaws.com"'.format(tfn_version)) + 'by "pip install -U tensorflow-neuron --index-url=https://pip.repos.neuron.amazonaws.com"'.format(tfn_version)) predictor_list = [tf.contrib.predictor.from_saved_model(args.saved_model) for _ in range(args.num_sessions)] val_dataset = get_val_dataset(args.instances_val2017_json, args.val2017) diff --git a/src/examples/tensorflow/ssd300_demo/ssd300_model.py b/src/examples/tensorflow/ssd300_demo/ssd300_model.py index 30881cb7..2e3f2105 100644 --- a/src/examples/tensorflow/ssd300_demo/ssd300_model.py +++ b/src/examples/tensorflow/ssd300_demo/ssd300_model.py @@ -296,12 +296,12 @@ def main(): if neuroncc_version < LooseVersion('1.0.18000'): raise RuntimeError( 'neuron-cc version {} is too low for this demo. Please upgrade ' - 'by "pip install -U neuron-cc --extra-index-url=https://pip.repos.neuron.amazonaws.com"'.format(neuroncc_version)) + 'by "pip install -U neuron-cc --index-url=https://pip.repos.neuron.amazonaws.com"'.format(neuroncc_version)) tfn_version = LooseVersion(pkg_resources.get_distribution('tensorflow-neuron').version) if tfn_version < LooseVersion('1.15.3.1.0.1900.0'): raise RuntimeError( 'tensorflow-neuron version {} is too low for this demo. Please upgrade ' - 'by "pip install -U tensorflow-neuron --extra-index-url=https://pip.repos.neuron.amazonaws.com"'.format(tfn_version)) + 'by "pip install -U tensorflow-neuron --index-url=https://pip.repos.neuron.amazonaws.com"'.format(tfn_version)) sys.path.append(os.getcwd()) from DeepLearningExamples.PyTorch.Detection.SSD.src import model as torch_ssd300_model @@ -336,7 +336,7 @@ def main(): if not op.get_attr('executable'): raise AttributeError( 'Neuron executable (neff) is empty. Please check neuron-cc is installed and working properly ' - '("pip install neuron-cc --force --extra-index-url=https://pip.repos.neuron.amazonaws.com" ' + '("pip install neuron-cc --force --index-url=https://pip.repos.neuron.amazonaws.com" ' 'to force reinstall neuron-cc).') model_config = op.node_def.attr['model_config'].list if model_config.i: diff --git a/src/examples/tensorflow/tensorflow_resnet50/resnet50.ipynb b/src/examples/tensorflow/tensorflow_resnet50/resnet50.ipynb index 50fe7e53..500246dc 100644 --- a/src/examples/tensorflow/tensorflow_resnet50/resnet50.ipynb +++ b/src/examples/tensorflow/tensorflow_resnet50/resnet50.ipynb @@ -49,8 +49,8 @@ }, "outputs": [], "source": [ - "!pip install tensorflow_neuron==1.15.5.2.8.9.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com/\n", - "!pip install neuron_cc==1.13.5.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com" + "!pip install tensorflow_neuron==1.15.5.2.8.9.0 --index-url=https://pip.repos.neuron.amazonaws.com/\n", + "!pip install neuron_cc==1.13.5.0 --index-url=https://pip.repos.neuron.amazonaws.com" ] }, { diff --git a/src/examples/tensorflow/yolo_v3_demo/yolo_v3.ipynb b/src/examples/tensorflow/yolo_v3_demo/yolo_v3.ipynb index 257d80dd..c7816651 100644 --- a/src/examples/tensorflow/yolo_v3_demo/yolo_v3.ipynb +++ b/src/examples/tensorflow/yolo_v3_demo/yolo_v3.ipynb @@ -53,8 +53,8 @@ }, "outputs": [], "source": [ - "%pip install tensorflow_neuron==1.15.5.2.8.9.0 neuron_cc==1.13.5.0 requests pillow matplotlib pycocotools==2.0.1 numpy==1.18.2 torch~=1.5.0 --force \\\n", - " --extra-index-url=https://pip.repos.neuron.amazonaws.com" + "%pip install tensorflow_neuron==1.15.5.2.8.9.0 neuron_cc==1.13.5.0 --index-url=https://pip.repos.neuron.amazonaws.com --force\n", + "%pip install requests pillow matplotlib pycocotools==2.0.1 numpy==1.18.2 torch~=1.5.0 --force" ] }, { diff --git a/src/examples/tensorflow/yolo_v4_demo/evaluate.ipynb b/src/examples/tensorflow/yolo_v4_demo/evaluate.ipynb index 5f49ecc6..0b0aa0c3 100644 --- a/src/examples/tensorflow/yolo_v4_demo/evaluate.ipynb +++ b/src/examples/tensorflow/yolo_v4_demo/evaluate.ipynb @@ -42,8 +42,8 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install tensorflow_neuron==1.15.5.2.8.9.0 neuron_cc==1.13.5.0 requests pillow matplotlib pycocotools==2.0.1 numpy==1.18.2 torch~=1.5.0 --force \\\n", - " --extra-index-url=https://pip.repos.neuron.amazonaws.com" + "!pip install requests pillow matplotlib pycocotools==2.0.1 numpy==1.18.2 torch~=1.5.0 --force &&\n", + "!pip install tensorflow_neuron==1.15.5.2.8.9.0 neuron_cc==1.13.5.0 --index-url=https://pip.repos.neuron.amazonaws.com --force" ] }, { diff --git a/src/helperscripts/n2-helper.py b/src/helperscripts/n2-helper.py index 5655f8d6..21908f95 100644 --- a/src/helperscripts/n2-helper.py +++ b/src/helperscripts/n2-helper.py @@ -659,7 +659,7 @@ def set_pip_repository(self): indentation = '\t' if args.venv_install_type == 'parallel-cluster' else '' str += f'\n{indentation}# Set pip repository pointing to the Neuron repository \n' - str += f'{indentation}python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com\n' + str += f'{indentation}python -m pip config set global.index-url https://pip.repos.neuron.amazonaws.com\n' return str diff --git a/src/helperscripts/neuronsetuphelper.py b/src/helperscripts/neuronsetuphelper.py index 10ee79b4..578b3a19 100644 --- a/src/helperscripts/neuronsetuphelper.py +++ b/src/helperscripts/neuronsetuphelper.py @@ -437,7 +437,7 @@ def hlpr_build_pip_command(nr_setup, neuron_version, component,include_compiler, def hlpr_pip_repos_setup(): str = '\n' str += '# Set Pip repository to point to the Neuron repository' + '\n' - str += 'pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com'+ '\n' + str += 'pip config set global.index-url https://pip.repos.neuron.amazonaws.com'+ '\n' return str ################################################# diff --git a/tools/neuronperf/index.rst b/tools/neuronperf/index.rst index 33aa37d2..3d96ae3e 100644 --- a/tools/neuronperf/index.rst +++ b/tools/neuronperf/index.rst @@ -15,7 +15,7 @@ To install NeuronPerf in your Neuron environment, execute: .. code:: bash - $ pip install neuronperf --extra-index-url=https://pip.repos.neuron.amazonaws.com + $ pip install neuronperf --index-url=https://pip.repos.neuron.amazonaws.com Refer to the :ref:`neuronperf_examples` and :ref:`neuronperf_user_guide` to get started. diff --git a/tools/neuronperf/neuronperf_install.rst b/tools/neuronperf/neuronperf_install.rst index bc04960b..f2107535 100644 --- a/tools/neuronperf/neuronperf_install.rst +++ b/tools/neuronperf/neuronperf_install.rst @@ -7,5 +7,5 @@ Activate your Neuron environment, and execute: .. code:: bash - $ pip install neuronperf --extra-index-url=https://pip.repos.neuron.amazonaws.com + $ pip install neuronperf --index-url=https://pip.repos.neuron.amazonaws.com diff --git a/tools/tensorboard/getting-started-tensorboard-neuronx-plugin.rst b/tools/tensorboard/getting-started-tensorboard-neuronx-plugin.rst index f6f17c62..368654f6 100644 --- a/tools/tensorboard/getting-started-tensorboard-neuronx-plugin.rst +++ b/tools/tensorboard/getting-started-tensorboard-neuronx-plugin.rst @@ -45,7 +45,7 @@ In this step, we will process the Neuron profile data and launch TensorBoard. .. code:: bash - python -m pip config set global.extra-index-url "https://pip.repos.neuron.amazonaws.com" + python3 -m pip config set global.index-url https://pip.repos.neuron.amazonaws.com pip install tensorboard-plugin-neuronx