From 1457d054649b2fe4e01d1a1a103b4cccf0a75511 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Tue, 31 Oct 2023 13:58:53 -0400 Subject: [PATCH 01/45] MPI operator code for distributed training --- ai-ml/trainium-inferentia/addons.tf | 17 +++++++++++++++++ ai-ml/trainium-inferentia/variables.tf | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/ai-ml/trainium-inferentia/addons.tf b/ai-ml/trainium-inferentia/addons.tf index 472dfd94a..797aa8e05 100644 --- a/ai-ml/trainium-inferentia/addons.tf +++ b/ai-ml/trainium-inferentia/addons.tf @@ -499,3 +499,20 @@ resource "aws_launch_template" "trn1_lt" { } } } + +#--------------------------------------------------------------- +# MPI Operator for distributed training on Trainium +#--------------------------------------------------------------- +data "http" "mpi_operator_yaml" { + url = "https://raw.githubusercontent.com/kubeflow/mpi-operator/${var.mpi_operator_version}/deploy/v2beta1/mpi-operator.yaml" +} + +data "kubectl_file_documents" "mpi_operator_yaml" { + content = data.http.mpi_operator_yaml.response_body +} + +resource "kubectl_manifest" "mpi_operator" { + for_each = data.kubectl_file_documents.mpi_operator_yaml.manifests + yaml_body = each.value + depends_on = [module.eks.eks_cluster_id] +} \ No newline at end of file diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index a63d8cb8c..0139da8c2 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -37,3 +37,8 @@ variable "enable_amazon_prometheus" { type = bool default = true } + +variable "mpi_operator_version" { + description = "The version of the MPI Operator to install" + default = "v0.4.0" +} \ No newline at end of file From 0a46f3aabf0cd2f813707894f84dcf5f22c6bd54 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Sun, 5 Nov 2023 20:23:23 -0500 Subject: [PATCH 02/45] Making MPI operator optional for users --- ai-ml/trainium-inferentia/addons.tf | 5 +++-- ai-ml/trainium-inferentia/variables.tf | 8 +++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ai-ml/trainium-inferentia/addons.tf b/ai-ml/trainium-inferentia/addons.tf index 797aa8e05..8ea843a65 100644 --- a/ai-ml/trainium-inferentia/addons.tf +++ b/ai-ml/trainium-inferentia/addons.tf @@ -512,7 +512,8 @@ data "kubectl_file_documents" "mpi_operator_yaml" { } resource "kubectl_manifest" "mpi_operator" { - for_each = data.kubectl_file_documents.mpi_operator_yaml.manifests + for_each = var.enable_mpi_operator ? data.kubectl_file_documents.mpi_operator_yaml.manifests : {} yaml_body = each.value depends_on = [module.eks.eks_cluster_id] -} \ No newline at end of file +} + diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 0139da8c2..e2480b637 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -41,4 +41,10 @@ variable "enable_amazon_prometheus" { variable "mpi_operator_version" { description = "The version of the MPI Operator to install" default = "v0.4.0" -} \ No newline at end of file +} + +variable "enable_mpi_operator" { + description = "Flag to enable the MPI Operator deployment" + type = bool + default = false +} From fbd3ba9dd2f5ca4e7bcd77c60e58a0d1f5378fd7 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Mon, 6 Nov 2023 17:44:55 -0500 Subject: [PATCH 03/45] added type string to mpi operator variable version --- ai-ml/trainium-inferentia/variables.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index e2480b637..22a957f8e 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -41,6 +41,7 @@ variable "enable_amazon_prometheus" { variable "mpi_operator_version" { description = "The version of the MPI Operator to install" default = "v0.4.0" + type = string } variable "enable_mpi_operator" { From c41509776711ccbba3f101ef933cff50446bfae8 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Wed, 13 Dec 2023 14:44:59 -0500 Subject: [PATCH 04/45] llama2 examples --- ai-ml/trainium-inferentia/eks.tf | 4 +- .../1-llama2-neuronx-pretrain-build-image.sh | 138 ++++++++++++++++++ .../llama2/2-llama2-neuronx-mpi-compile.sh | 0 .../llama2/3-llama2-neuronx-mpi-train.sh | 0 ai-ml/trainium-inferentia/main.tf | 10 +- ai-ml/trainium-inferentia/outputs.tf | 7 +- ai-ml/trainium-inferentia/variables.tf | 2 +- 7 files changed, 156 insertions(+), 5 deletions(-) create mode 100644 ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh create mode 100644 ai-ml/trainium-inferentia/examples/llama2/2-llama2-neuronx-mpi-compile.sh create mode 100644 ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-train.sh diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index aac9e814b..02dd4437e 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -176,9 +176,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = 0 + min_size = 2 max_size = 2 - desired_size = 0 + desired_size = 2 # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ diff --git a/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh b/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh new file mode 100644 index 000000000..beace7713 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +# Set strict error handling +set -euo pipefail + +# Environment Variables +AWS_ACCOUNT_ID="123255318457" +AWS_REGION="us-west-2" +ECR_REPO_NAME="neuronx_nemo" +FSX_PATH="fs-08515da32c3dbe2ef" +# Add other necessary environment variables here + +# Function to Clone the NeuronX-Nemo-Megatron Repository +clone_repo() { + echo "Checking if the NeuronX-Nemo-Megatron repository already exists..." + if [ -d "neuronx-nemo-megatron" ]; then + echo "The NeuronX-Nemo-Megatron repository already exists. Skipping cloning." + else + echo "Cloning the NeuronX-Nemo-Megatron repository..." + if ! git clone https://github.com/aws-neuron/neuronx-nemo-megatron.git; then + echo "Error cloning repository" + exit 1 + fi + fi +} + + +# Function to Install Prerequisites +install_prereqs() { + echo "Installing prerequisites..." + if ! pip3 install torch --index-url https://download.pytorch.org/whl/cpu; then + echo "Error installing PyTorch" + exit 1 + fi + if ! pip3 install wheel; then + echo "Error installing Wheel" + exit 1 + fi + # Add other installation commands here +} + +# Function to Modify Configuration Files +modify_files() { + echo "Modifying configuration files..." + local test_llama_path="./neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh" + + # Cross-platform sed in-place edit: Using '' for macOS, -i for Linux + if [[ "$OSTYPE" == "darwin"* ]]; then + SED_CMD="sed -i ''" + else + SED_CMD="sed -i" + fi + + if ! $SED_CMD 's|old_tokenizer_path|new_tokenizer_path|g' "$test_llama_path"; then + echo "Error modifying $test_llama_path" + exit 1 + fi + if ! $SED_CMD 's|old_dataset_path|new_dataset_path|g' "$test_llama_path"; then + echo "Error modifying $test_llama_path" + exit 1 + fi + # Add other file modifications here +} + + +# Function to Build Docker Image and Push to ECR +build_and_push_docker() { + echo "Building Docker image and pushing to ECR..." + + # Change to the neuronx-nemo-megatron directory + if [[ -d "neuronx-nemo-megatron" ]]; then + cd neuronx-nemo-megatron + else + echo "neuronx-nemo-megatron directory not found. Exiting." + exit 1 + fi + + # Check for build.sh in the neuronx-nemo-megatron directory + if [[ ! -f "build.sh" ]]; then + echo "build.sh not found in the neuronx-nemo-megatron directory. Exiting." + exit 1 + fi + + # Enable Docker BuildKit for better performance + export DOCKER_BUILDKIT=1 + + # Build the Nemo and Apex wheels + echo "Building Nemo and Apex wheels..." + ./build.sh || { echo "Failed to build Nemo and Apex wheels"; exit 1; } + + # AWS / ECR / Repo Info + AWS_ACCT=$(aws sts get-caller-identity | jq -r ".Account") || { echo "Failed to get AWS account"; exit 1; } + REGION=us-west-2 + ECR_REPO=$AWS_ACCT.dkr.ecr.$REGION.amazonaws.com/neuronx_nemo + + # Authenticate with ECR + echo "Logging into AWS ECR..." + aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $AWS_ACCT.dkr.ecr.$REGION.amazonaws.com || { echo "Failed to login to AWS ECR"; exit 1; } + + # Build and push the image + echo "Building and pushing the Docker image to ECR..." + docker build . -f ./k8s/docker/Dockerfile -t $ECR_REPO:latest || { echo "Docker build failed"; exit 1; } + docker push $ECR_REPO:latest || { echo "Failed to push Docker image to ECR"; exit 1; } + + echo "Docker image successfully built and pushed to ECR." +} + + + + + + + + +# Function to Set Up Kubernetes +setup_kubernetes() { + echo "Setting up Kubernetes..." + if ! kubectl apply -f ./k8s/example_manifests/mpi_compile_llama7b.yaml; then + echo "Error applying mpi_compile_llama7b.yaml" + exit 1 + fi + if ! kubectl apply -f ./k8s/example_manifests/mpi_train_llama7b.yaml; then + echo "Error applying mpi_train_llama7b.yaml" + exit 1 + fi + # Add other Kubernetes setup commands here +} + +# Main Execution Flow +echo "Starting the installation and setup process..." + +clone_repo +install_prereqs +modify_files +build_and_push_docker +setup_kubernetes + +echo "Installation and setup completed successfully." diff --git a/ai-ml/trainium-inferentia/examples/llama2/2-llama2-neuronx-mpi-compile.sh b/ai-ml/trainium-inferentia/examples/llama2/2-llama2-neuronx-mpi-compile.sh new file mode 100644 index 000000000..e69de29bb diff --git a/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-train.sh b/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-train.sh new file mode 100644 index 000000000..e69de29bb diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index 29863e450..0ede9c5c3 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -30,6 +30,14 @@ provider "kubectl" { load_config_file = false } +resource "random_string" "this" { + length = 5 + special = false + upper = false + lower = true + numeric = true +} + data "aws_eks_cluster_auth" "this" { name = module.eks.cluster_name } @@ -39,7 +47,7 @@ data "aws_ecrpublic_authorization_token" "token" { } locals { - name = var.name + name = "${var.name}-${random_string.this.result}" region = var.region # Training and Inference instances are available in the following AZs us-east-1 and us-west-2 # You can find the list of supported AZs here: https://aws.amazon.com/ec2/instance-types/trn1/ diff --git a/ai-ml/trainium-inferentia/outputs.tf b/ai-ml/trainium-inferentia/outputs.tf index f6444daab..c66da693f 100755 --- a/ai-ml/trainium-inferentia/outputs.tf +++ b/ai-ml/trainium-inferentia/outputs.tf @@ -1,4 +1,9 @@ -output "configure_kubectl" { +/* output "configure_kubectl" { description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" value = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}" +} */ + +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${var.region} update-kubeconfig --name ${local.name}" } diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 22a957f8e..749d3c6db 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -1,6 +1,6 @@ variable "name" { description = "Name of the VPC and EKS Cluster" - default = "trainium-inferentia" + default = "tr-inf" type = string } From 08634edb0436155256e7c813d8d00e3d0e6dd59e Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Thu, 14 Dec 2023 00:53:55 -0700 Subject: [PATCH 05/45] llama2 pretraining updates simplified docker build for neuronx-nemo-megatron container added scripts for cli pod launch, precompilation, training added script for tensorboard deployment --- .../1-llama2-neuronx-pretrain-build-image.sh | 193 ++++++------------ .../examples/llama2/2-launch-cmd-shell-pod.sh | 51 +++++ .../llama2/2-llama2-neuronx-mpi-compile.sh | 0 .../llama2/3-llama2-neuronx-mpi-compile.sh | 27 +++ .../llama2/3-llama2-neuronx-mpi-train.sh | 0 .../llama2/4-llama2-neuronx-mpi-train.sh | 27 +++ .../examples/llama2/5-deploy-tensorboard.sh | 126 ++++++++++++ .../llama2/docker/Dockerfile.llama_pretrain | 123 +++++++++++ .../llama2/docker/Dockerfile.tensorboard | 12 ++ .../llama2/docker/assets/nginx_auth.conf | 11 + .../example_manifests/llama_compile.yaml | 83 ++++++++ .../llama2/example_manifests/llama_train.yaml | 81 ++++++++ 12 files changed, 600 insertions(+), 134 deletions(-) mode change 100644 => 100755 ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh create mode 100755 ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh delete mode 100644 ai-ml/trainium-inferentia/examples/llama2/2-llama2-neuronx-mpi-compile.sh create mode 100755 ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-compile.sh delete mode 100644 ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-train.sh create mode 100755 ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh create mode 100755 ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh create mode 100644 ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.llama_pretrain create mode 100644 ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard create mode 100644 ai-ml/trainium-inferentia/examples/llama2/docker/assets/nginx_auth.conf create mode 100644 ai-ml/trainium-inferentia/examples/llama2/example_manifests/llama_compile.yaml create mode 100644 ai-ml/trainium-inferentia/examples/llama2/example_manifests/llama_train.yaml diff --git a/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh b/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh old mode 100644 new mode 100755 index beace7713..0f4d63549 --- a/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh @@ -1,138 +1,63 @@ #!/bin/bash -# Set strict error handling -set -euo pipefail - -# Environment Variables -AWS_ACCOUNT_ID="123255318457" -AWS_REGION="us-west-2" +# Check that we are running on an x86_64 instance to avoid issues with docker build +arch=$(uname -m) +if [[ ! "$arch" = "x86_64" ]]; then + echo "Error: please run this script on an x86_64-based instance" + exit 1 +fi + +# Check if docker is installed +junk=$(which docker 2>&1 > /dev/null) +if [[ "$?" -ne 0 ]]; then + echo "Error: please install docker and try again. ex: for AL2023 you can run:" + echo " sudo yum install docker -y" + echo " sudo systemctl start docker" + echo " sudo usermod -aG docker ec2-user" + echo " newgrp docker" + exit 1 +fi + +# Check that AWS CLI is installed and configured +junk=$(aws sts get-caller-identity) +if [[ "$?" -ne 0 ]]; then + echo "Error: please make sure that the AWS CLI is installed and configured using 'aws configure'." + exit 1 +fi + +# Prompt user for desired region +read -p "Enter the ECR region: " region +echo $region > .eks_region + +# Replace with your desired repository name ECR_REPO_NAME="neuronx_nemo" -FSX_PATH="fs-08515da32c3dbe2ef" -# Add other necessary environment variables here - -# Function to Clone the NeuronX-Nemo-Megatron Repository -clone_repo() { - echo "Checking if the NeuronX-Nemo-Megatron repository already exists..." - if [ -d "neuronx-nemo-megatron" ]; then - echo "The NeuronX-Nemo-Megatron repository already exists. Skipping cloning." - else - echo "Cloning the NeuronX-Nemo-Megatron repository..." - if ! git clone https://github.com/aws-neuron/neuronx-nemo-megatron.git; then - echo "Error cloning repository" - exit 1 - fi - fi -} - - -# Function to Install Prerequisites -install_prereqs() { - echo "Installing prerequisites..." - if ! pip3 install torch --index-url https://download.pytorch.org/whl/cpu; then - echo "Error installing PyTorch" - exit 1 - fi - if ! pip3 install wheel; then - echo "Error installing Wheel" - exit 1 - fi - # Add other installation commands here -} - -# Function to Modify Configuration Files -modify_files() { - echo "Modifying configuration files..." - local test_llama_path="./neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh" - - # Cross-platform sed in-place edit: Using '' for macOS, -i for Linux - if [[ "$OSTYPE" == "darwin"* ]]; then - SED_CMD="sed -i ''" - else - SED_CMD="sed -i" - fi - - if ! $SED_CMD 's|old_tokenizer_path|new_tokenizer_path|g' "$test_llama_path"; then - echo "Error modifying $test_llama_path" - exit 1 - fi - if ! $SED_CMD 's|old_dataset_path|new_dataset_path|g' "$test_llama_path"; then - echo "Error modifying $test_llama_path" - exit 1 - fi - # Add other file modifications here -} - - -# Function to Build Docker Image and Push to ECR -build_and_push_docker() { - echo "Building Docker image and pushing to ECR..." - - # Change to the neuronx-nemo-megatron directory - if [[ -d "neuronx-nemo-megatron" ]]; then - cd neuronx-nemo-megatron - else - echo "neuronx-nemo-megatron directory not found. Exiting." - exit 1 - fi - - # Check for build.sh in the neuronx-nemo-megatron directory - if [[ ! -f "build.sh" ]]; then - echo "build.sh not found in the neuronx-nemo-megatron directory. Exiting." - exit 1 - fi - - # Enable Docker BuildKit for better performance - export DOCKER_BUILDKIT=1 - - # Build the Nemo and Apex wheels - echo "Building Nemo and Apex wheels..." - ./build.sh || { echo "Failed to build Nemo and Apex wheels"; exit 1; } - - # AWS / ECR / Repo Info - AWS_ACCT=$(aws sts get-caller-identity | jq -r ".Account") || { echo "Failed to get AWS account"; exit 1; } - REGION=us-west-2 - ECR_REPO=$AWS_ACCT.dkr.ecr.$REGION.amazonaws.com/neuronx_nemo - - # Authenticate with ECR - echo "Logging into AWS ECR..." - aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $AWS_ACCT.dkr.ecr.$REGION.amazonaws.com || { echo "Failed to login to AWS ECR"; exit 1; } - - # Build and push the image - echo "Building and pushing the Docker image to ECR..." - docker build . -f ./k8s/docker/Dockerfile -t $ECR_REPO:latest || { echo "Docker build failed"; exit 1; } - docker push $ECR_REPO:latest || { echo "Failed to push Docker image to ECR"; exit 1; } - - echo "Docker image successfully built and pushed to ECR." -} - - - - - - - - -# Function to Set Up Kubernetes -setup_kubernetes() { - echo "Setting up Kubernetes..." - if ! kubectl apply -f ./k8s/example_manifests/mpi_compile_llama7b.yaml; then - echo "Error applying mpi_compile_llama7b.yaml" - exit 1 - fi - if ! kubectl apply -f ./k8s/example_manifests/mpi_train_llama7b.yaml; then - echo "Error applying mpi_train_llama7b.yaml" - exit 1 - fi - # Add other Kubernetes setup commands here -} - -# Main Execution Flow -echo "Starting the installation and setup process..." - -clone_repo -install_prereqs -modify_files -build_and_push_docker -setup_kubernetes -echo "Installation and setup completed successfully." +# Check if the ECR repository exists +if aws ecr describe-repositories --repository-names "$ECR_REPO_NAME" --region "$region" >/dev/null 2>&1; then + echo "ECR repository '$ECR_REPO_NAME' already exists." + + # Get the ECR_REPO_URI for the existing repository + ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$region" --output text) + echo "Repository URL: $ECR_REPO_URI" + echo $ECR_REPO_URI > .ecr_repo_uri +else + # Create the ECR repository + aws ecr create-repository --repository-name "$ECR_REPO_NAME" --region "$region" + + # Get the ECR_REPO_URI for the newly created repository + ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$region" --output text) + echo "ECR repository '$ECR_REPO_NAME' created successfully." + echo "Repository URL: $ECR_REPO_URI" +fi + +# Login to ECR +echo -e "\nLogging in to ECR" +aws ecr get-login-password --region "$region" | docker login --username AWS --password-stdin "$ECR_REPO_URI" + +# Build neuronx-nemo-megatron docker image +echo -e "\nBuilding neuronx-nemo-megatron docker image" +docker build ./docker -f ./docker/Dockerfile.llama_pretrain -t $ECR_REPO_URI + +# Push image to ECR +echo -e "\nPushing image to ECR" +docker push $ECR_REPO_URI:latest diff --git a/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh b/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh new file mode 100755 index 000000000..75cabf752 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Check if kubectl is installed +junk=$(which kubectl 2>&1 > /dev/null) +if [[ "$?" -ne 0 ]]; then + echo "Error: please install kubectl and try again. See: https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html" + exit 1 +fi + +# Check if kubectl is configured +junk=$(kubectl get nodes) +if [[ "$?" -ne 0 ]]; then + echo "Error: kubectl is installed but not configured. Please use 'aws eks update-kubeconfig' to configure it and try again" + exit 1 +fi + +# Read in our ECR REPO URI, created by 1-llama2-neuronx-pretrain-build-image.sh +ECR_REPO_URI=$(cat .ecr_repo_uri) +echo -e "Using container image $ECR_REPO_URI:latest" + +# Launch the cmd-shell pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh +kubectl apply -f - <&1 > /dev/null) +if [[ "$?" -ne 0 ]]; then + echo "Error: please install kubectl and try again. See: https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html" + exit 1 +fi + +# Check if kubectl is configured +junk=$(kubectl get nodes) +if [[ "$?" -ne 0 ]]; then + echo "Error: kubectl is installed but not configured. Please use 'aws eks update-kubeconfig' to configure it and try again" + exit 1 +fi + +# Read in our ECR REPO URI, created by 1-llama2-neuronx-pretrain-build-image.sh +ECR_REPO_URI=$(cat .ecr_repo_uri) +echo -e "Using container image $ECR_REPO_URI:latest" + +# Launch the llama2-7B pre-compilation pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh +sed "s|IMG_PLACEHOLDER|$ECR_REPO_URI:latest|" ./example_manifests/llama_compile.yaml | kubectl apply -f - + +if [[ "$?" -eq 0 ]]; then + echo + kubectl get pods +fi diff --git a/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-train.sh b/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-train.sh deleted file mode 100644 index e69de29bb..000000000 diff --git a/ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh b/ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh new file mode 100755 index 000000000..71391e253 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Check if kubectl is installed +junk=$(which kubectl 2>&1 > /dev/null) +if [[ "$?" -ne 0 ]]; then + echo "Error: please install kubectl and try again. See: https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html" + exit 1 +fi + +# Check if kubectl is configured +junk=$(kubectl get nodes) +if [[ "$?" -ne 0 ]]; then + echo "Error: kubectl is installed but not configured. Please use 'aws eks update-kubeconfig' to configure it and try again" + exit 1 +fi + +# Read in our ECR REPO URI, created by 1-llama2-neuronx-pretrain-build-image.sh +ECR_REPO_URI=$(cat .ecr_repo_uri) +echo -e "Using container image $ECR_REPO_URI:latest" + +# Launch the llama2-7B training pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh +sed "s|IMG_PLACEHOLDER|$ECR_REPO_URI:latest|" ./example_manifests/llama_train.yaml | kubectl apply -f - + +if [[ "$?" -eq 0 ]]; then + echo + kubectl get pods +fi diff --git a/ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh b/ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh new file mode 100755 index 000000000..c043d2209 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +# Check if kubectl is installed +junk=$(which kubectl 2>&1 > /dev/null) +if [[ "$?" -ne 0 ]]; then + echo "Error: please install kubectl and try again. See: https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html" + exit 1 +fi + +# Check if kubectl is configured +junk=$(kubectl get nodes) +if [[ "$?" -ne 0 ]]; then + echo "Error: kubectl is installed but not configured. Please use 'aws eks update-kubeconfig' to configure it and try again" + exit 1 +fi + +# Determine ECR REPO URI to which we'll push the Tensorboard image +ECR_REPO_URI=$(cat .ecr_repo_uri):tensorboard + +# Generate a random password which will be used for Tensorboard +PASSWORD=$(head /dev/random|md5sum|head -c12) + +# Build and push the Tensorboard image +echo -e "Building Tensorboard container" +DOCKER_BUILDKIT=1 docker build --build-arg TB_PASSWORD=$PASSWORD ./docker -f ./docker/Dockerfile.tensorboard -t $ECR_REPO_URI +echo -e "\nPushing Tensorboard container to $ECR_REPO_URI" +docker push $ECR_REPO_URI + +# Create the Tensorboard deployment +echo -e "\nCreating Tensorboard pod" +kubectl apply -f - < http://admin:$PASSWORD@$LB_HOST\n\n" +echo "http://admin:$PASSWORD@$LB_HOST" > tensorboard_url.txt diff --git a/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.llama_pretrain b/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.llama_pretrain new file mode 100644 index 000000000..51985d9c0 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.llama_pretrain @@ -0,0 +1,123 @@ +FROM public.ecr.aws/lts/ubuntu:22.04_stable +ARG PYTHON="python3.10" +ARG PIP="${PYTHON} -m pip" +ARG DEBIAN_FRONTEND=noninteractive + +# Neuron repos +ARG APT_REPO=https://apt.repos.neuron.amazonaws.com +ARG PIP_REPO=https://pip.repos.neuron.amazonaws.com + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + git \ + git-lfs \ + jq \ + software-properties-common \ + wget \ + unzip \ + vim \ + nano \ + zlib1g-dev \ + openssl \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + tk-dev \ + libffi-dev \ + libcap-dev \ + gnupg2 \ + gpg-agent \ + pciutils \ + ${PYTHON}-full \ + ${PYTHON}-dev \ + cython3 \ + inetutils-ping \ + google-perftools \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# EFA Installer - required - installs libfabric (but no EFA driver) inside the container +RUN apt-get update \ + && cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -fr /root/aws-efa-installer* \ + && cd $HOME \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Neuron system packages (minus driver) +RUN echo "deb $APT_REPO focal main" > /etc/apt/sources.list.d/neuron.list \ + && wget -qO - $APT_REPO/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - \ + && apt-get update \ + && apt-get install -y aws-neuronx-tools aws-neuronx-collectives aws-neuronx-runtime-lib \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +WORKDIR / +RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && $PYTHON get-pip.py && $PIP install --upgrade pip +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + +# PyTorch Neuron packages (2.12) +RUN $PIP config set global.extra-index-url $PIP_REPO \ + && $PIP install torch==1.13.1.* --index-url https://download.pytorch.org/whl/cpu \ + && $PIP install torch-neuronx==1.13.1.* neuronx-cc==2.* \ + && $PIP install --no-cache-dir -U python-etcd \ + && rm -fr /root/.cache/ + +# Install packages and configure SSH for MPI operator in k8s +RUN apt-get update && apt-get install -y openmpi-bin openssh-server \ + && mkdir -p /var/run/sshd \ + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ + && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Add Neuron tools to path +RUN echo "export PATH=/opt/aws/neuron/bin:\$PATH" >> /root/.bashrc \ + && echo "export TERM=screen" >> /root/.bashrc + +# Clone, build, and install neuronx-nemo-megatron +RUN git clone https://github.com/aws-neuron/neuronx-nemo-megatron.git \ + && cd neuronx-nemo-megatron \ + && bash ./build.sh \ + && $PIP install ./build/*.whl \ + && $PIP install -r requirements.txt torch==1.13.1 protobuf==3.20.3 \ + && cp ./k8s/docker/nodelist_helper.py / \ + && cd /usr/local/lib/${PYTHON}/dist-packages/nemo/collections/nlp/data/language_modeling/megatron/ \ + && make -C . \ + && rm -fr /root/.cache + +# Setup git lfs (required to download RedPajama dataset and others) +RUN git lfs install + +CMD ["/bin/bash"] diff --git a/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard b/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard new file mode 100644 index 000000000..c0b1d21f2 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard @@ -0,0 +1,12 @@ +# Tensorboard container for use in the EKS/Trn1 BERT pretraining tutorial +FROM public.ecr.aws/lts/ubuntu:22.04_stable +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y nginx python3-pip apache2-utils curl +RUN python3 -m pip install --upgrade pip && python3 -m pip install tensorboard + +# TB_PASSWORD is specified during container build +ARG TB_PASSWORD="" + +RUN htpasswd -c -b /etc/nginx/htpasswd admin $TB_PASSWORD +COPY assets/nginx_auth.conf /etc/nginx/sites-enabled/default diff --git a/ai-ml/trainium-inferentia/examples/llama2/docker/assets/nginx_auth.conf b/ai-ml/trainium-inferentia/examples/llama2/docker/assets/nginx_auth.conf new file mode 100644 index 000000000..551f5c1c8 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/docker/assets/nginx_auth.conf @@ -0,0 +1,11 @@ +server { + listen 80 default_server; + + location / { + auth_basic "Restricted"; + auth_basic_user_file htpasswd; + + proxy_pass http://tensorboard-service:6006; + proxy_read_timeout 900; + } +} diff --git a/ai-ml/trainium-inferentia/examples/llama2/example_manifests/llama_compile.yaml b/ai-ml/trainium-inferentia/examples/llama2/example_manifests/llama_compile.yaml new file mode 100644 index 000000000..245b64c74 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/example_manifests/llama_compile.yaml @@ -0,0 +1,83 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: test-mpi-compile +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + backoffLimit: 20 + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: IMG_PLACEHOLDER + name: mpitest + imagePullPolicy: Always + env: + - name: POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + command: + - mpirun + - --allow-run-as-root + - -np + - "4" + - -bind-to + - none + - -map-by + - slot + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -x + - POD_UID + - -x + - COMPILE=1 + - -wdir + - /shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/ + - ./llama_7b.sh + initContainers: + - name: wait-hostfilename + image: IMG_PLACEHOLDER + command: + - bash + - -cx + - "[[ $(cat /etc/mpi/discover_hosts.sh | wc -l) != 1 ]] && (date; echo Ready; cat /etc/mpi/discover_hosts.sh) || (date; echo 'not ready ...'; sleep 10; exit 1) && while read host; do while ! ssh $host echo $host ; do date; echo \"Pod $host is not up ...\"; sleep 10; done; date; echo \"Pod $host is ready\"; done <<< \"$(/etc/mpi/discover_hosts.sh)\"" + volumeMounts: + - mountPath: /etc/mpi + name: mpi-job-config + - mountPath: /root/.ssh + name: ssh-auth + + Worker: + replicas: 4 + template: + spec: + containers: + - image: IMG_PLACEHOLDER + name: mpitest + imagePullPolicy: Always + resources: + limits: + aws.amazon.com/neuron: "16" + vpc.amazonaws.com/efa: "8" + requests: + aws.amazon.com/neuron: "16" + vpc.amazonaws.com/efa: "8" + volumeMounts: + - name: persistent-storage + mountPath: /shared + - name: dshm + mountPath: /dev/shm + volumes: + - name: persistent-storage + persistentVolumeClaim: + claimName: fsx-claim + - name: dshm + emptyDir: + medium: Memory diff --git a/ai-ml/trainium-inferentia/examples/llama2/example_manifests/llama_train.yaml b/ai-ml/trainium-inferentia/examples/llama2/example_manifests/llama_train.yaml new file mode 100644 index 000000000..b7caee81d --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/example_manifests/llama_train.yaml @@ -0,0 +1,81 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: test-mpi-train +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + backoffLimit: 20 + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: IMG_PLACEHOLDER + name: mpitest + imagePullPolicy: Always + env: + - name: POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + command: + - mpirun + - --allow-run-as-root + - -np + - "4" + - -bind-to + - none + - -map-by + - slot + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -x + - POD_UID + - -wdir + - /shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/ + - ./llama_7b.sh + initContainers: + - name: wait-hostfilename + image: IMG_PLACEHOLDER + command: + - bash + - -cx + - "[[ $(cat /etc/mpi/discover_hosts.sh | wc -l) != 1 ]] && (date; echo Ready; cat /etc/mpi/discover_hosts.sh) || (date; echo 'not ready ...'; sleep 10; exit 1) && while read host; do while ! ssh $host echo $host ; do date; echo \"Pod $host is not up ...\"; sleep 10; done; date; echo \"Pod $host is ready\"; done <<< \"$(/etc/mpi/discover_hosts.sh)\"" + volumeMounts: + - mountPath: /etc/mpi + name: mpi-job-config + - mountPath: /root/.ssh + name: ssh-auth + + Worker: + replicas: 4 + template: + spec: + containers: + - image: IMG_PLACEHOLDER + name: mpitest + imagePullPolicy: Always + resources: + limits: + aws.amazon.com/neuron: "16" + vpc.amazonaws.com/efa: "8" + requests: + aws.amazon.com/neuron: "16" + vpc.amazonaws.com/efa: "8" + volumeMounts: + - name: persistent-storage + mountPath: /shared + - name: dshm + mountPath: /dev/shm + volumes: + - name: persistent-storage + persistentVolumeClaim: + claimName: fsx-claim + - name: dshm + emptyDir: + medium: Memory From fe66508f93178b08e21728abdb6709d76828e691 Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Thu, 14 Dec 2023 09:56:30 -0700 Subject: [PATCH 06/45] fix typo --- .../examples/llama2/docker/Dockerfile.tensorboard | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard b/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard index c0b1d21f2..8a21900fa 100644 --- a/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard +++ b/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard @@ -1,4 +1,4 @@ -# Tensorboard container for use in the EKS/Trn1 BERT pretraining tutorial +# Tensorboard container for use in the EKS/Trn1 Llama2 pretraining tutorial FROM public.ecr.aws/lts/ubuntu:22.04_stable ARG DEBIAN_FRONTEND=noninteractive From 74cbfd2a4c4d51e4003b5212ddcdf15619c84eaa Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 14:22:32 -0500 Subject: [PATCH 07/45] install pre-req script --- .../examples/llama2/install-pre-requsites.sh | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh new file mode 100644 index 000000000..010754eb3 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Function to install Docker +install_docker() { + echo "Installing Docker..." + sudo yum install docker -y + sudo systemctl start docker + sudo usermod -aG docker $(whoami) + newgrp docker +} + +# Check for Git +if ! command -v git &> /dev/null; then + echo "Git is not installed. Installing..." + sudo yum install git -y +else + echo "Git is already installed." +fi + +# Check for Kubectl +if ! command -v kubectl &> /dev/null; then + echo "kubectl is not installed. Installing..." + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl.sha256" + echo "$(cat kubectl.sha256) kubectl" | sha256sum --check + + if [ $? -eq 0 ]; then + echo "kubectl checksum is valid." + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl + kubectl version --client + else + echo "kubectl checksum is invalid. Installation aborted." + exit 1 + fi +else + echo "kubectl is already installed." +fi + +# Check for AWS CLI v2 +if ! command -v aws &> /dev/null || [[ ! $(aws --version) =~ "aws-cli/2" ]]; then + echo "AWS CLI v2 is not installed. Installing..." + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install +else + echo "AWS CLI v2 is already installed." +fi + +# Check for Docker +if ! command -v docker &> /dev/null; then + install_docker +else + echo "Docker is already installed." +fi + +echo "Installation check complete." From de1d173511c975473e2406475966f8d40d0f9c1f Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 14:29:29 -0500 Subject: [PATCH 08/45] more tools to prereq shell script --- .../examples/llama2/install-pre-requsites.sh | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh index 010754eb3..eff005f25 100644 --- a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh @@ -53,4 +53,25 @@ else echo "Docker is already installed." fi +# Check for Terraform +if ! command -v terraform &> /dev/null; then + echo "Terraform is not installed. Installing..." + sudo yum install -y yum-utils + sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/AmazonLinux/hashicorp.repo + sudo yum -y install terraform +else + echo "Terraform is already installed." +fi +terraform -help + +# Check for Helm +if ! command -v helm &> /dev/null; then + echo "Helm is not installed. Installing..." + curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 + chmod 700 get_helm.sh + ./get_helm.sh +else + echo "Helm is already installed." +fi + echo "Installation check complete." From 92a8873b60d27b9ca4b72f96df65e73fc4423f9b Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 16:23:03 -0500 Subject: [PATCH 09/45] addtional tooling --- .../examples/llama2/install-pre-requsites.sh | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh index eff005f25..747fdcfd7 100644 --- a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh @@ -9,13 +9,25 @@ install_docker() { newgrp docker } -# Check for Git -if ! command -v git &> /dev/null; then - echo "Git is not installed. Installing..." - sudo yum install git -y -else - echo "Git is already installed." -fi +# Install a package if it is not already installed +install_package() { + PACKAGE=$1 + echo "Checking for $PACKAGE..." + if ! command -v $PACKAGE &> /dev/null; then + echo "$PACKAGE is not installed. Installing..." + sudo yum install $PACKAGE -y + else + echo "$PACKAGE is already installed." + fi +} + +# Install Docker +install_docker + +# Install Git, Python3, and unzip (required for AWS CLI v2 installation) +install_package git +install_package python3 +install_package unzip # Check for Kubectl if ! command -v kubectl &> /dev/null; then @@ -46,22 +58,10 @@ else echo "AWS CLI v2 is already installed." fi -# Check for Docker -if ! command -v docker &> /dev/null; then - install_docker -else - echo "Docker is already installed." -fi - # Check for Terraform -if ! command -v terraform &> /dev/null; then - echo "Terraform is not installed. Installing..." - sudo yum install -y yum-utils - sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/AmazonLinux/hashicorp.repo - sudo yum -y install terraform -else - echo "Terraform is already installed." -fi +install_package yum-utils +sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/AmazonLinux/hashicorp.repo +install_package terraform terraform -help # Check for Helm @@ -74,4 +74,12 @@ else echo "Helm is already installed." fi +# Check for Boto3 +if ! python3 -c "import boto3" &> /dev/null; then + echo "Boto3 is not installed. Installing..." + pip3 install boto3 +else + echo "Boto3 is already installed." +fi + echo "Installation check complete." From 3cde52610ef59b973c2750ca2a28472557f678d4 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 16:29:02 -0500 Subject: [PATCH 10/45] addtional tooling python --- ...stall-pre-requsites.sh => install-pre-requsites-for-ec2.sh} | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) rename ai-ml/trainium-inferentia/examples/llama2/{install-pre-requsites.sh => install-pre-requsites-for-ec2.sh} (97%) diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh similarity index 97% rename from ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh rename to ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh index 747fdcfd7..8c8adfc10 100644 --- a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh @@ -24,10 +24,11 @@ install_package() { # Install Docker install_docker -# Install Git, Python3, and unzip (required for AWS CLI v2 installation) +# Install Git, Python3, unzip, and pip install_package git install_package python3 install_package unzip +install_package python3-pip # Check for Kubectl if ! command -v kubectl &> /dev/null; then From 79b2b6df705582b5df7987ff082e0f8ff5f99426 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 16:40:57 -0500 Subject: [PATCH 11/45] AZ fix --- ai-ml/trainium-inferentia/get-eks-azs.sh | 59 ++++++++++++++++++++++++ ai-ml/trainium-inferentia/main.tf | 18 +++++++- 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 ai-ml/trainium-inferentia/get-eks-azs.sh diff --git a/ai-ml/trainium-inferentia/get-eks-azs.sh b/ai-ml/trainium-inferentia/get-eks-azs.sh new file mode 100644 index 000000000..1d9224bec --- /dev/null +++ b/ai-ml/trainium-inferentia/get-eks-azs.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Function to get AWS region using Python and Boto3 +get_region_with_python() { + python3 - </dev/null); [ -z "$REGION_CODE" ]; then + REGION_CODE=$(get_region_with_python) +fi + +# Validate if REGION_CODE is set +if [ -z "$REGION_CODE" ]; then + echo "Unable to determine AWS region." + exit 1 +fi + +echo "Using AWS region: $REGION_CODE" + +# Determine appropriate EKS AZs based on the region +if [[ $REGION_CODE == "us-east-1" ]]; then + AZ1="use1-az6" + AZ2="use1-az5" +elif [[ $REGION_CODE == "us-west-2" ]]; then + AZ1="usw2-az4" + AZ2="usw2-az3" +else + echo "Unsupported region: $REGION_CODE" + exit 1 +fi + +# Fetch and set the actual names of the availability zones +EKSAZ1=$(aws ec2 describe-availability-zones \ + --region $REGION_CODE \ + --filters "Name=zone-id,Values=$AZ1" \ + --query "AvailabilityZones[].ZoneName" \ + --output text) + +EKSAZ2=$(aws ec2 describe-availability-zones \ + --region $REGION_CODE \ + --filters "Name=zone-id,Values=$AZ2" \ + --query "AvailabilityZones[].ZoneName" \ + --output text) + +echo "Your EKS availability zones are $EKSAZ1 and $EKSAZ2" diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index 0ede9c5c3..8909f26aa 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -46,7 +46,7 @@ data "aws_ecrpublic_authorization_token" "token" { provider = aws.ecr } -locals { +/* locals { name = "${var.name}-${random_string.this.result}" region = var.region # Training and Inference instances are available in the following AZs us-east-1 and us-west-2 @@ -56,4 +56,20 @@ locals { Blueprint = local.name GithubRepo = "github.com/awslabs/data-on-eks" } +} */ + +data "external" "eks_azs" { + program = ["bash", "${path.module}/get_eks_azs.sh"] +} + +locals { + name = var.name + region = var.region + azs = [data.external.eks_azs.result["EKSAZ1"], data.external.eks_azs.result["EKSAZ2"]] + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } } + + From b1f8343d7432f71f8869203a4af5333f28e6f32b Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 16:50:08 -0500 Subject: [PATCH 12/45] added jq --- .../examples/llama2/install-pre-requsites-for-ec2.sh | 3 ++- .../{get-eks-azs.sh => get_eks_azs.sh} | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) rename ai-ml/trainium-inferentia/{get-eks-azs.sh => get_eks_azs.sh} (86%) diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh index 8c8adfc10..cd6000834 100644 --- a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh @@ -24,11 +24,12 @@ install_package() { # Install Docker install_docker -# Install Git, Python3, unzip, and pip +# Install Git, Python3, unzip, pip, and jq install_package git install_package python3 install_package unzip install_package python3-pip +install_package jq # Check for Kubectl if ! command -v kubectl &> /dev/null; then diff --git a/ai-ml/trainium-inferentia/get-eks-azs.sh b/ai-ml/trainium-inferentia/get_eks_azs.sh similarity index 86% rename from ai-ml/trainium-inferentia/get-eks-azs.sh rename to ai-ml/trainium-inferentia/get_eks_azs.sh index 1d9224bec..995888a23 100644 --- a/ai-ml/trainium-inferentia/get-eks-azs.sh +++ b/ai-ml/trainium-inferentia/get_eks_azs.sh @@ -50,10 +50,20 @@ EKSAZ1=$(aws ec2 describe-availability-zones \ --query "AvailabilityZones[].ZoneName" \ --output text) +if [ -z "$EKSAZ1" ]; then + echo "Failed to fetch the name for availability zone $AZ1 in region $REGION_CODE." + exit 1 +fi + EKSAZ2=$(aws ec2 describe-availability-zones \ --region $REGION_CODE \ --filters "Name=zone-id,Values=$AZ2" \ --query "AvailabilityZones[].ZoneName" \ --output text) +if [ -z "$EKSAZ2" ]; then + echo "Failed to fetch the name for availability zone $AZ2 in region $REGION_CODE." + exit 1 +fi + echo "Your EKS availability zones are $EKSAZ1 and $EKSAZ2" From a8cdf82790c17628545e362c8afb32ad15e8ec6f Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 16:56:15 -0500 Subject: [PATCH 13/45] added tool checks --- .../llama2/install-pre-requsites-for-ec2.sh | 62 +++---------------- 1 file changed, 7 insertions(+), 55 deletions(-) diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh index cd6000834..f2cf7d19c 100644 --- a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh @@ -2,7 +2,7 @@ # Function to install Docker install_docker() { - echo "Installing Docker..." + echo "Checking and installing Docker..." sudo yum install docker -y sudo systemctl start docker sudo usermod -aG docker $(whoami) @@ -14,13 +14,15 @@ install_package() { PACKAGE=$1 echo "Checking for $PACKAGE..." if ! command -v $PACKAGE &> /dev/null; then - echo "$PACKAGE is not installed. Installing..." + echo "Installing $PACKAGE..." sudo yum install $PACKAGE -y else echo "$PACKAGE is already installed." fi } +echo "Starting installation of prerequisites..." + # Install Docker install_docker @@ -31,57 +33,7 @@ install_package unzip install_package python3-pip install_package jq -# Check for Kubectl -if ! command -v kubectl &> /dev/null; then - echo "kubectl is not installed. Installing..." - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl.sha256" - echo "$(cat kubectl.sha256) kubectl" | sha256sum --check - - if [ $? -eq 0 ]; then - echo "kubectl checksum is valid." - sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl - kubectl version --client - else - echo "kubectl checksum is invalid. Installation aborted." - exit 1 - fi -else - echo "kubectl is already installed." -fi - -# Check for AWS CLI v2 -if ! command -v aws &> /dev/null || [[ ! $(aws --version) =~ "aws-cli/2" ]]; then - echo "AWS CLI v2 is not installed. Installing..." - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install -else - echo "AWS CLI v2 is already installed." -fi - -# Check for Terraform -install_package yum-utils -sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/AmazonLinux/hashicorp.repo -install_package terraform -terraform -help - -# Check for Helm -if ! command -v helm &> /dev/null; then - echo "Helm is not installed. Installing..." - curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 - chmod 700 get_helm.sh - ./get_helm.sh -else - echo "Helm is already installed." -fi - -# Check for Boto3 -if ! python3 -c "import boto3" &> /dev/null; then - echo "Boto3 is not installed. Installing..." - pip3 install boto3 -else - echo "Boto3 is already installed." -fi +# Additional installations (kubectl, AWS CLI v2, Terraform, Helm, Boto3)... +# (Include the existing logic for these installations here, with similar echo statements for tracking) -echo "Installation check complete." +echo "Installation of prerequisites complete." From 30b259b0601565219ee6d23fd868c9c13e921247 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 17:02:09 -0500 Subject: [PATCH 14/45] get az script update --- ai-ml/trainium-inferentia/get_eks_azs.sh | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/ai-ml/trainium-inferentia/get_eks_azs.sh b/ai-ml/trainium-inferentia/get_eks_azs.sh index 995888a23..3ac09d86e 100644 --- a/ai-ml/trainium-inferentia/get_eks_azs.sh +++ b/ai-ml/trainium-inferentia/get_eks_azs.sh @@ -13,13 +13,9 @@ print(get_region()) EOF } -# Prompt user to enter region or press enter to detect automatically -read -p "Enter AWS region or press enter to detect automatically: " user_region - -# Determine region either from user input, EC2 metadata, or Python script -if [ -n "$user_region" ]; then - REGION_CODE=$user_region -elif REGION_CODE=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .region 2>/dev/null); [ -z "$REGION_CODE" ]; then +# Attempt to determine region from EC2 metadata, then fall back to Python script +REGION_CODE=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .region 2>/dev/null) +if [ -z "$REGION_CODE" ]; then REGION_CODE=$(get_region_with_python) fi @@ -50,20 +46,10 @@ EKSAZ1=$(aws ec2 describe-availability-zones \ --query "AvailabilityZones[].ZoneName" \ --output text) -if [ -z "$EKSAZ1" ]; then - echo "Failed to fetch the name for availability zone $AZ1 in region $REGION_CODE." - exit 1 -fi - EKSAZ2=$(aws ec2 describe-availability-zones \ --region $REGION_CODE \ --filters "Name=zone-id,Values=$AZ2" \ --query "AvailabilityZones[].ZoneName" \ --output text) -if [ -z "$EKSAZ2" ]; then - echo "Failed to fetch the name for availability zone $AZ2 in region $REGION_CODE." - exit 1 -fi - echo "Your EKS availability zones are $EKSAZ1 and $EKSAZ2" From f5dbc090da724e0e9f0e4a1c052de6f5e2c20e5e Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 17:05:58 -0500 Subject: [PATCH 15/45] az code fix --- ai-ml/trainium-inferentia/get_eks_azs.sh | 33 +++--------------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/ai-ml/trainium-inferentia/get_eks_azs.sh b/ai-ml/trainium-inferentia/get_eks_azs.sh index 3ac09d86e..f24a2e30b 100644 --- a/ai-ml/trainium-inferentia/get_eks_azs.sh +++ b/ai-ml/trainium-inferentia/get_eks_azs.sh @@ -1,37 +1,12 @@ #!/bin/bash -# Function to get AWS region using Python and Boto3 -get_region_with_python() { - python3 - </dev/null) -if [ -z "$REGION_CODE" ]; then - REGION_CODE=$(get_region_with_python) -fi - -# Validate if REGION_CODE is set -if [ -z "$REGION_CODE" ]; then - echo "Unable to determine AWS region." - exit 1 -fi +# Hardcoded AWS region +REGION_CODE="us-west-2" echo "Using AWS region: $REGION_CODE" -# Determine appropriate EKS AZs based on the region -if [[ $REGION_CODE == "us-east-1" ]]; then - AZ1="use1-az6" - AZ2="use1-az5" -elif [[ $REGION_CODE == "us-west-2" ]]; then +# Determine appropriate EKS AZs based on the hardcoded region +if [[ $REGION_CODE == "us-west-2" ]]; then AZ1="usw2-az4" AZ2="usw2-az3" else From 407fa4925cb294eb383e16b364430d159d352ce6 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 17:07:38 -0500 Subject: [PATCH 16/45] az code fix --- ai-ml/trainium-inferentia/get_eks_azs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/trainium-inferentia/get_eks_azs.sh b/ai-ml/trainium-inferentia/get_eks_azs.sh index f24a2e30b..62962c347 100644 --- a/ai-ml/trainium-inferentia/get_eks_azs.sh +++ b/ai-ml/trainium-inferentia/get_eks_azs.sh @@ -27,4 +27,4 @@ EKSAZ2=$(aws ec2 describe-availability-zones \ --query "AvailabilityZones[].ZoneName" \ --output text) -echo "Your EKS availability zones are $EKSAZ1 and $EKSAZ2" +echo "{\"EKSAZ1\": \"$EKSAZ1\", \"EKSAZ2\": \"$EKSAZ2\"}" From 0c35b437d4826671b909b862dc3755fff0064c15 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 19:30:27 -0500 Subject: [PATCH 17/45] fix az script --- ai-ml/trainium-inferentia/get_eks_azs.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ai-ml/trainium-inferentia/get_eks_azs.sh b/ai-ml/trainium-inferentia/get_eks_azs.sh index 62962c347..bd39769ae 100644 --- a/ai-ml/trainium-inferentia/get_eks_azs.sh +++ b/ai-ml/trainium-inferentia/get_eks_azs.sh @@ -27,4 +27,10 @@ EKSAZ2=$(aws ec2 describe-availability-zones \ --query "AvailabilityZones[].ZoneName" \ --output text) -echo "{\"EKSAZ1\": \"$EKSAZ1\", \"EKSAZ2\": \"$EKSAZ2\"}" +# Example of correctly formatted JSON output +if [ ! -z "$EKSAZ1" ] && [ ! -z "$EKSAZ2" ]; then + echo "{\"EKSAZ1\": \"$EKSAZ1\", \"EKSAZ2\": \"$EKSAZ2\"}" +else + # Even errors must be output as JSON + echo "{\"error\": \"Unable to determine EKS availability zones\"}" +fi From 289bbfdba14a288eaa7c7170eebee773d92ddc38 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 14 Dec 2023 19:34:31 -0500 Subject: [PATCH 18/45] fix az script json output --- ai-ml/trainium-inferentia/get_eks_azs.sh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ai-ml/trainium-inferentia/get_eks_azs.sh b/ai-ml/trainium-inferentia/get_eks_azs.sh index bd39769ae..d45648e46 100644 --- a/ai-ml/trainium-inferentia/get_eks_azs.sh +++ b/ai-ml/trainium-inferentia/get_eks_azs.sh @@ -3,14 +3,12 @@ # Hardcoded AWS region REGION_CODE="us-west-2" -echo "Using AWS region: $REGION_CODE" - # Determine appropriate EKS AZs based on the hardcoded region if [[ $REGION_CODE == "us-west-2" ]]; then AZ1="usw2-az4" AZ2="usw2-az3" else - echo "Unsupported region: $REGION_CODE" + echo "{\"error\": \"Unsupported region: $REGION_CODE\"}" exit 1 fi @@ -27,10 +25,11 @@ EKSAZ2=$(aws ec2 describe-availability-zones \ --query "AvailabilityZones[].ZoneName" \ --output text) -# Example of correctly formatted JSON output -if [ ! -z "$EKSAZ1" ] && [ ! -z "$EKSAZ2" ]; then +# Check if EKSAZ1 and EKSAZ2 are not empty and output as JSON +if [ -n "$EKSAZ1" ] && [ -n "$EKSAZ2" ]; then echo "{\"EKSAZ1\": \"$EKSAZ1\", \"EKSAZ2\": \"$EKSAZ2\"}" else - # Even errors must be output as JSON + # Output errors as JSON echo "{\"error\": \"Unable to determine EKS availability zones\"}" + exit 1 fi From 53cb92eabd4d23695e27833a9e67fa03ecc54cb4 Mon Sep 17 00:00:00 2001 From: Scott P <48838323+5cp@users.noreply.github.com> Date: Thu, 14 Dec 2023 19:16:38 -0700 Subject: [PATCH 19/45] bug fix - always store ecr repo uri --- .../examples/llama2/1-llama2-neuronx-pretrain-build-image.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh b/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh index 0f4d63549..eecf1a060 100755 --- a/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/1-llama2-neuronx-pretrain-build-image.sh @@ -39,7 +39,6 @@ if aws ecr describe-repositories --repository-names "$ECR_REPO_NAME" --region "$ # Get the ECR_REPO_URI for the existing repository ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$region" --output text) echo "Repository URL: $ECR_REPO_URI" - echo $ECR_REPO_URI > .ecr_repo_uri else # Create the ECR repository aws ecr create-repository --repository-name "$ECR_REPO_NAME" --region "$region" @@ -50,6 +49,9 @@ else echo "Repository URL: $ECR_REPO_URI" fi +# Store ECR REPO URI for later use +echo $ECR_REPO_URI > .ecr_repo_uri + # Login to ECR echo -e "\nLogging in to ECR" aws ecr get-login-password --region "$region" | docker login --username AWS --password-stdin "$ECR_REPO_URI" From 080abb5020ffa3a2574c9c3221ff2c52e78d0154 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Fri, 15 Dec 2023 15:35:51 -0500 Subject: [PATCH 20/45] eks and main code changes --- ai-ml/trainium-inferentia/eks.tf | 6 +++--- ai-ml/trainium-inferentia/main.tf | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index 02dd4437e..1815f068f 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -176,9 +176,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = 2 - max_size = 2 - desired_size = 2 + min_size = 4 + max_size = 4 + desired_size = 4 # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index 8909f26aa..d55ebb65f 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -63,7 +63,7 @@ data "external" "eks_azs" { } locals { - name = var.name + name = "${var.name}-${random_string.this.result}" region = var.region azs = [data.external.eks_azs.result["EKSAZ1"], data.external.eks_azs.result["EKSAZ2"]] tags = { From 401e17756c3db43ee1cc2f20bcf0d9259c3a2230 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Fri, 15 Dec 2023 16:24:12 -0500 Subject: [PATCH 21/45] llama2 trainium doc --- .../gen-ai/inference/img/llama2-trainium.png | Bin 0 -> 67486 bytes website/docs/gen-ai/training/Llama2.md | 271 ++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 website/docs/gen-ai/inference/img/llama2-trainium.png create mode 100644 website/docs/gen-ai/training/Llama2.md diff --git a/website/docs/gen-ai/inference/img/llama2-trainium.png b/website/docs/gen-ai/inference/img/llama2-trainium.png new file mode 100644 index 0000000000000000000000000000000000000000..1be596b667e104f0155802a7c876b06c711a2303 GIT binary patch literal 67486 zcmaHT1ymi&?=UW|-%@%1-|IW) zJ()t?pzOfF!a(d?LK?x3YLomqft}B&OxQm;{@JK6rm6-PpbVB0=l6vT}cXbaeb4IjhE@XOF-; zX6;zxb*YtwRnf{jVFrm5QGPa!gt3~JAC}!OWl!-w%sVTd(DGihIBPDHwa^$v3eIEQ zrXHq>wL2ir(kcw<%tOA2SD8CaGI+y1xJ|JN*_SXv9&^g8?Z9N-@Z8qV>QGt(e7Q~-cNXIP5KlczuPY((x3W4;vM2sE$e(e94ej;qOl=%Y zt*r>2#?{rcc68t&B6HBJd)TfdBjQ$t8)(h^sZq6ZeJ^`}f5H z{(z*AkNWpT!U+DMk>r3|IsNb9DR!=K|AP$) znH)*T?(nDBpQE^>waFqZ#z&G}UID{L5<{)5g_L)Q-SRN&v}bP(~&W1!Q$A^V#^T|h&k%6FDVG z`Zx5brv}twJ{9bL(;6HVE)+~eQ>7j=`ETf`PYrtKu;xO4(G)@#C94nt0fYD#Mg9*5 zem$T;vgAR>@;@A+fq3DlyP{|CH*|nrFC0TA@kODK^W;IRs9TvvC-KFs4yqJ#e z2^IYfU(5vn3%?{%Lhuu;cPyB&YJCDLTzgsX`IIVQDT-CNS z0-O?r7}5JBn9ts{(G3dr^}%tf9PD)ry+_L48u@@Bffe?LU;zT&*m*Bt923NUQV{eG z!l;of|J|%enmVa(KvP~e{-1%#fPp)5N2-1eyeiQq)$qJlPGvu%#W4K=s(M*~ou~{Y zDm$Uw^*_8UK@!2~G?>c%;3lO{*Q<`fvtgTnkF8bPvcH*^jSBEA z{>57KPdbs%f-~naq1wVm2cC)T%6`X1cbGaz7DxE-lKiKviAey%i_ch=usmVt;>W)= zoW2n)iWlOlk6xZpHc;*Uq;My550-39TR@ojIsbB1C33VvaC zOnP+Gi9C=c;73ABOPjws%o~o{F$`BJr@kv?J!U(bN7zpMlM?*80G8BJH#DhF!l;nO zUm%kXgRL_OXK$9TJInKtW8ZEpSKJpF_cuNL zB?)Ln8*I=RdEuk@S5VsC*7n7c-%^1|1~pIyl3P8qwYP9aBbTYsewP1bCY>4C7>hj9 z!;Y8w!_)0BcbZ{EM&$P<0`1IPR9O$Uv?cO3pGPbVwzx<#b^7OQZ3G=UmI#{=(n0eSL)JVRqD+Ys^kBYTC7n(IBxOSSIX_NQLpq7NinokGYz#m@%> zceJra$glSTm4XK14JL>65n&ZXgH7jt4(J-D=}nrBp+73Q3f}OM&beJ+kV25$D*LVv zi__L~=PBavtS5Gxx+6|Z)I4T#O8FX-L~%l~-!nPhq|lNA^Y_Um z29<9oIx!fTZ7k3$)w-KA8oxGS)Frc`7-twdlZ-7=oEJFauQ)`cDCZwWFymj69}o?afWF09HepSaOe*SFml<%KZzXd52$C7iKJgvqb(LH z%*Or$C-eT8r9C?F-XK!SiZA0Tk|IvJ`D({d=r#_7_ELg?T3AU8+wzLUh03a2#FED>p{cV8=8|c3AQs~LCAm6K_3{!7g3rl)1WCM8 zx!LU}Fzx_t2YLRGrmwy=C`1fg_GLzA zO=J5sN*CFUbj^vonp!i=u;oZ{ZnxQP{@WD%n~nsg@R-L&5#nx7BM5`cP+DJeyd|r=%p}D57ol zlbMYpa`W=UPdA4kr!0FTa8GUzTX!+W@m<%W!Pl7N&WZ)<-Dz6ki=@1pwDau_tM}n$ zy>sYRnu1@F_~eow^^s@7X%#RS4JKt-V)1&^YuFF5Rv8W?WZ6QBwB24u$+o_~hY$&+ z*lW2xh}DneQBI&$N0+}*Avc?;%B!(n;x%DJz@VGBU(LJ3c(Z%E27}z1I8V|Y{`_7c z0FZ=FXDoBSY}2wh#LUbL_#8C2p>q%tZk4u~52M|>nCW=FG@h3+H`D{5z)vpbtOpJj znoheAm4mpWC!u%zJ$UcW6piInrrj?O5UJnS5CzRGiuISOR_AAF)3x56h{a&ABi=l= z`9MMs_h$IC9!ByYjQ}!0L=1Q~SZw)3KSi0qCkZ8qQ9qEya`vr`Bj@3l(=&%=MOhxD zhwCkTO){&``0uZDi_~gA85kJAb)eI0m2_&kZIh;76cuE|pvkZc1R`SRTMxq`>>#|DOs@ykOy|Qz z&1vXyNL@!kT(jvi+DN{KdUT86i5w9GfiF!5jgtKdw8gt8*ISvpXA6yv0xVm)2sY4 znbkZ&#(68#2`8o$+KilEtCSHc&E5`Cm!7umD-(~gI5E#iokVMHr z*;Kchj1sw?^pT&lbTAqAe+7tF2#sgevVt6~+~{D+aXD}AsM6%LZ!nUmy^FwtAmO0p*6MmQ)f$TJx*|!JinM>W(Xn`Msa1RFC=!=TnPDIKTBvZP z@%{Y`1PXM}xlVAXY3g`^Y$6_yn~4`(8-})re5Meb$NA*DlfCMZeiTuh>|KasFa%IO zK}WV`saoyW)+Tbl#rKNr!Aw5we5vG|X1=>4A;mUCu=_G{s6&!!-9_Dz^o!Y~;- z3|6>aMVL>OjJ`LapZ$P5e{;U;$N`eDQR#*8fh7ct!%I)k(;>EjM#o(Urs)v-$1KmS zH2VlEcVNlXD4m9oXGHOOmQ3q^Uuyv-1%aYftW4Xm9wS+kJPVve6-#+_ANT?5SiycHWIbbOB8)wy!n$EbbZTT z9d(93@?6fZp^1X@KyQ*RzPEaL(c*Z(PxQ^!n%AoM;#)Q9YWlDix}EPpNTQTj+kgbT z9NkS-*OyCPwY6}T0=T+Z{^uYhJ*abMPUa1pyQ^cjakQFZ z@mR9i6eD3`f4m(dJP)4JLFQo#!4^DLv$#7z)7Sxvf0xAsvaB711Tj`a4YKbFI7jtG z-@TDz)6jNV3E&k7WA9-#`NkhgtAf`)T0fM^mOAdy3#eSP6)1t{LA$LFHdvsoajB(tukLThW%;(<1mNBucVk50+-on|4bYzK zmbc#IPj}q>xO?FL`er;EH=lNNFwr%dzY4T`kk~h-SVI2GW$j1t2J4*Y0XVsxZlinc zI|hF{7Ez=~xrLb3Uz| z*y4G+L!h$3P&&L#FtyxrcOrQ|im?;v>4&P)v@=y&?8l(=Vp!^Q9OM0ijvEDzaH{E= z$+Xr6=i~(D?nt@Zgu*LIPs+Ic$AgOhiY= zXNLpX>qittf>0{j+CJH}Lt|X6VoM@%Gl-xFB6c8+gQ376B=?Pz`0Ds$`nz2v!JYBY~g_{VLNq)`ILcKfR@F*i1362^f?L@8aZRCq@mgXBLFA?3j>l zV%F;|8B%ZYXy9RAw}(rUOCs*nkKlN-)#P?=<|#Y_`u6>-Yul>z-pNwh^K!wtR%9%0 zB@Cn7oY-t1olhUypW=BuIiB!0U{}_x5aUF*TY%;Q~ED01+UNudgw-?*e!W9khi^op0rUl!} zIU=DuA1fF630@uhz9Y&)f`obg&5>c8c>kJnM7P_=?|irUa?u`U*PIy|4ZE?;&S1x* z`{lbIHY)gx3&R!lH#^0r8-C7*?d=MI=N{Q#BnY(0`XSF*H=y*9b}91d_4)~?81+2y zrfzlD(7cdRV|1QTmxS9q(C9pjiFV>!cD+!AlVoT;8AAh)~{9(?R zGitOc0HFm$61MUd4b-_smx29VbO;)`TbZT>2+;c`Pz>iNY?C1U3SnJbEGH8Gctj#d zMas%!_%5*5gA9s0X$|`0FRI@oq?+^72->h0j87K7lhuju48%}@WVKo5Pajqp(24gG zQk%~bqRG}O?1aVlASg()`|uo>b3)al#T+h}>!4of7BGDA*BvY1)Y+99Gtk(=sIOhy zIoZOZRgB@#t)_%kGAi;xNU1(L%vep#t78XSP7y9E483n&FZ&{D@ZRn*4|VUSd#FqY zyMW}9R$TxMy1P0woO{uTDPVAkurWXL#@;f5Oj|=A5?#gqxVg<45d!zT(~VPVh?YY3 z_Oe%M2{8{byQ|%r)Qet-u$x_rG%7y>X2Ye{)&k?#61PLUGHq`<8C_rOTL_+?#aF5e zop;%6BCLhR_HU~SbhNB^uIonpxiUzk_kdti2i#YOTI)+a55*G4RDkXfZi#EXX}Mgs zU<-Q>u)?c8f=g-Ejn`Kk@VNy$Y>o=xiFPl*we7I-+|=!CxRIyS4FDWCM#5 zT{6`HSKwY``~fhcV;f1VTHwU6@XzBpN-|V}VCal^Qq`w)yb;ZICpYthS~2McP*2=t-E6ASrSmhjg|3LP;7?Aeo;Xg#;qRa|plILI4eB--nR= zkkeK-$_&+|Y13&0p91VjE-#pl?#79mp#zr`0aKNzQ5C_0Mwhi5HKU#qxKc{+Ms5 zF2kOlB(=b()MF%T+fHPJ8QlA|uOTAmM|r`)l~3~1pA^Fdwt&f;ofs9d7J zMABio;-!E4oxi+Ka8#g!hGODV2Qo?@z$|TFvXYvft6KiqB5;7;DR->g2Re{MeE@8$ zLgn;~(eJiD;ldHn@|om_11YxP?;ZAUo%sLhkX3jp^{$Fgi^Q81!sQ+ApWsE5w->}! zDa-E8U9TP-yh|eg+v$nYxs}qyF1Nb5nJNQ9hMh`zCjD92lWVq2QptE#**|D<`D7TU zmk+ET8qSYax52@`rB$Tuh&$-Zq1xfSJICGT+vL5B`62nVU-O}jbskvUfX%&8+USrO zfck?i71Y$1YH7Ck*2JQ*wZ4VnCeF)f;1g~}{6Uztj>2%N#*BH}bJm+;qsKywz!m;W z^#)IJwR0J_?RD_FC6qbf7yb;vGLqRWvG2kpw*pKW+Cz}iSx#Dn9`2;|^MB#EA!*cC zBt2qNOn)y%kOyawrmaDyl-Bi#OgXu0LGw#G>Y;|YPRY~#FC$lC^zU5IFexZWc2*G+ zH=_A-P}?NmUK33U@8eE|{@kMx$ivwvRtj&!PkpR_v$-7TAR4jzB^STLQBQ$@XHcuJ z@2jMnoyPLtt7~o;lShOJP9{VDmeGGiB#K`{7#9jR4aMI16Rx%qT`D2-V>+H4^4?6H zEYsL{p2}zr=TN&)kOBQ26O+N?!~JZzK?rJ#jSGQCX9BGPDYfJW4C?%~Bjmpab_wAh z6v2hUNkehi#CcdzIY6gXCo7&VO=OlmC_GK&v2L+gYEaCyo^4c;K&Pi`y4Z_uWHl3; z(^u~sdwhsBZ5>J|vD+{hS!`A)wKLqd(J;r&^b#Xbh0_Ud0~U|rYF9{~g5gx`?_KMv zOqTuDaHG_NRu3)@f)ogX5yR1>K=T`i2wsWHQFX7azP+#?4Aa(OfSs7?51n-N&vtp} zrl%P6(VNn#64il-2A3JlC)@S5Y7+uy=~EXZMe=zc42+EjrqRgzcnwECb`OWs-Txnb zf5z!!;`!gQN%Yyud8>4?NUnutHK{jSj7asdC^8JyhU{_+hAB~~D~B{F%?bu>CxR?& z%wLxzLQ5um9r3y&%wuM*HAEU`m3C=N`8;z!p-Ax~P0saapyIc50+JPV1_em-yb0#0 zQHM^n`H`IA(uCv{Hnk>$uCsqt5AEA=hDWVNsh#N()2g@TTNeWnnoW6b@%?sXsMa$x z95J!6@AS&FTD8h6Pf1t*#C88+KHvfm=4;=ene?b_8NeQI(T9i3RXABDvl^Fuo^Tw{ zR1d1SsWsc)P9)PR8jNvMj3=d57VG46G?S=YB4?c2bprr zPj+NQ!Q0Sk5;}v0(mOyfqB*MN7QMs#K0SaHnpQmtvj5J~tz91yX&N_4^)6;;?z4CU zlVMohZlz)>+Swb*>dgmAF=vlc{oG)+P|-gDynjfdMFe7&hZO^vf*c=8>|scz7NF4_ z$PS9g(+y;NzqzkqrpDU)XyNT7=aell{*&w7!A2ReAm1}Vj-e&^OIOE zZ>%}|gX9DJQQa6Hc3>_HxY~}w$IKatf5QNeL=6tfv)W!p#Ov-h`W2)8hbvJ)ULiOH zNGy<5y(WqJ0>$aLLvr=xvi*AEiu`AEIafUPPE-d_m}!8H@8wDwlC*+&;n26>Rl@U2 zrFg&{5t?|gSI2dc0^!M!Ikpn4zt<=fDG-);fkOO$pnn13Uj^dQlJhOPLt^~p$jh(5 z|H(lSFaW0EYJS3?;B7U02rFM%LbJ~7$TDU zM2s+DIOlhF0uDH-Gm}0wM$?Vt2m5-oGe0?DxP5eqisZuicYhc@8NzqGc7@ZwdY- zRDjKW_LO26l`bLq8)!XR@D4TN*!)0n!yyo^(9?oJAn@rRsL=04nyItSJmS*AJX0n6 z*$dc>wONzmc+tn^^%9jE52dgY%kHhj$-i_A=-!@=43rr6afD^W8* zjtq^4##>@)DNLgqh18CN@}i#Uk_*o11}hx1XGy`@xKknj<_iG|oQvRuDgoiT4UDca zZt1QDpQBj+5zZiK-mehwAKwE2>Xq&#_mpEu~=I^k@?oUA-L9$xTvW)#RHXJ`# zYy6eNk8gzhG%(TL&hRcB3b43|rBb#tGgFM@mnNFj>4{rvs{1Dzu*f5Ya{~R(XGuxR z@rnloZuHV`B$Gv*iq^nH->+2T~4W(=bR$GA&T&OtaDDY{BEwxcU!pY-&z$754*A@2m>>`#LiEr?azLtt^Prp z09>REJBQA>InDZ`nEoP@av5Biq&D9a`KD9LqLM@!6Tg?PgL1<)uEuMnBT8EdFI~Hs{SXVKE+bXv{&HvMmzg3wxHn?nX zV`o)wVZ;2#%4jU54Li*mf!j_!nUH$i^7Nvvd6w>Hr~?_9h?&D9uIV4gRRtNO;hSJ< z_=QV&__*^QI8PDIXO)p;|5i(F0&Qqb+Gf;=V`3crX)vK`g`#lfOl%$n-54}xbJ{$L z7xc+HDT_AD*Ieb{d=X8p_*AKQ6s4sO6|;tjbJ?ZN*;4~}BXOO-5`cenI)HUO$`h4o z2jr)iPe-}rVCw1AT0h1$A9K;x=%~&;NU+VYu2gz&BXY;-4i(gR4AxT4y$|PRThxAY z;qjP6SG=mvH7RdduUDMP)6&<8_x}4cQ?oXGDwMW?w@r?_2@=vq>VI^k1>kQ3PV@Kg zNo_(<+>H0AXjr<0%)?Hfng>~@2I4svQqbWU>&`c06z8FP(}=4VX$GK}OE&WBnYE=O4vOL`G0&@M^**C!hvXE`6d z!l-17-ayC`dM`<^KRkv#lbM#v;Ynyj7aR$OcM+N>+Ej<36{D>G(mSR}BM@|MmxSsH zwmOJD67%+YoG?>C^X9f|`F*X=2TIt~B9mh6P9#qH;Mc|to3k0Mqz8f5Ckq8Tt0j@* zLHacd#C~{6ub6MHQ`}xJ)yeh8(U9#jQT4Av8iun)qM#<-&skSF4ZU(TI9i6Y*I_~x zvxFdb-Xq}|-oy|hCy&yfT3fAme1H#ERsd}VH{$rcQMc;4H;|V9aS7XMtF$>9`=@0y!5G2 ze}^sp?(Uo9?mXM;^BAqVfEJlUm-yU`4AD|KBjKmrjsTp;MR%Q@SaTqm@BNmv4pE=l8D2C5gH^yZK<(f3)wLIdzYmiK>^_V~RW|Xo| zQmFQIOo$;5S4+`2c?zVLya!qZf-S45AphA4Je^w#%6_OM+O-y;RHJJOVv zxK}K;L(7(bEaEfr%~!vdO6y|Zq{v1Oq3~<-tM~JRj$|Vb8O{R zq7q^~2+f9L>XDA~S65WA=ggt}DiOvRO zOI!#HxqrFp^F6g{Ug&NzLiF2=HDe&@+fxiU-P@BHhE0N`%ZTy?!g8P0=o4WybBEJK zi4Q=V=n+@5*L}UTp2=kudkf_>?&AiBd{bpf*Zr(BH324Tc4`BK4hF!)|0wX@u6)8znNg7?{=Wo5`CR)9)xeLR@ zMRW59^P=V%i?(>v1;tIcQ&MXO5|cCDafBJxr_1noa9t(L+c@dAztZ2=oGDAlnmfXA znsA_JV^e*&Sx>;Gng0|N^0MjU>6?$>`6`<-hkaU2Tu#uiUz6D=z?T_<0U4o$)3GR3 zSxl*l*pB{A${sbK#X&D;^HmpzPv}i|@RKE7#)1GGtAr?_Oib0yADJ4~&H2M9aa#)A z_O)R%V;I)+T2z-eYUFbl3u_Tq5htG8Qj`%kF4X0nhK>HIx$@jrsgKnqkC(EH=nZN6 z0mSoymN5+iE7Yihq&-iSHUZ*ombgqi_J!W#lf_>mPF(A8No2(v!B;YMs&-1LrBzSXR1TTq9$xXFMzs&2xg6CYJe0%`y(dL0- zw7&D2GIRcl?73*2jkfz93WCd`$D?5K?TJfzXysSKLyos?8f&pQY~(iIMXJBX%jS0V zfZ=l0y!3dAyQ8LpIl%<_iO_vFA`h5EB|f_HE~#+y-sp*GYkO!8P-M^-PjV&a1h)=u z)G$vRS&cV8Vw&Czc66TQ=m_^gKK9^}{}fD?&T&KIu*6H}jxIeg^c2GXzM4R(fk;<0b8 za|oW>0y_z!TJ!0C3>jop8{()!WsbVW#%qD)Jle<3Q)YZr%doi?&!897%1^2LgDKh$ z#^IL;JitPL0Er=k65a~Te#)be-L-Ye zWAECWIaU3A!#S9a`<=(e<`i`ir*FKp1Vp0d`(_9BhT^4>f=o}F+}SW(hU2}ZeG_BA7}R0a2c({< zx3=GI3ioGD%HrGZqc2SeiT2;Gp^rSPl5j?tCT#E>%fJ{>3~Gy`9Cy^$=gG_6#VY6< z6=fYa6u8^_Ua1J+P9@4>S#tl1rLZ7!(525-*;<*Mr)y zKQ7^8B(DOQ2cNvxjx;uXX{043hL%I^-Obz~XDX11?dN->i7s0hPHf$Jq&nZ3gir9) z=sYHQn4Dgmvc!70%BoKo>Hwy4f-2_Eh*XV0*KnrBW2NiSeg1gARB&=Wt#5c; zT(o;ShI1#Z{^=WOy*#1$k^$AX;tf`p16s8(BPE^8$5GE*p*|(@SXnW0=Yqg#ZhDM8 z!ltLfagGZkcf>7aW0xM2sM(erO6ToP9cR66&N4SL-0{EdzI38$XS*4&UcbN8ijDr5 z5nZZY*B5}12Bh|?hB=Ev)4U!U6v`}AuU$9G?be2zZ%&Y$PL9blcPUy2({e@BwiHJa zGT*0=w%#$R6Rd`1skiLW7il!a)NmZXpha(e)XjN8udMNqbXt(16_QwMy;La_h=iN> z;AOab15G`L6_H9C=WH}RZg_EM5A}TZvn4Tx_QW_j!E~7>xponD6{AG3a+!iSXAPgY z+Znqk(jFgyEYH2~Fe-%W(Z|lbcjc+)6XVFdSHcLB(y6ZpY+-D9ZpC+=HP4LU`oEvcLPYk!1YOImpQ9NF*6!a4{zES(k{>G~Qfogg? z|7&OjltKVMQJ8Hg2K}UOnbFWIrrT*sU&`(HPZQI~qv-UiRJCT4%a{c%QPB~$LV@99 z+FL4*TKWBtSf81%fBIMlAif@D<5Hn&j}xa5L!ncAE4{fjEIu6CM^;&<#GT8CQD4h) zR4b7A&c4fa|I$gVntK6`f4UeSDOiv>ipj~S&rRfP^Eheq%R?KQjA(vcl%!Uj?7PFf z>5b+~zA-j}LfEb`>hhL6FK=y^ViuQuO}Vm%TzNpAQlZu?o?BHXT+5s^D$LtrzCL>} zwT?f-{lT#PI!Y8j?&V|YU@Y19?#S5uuT6*VDK0*TV?X*yy&dQ2eSe$GABVe?Uv0%l z6AH2hS$q`rB97#{2>x={pV~qc$aCSZ;o42g+1hfCcXHi{di{M(w%_0OW1ipa1`DRF z)He4;eMF;_f$!Om#NzT6DXoX4@sla_!vWg5g|z{E*|H^%U7qBtH8x(!68fm8(kxzx zW6U*z&3tKGa>IeR(Y5&rQ)Y*2OuiHrtK0`|>t)P$-i*W3$=rlKFG!a|SRJFRjd)%; zm9tFIdv;}N7;A1J*NKcaY45nqG#p#_T%4u0RzYER;ZDt5Q+vf$?a?-HNP_TJ%i1N? z&ql4%g<2lQs7ryMC%h0?N1oEarW*eXdac2C<&yNx{K|J+vP)5Ir=@jE)4F+*i7|nQ zyOPA`@ZGu}HY!WizP7>Q#|MV1n`KXwEri%<>?*sR#)!fq$IV$a3m8M2$0d)sD%UZu zi?OD|uh=**PQH7?bDYNOwRVn?3lwtQaL4e00v>77Xz@L-5@aG+JwR#WaT`doakd{X z4#!$+Tts*XFM#YkF~a73&e~d_<>A5k?ATSe$u4flCq1s<^zkweOb9 z8>J>pAGE92^Dr{U`WD~cLeAt`1>T&w%W3o7Ppnn#w6<>CWh}4?sa!nScwe;O4a}fQ zFny@kW*uR{cs%seFB^A<)ob%`u~}@x)-W;d)cY|>mSq56e%S2QqEf-|#O($R?&g=z z!fIJZsAg)0SvKwu?&va!O0o{qBJbFn$~iY_%MHen5(4PzctMU=d#dcxx5nxAtxA?~ zoV4>N`Ij5U-K*NDK(`^n?MCymC@LkYfsI7X+;NHjJ=Zk0 z=VlYulyM(roIbphGcE`pi)>oUIrXALI%+2QG^5FSJO_=<$Od`ds~pE`zVe29j4x5k z&Idhi%jqzGx|a}N%98s{>2>w+JIU_;^Yy8@!+31#dbRgg57f=Cdn|)Sj`drkBKh3Z z>k786-vz5e+AX(%rzc*oMHZ~sjE7IdO>uMxUXF(yo*UmcBJxsGz1DL1v7%e$wBVM$ z@x5{>wZs+h^5OUiwDfUkPLmXnf`aJ2?~k!Z*b?Ea4c7UHqQa$nvk7aKG5E6E-D2Wc z>GmM&6&s9S#5l_E-7`M0TfJh|P>$Hipg&yjl8yDU8tg|?&JlMna$Ul_7tjjVn%Nr| zS%FjG{oAVqmLOAT0JR(;LkZAuglR(gz!uTk@ihp)UU- zoG*h#ozAJM_*ezl?-Ip^6_6Pk=kX(!<~tCn*sjsySLjc?gPEi4zY*w&4Tbd6VLt``l7 zBs@Ht2y*WA9GAWJ_cw@~&$JN0gIc^dUS#sPBQL$6OKO!XZMapupvJ~k=e6aHoj;9&E(N_x%jmw<<{JoTVmZ+0rt-;$m(=IdNL|l_ z`;N_u4-I$06TG<|H{e4x>ec;KT% z*{qSPMxcD{?;|vKq!y%X)~vT75nqGga}swu{Q?kXm)ePd|ig|87%TAz*Tk`HfkTY+wFT%Rz4?X_$b+$)rW-~Y1i1vL$KGOP+FA`Zx_sH9+I2QNX*&eQ$8#{&ivtJ$A=F195 zv-gv3>RkuWn^EA)2&0DgZHrjAc3kdVZ;~+_uZHU8!*!Y&pV7Q^+3~!^ZLZa>dE_@i z!s1jYYuQ9Mbahu%o?UDWO6LfDSzVG)=(mU?!Xw*}@`^pU>DCST8<+d)tMO7nAP%MZ zW~IV)0FmuHrh2$REc7dx#QG}q@%JLMgB$F_DEGr%`hBj9?4{eFRB<3O=5vKs5ot@F z;qs*nEI+Q!yxa+1xf*G(QOAO+n^DGJU*oRzBh4t+)6P;yU>tveeg_BX{cN)SLEd7! zLONHc`4`6Z%cizs;;Qiqr(njW9x^3|}>09u&_$3qlmDh$WeCz627_(uDxxJKR-7C|b)zi!I zE9$a(m9mCSJ53C!g05Y-WgXJaVi>cDLb^mvr|b)%s{>8<^_8F!TPVGwHQ5%BMKA=L zZilcZi&vT>!vm03*eh6UW0^bxAZ& zA&jr2I*(qPTdfU|@jPWC<4$_Wb(yKhFz;P#pb>cR4wh>fzf)pvS+}0YWDpH84!e2x zG7evKuTsMM3lK=F28R(+hZ!8m^Zl+3Q04rd@@5&2`lbaRX`7Tb!6Rz}{gt0O^KIpc z>wWH6gJ&{4ngTCBgfC~7fo7~UD|F+8_D%@AU;-=NdJ=)uQhez%`6)P$^# zvZo98Xne}6`o$Dq#8bFj!1d_1|;!@2tos)?p1H&{Dr zofbTclr)w4XDRLvVfD|sHG5gc^twNY`(MbVEk z_ac?gJw<0LDWlO~o2e*YYv21FH#=^Ab(vPRGJ?hzdIqHhV{H%WpsG-b$kP(((wuHx zARTn_b-A7CM`Ib&THme1XG^pAn39;OO@XER>VrF zZO4$KLEqSPrpo59$!H{l7Msl~E`!JYO>4C6;Bi-U@7KIuO-g6So;RVR_98inN`zav zH2OX4Fh0=65lv@q^(YG4Us_(H7?} zE&V{6S{vIhq47$fPC)(qZF6+U(pi*0P~f5lRgs!EGs-loo}rZOySz3MPa!{lsb|yG zH%OzpoaiI;L(l$&7ZO+Wst}OKiE@F`Y3?l6is`lulT66a<55iaCKtvPj(*KwU2w?WJcWmu%KyLi$&g2>}}_PnT}bj#=8`LQ<1C*?p>3(Or>Akp1HqjS~>#~+uM zw>X)8(94b5G6g*(A@BqSrZQhDTBH+)BJ;%|k_J(Fggl(y83Ybwtzi+ElzKdzUks0`rO6cVYyu z?7V0=z)K+-Y*EozOD_GxzH#cG?7|BO$lG3{f;G8*BOFvQD2NOcM>R?;=ohQDz}zPt z`lESmjUB~OP6ld!1SCaTYZZLZmMBdv9s_p6YI;@%WroNNLPL}oRxQd@57P2LG)67; zB-iA0#^QIr5l+IpHC_OP(YF`gWi=kAqDogf-LcEnWx8Va-c^KDa+y-V>8uI6h5>=R zb&z;g;{M^|IqXt$p3sO==BdHf-L~#@ua;LP*x40E^w3|4#o(|fZl+}TNz2qEb;i%i zGx?eaH#H=_yHGuhaFq?=VijSQd!N)eqtj+`evJw}Fh;mV$v7<((trhBh76tlrl0RA zO!q$*IGl^vM3JLoLk^vKdfV*F?ei2_(XVL(RKf0QqbA7^W-Q0y6(`v9XcwtCIj;O3F=FAK2kyh z!)t<3{8_#Ov24a3L#>mkE3TBzY}%X`tBBU?ij04` zU0|^E-XZSupz13+w~f4+opOXV{NF74Cog#<^yIEHg{%4^?Y}*dV4LPNb4_$FMMeoI zmq+JXb1FQy134-;T$~V%;8PNct(;6Povi&Qb#6nvj3Qs)kG-h4WgoN=ywm}NgzSL;r zR|RocxaBoFbH_XE@hVr{4tw7|A5piz`kgi8_eEPiH1u@_6=T>#;GsBHjOB@VQmWS^ z_K3t}j(A{flndseV#t6*oXZ zF_5+7yw_qgEb7#v1PKfiPh>>IhOMADTe`r{@!myPq%kXz`$E%~T1lRp5^npIDIA&x zmw#Uif8*lW`~4|`D+<;z>?USZ+_@VXV%>_w5HeoF)CCRzxoB>uY5=Q@o+(NA0?%JWKcWeo5*XP z@-?5MvfNt|9t0f8OP6G5{v|arY2MDln%fdF(b3tBkj3BZr0ulWb2tKiq-ag4Q{hd% zk)>xEXATGJL=@~fgC|5G8!?YLtGE|5(V$8oezA;S3a8)65Y8^cdKJdSIUwJ#Tq0wrSFu8~VFCp)6p1jH~;rPRk;NzDO^V~ zzkb3921H%c2ads31McVzF9FWq1K&#**=zO)A_V^G()LS#Tn<&4}QeytV;UV99 z%<99HuJoileAt0Az@ca+<>-Ff$yduT(7U30vYDD5j>qb%OzGp{JlOa&?*qPoA{niCOxsRO`Kq1`=0}1|14etpY^BH z#@wFWDSy?KM)C6k#VprR-~6CZg8&l{^v_Aur(({#=ChGN!Xg20^Pi5}^f0aA_QF=fU4bJAq@To&` zE)=Je3H86Kq8KW?tXEzD4kYHPC(wWKQ2#SS2)x7Ice)Vsk>97;qxe1GmM>#klaFAp~ybIt%V-y1;>fdEMBu|H!yY?`qK0vYfP%7pl z@cyg*>+*N~S6Pu{_ZwLzm4FuUw@Q7ka@LUjU*~rTd>U?wpY<0=~ygx=>4|EX(Uf93E?A~h7Z?s;XB(a|Z@B$z%J z+{~t?d-5aU0(m~N;#43Dx%}i_{#pXdncmmUI3tcf9tat`#)0v<7n+Mk*_)D5uSc|! z8%!c=OcfN^TBO%b)Fw74be-kkx&K{yar6oxR{6Qfv)W#|&5V1aP^)YEC95HOi}8B@ zGtOe)8q>6*QkNj4gZz5Q>_fOzi9e9}A&U3`2E25e(YJK?z(Kq|pHf8YEL$$0L)ThA zz+1z&Qefn?_id_F_3R(+SD-8(<{`q!D#ie{a}rMOcCR^c7`H64`D_z^>>eB~;hERqQi2OJ2uhHU!G=FmE{72b-X#H0h{jD*T!_@cN%jg~=?^5t z+-%JkG^?yXty)k&9ivr<^!EJ1QYw2E^e#Vx`c#1bQqCFLuE5jOE1zCG zL0vV-Mg&obZf_OutI`#;IvkGlRPT$L?5#P-!4@XiZ2XC}+Xi!~g1TY`C5ai01^rj) z!3gMQVLO?#&@-Vyju0$cXBWlzj?9E2?fg%Rifr%qFot(dQh;bv06Y|$y$nkTdke?z zS94SfI3GGq_aEzb%`|W#`O-#!eD%${;}vqDpI|7N&xVCwut>&Qvx_pbGDml5vY#72 z73&?M!o3^h5V!HLM_pn4X-fP6%CX%rQmX4>IE;eqLAo|7LcX;mHb$H=EpVri_nTYI zO~t#c&K@J0NgyoX6(9suP-%Kd33a~gw^v4`P1JYE@X)5n<4+Xj{{$2-N}gNu6Mp6+ zc4m(2siP0ANAF1qd>&R2?WVnjFlaio*>`F6S7t^&c{Pp6b111g7hGtJQ% z%{jOeRz*hPiQNP~U1Wg$_{6Qz?z&qupDq*N2l_prBB#el^~x(`j!6KP z$p+zqOGZ~%C`wr zW=Fk((&HIo()e)Qoqk@%rFPIk+IRJ|XhHY@_l{Qv6*STfMhnPL$YU>;K8Kj zZQ@&pQ2MzdoNP@_VbS7{Tz&-av^TKNax&_ee@4OP4}6+*g7O140;|yclijH-=HoDW z$r30**to4-LRzbGs778Ho!HohXn$vZvY)yhP2|9-86qRF{nyubr9n(z`obDejjE#l zu+U;hXsvs!h2}-qCwf;F7lN~wjPwmS>`}12Alu29oY#1bihSqq3No46f{BQ%aod-ABrnjp3wv&QNPRb?-c8V*>8f6PKyQ+2%B{NdEOeKUn7X8&SdpgRHS-3tUnNXm{ks?79ph#dHsKA_i>;h{t7Lp;&#=Qa9i1Y(;0k z)^0!5_8&3!tOkDF(M#)3?Rl6ZxqpbL2U!TzrG)BKJhK2jqd*su>$)~6oyw^FfL)kC zJi3sKM&Kt%w2A6k6jHstwMu)9f4Qj_$;1U|5xvUiY>~dkH|ubckr8Xdie}CcAqJL% z3|i7fc|RsfL3V6gu7zHtOe5wrN<}+IJQzD7UEP*p^-2rB20}FO^4L?Y( zgZe>D1E4qyf>h6hrfVAyw(x5vvV*PFtVXjMM?`?9scG5afBF@mqCs)n)|J6281iM@ z4?#^0WoLc^?5Lzdo+AwlDeGj*V?uLIEV41h@pn{!VjO?$_FjH^ z%`~WL`mPhY#(7m)f%ycgc=DB%g{Ig<(*A1UCur9q@ag_6>K3;L>zlp3rS?3|qzQ&U zoi*GpsdT42(H1M7+-7AFnQ0-Dyf&$Mh86`WEfK2)yVi52xSW#+9L#wew2I?nK@2bO zY9D7kHYp97Q=E`(5Mi6bC#9c)PYrS&-O$&fUwU)UHJMID)uh{>{OT~mHW0!WD(9+L z%noK|H)}E~FdaU&p^Gi|>A*_sNwJXpj0TR*O?UnXW;fN$PJy<%T_U*>1*zfAkU^I%)Nr*i;N~8J}r8VlcFmEHWpJ+a#;ntRxyAZVUx1Q9?gG(5vl_WZg&ZQ+M6b)hhqIO)>0Z*H7=QMr|%SeJU&kNNn zN8@l`=W8$M1tqsIYHKC|wj}9mk5>CSH8Cg+_&D`?3?y&kVJ>|4kNXtR%L| zA9!Lr6M%3HMG7D3M7zo;{mRWoOVJSH_Eb_|<96a4nJd4hp-|vhz^hwro?*FMQjkNr z(4=EUp`P0``sPvH!Rm87S3-+6oXC(|^SF^%W4K_A+v!-`tJu(BxkB(!p+%HRA}Kx- z!RCn+W`iaZn;>e%xbw#SzoGpRZqZwTjej^E}a{; zHzJYoTY1cQQH*}0!K;wDL{*AW%6+3mu|xruqSh;PdHF@Xc=CF@%8f)3Y*a~{PDf`# z$@eGBF1_`b5<(9C@2VARcEx${J#6yth7A34ayb1Wl|ZrR{l|xwHW4g~(m%g^%=B&Vn2oLOC8@V%0FWn^T~kfl2WL^WCs z$i_48?u=YSC;gFa*YZwzw-_`NjC6IY6mVPCdC1G0Zdq}zig3Z`Qs;n%OY-SL2HuLRxG=wQ!$=GkA3Wq1D*I z>!)L)1#r8PBa}}J$WH4&@R+w(T3&R#O}C9GFJIhTke)Q%pwrAPV61ZE1DECK=ravo znGg;IWiY69$alP}xYM_a+21)ZH(T}@T!tH(KNLGd-8$*4?T*C2@cBW)kODc#P^O#; zRbhgwON}rCZh49{GU{&DT1^aL(l33>{_kDg0oOYb8{Z2&2L{eNMZT?h1mJAsrh2kn znKnQ9Aw6v`@@N2-)HElYPgl#!-I~7qzMZ+?9G+EIN%f&qCxGIsvEwOLFR!6{dIyJ` z-RD~BuxLafLS5@g_x?I`5@j)-tp8>m3(y9lL#@GDW8!Nj_Kls}_KF_JX7|UkjTEFt zw?MiNE^tpyqq7^I!ORJ)K6&_}nFFk^B2r-I1cSL1u6Q!4u|8xieA>>F@Cr2zkNx{T zjxAoFjK{&e-fu3FY3{^Or1D!#3;(A4K!gAn0%1}N zMoQ5LG(Yf{vf|_%-gyNxyt;bL0liAVbVICt=>eIsh>iEz`Lb;| zE(y_gc>60WgyH_NdB@4*b!-|LE^R_!`+aP~Ih@BkZbsG0kN z6&ifIG#jBThU_nnumQUP{1H8^foB4$hAz&?)kP8M-Q8o0_YAuF{RUPl)5h;5-z%GC zItM?kg0Y$?ZO+W>&+`7adfQ?E2E&bhj=LIq`PIZmQ#8d5&WB^;ueUYPZh7(a3SFcv(^OB<1N`bUt>S!uHJ6_=au_~ z4!3?rqWqP#8}-i+pXNO4Dq?m_INdyxe`Ry0Duj%Moh^iCtEn?wgs_P#L!R@cEy7jS=%I)V6GrX}`cK2-& zSvQL{Cv$FX_=wf|Fw{Y|u->oyKwPi=$mqRuu}q|d@JLv_AmZf@+=xmze zfkG%XwG ztjLg^tp&T}KUiq?fd#1OKXc*$__qj<0G^4%)C3dlccJ$Prmxsc`X#as_w`Sx(SKZyvN=+ywCV&G=M$8(`)<=^o=o|eJ;jHDkf1so>{4a=o&wL#CK5NyKT zT6#~^^*s)--oO`5!z?s(J)T7KoUFSxglGjG}jEiL&I$gbU=w^Nn%AUqRV9yRa1$l@?Pk)(FESn$gldpS6bjb2^Af-4) zo>&pKRh-SKCYcyS^kB4ygF`3C;19>8>{VrF7nE5+r~%^MmkRvHD!S#;Ti3;V94# z>7YAsoeZSJ17gHW&;=Ls^D*XDj3#;1wwlK=9%hBm_|0W*{thAt;RADBXj}dEBl$}+T}Bu_~&a&%I4+skYRHXXKVnXF92cW~Jfetn*}{n6}cJ zUX;vbWF(VrT=$24RvmkdJirD@mOVtm@;BI&DXX9_TBY#SHYbjSW<)P%o02nT->?J{ ziM_(!P@P*gT4XBB52mw9$&}`wK&5{94aZwlZC$O&E4syR(mychGuA8-owRO<^3?yh zyZ@k{sRmlvjG2%RKDunkBoCuhzGmrVFqjB#qA{ibqVGJv;(o05d)1!*-qGzAy`4_f z9C_};!=Oefxaa;-QGDZxL8c4-r`b-Dy?mmUK@x^H6#f@l=6Az*q$U&%0H$PI)wW)H zx}C|fd|HwiCLMz!vxnnZ(Hp?$O8bas*Nx*~x#PJ2BB`ASTN7PGmvCT6`M@Qy5YQp@ zvd()6O;b0{)4MEhz36ZO)qpX8auPq@D&c&-yd#eLPO`q?GPP8A7-*{FRLOn&*TVjw z7kFeL?ZbJbHew!JoT<9@RM#qZa*SAdH_CS^qIB4{c<(@}YYw9l2to{AF zrY;=k>Il7twCx zm*y{v0w6t60y#X7EnLj^E-Uu{)ZKu3YghBg*Q@P~pG~IU*2)(KyEsE0baGaMFPdd9 z9@QyI(<8jk?6gN)v!W9zYTpsdHl&)Jx=jYZ_nePC*3RPcvqpT z0Uxp#dS{lQ6bE+}staEmdB)uj9lRT5JHk*MUr#(v_T1C^PE6?c?OtzkWDetN=49iv z&C3~Zp&yB23JPfU9$~*PeDvyW#J(Gz$L9x>bIfiPL^HoTOz@Ctw>l>t&y@?%PkSm5 ztgSt)k_ml#h=ti3A@OtwN2SSMNY&#+TIawRDNgAio6-_XzTANI_<9vpKr*5#aWpDj zZ&GeQiOYIswl+?%&uXdk38c{fx_=5O`V_N+Ee?u>d8J zf}RH$io4%X2hTT8W+Ty->5KU5_1P~<-dc3aUg<$R^j5G0(mdp5bggQ;9F^;|gF*I< zRNOqBA%~6A%`@{l39n{=!E+V~V=y^lfA9Nl)nn3jd*H2bb2_o3(^q%2wU8EWU@ygC?u?z?=%>a_hZETpIXb#{?Kfg9P}gOV28Gn6okL1Ns5a*j zJj$R_tKPZxL#fexUBLd0wG_VxM$hmCDmB>?Td-zw&16U*sUo1^^sN+wF*@Or0^pjs@k1~qmGlKO=&c%SQl&d}oCxI)v zrGmJgWDmBX6=&NA&iaZxgSbZjIS|bSI#asPSINxN&@n2V+_w^NHrkFz`KFFVID|Lx z>`7=;8hHu$a}C|YJH4;}Uz$FQ|Ni|fF2}h`+UVisHGlee@LPrQEyn%?UMd8P15IEi zC>4`QrpGGlSH(2LU^ssW^;J8&dPMiftLcO;;exKMd&mBxR6HFa4=5Q2rAM}T3eKfa z)Msh_nHK2D>1(nhiSe?nM3tdKUtsK|_|Asg9@Iv?aL-M%AtUiJ@_ba5iS6nDn8F~J zsT9+V-N2x>8>35iXP||44d?&e_!~vQqRMWVmj%vgV>wl!Wf=Mx4Z*zJu1@<4TSSi< z6x&(AmYezgb0Ny^*hFWt${5zL1HyU&*=Qce{a}z={}8{~wNV!=vbYEs$5Qd4c>@Oo zh?ei?4c;s@J8*%YPNVloUv#rIuX*aV}wX2VX~49TEcTgA@uqPt$& zLX7{$`#zm5?W0dxcrwmMGhuDlpY^XoeF*6nOHY5eOt8pb_t&qHZd(!Yq@Da`h=s2e`@f$h(MZSCUxe7*i$BYQ!}L2H2-l?qUW;}M{Sb$&1%5G+3UrDKGhEBX&Ge@q5q zy)>Z+-pSE}Mlg3i`3}U=i}Qm0FpQ9zy_$?zK+LXNOkLlO1-(*epbCh5gUZ@5Z|IbV zK?469k!EicG(lgt=H{CWxnFW<=%B)eO*^p9rKA5fK2;s*_*O331ugBfg0$yTQyRvF zH|>AgVck$T9!A^a8c-@OLI0*COFqTQmEC`zQ6qklK7+oKuq{eU`Aawf#u69$S+au< z^<@-(GA#!pG+58l2zfYRP;D1e@b1U2&!@#L0+OOvTpQ~D>;I26`>BPLf1;m^v_H*q zv`A+6ai8ty<0U8m0>xPyMWdz@0!gJ?N`ws&ffddTjy}l$`Rye>sCodwnKBKP3oOSH zU9VvYUI}(}S1&`7e7;l8CL1gaTv;5t{T zk^q9whgENo@Pqz7Nm9S;jq_y3N-C~;uU_5mxf4~*mRc|PyNqP8W|QJb!XP8!i~K&y zb?6R*zd4x?K2p?v`s#dxU-qFI+VfNb(1D!j-%uQX0!-LOGj;h6T@UlCCml~f(0};- z4RXvl(=flxUZGZFPY@SDK3FKYW-g_P54d)UQAD&&@@J*SD)=)-X119R+0kco#y?5_ z%uWGt+VK6lAaa}CB*|yu=tY>OY>XPt7AY>gcxBh$Z@Idk_6{-#7PJ<}l#R;O8znTH z+n4EfCv-7Md0H~5qj_(V+OE5kE3QOuJGGxz?Ni}FaN@U?>j=N!pa%Pq(86PmdxcJ; ze2bP{|Wj{)7y%=VYv!R1(MEn4NDF0a@OPwq@zcI^!4f5`g9d_aSPtUoRuB zlQWg2X@B7u@7KuoK%nQOLrD8j93KZPN4ESs2@tAN?r+u-K4k(oER`(#4#Dx_-nB>F zPlwWrwFO;jAdjbMPE@eNKqrx@_KYS7D^HY%Enhr|T_mnl@^09;R{4h8c*sTBOi8$Uj;-4Tg=epfTY0`gk#O|bLY+79bvzzf^w&qfyu#LtzS~_P zmtjn@#-RoOva3iBbYCh`BW++9dGaDerb)? z$?8il{p?~d zowts;AbqE%gFmBbg!@5)7Xio;o85wZ8^G)wY8qmiOBk-P_p33|4~tCe9qZG~@}%nY zd}}c>$~!3zCDlotPF3J!Vv5Gg?B{#fFI?sfqKHUg)M!zXkN`?0+Z#&_siG8638ax? zq|d2q;LZt~-o0vONoM8HrC5Rk2&O@NB288&=P28pV}(Z6?OUC$Bj*kCb19qH&yiI$ zo{5+}ss-~CQflWcFD{AYI!?)DJR5be!qHbD&38SKdal`*f62v(P9b?y$X*BJ5MTbh z9eom(H;(xo0^!+FzWt+WvyJyRd-Io_-7>}HmWF9w0HGkwN-rzX=J7%W!QM#H{1F3y z_iFfK4-+AcrG$#_gBRH1$Sn{#hGMj{Up}qQ6UYf-emLPwpwnH?BcHh!v-Ebh_(oLB zrM7p0I+u}#DN~akgE#J4ub!ty#VQ<9v^)=v+Ngk%$W(e|KGfRev8zehfP+1AThFq z3cJ%|CtW#jTkZfR<5_a^_3=k!>;PVqau|AbP(9T-7Io*SI{q4G7V(7-gRG3cyW;-a zWyi@F%KUUU<`?ubKw9JZB-?ioi&dS;`OA#A3f24WjX9(4WQc5-Y~!`4Z`y~cbCONO zMaLy))m7c4{o7^UjWvv#a0Zt0-dZ8vH8dx8aX<42>;$JsA^5?IO)KMs(g-q1*b#Ob zmNMlKhGF4gcHu@${tWNj$-#I>Lv;H5yKD1FOZau-O2MA|HRzc`4-Zo6vg)$)w8{LF z#=5jIkvdv_$5s){ILCu^CcAreng7r+iLgS?0w9D!?_^A?y?e;KMVOxWxEN9s4RRgVF;bG)rC112}x}S3Et5EfmOb1|IXx&Of zbbVYos zgeIXTIQ9JTYP%SCZ6wiCmix1t+aEYt&UL-4$z!y#-l@E_oQz0kV)EoYL}%_)AvCh^ zl`4Hk%!d%9yAp@+9S`O4l0-6wpA_!ItfS8^oy}Cm>3cqsw@1RF2tHvf*_m|znpy~} z*a9)eaoDIUyYn)4D(eN=DnRmG<1M3??a8cP2o)wACL&&L7r(cEA^`+wiVY9=)_)+7 zO{kZIy_wsxnO(U%HDfwz!YG@YKenAIN#gcvltug!>%l_MrIuGSNjy9tH$TvYOz31# z;me|k(S+F>qu_ggZEd@oLhPM=Wm#(Ssw~UmTd(ns3Wks1z7zmcI`+KGsU89XyA*dN zs7Q{EEFyj*uZm>D0wH8=>Y(;T>Fv5LEz{V{ixY_+o4ZF5O?=(wMw@9Ksp9*PQ z4veQKm~kb4CXPxncL4$WKd78EMJO=15OgSr%Axt!qWNKt~Flbzeeif;CpL^&2Yti3(6bqz8LGRBRsS6!f7 zv8X)Fs3GiJcJy8St6X?|7}GfZx_ zwuuZTM+qZ_*gql(+hwB0&b8+LdyJo;oDv^!bUWoWYZJ3|OjFy5lkM;5_4*`iymu*C zY^vOv{$b1QNNUjO07U7(x3Icx2$9sjy*9C8CDRN_cdLWv9?YP-3KvTk3pxS)rvvJUu6asVJ>x_AnWkfp#8t42nq%CC z$_%-=Xw-o)!Kow=Flz@W9ZWqoS*jgj^n6y!#huqETBZ;Q0(ES)_?zMA<$A{sH*B8& zP*f-TwpJqwdlD??dE3h!wqa(dC1b~43`PY%K0tS>0k7+J2vnY`??!qOPx6$LyGtm` za`l}y)Z179xmq`Wb+YxbJ}#$(4vu@c5hh$i4pS=I=M+WsGP&CtbBGtdqZPrf?BOqt zy7i)>#ImkI4+yy!rJTb;5Sm!*!txkkgTnsY$bWJp1g>`nw~J~wchD*g3*Q~`+;0B) zB9}c&Fig}VcQR4X7q7&kHsw#mJxT}ZXjLlB3AEP98>I>crWh|JL>x^I@`=f+>4b~t zw*JgQq5yZ(k5aDC50+~@LhCwV3xMFIlRK>9Y?M) zRHM!aH`>?VMF4?D=?!eZCDro<9-i$uYog9be zSRMomZsF{tola^=tKYWhk@ z`$pYU_zjC!$8B=taHq5w2^#L`tKeh?5Y+xCNVkp7xD9r7x_GrM>Y)UTyF* zt)Xxz4wJEUQQaZ$5r<tjsp2`pCTE7Zw`0T9Jaf8%$5(K(y!Lldu44Z9lc>4g(sVA&@sL&exL;c z8Bs2aSUs;^rJ(!VOgKK2Gv3PtXt0zMIwJ}rZ-RP?E(g*B_D51>o}PjzC)uleT|<8U z)pWT~or>(mLSXo2@U`OMg+?h5=-^W|aB!jdK%Re}HsMiRl+`@KoDk(DI)s4oEw#49 zFe;c80R^wdb{Oco(?R zLv)SBI}YZ6Pwun`-<2_{Ri!||N)pb?KqstI7S~Ppl{9P-96pv_zb54E0Dln!fl57bthd)wm+dBkv%uKXVnKl$KZC|cfUdKD1b4uAF5QA093%z^%n_=$STZR82fogEU zoVfr$*q&`Jyg~$x^SJr(lUD^}&gq0fYoqd}`DNX}GT1ki+)^E7-nHnDlL7Lmx(c>m zyP=8X3(J-jz=HhB!&H1XAcFpLr>pYG!|4nzZ0#Aqh$~rwU zVM$Po|F)kf;6h6ciT)3jyOGf|N>U}Pj(45lkJM`-2Lv6QOTX(_?qZ(TI>civKZUT} ziTqcI)w>fA&Q-`3G(1|bNYz+!NTzFmIn#w6jE#>e6qDSG-v;BjidIMlm zd{rH<=b+X8rLoXiKwXgBUijK_rAan{9GH3R;sVFi|0tZO=fh{I5yf*$UJV-C6M|hIt~U%X3&LZyh(Y6{;@7yPf^W! zd3nHi<9zYJSn9MztiporwM#pzd6eQi1xwn=(awZe5dhmF>Y{<`83=9uzq@j}l0MIU zU0o&rf0KELykN|o-|@)yubIF?9!*F((3$m){dMym-?eD9t4q~Mb1TpITvzkkISEV~ zBH{?u6*#>1+q7EjwaJ2?#}f!5iCm#r$KPHiq0yEYJ(9lXT#M{Oi<_Tr?Y0n9 zHggL1R7se;&n!iZAP3MId~B5KQ?&Pnn>f| z9u@rl==)X%R62e=^{e69BHEPdmgBzrxv(Kd-Rt(pXMH_x>pr9PS~_w6`YZ*QKYG(A z$YQY$>>N(GNG@(qs%li2fFJ8~!{b=B-Crf4&}mmL@)#q7C4*x~2)XXhw3{)WKE)eL z#u@14JO63PGpUq8R5gTs_FQCz3SaiJ9!tgu2_>LY{uEqeGOi|&;sdPl*!$isdK8NU z#352xe~|KywVS~h%rPv?8>XTgv``+A{KJ`uE3)Y({q7ygFW=X{t-Z{e)F5-rUeU@` zG+!3(Xuc-rUS~t9JE1|nVDqi`?PGP%w8aLc>n~zIk}igeA;B}6a=ca~r)SB%+{UF3LiuOiDfS~RP+T+fD$XPXjEn%WK4I|%>UY})5QK*aKHYb?N=0%%?n_BzL=NhNc8lop{g0CSRf~<>4*oBmg$MM8y<6N0 zC9Yexs&^8FDhdbX0ENb7D)Y)V+l|2pUZJiTHGLNJ&keVx104cPERfL&qo6||pvMk+@=+lPm)?C`X@?X-Ivhva=Q`)K4Ti)nR=8&l>zCqCf32}ES? ziHvtE68Xte!;jr~Lj@T=Lm{EHBtpxx(`Rd>TSWxKumioee+>G65c22K26fdgr;u!( z=l=tfa1;Jrhq&q*4XspHUmmK|I`Hy-A2wXsqhBVAZjJ9{_&84TWYIK1&j~&ZD{nQt zFW)%XFGzjYPAIfWXKXl@w z^>Fgce(BPB)`E>P9h*cvaUA;>J7SS*;u^<0(Ox44eysTtauCK)HFnz$Nk(p?X7Mf$ zb60;o7=$+Ly9Z!EW5bL1hI)tmR#M!%0ikXfXY~^6~qDSO~Ct zV+3iDb_E*E3jK?18nikclxT4IC9;W(CF51>vMJ5tXAEK0w!QQ1aU6ykW=q*YSkdX& za3p8_W12OWX$0^yukHQEH+0wy#bqfF0n{0+TIHiGK5xqPd}L(;RR+Q3+B($of2#|R zo@rXYQu&M>x0jsH_^<=akzX6AzI>CcK8e?FwUpqWP!LnyIo&+*k1Vb{*Ca_p2ihO8 zdcrxsJMsUXt1KARs3J^Ymj_ela!J?xI-DOPPCEKzSKm_W_NMxkX>DUvJ=YV@(Qb_I z2l*Kvp!ikUghqp8qGMU|9y_ccQO_AS7mdD@d%tc()ut$NSl4V)VEu$eD_d{7FJFED znC~85VQ%6&*krTUjAiwGCIfMX5{NUQ+mbshNaZ(*yS^l>`57l zYK+XUz0B!{nhK(dlN;Euxf_^clF1Ir`Ru=vlWO?PiOMM**F9Qu>o8Gtno@K!mm@;1I1&iGLlr+gdNe~R z75uxr1vBh`rTd3^tBlK&)s1yBnm8zyxJ@-71Jf$5ote?o6z>Xx*d7QsO}Xof-yMo; zsyqLCR7+Vt5K}4mq^0-EY`QAb8r+iE^};Y2gK+fqT%Dy?9x@?k_g5+BxSiZ`ip(2` zCd!ZC4-I%C&?&3x`IXqB>%BqCLYbr@IiH)@IkeF0KG-UAQt}$fd7H@df}lPga>3;) zoxd}{M;OOuioVq1So535dfv_VH_f~084g7ZS*t$StZ0~uRnDj&!^~OepNd-9ImR&T zae08suBg|tgs!uv;XQVZ@q+ge>!H*q!=jLT(y5`7-+Lhk>pyf0U=^q@msq3xq|B5d zN7)!!-#d$M5-6xgnalI|&vAxzuZQcgDe)tf0R^%AmIS^GVBP6u#5Frxi)J@Q%WYX& zVXqckc4}h?%rGs(CgP!Ulv4)`C4*n$j3YI{(WLr9@pQ zy?17^mtaTosiCdb7}a#a7;L+Q&k19o(zR<8-JsE=h+RUrIEhh2t=zwt_j8?jtUX8m z<}7{g-z8Ex;H>l1F87tidkfvAS`g$T5+(`!c*- z#G8YL^=1ff{LR}@=2%7f-Pq4=r=+HXNo6;CR>02hX-h0%fZ`+Uf2(*l|VCTj$x6rl+efgkFM(;@^-MC_pjiJOm*Z zpl6|UEUjJ(Mka%?ndt13rJMdQZN%;QC}*zht)5{hvLX4&eON}YAd;K>%Nt?z@Bs`w z2fSi1GHLQUKDb=EMT!+MBKWSh`w~m`bmR<|qxn86R36ZQfC9P^5V;k>V3V$J$l2&7 z6m)N1Sg|*js!R%hMtkNSEq5?t8ey*qIUocn3s{T+nH-xJm6hTc+7XL+sLzG=*y7F` zZh7Euvk`><)h^XeI$zDfGYKA3W^;|PLf@_Bh8BgB$332N>8!G6abBAak&*I{_F4*@ zPvMb8QnD$nR`E2XCxi00K_xLj*?hT2jUTK_ODp$K1WIB4eG1e0hlSHUfz(UyM_vBk)6&Wo) z^xI}Ag~9<>at$N?0INXCFfkf8H*9|>wofb2T|zJ$vMh#B&zEazVsXu>;DzmhN5aV;S?`U=Qy`?ve|d9F4CDeB%)y*#HrZ#?9W$ya|59eIoBI#{6k(_V z7P%7~4LF{m?;P7H&TOh}kpQOaTYODX7u0Kce+s;S&_veb0!ukzJDb&2XD(v`vrVMa zCRCD#ipduN4uWQSH+uK)GXvs%wV{MZ=Lc!aO}NB2DWKUQh1`a>EnQmgJwlIzj>uyc zDj+#Z1fe`b+>(~;reGnVe8KYJWwxcC8_n$}S@GxOy%p zVL=65BvL)xK8qSs@2N?SCr51x()rWD`F|1rN>1+ z>bg$t;_Y7ve_kWpsFal0QgC%zk@9+;dwsy^Set8t%Sbtsb@zW6NFv$GHqLN>L4MLz zylL~Llu*uKDvLdOI}PoUQ*#|eOc-SfnoOHt`z(7x-%=5!^Y?-Sl)6w;NVobbCPF0q z4~)^szt!G{V8LYZhkYRN{}p%nlD(=9wipYRt4oMgmAL4|&8=sN364i++C8jzbP&c) z7R(7EEum_yA@l^51}5Cw3RU(QJ}IlQ>y7%#$Y3~gB}C~(Cro-3t1J6e`r5z=e9u&U z+R!UF|2Ym}s(JsPee??-j=OBb=^qnwOnY6i@zYgnWW&J|Ym}q|LNYNM&P4qAR9LDuWus7lK2E^;z=O;?htQEESuOj{u>iu5#^ zkrn#HTMBLDVm0r@x^dx&gFpYql4dv#;;$42vuEpn%4v&O3dNv$;zI#>YbIVKw?u>$ zpa@q}zq(!aG+0e+)a<`h4@cXHM?064?ThLqtwn>+rL44lY}yG4d=D)#qYWSUj3-?W zEqLmL_y`#R{x+|!e&#EQ%AHnr-Eeq|c%*!@}cVXq<`;aA}e;t{$?%xTbSvl7&U{n!QI`APXtf;5oY__0@rND3;{JIW`M!yL< zTnur2d#&6}Kq9z(?W6L?^nwVU+SW>vUD#;n3VO`g#PNfYwNlTc`^72x^U{s)tt7LO z2pj=KeW3cCya+WYGwg&6x`w=repHO@r9`(gfV>EMyH)k+q6a@?p48c;AM3)OXr9Pm zsJ}+={ed23{X)Y0{R7nWUmKCBqQ=dtw&|lSOsrD^1@m?U);40}KQa{yKB~;2zJG_# z&X>6P+znM}HxSc1b<-($`p`yO!Co9= zf}sYIbnIT_vem$s0;k+~jIZ;e;EZ|Z!ts2{>4AbuKp5y0Xgy1qi67{hzB2S5w=7i9Ot7UpMXxSq%DS%8h;_u&4zdY5;X4G&kwdhe zxsvPvQ8e|W6BiH1{R)4g%5vxAXJ>Z4Vn9_D;o0 zn_@|VSNDL%#oN9C)Ae4bHS+0}Fu@m7uq{kjJD@9Rv*(b@L~1~1rUUo)HzN)}(0(_Q z$LIQ(IOtJX3i;=ic{31CheBuYMiMX_8phbW!n^MP1Fg2}FOfD~9|KVadda9=jdVd*fBIs%$cT-^ zAINSarIc`V##+PuS!+a(;wyM(tbY*|yu~oZ+~_y(A6lydq6wHmg;MZe$(jFmOpY&T z?4)GI1UDrsGbdEnhtNxl6Be^XkD(+c?wJAZ=ief(LFhTdZ!}**goK?pWN& zFA(S7k}3nHFO*KzPe-*ENf3z5$fb^3NFW7d9PvJF&S6IvnL?sy2RCshSe=H$U<1+~ zL(qNMpiCM{yZWabUX8e&P9ObDYqdC^EVU*pTMG~ei5^j|!kLJE8Y~11^dQ5lM5GG4#gxv* z4FRNsgiNwM-`CA8BmHS>#6OR@ga!Mau#FKNn*sXe|NIFl2d(J7d9n%b|H|Y1&#;eQ zL6_ov^OO2#ojoub9qf zZ+j70*I}~!L<-9z!xv#wp*l8G0SKuOhZHN}I3ZWt(nxzdgbI$09WB;>1p~TdA?Qt- zu{-H`Wwr5|L zcPies_W1SJ#YoSE1}g{SgBto@;s4x5&i}*STL#sYZBe5@Ah-t)?ykXuLvRc3gaE;V zyIYXp2MHS7-QDG2!GgO>;9$YwZTfcK+uh%<_wT)`uWDCnSCO-9E}47GG3L6Vexm*R z0Q~VzN&}oVp`;}LCE)!RN>w|!e)@Ya|9HP<1L_yLhs-(sb8Mi2MKkaO|9>E~p@_;w z=j)*Z`TrW@XD{_p_`l@Kzg~po{bbY8g$22R)GU9oEdTRTIDPu`Px}GQhZsM{r?1zm z_W#@Jzh3g9p3whcq-Q&!V*_^TL624dShxQi2?pnT-+xvJRO^)tw35`v&6@w8_J#sF zwfXeoud4pa7qr#`OsE@3kIB?CFYzCbEb$ZRKPv?4|NqO-!-PFAPk$>%&%vHlyoc)H zqLqw;q=yd$Kp(TCp#}!`Oh&&w_ZP@=QaJ4%PcwXm2y-?58WQFrl^bgxm-kiLW#^9+ zRt>PcQq7V*$j>8ZqzgdwwZC$Vd;U380Hf0yuxhti@9TdLFz^$#1M9m_b8P>z9u`6> z6sWn9fF1tdhMG$ROt>gS$o<#tg#WFq8<_Yk)>0#X4V2@b?SQ6PE$DZ6yxy z%X*L3){(53KUf)b>;to1tmdI+;7Wh2>oK}()dfz%%go!9HA`(PuCF~On-r$*WS>+b zWonTGMpy26sBX08=mfAo`>6O+;MLZ4n8sg*r#(v1(0RxlT;w>I%S46&9dB3kGs&$G z7Ol?&EQt5KUi%9Axd-r}TTAyqd`jT7f|p`lE&L20ag-QcW0{BFzis^^_1c7jpUD>X z#?^^%j5mDrTbW8;aHc>im27h7M))07LJEt5tV|{i=uW;L)Y5gemDxi}WmsMs$EzY1 z?cU~m3Sb%zK+rDkt=vl)5JgHuxU=q9+ufDvQ`hFi!n=mA>u^|KjA0^u2zt={X713S zAF5&Uj=DXeZ8dkh7jrPS%dqV%ON0zsPtQ|cn8(@fA6%w{@sDZ`> z;MA4mFnH8Ovn@+BDIcxbz5C+?Fr-{lGT91*N8cU{^EI8$Tr&f^kB#2DdAOin=~ zo>ZlY~^C9?52M34RqTo@0HL1b?uQjqS8jn@>fDJGF7E+xo;A!T_5iQfn4SR1B zxzv6|O0R!K+~6hfn}FJGx|k~G&PBr=+@__c<3d2^OUh4T$rMh+qgd$MFce3Vm>Y&2 zDjP_@ZI|jQvus&0_#Cx5F-P82H%l$?T`hjqj~9U#D{Cq%yMtn(?!C)uVEUzi>=2%cn1LgL~O^)*87^;E@HrN6PduIBLZ^VSgqlB zDyW-mm-KFOYc&7u ztHb6Fa|-{NiThy=dG$d9^Y;X$@hD{w)=Qt3ox;JtK+%R@IE9xJ2t@o9i!(q}o4_{>8Jzv(*p91r>m!XHRkP4E9 zHN|s0F#8!x^jU5cC%$^kZ&1v2Bhrj^uZl)2Ox+fusBHGV(d~G{RJ}gK?Qq())_UPI zsTtiDjZ~PK6qSUQ$8I_CJgK~JGp=q$l`+9FsXSKps{p|-OXdeD1q~l78&8xs@+!N$ z$F4#En+G6ffm9KFJ5F%R9X>vj2d8bP^Z{UgNjW3{4*Y3dc68bpmo zHI!&+n5Im9>v$|ix!ohi`H}mC_kIh&6u{t=9Cci}Xl0Z~A*XJKUOD*w!w7qN}Y^?2)LNH&Qb8NM2;m;%S=Lp(ljyGr0Mpma$ zu=PTr(sZ$U#LF3wgWHXzD-cR-GMYhVbOq4PI6@pbWO=>wB)gB1IgD^!>P!0Keo2_v z<|V7yQDuaM9xZ6E8TH}6;;?ZJvzVwe3%d6V6J99`O4kVEVfLJwq!>2o`xHUw`~O6|H-vnj}e10paoomLuu`-4vbC| zE+MVAJ+k^vVeWM-#jjfIr*`G8#@nlpCIJfUOxyU5OOp%2=bUXw!ywjz z^l(>s-|ifJDs~!YgnPTvc6Vl$E*mFkf#pmp|1B$5gHaQyPaEtS0far|_qWq&Dr7n) z;%AQP%aM7EG+Y5n0IKATA^o`*`{h|2yFQ_MVu2M~^tZY_^PI2m)TSG6XNG0QN>i8? z*XS#74gvBnvRUJ-^%9Hg$;$g3G0?CetD8$Qb7<2vYlofYj`E(cL1m>FTo|eF#?HF3DjS-~Ql}RD>f5(FJ!%f=#tfF!HFy;oL-8`EoSpDzH8EnjR4OykJ+_-zkk7Y9wBXM#%ENzCTgkSA#;Pe245{Z zk4Nzfp3V10XIlyCoPsQ^I)}MNRaXebOL{J zd+XEwCx1TmssMKo0s2Tf7PT~4@;(|@f<&pFtsa*OeTo?kZwL(H?o;%NO>u%KoipU! z{*N*|AiSN3Wapwr69Xx^aeu(Q#Hs}N zBm>CbY?1F#-)`4^cnk2I_c@G8J#PF0=I;ZcRm;`vwYj~6RGn0Q7jf=xZ~5*fe7M&e z}_Fc}^u;5|N$kjM21=}rOa?aQ9+7(&Q4cFk7VYMzD>vGCrNCa%G!rr?_ zI$do!CHvgBjDwgzZseKY?HoLLw$5#?e*V~dpB+6^flG>}CW~`_nN=vN;y}Bj-q+b^ zzijF#)8X>AH*LyG^h&9~R613MwD}?AW^3ytsqheW<&IZPi0*^(!J~kyio$QWmqHeX z-_+>U43uWs%EdJ*R`c60Nw4qtuHLk|Qx4yDZ<8dIE?VAN5`>>c=$2jygOE&;xvBW%D z1xx`J`}Db#O0=jZj$eJ4Zi+_F3W#E=a(|zel**6XD+_pSG*hnEgQnV}mHth@Cq{K4 z?p0x_#(AZ~9upgB_tEDm5A_)^PC(6CFfhX*3N7AOs3s>h1LbO zT?69*n!+#>Wi?R%Y3pKbd8hDh?e{hPrJ5xzf9dZUW)rx{uWht=MVIYZNJa5+2`1K` znf%MqNi9;uw_sq+6;9LZJU$F$4oJHyu~_9h9mHvqPqy2#aeC==b$quCY*#C}XHyR{ z^77OEF3<(maf97-`|)crsuZDo19ie|AHU*2&AmE2c!)QWUP~o#8@gXk3+7WFrdh%~ zhdIRSSJlS7WXD((UB>m-08=OpEOru$Ap$7_d7%{Q?9#gG5C@|fsEIoa9k}X%)Sl@k z@-936I*Y=4YT=_N;r%z(0jf@yNQF%bf;9dG;7JH`#9h82cMhHUv8#YGc^`-&+ z3@1aP`Y(zBE=HGQO>b9dPaa2PHdzIePdROj`*iWM&(aiw(eJptYl4JTR{(X2 z#EVReua?8H8*HKlSWxBXx03;+1ld;QE>Z-VVvN=wyd5az4<-d3Dbt*LZu?NIrLx^# z2p?ETz59x(5`^du)D7wLC|0`~LN_vD(*~$q+b(9=v7Wfu61{=2A>j)QI9JFV0n&?7 zq6`g)9oEw$`4<)x!#DSL0ZS+z*)c^y2#@pzyqT&TMubsAlv>hm;%xawdJf$BzHL(O zdDC4>#G6cBZ~|9%Vz*9~%&9Jqc}$c5?IxSs(A8C;;*}4FSKOINAQpTc&mh8aHtn<% zsihjiI%!60BduVmaUE3NtCFzmUtM-0hLoRCn3#YgmRh`jRM~p~ye>&eSfg#!jdU1w zsp~Rq{&ofX0xsO3J#GFqP1(-+P2uv934-gOpI4dqW?kO+{8UJ-ehcl5;1ju2jT3F3gy9mikA#PuN+x{&ckfwIyO zqHS?Uk=t1@9BTwc*yZ!XVfp;5v2K zSs&e?e2G4pg-4Qt+%qxqp!|?0gHBP04{&8>^f`A-wdVQU>d2fN?+RQWqht&{G8iVU(Bl7 zpgaExmNiARaE@ zS6p9=%wnNMwQZ^~S5e`(lvJ8Ensm3_<-nmszU<-_FG3Q%4+w~@(IJ;M80^<4NI$GWC2nzbSs1$2B{8f~M{@)3}fmJ7T(q?hqzHXC>; zvb9PtULi@$No>hUn0hVv5nn@#VnDSrCFAhdVw!0s>r2g9ji_}j9y;Pw6B1!lW)0t# zwSXKp_X#zKs}^|ne5XK}a0?GESKlY=?TM3;+N7KrLIqeC3P#Dk?{zy_otJk+N_&&H z^EKG6s$;TY(!GRs1W0OUnDQ@uT)#k?79n2)$Cq~71hRPXWQ?yrQaruc?8EJ{#F#+i zylSd8cNiLmU8U`0=eXb;Idy|!aw>KmV+&KQU5h`(4S&(UU4WAiG4wqdt8~X0W<*v{ zKd?_EjN+!u)=?%x+FZ{D2NzXZnOK475Q_+?mw63!U zZ90l&PF;4Wq|yAMs=M}OiQJCE$N(LJCvZF0r>b>UA)U3Hi98#Vx&zoJq#EAcKIh$P%p@evREz#NnAzo>3>K zwS3m5-&NMB(!|o3CR5g^O8FdceL55QhFiIxX3$MY>tR~-eavhVv$Qy`tDrz~GsS1W zDCMxCA9Gs;B?CW+t=?~mIlffnV;XQ+8QxsOGrEk@AU@E@eL z4@~iZUC}Bx=!Gl#6k&F%l(8Y(?5bD(tA9iZ-~o{J2=Z#~WhRXg&NM8s1NWOri{rC9 zE?Dl2=OD`x#%G?LzIyhv>wQizAvp(T&~lcujF9VTb=q5BZ3naAJ&vZJH4+&t+KX_z z47om)1Snr4>w~C=HKQSwS|eLi9a71eDlIIC1yGUK?P=BN3R#vRlNg1P#%Fpg&Y_In z#aOzkZ@U($u&|6-&nUf!6K3TUepvnW0ls#Vy}4<|SA-a|Ca=q8qxT1OsV+#|Y!!e6 z;LKE#OL7pn_&D;=WZEE^m({R2DDt-UJsqb_t~kb5yvH&KtxJ^1omr^TO!%|!AN;ZY z#*ogTqGM6_hN~x9$IkRc<|aL*r>@jioqN)pAl4UEpoPGd&d66dFmq#2{tA$4BkHk5 zVi9N*9m_Vt&^r}03Zrk5@1km&^UwEPvY0gjfP0>Y5BQ(MtH4DE2zyoZ`2*6Wx8G6Q zxuuSUo%AZ<5X6=Qkbk!<`a&8}*kIm@+7ocOd>i4cC-#qKrGU|NtH~kLVZJx4(%c@D zsrh;?P;GsSu$%pQcW%n2Z$U>jC)>(9N}gNDFZ)il%@uC(9a$DOjHU7BDZ#mhbTqjk z0b^{6nh_I`tG@!J|0e)EVOfZ+5fi}bv`lTmVPm95V#Sd;__Fx0^W$Af5}JPE%$ub^ zm$l8~KAF|%)<5Ro7Zw-*WQ@9qo9QSi>=^S8sss%-E3`z8)`sFUI+1IT&q?L&{y23w zJ$uvSwEaz|=+f@6FW_+)x~Qx@ixF7^!aQeT_>LM^i2MmDfqe zh|Wa~0F->`93Wr}@^Hq$44OcbSPC{Gop@4s5UnKOsi(R&`;F!PI3M6}I|8@)*mDLD z;~BS%drNSyV-e3z?yIHmjP`4xQfN@WXS-Oms*ShUuh zZUF%;AI_{8U__Cfkcixm69G}W>$)$yz#x!n=;;d;>j-r#iz8n? z1sj14fI``|yyj^Mhk=C8E${?D0dQDqhJ*^aiW){Gh`F zM&8}ILltWAx2mTjw|iM}_?B*wJuS}V6~Gw%czE2g-Dc1;GpEtTM!H;W)gfYkXAj<$ zRqF- zaT|kvMWvV^a_h@3*?!ZSw6Fq)q~8^;@EVExmC5uepspEJSYt<=ivuExxrX`ANl6540X=wH>C?8@3uB@$H#-u`jG>6kxjRcnLZEV5bd{~k!tjJ z0nHVQS*&p0mJ@60@Ax3Qfl9AwcAoX;rK-oZY6q1S=7EGbudkW}{8%XxI7>f%h(l)# z(UzNue^R&+y>)h%TQ;-ZE%$~8iSgJfYYzP6)rXG)l?V3Ap3fSBEQIf=@B@AEg{8~J z%Jdqvw?kDU6ng2YjLc!~g{Q}qLTw^cU=~7QMdd0N%o4NmUPQk45{%Fo%u3^nXx~Xg zGesjcJkRVw=f5XOYhduRP>H{~Bk;_biMjeV3j$&ta+^`!GmoxpJl|TJzSjO?R~!8x zE)xiItvGIM`4h*pw+TC$);L7_#BrA;Q-3Pd=dw(7@7C2_H|3RaCa<;Aqru6#YJ-gu zU#*XAP(4nimGC^v6A*;u@EUiSXC;E$i821F&Wg*LosfV z@-6Aln%7%QCtEx-{;J!MBn;`EZ#dNUkFwxgIN9auH$*e&yFuj3GtQ;erf%i+&oRp- zdgrLwY7sGwEclJoh|Nnf#P}U-1y8%M^&J9IjP|%u#CcVGdkI@^jFY znb;Wx+{aWclxqTv6KmCP%-7&a9$6t1*LM(mQEmc)^GbMR z(|DmLD>WoH`=~qQ&ev$CT&F*}*SJs*SaWc!_t|yFMCZysxKw*QaKCYkbgG4Zjk$8S zy&EAr)jTsThhmmhwy-~>`EtiV*X88BO)JrBCeW}DYaK&VIa!tg5~3SL)_`PGYzAG} zh+(^iWKJzOm>urXl+B`77E=hH2hdNj0KOTlvGaaigGc!7m2I+Ry?aeQdwXvc1B;r@ zh6EzWf_8+gVB`&jX9_H|CK>*DtvFt-^JXknv!*s>FTJHJ z%4_G)zaKo?^A4SOb_8C_BB&x0)Y)J+SxRc^zaEMJEl?w3jMD5p5c|<~H|3oCq5s+v zH`4!^MZZds>`Fm4OiS6W;p2^8@b+OWd3c>2v6srf@?CJUHSr;hKR!*Cp^l&p``Rc? zHc9DZa4D=u%~l~Zax54Eqgr4|kQB}UqNg_`%xBT5KVGRolM}?wJ|b(LU;LVptaFfr z6LjiA3t1$ZDTu!}lK2wX(*o+8uF;xF1S`R|(*kk4m}=C8lFk={=rvgi0*?+$9@p0v z+nf&PKWzX{DumxBPZWL|9%gD1IjNk16jWi(tg(~F77x}ekL4hcBv&ROa=uq9ZrCd} z^J33=1VW17-CmWU&A{b5^r9Ts)EVb?s^=I=w4$vOol_3YXsl+ctBNQl#9{iZNAyhU z4g+q!I2{6_f0BCl9M^R?n)VHuY>=oUD)Ea6TV+<2-JC~GO2|YuAQ|QS$%6U;2GNZS zAXmJ2y@}P2s>Mjw`2}=ewW%~@*j{yeRaSvFn66JOwjM~K+2*e4-|=e#^&C``PeSSJ zqqK^kcBYZDU@6$+mPM@;#S{lfB((W$8qZV$DW-r}*PGK|L3`9ZOib>&WX(mVZ#P80 z#5?0kX6^XnE-kecd&c>)18_J>yUFRowb@=q-FrmobV;PgFiSz8sg4Q5Z@yQ%3n~^`QRmm*ugo)62%)du+c5xIKZRd!fMUU5KERgFG z&oL@%RPf^8-CLW>^ z9RAkVl9D$o)+OrN5gHF>$p#)8-IE0hrpdP;HM6j5@?dd~0&Y~|KJs4=O79r^kZW>` z;RL#@FLTE|Wn;mkA0c!5U{(dZ5NT^=uQgr&Bphb9rn%O0*3H^j#k1?>D)kq2N?07^ zghao+CWTl|H~ZojK*NO~kr6a;wiZ1yr9q|^v=4sjLgxNqW^Ny6O$_m-bKrcqFTZggWBilR4-bn-Xg-ozVw5d?KHrzX(w3_r)O;ro+~yInXR{E>U4t= zg#zN*D=GtJD}~0%ovfOVKcx32>toMu)bU84*k>=T*IPO4Ke<}WdIpg>(7+1){KA09 zzN?NH{nWWNCCk?S$ZMM1x@ungA?KXzcIWnH6I8@M2}G|c=62D27IoTR&&}8qTkJH~ zm@2X&z)qDq`VQE=7+JwnBrHDb^4JhCX@Nb8g%|zt3RZn$vt4$J6swZ{87uH+vrcS7 zNH)~j*k-NBR`n#YHC=K=AqDLjaB_^Ow?>{$i%T3EY2bku!Z9IC}jt z$-yareIbg**6L1%Zm4k-&e1fyFU_p-CexV zr|FRC{2z~<(#Xd`l+NzD?d{i;T#cO{Du(*s#TS7vcDWg+b0=nQN}m0IkEMyGwP;qJ z^rK$&_3q`tD=_gZ{t@}Z!B6~#Ztd&v99E|cIHK=&2jqE#*v7d|Sh;owX8McObU>)H zfwrM^zS1IVI7$IHyvbrIrtwt=p?GX%pMqy+;aEuueJ^rP@y{rYPdLG#1}H*tf0`?gK06eV7_Bg4QWgsM()-ev@JI#joKylVA+Rpa z`Edy)52HCk>)BT;#Za%kGvA!v>16W`$WCH4=Voxi^!_qpCj;;usc~ukxZ^Padrek9 zfzk}_Q3|TWQS)8ddliPN2dD&2uZ!a4w!jVexkek-G&5?b2OnX@3Xmh{yvQU0+|~%F0cpF~k9^Ni z8-@>nFLjTH^U09Cg5Jf$(+ipmi8cu4T=lyr&s_E5zz=U?y~TedoxQ5~S&+D@ocweA zNmh{{mU5iPlYl%td7c$!{f?4Cc%0BAg%}A-NGFa~A^ZLuqY7)0vXhD38N78{-3iZn zy=Mnz!t{B?w$7gV1rXlQB*uqasy#BZnFcjQ&r^NcNp6p-zVvq_;MeyR$Fw;&W9IYN z5bnybpR<#>o7@we+gO|aDaoL!+v?^Y$Ma4D+7&@Qr}O}_#rT4A>~9eP@T|&gpB)y2 z2H2-oa)+w_pyobn7cy=j42VH8A{~u;kvlWI=^=tu+F5K}r$VI|k1AaVn5@ge(;?Zo zD_KmJ)gAg0)W8LiLA=1~4!7{J(LaE|@n@A*6w ze}6h@p0zz&0Oa-W!Y~M6mI_!nrrn~Iun|VoItv!Z`~S$ws|GNY2|!NT+|TWjKm`g` z=>g>}B#4Ad_1G+M{O7Xj4;cVCJlXf9HS{0XJLNkFwhjyB<^F~CI9I|6#&yTYu(Iw=Sf?Yi!x5CjCX8OdU=LD8g%QGc$dI{UA*_{n9(h5Guf9E*2aSH=`|w31 z?Rsdtt1Sz^IDO`Mr|@e-=v9cB#K*RqD^amV1x#Lx>E`y7*c+C_v~XY|*)9oya&~Vk zv}P7MdkMBymgTC7$B&qRA7cCaH7X)jU2)mQMLL}P?wdpQdrRxDrMR||9)Xoi<;_&k zdED}>_#-CBAK>+O8>*IrlyS1Jpm9HsfEmJ4ho|HQjVvA~Xt7cwGR)O3v&MLMjtUT# z?2%!-zR2vA%V6X${y;1w{j8l3s-2tM@@qugS`Tcg%4<1#*7;!j3(Q8#1Wd!FtD2lx z+$Y>@kA2;&!h`ZugsDpFOednH@+k)1U|LS10Bj4$wPuS9W4|Z*eqtW?S z7L~N7AWd<-Y5GM`&9;egjx^ZW2_-Vv((BaJCarQM_*f2 z`RWlHQ2Sunu&+!v$~bm#n8;Euq@=}2cAflr$&mYr$k=1&hBGgl3R41&*N63I_q@Q8 zPRMIrYHVcmU^KM>Y*|kR0aC?Uu;G2p9H7kcr=!7ea?hk1JCl7i+r8nP^4t&~kE+~XLuWj3!v&fH zaVyoSKzGmsDu%pG;`M_d1&vtJ8d+6(LUoguFze;)@$Q;+@8ZH)D&g~B7W7kbYOvY;@aR&E4zeNl zr=$M7->VdttvpOu78rad1T4<>ClnJ&bX8m{TD&NhFF4+&v@&Ge*lrh)c31)x*7!-+ z-p!|%kg_HRho%9+R%sw#_Z_UQ;af6j#P<^pU00;030Hu9p3lh@vKZc}VTY?R$tKTu z(f6%g@uo3;`iN=5Tyk-aAeNDV`0BkDhwYBl?vVi$pug=3Yxm#z;);9W+m{jRg>-yATGe}0t7c#=a(fb)Y9jXu=yj5no3U_nhzh$f#DQG zxNmr}k~!Zf=*3io89HwxZzv~C6qt<)I!7~mzH=4_COXoapEX0PpM64;BpG!+6e+`lbr$ zc`Z^wPBKlXC98e|*-Cl%OHB3u)H1csEnINr-3{$8=a^Vs%>yTr0E{T!)!!-NLE@*^RC;*Lh|VDWS1thJnX2anNk{hi5bVR} zYyJIWF+U%Yta7Kb0kc0Ck7pTYugD$24%V!m|!z57 z;8!d7?Trbz2?HKRDqyT%KkpCVdGQbekC`H@*||??z(?qF(DV7^D?`X-nUI9xY@-`! zxQM-Ttyn0wu|@$luum|vkNV~kR%uN8j8UKL=9F1>8RF$wXWm>Ly7TO#R5P9+^bzru4)jDksTx-fxth|V*;%Cpr z4<6V6@5{L-6bu@_=8owPh$|YL(fEICI+V>u$=g~Y&6Mu0NsVN$s>2`SUnvbR1&kpP z(3xivvAuA^9Ji3{B_n}-A&s%ctN~0X;4Y-5+neJ-Vds7%932%ul zPJYjiI{cdE=W}%{AI!PK5>|N`mdP96E0?STvq_xaiuu0r0y5e0g4Y4cuG$+ zUA{^xl!o_I+L7$+fxU7uvK#$!zvrwIO|qFusfciLU`gIK)1sfi6@`_p3G4!nM|{6< z(H&MYQ%eg#LJH+ZzNbu$GQDSL7Yf6UfMf%;)oADJ&5q zTJ*8LDl7jje+MA06hnZ8g_RY#&&A@A#FhD&{3}m7c2VoAoUCBGJQic-KcP6?nAauiMgWV_?q`p@Ek8|o33S~lcH&*jO?%yKM&S1a6kSkKeV`x2j>)xSS}$Us>@D%8K~1jgV$Pdo6@k$xh`SfB9EuK_<;J)kT{(Qep( zR}LWt4hyec?W(;07V_uInixbPo9<00^{M#f9OB|ULYP&mR<~-|Gz5- z1{lP@>aJl;|IdCvAvQjj6%d5@e^f5;d@%lBll*gb{eRXZB3W`s*!LNo2^On&-lX%Q z&yxbm|7hOv9l;kMO)9aeX^qjJ6Xgp6Fn2M(Z^OImHdyY%w;AC*&v-I3#9G^V1~a10 zg+JF|rvxzl5T&ennGEjd=YA;s2JWQ+w8@bo-~{?Sx6=d_4IyNj|13!VFIZVh`VwL6 z!vCILb6)=6gi``EJGA{TiZjw9n@G zxBagwQY8D@AB&cZEb(&TsaY;>>?y+{*V75Iq^{j~n0&-*uR?;~tOpR)uA>Dfx!fuL zXRW!+PW?AwPs)=tGGuiTX7RJ(yVbrq^J4$mPgHu3BRV{lrwGJ0dpk+uF zY9kj|U%%5J5oEQSVNnWZ0n`0=W&>@3FvH-;zDn2#K9^$@uWGTUoogT@mD82WV@VXc zR`*R$KpkL}X>%AKP=z`A17`WNaz6o>o9g@|hjE-(SM$eX7UU_L=js2IM9S&Swg$xe z#-j(?LaR#6NqxxU@3Yt|Qsj7KpUO3g>8#_8@2%FbiJ@cBbYHtUOOECV2ru*dxfPSw z#SzEwXD3%VZ3J=5cL|fp7Q!ZQ@4? zYrs_tsZBRoZA1jax&xoNtB!mD0k@J&yLAf1%;8k%Q|weIy0lTNTijb;W0@rC7TDgf z6?cC*^G6Wl?s)NUTEz@8j$QEK-f)=jW1Od}xLGX6&Iq&2`&@=){!<^<^Uyy`iirU+WY70?`M#pzq-@OgYarNT|J*! zmo}XN=kn{zfo=`eq6ky;hiJ-OYS(&e%!A9Xj7c9=S6QlUE~}NVaJ-uWym-xusbomJ zb;(OpuF9BR3yalJTCKovILy;Hdp#LcUX*H%O|-L^m=8wG9Po~L?;AN4Bs%O|W9|z$ z7QOQME?<}r55Ec$`dVCEyRls?T_GdPTduS>qe8EmDX&i@X~=RS9X;4JNx%7Vrqph0 zbgXUr5^b5%p#*#f>3X64_)FsE_*_Z<*t^We++wmU6L6Ip^-TWjWB9cp_I@Rz`!d|g zjCORBmhGC-mx1y@y(CJnktRj8;EQtRD3bn-hr-r;HE8<5_pwSbr0v`+2z+K{R!X?i z>LQ8bzLDv@!n8mOUTAy7>l#q3zP&3#3qH;xyv%jZG(K}En^nh7k|$wgH21M7*r{gz z?|HpW6(gkXz~^)7!RyGW>+}r0J7Gt!-ySdd(%El@7$&WD*yNZ5kG0-9f1cIYU(9WO zy-=fX^c&t$zI5+_*CO)uqhrAVwfylx+xd4YrFg~kaguq%W5OMO(5f~4A?|1*QxkIG z=2H}LA`U0bAbn3O=le0iGl$Wa>joD5)vg7Cce18@t*$N9^DVBn!)N=O#555%atU#} zjI(NHsh>^)PARFi!D^~&O?cZ^)aP64J@98@pl@el+dAOdi;0|fXLr$#rsE-o+dDJE zaRqyhQi}}I-|-AIF1ubkT&5Krc(lSI=B~8pfBY4ojm5*Jtn>R(ejBmG*ORAQ-FXSE zncmd$P32K&OD}=A2cw4li%o5_`DTwIYn^uKu?$`s0>`a5Ji{`^ypmbxt+$>VU+%fQ z1JZ<@^^iaelv<};$Qmm@@7csJ(Pz)z-n9ZsZUZc81+%gIsK9bV(8)V=)d5ug-!VM= zm#PC5+@9|`EvCZz&d!hMRdY-QZp`m!oDHt~svT(eLaZO$Q>qRG6gq#rPkmarNVUE* zZQ5c`m1k&Okq33$-dQ-@{t#BpcX`ou{@}7ld$#uIUGyWX$Xbuzu*?IOQCFpy%hu$+ zW1L(`bn`j&#Pjoo%pVYf4m%%fE;X`b=XE;npm8w|v|LhYBk9LysEo!}*q(y<)kjjOD(od~!aWnL8C$rgJ49D7pa zN6VzEXx;I_sgww|thQPnmn{lX4Q*cYP0UeNaV|xEy|`^NU*(b~1o~x_IGl}0X}Yw~ zhUK*RZ5xc4(D+!BDAyI2hidAW?Im!8J@BA>j7xZBH8q&hBm5?zA2I4MRr7I=H$?U7 z>G$yqmhjDp7uVt^SS|f4U)zB18mSy9g-Dh7xt9L9 z#nlx?hHRK{EsVT}-O;LD|95-PviEkEdn(Ht26iR?J4sKx2_K38sBDG+M3Y} z4K3xuWM@7aUOhGi5cXjux!uN2}QF^4Y%Dsv}_2d6_BbYwN~n=+1ggcC5Qm z{n?Eb^QfI@V<@HTy=TMlBW=bB-shXv<}#XekD}C#^~5%FmVDdI-NIcX2dMELL69)A z*Jo=zraY(7t8)FsOG~aBJ>u!~2fK#Wf!BNX$_)zlFQD;k4e9UA{VGmcWFEmZt>REH z7=BQ&WLY_W*A2D_C=4;T2?{TNL9q;1!ic)s4q|_H39q617VPW)yIeK?b-KQ(Q4ch+ z`2x*TU3C8aL&g2bVm;H7OwG6Zz7G3>3=Wf8mz7sd6&mNP7DE$67c*=2XLqDE1H(s( z0pWy(XLu|ufsMnoXv1Ymy`J`%K^A7EypB#cz2xpoY3Z~U`(XlST4+psyg!)I-zg>e zXK>VBKoTz&%M<@T>4qD_6>U%k>x?nwE zH*HtdZ+FDVg>}MyT#`KhofbRb@HyGdhg#Fk?W(?XcPmjdeQ1-sy2Sizj_|I0Hfn@# zt}pH=#n>Z#@VMb%OBzc3ibSjI={Jx^k$c5b7NWx9XAv7Pf!dgKw#c6&3KMxl~2kf1Ct7GI(PkHSgDOq5O*#G#w_S2al7lI)>+Kt@!S*pAT zGF0n)1M}@#0r~QHuaNH2{BiA0(i6Oyky*@TP1C^GaUZSS6!vn=P?Dz?GPb|G2tGfB z)_ZElKN|?<)JRL&@ZmXJP_ljRLp9rR$5SylBXOf;kevIOfYfq@+1NXe*yg01B)G88 z^~Fb{{(14yZbpBGJd33Et>l5=Z%)TK*4a>1Pg1RInFv(FKOuSi%!@-=3VeJ2PkUb( zP*vB2DI0Qxf z^Gmc|KVN)?Z{K>MmmwjNq{g96>DV~w2M%(tek@7O7XbUzrqZBE(p+AT#6SeL?fS%5 z5DzK_3;W)ysZ!q3e&a8u0vqhHr*x~54=UOOg*s6$?xh{~INnEfvs^P1cuKFEk`+}a~rDE;(m&CFq%ReqO4 z5gSa(x$^g90Q>>;xs%>j$wgZr1tj-NH~EcTI6Y)c zvgk8fe}9ZgMUGv*)Go1m;`2h|5@&Uf)-!$@Ctaqyg~{~=Bl$%dUnQvd1bv-_F*{q8 z6{kpc`z=O@%Hd(h2rXOhl~&_YV38A=x^3=<>0sL$X2-swN}89zx}V#HU+W#BxJ3B; zmqGo!d`^`WHg?wnczbTQ7-Hf~_z#WR&iiiKXiS>O?OAva(;@g1!pJEEN`LM*Z zB5b}OA~KqGxof$CNwAoO$5~`x>&$1YFGuo^w7kGiUP}T&wS(addt~d(&Xakkqe#P5 z#IbZf&s^iIw(TXa0K((pX8*nV(Wny7em`~}xP!hE8uZ@D>c9Y7-P^#=N@F6u#zTIO z5k}(_MAufxatkLZ(lO&Ss1*xSUEZOuO5C>YaXTaT+?8o_=ewJMMw5J9$03kzp&~xY zGt!18Ule(Eg)ZdgMiftxDQV(_07&oz7KhI2E=k#=zM-DgrKk$UWb)km3A1^*QU(ss zL&p_@ozc@kAGPnzFOZw*oy8%3MV+G-4u7OI~U)Usa9B|oAp4bLx!wx#3M_bQ#E^}ZQU!w%bQT2sdlh`*Ss{> z@&3V%?4lo&LDW!_1(L2h{Zze~2zqgTO41ZwD}a5Z8C)6ilCQO4sEo*%~V;@jsuI!zPU$6XIU@S*ztJvv1{#FrN=!h?$Tj! zYNHd9aitShFU&oyp>i-BBJED`+SGjmJgsToc*`fU{%(OozgIVg{1A7jZVrKgYTyI8 zw&G2NidrAEEzDwys%^Qc>uL5$fPI2@+VkO~cD5$tf^;lUlbl2y+1f_#J+sPsC@tcs z?Xwp!iLdhEP;Z=3>0eGN43e*1g=lbwrBlFlz*B}i$X9IPjD`_^9&BSmR7!1HMbL4o zlGB2T3%Ta)o>EwluQy_R)?=rfP|*xfCa4@v!*wqR2G&~3ngzcqCc=CRu7j@ zS_`Kk#*RV8mUh;=*di@X)9GDy>nzAyPixo3<@qVsJWUhTyfGRw7xg_FVQ%!5xPn$$ z-j*dg>K7?5Pz5Sd-TNBY4LM6FMeHll(T66fJ>2XZy-n3ayQ9VE9s`_FaSK;=27Tm1 z&#{V`z~vL^PkN@l7TwJ%8ier7US57L+rQL!CTrQ5l860ZjkJvLtSN4KbG>XUkdWJzQxW|gWnw4}b$(qfM%W9JuliUu| zsvkvdi=o=0!Ems#o||VW4zZ@7RMTKV%rl|BEpDJhC+?ZxehY`#yb&UL-cQjaywoRv zf#}8;#2@S{`rTYv{0`auC6%X`%^)-?UP7`O?IL4#h~A%wGq9E3W|<>mIg50SVx!Xp zEleqVRX3M(9lLlha+|P-=ALJZRCN%j~$VW%~r1lGMzWsMM`e$Tt z{UU1Vvq!L}Juygh{~jyDh*4rx+=`&&P~ddlr- z|JibWMqtW)GWV)jSC=g2{+wBfekEmtO;)Z2R_=|}X6r=ST5LXt;n054M)!f{MqPbb zffFgn)~=kg|5CVdbdH3^$O-0t{j{t9L%j#z=k`}*y+!uCZ^#ha<^vjSZiX~+mC4h< zYPxs#>G3nXX}s>q?*#?HN=3@xl4#P1BDNfqD)4yv^%dI4&KapfK1h1ZG86|>jGK?a zCV~SvaoDmR8w~^5C1IjUtuP#lqUa*dhOd70^|;eQMOD3@4ja*Y4ywVh&%D|Vi37(o zSxbXG{Zn_k?qmjA2T5^`Xu<8O5{CCW@{rdH3_F3lDryqqQv<4M9=VE69}21wb6vFv z-)nVsGA1Ey%9F9q+^1Sj^q!RTw9h4rqd!F-yK0mhNp0hhJ`)Ekf1Rj+@vru8>*ZmUK}sKPeZbuD zl;ru#-CJu|>C(6!27BTr`_i&8VWH+`LCf%-(XG!$oxVILFS24_aC$~a&%7`Cp=G^f z6yf48+%A4#z0zWqj%$N2E};Ww*C5U}A0-}#gc3f5CyA7OlvMwj2#A%jB9mlFu%yY+ zZaBI7n%iPCZ*$m`k0bx!aFRWrdZ(aGdf!8*N5m-JjnxxHPbn`lZ?8hU=6EIQf8oVID?+63$P@j_Z7-%F2fle={hb+W_V&B_DyWY?5?Gevt z-C=YM6$SYrGq=xiJ$s^J#&FUpvPy_jsQw#xmb=K0ZENrt<4JS$!U2EJR!q?)5r0(v-4l$oT zL5qSMAI*q&79nxk-J&2YxbmZgbAX+|D~bglNi72O@c()seo8?0oHolkNc^pH@C;fF zQgS0B<+JNeA<=$*ez$I0*Mu(kSmb;+u{0vfj9R06u7&(w?~K0nyeAKKlCLhFJWd2X zi?Y8PdaCpkf8-DZk}C4Sr=u5ovPPNVh1mu#^y zcl4}988~d=9EQ_&o6LR2CesDq^J`7Nbjj`Z@SM$EkZ(tFiHq#~HkQdjvh$O0WasNp z8rJRC5uP5-jy~?h$03gMWj*AM3@&a){Q-?AIjJ|Bo{(`Uhuv}UwcQH31*bbTi%Y_w zX14q6L$1IgS{@weUcybq=O8sp0k;0J6dnd*WGZPYn+s=&O0AE0EQ7QM{q6H{v3&`1 zb2`Ik504@hgF-?S9##eAA*t;4KH-GyB258j;uzyy-JSA4oIq8T{BB3UbVg>oaclaX zn>(~ri&OMjnU;K%-a_n9j%@Iqc@tOv|}u)%`5fp zJ|$Nym8*jp*_h&3yz6SDb=IpniAV{ z`7GZxm}4~NHP=ZmEEmT<6}Zf%m$(}EGqf1;PU0fzTzNRk#sP z+Oby^OhhZw--jhDTZy0bP2_9i6q9rvY# zPB=nbak}EE!^fE@OAWa@ju!j#=;EQX<^ygo7teY1`FHy6KPvCv6>jWFrZpGHG&P$D zWGFI!{j?hybeT0FaD4^m7hCL@YQN`%_@<0dt zt3H>Lyvs4QH;RfT`p|CRfDowC>GnEp@N+?5#7~5;;VQ5#k`*S+RktE?J%<-t9$qnP zTv4S+v8k4Q?A0@%_eZ+8uSX}GbB%25%vfZ`>g&mjfcwCEkka$+xITVB_5F?GKBbx2 zM5Nxd*E0`|I;VL5iyM*jtsMsUf}uzovdlplQMBExsKpk(@w2d=L~>9pi7E$0u4k-l z_}F=~BUD7~dd&WqPVsZ|_FT#*OI1dn#tYw=#Ke~t6Am-o7DA$;TZ)t=i1`MI8V&U> z$9F$!Ey?%huBTkTZ+Gl9iF|{BrT95F30cwABfubSf-)^p-OpY1xBCp^d8+O!Uf}^Q-GN!QSu!14*LycF?r-C` zg*(mOILf$rz9gV&K@8OONU(w*4m1g&%1Ego5a%?SWVBx$bXRQwBW9$@vGxrQwn&ntBzVe`=gH0m%#IZRCw%bNgo(?vw(%Mcyo#`3C zF>bAOyX}Rc@qV9$jQxe{1ifxYBApnV;)3ZkMD!0bPx46BcdVjNGl~&8J)p zkKBnGe(i?turx-bB15bn% zj7;{N4h?oLYQDgmiSg+=)En$`c5xo9Z!$Ak-@q!@CLlP*P{_4Dx9z>|Vuem|^ikwA zG9G2n42@OmqvJ6!N6p`aeQ9xgkU{g~_PEr3>UplXuifH8JyD_1|=A*^UL-QvOet@HdXk<$g6gV_#*cxxA&!@Se$Zq!1Y9~wJ*1P6=p zL5*Zf;6!Yyjp&!NMcFDqU<@Ym=vAA(q#sm`20syViz)TBWS@X6 z*`<@!x{0bzV&t9sm4;pyH!)g zTz{I`yge1e6j1@C!}FHa{Gb3U8ovMq8ezLyW0Q*Ckh7%Rc4zKwloTf^wtp;+igK zd0w+q>k=kjaCzYwSHb*O2oqN0J`906A#jN4EK}zx>3zCS7C1iO|Hvsp@tLvTlt#t( zq}X`1kOl{9}DaLfZoNr^;@)u<(J??&8-0!(;+@U}mt8FtKvu6Fa zU44oXvn4o=?$u3&{t4#_jwiHz3RF}|tW=^3+FOc+_}Q+1ia%q2Ip?sOms*w>yIkWQ zFy`C($oV!EoD*0U11I``%O{(&p)Jpl&)w^C9I0riV!Cx4*V!(R|8ee6YmIhaDM)^O z>+oK4wAsLfILgpN0#0b@>R6cljEFVlienu>{sZcE|c2eK@gVAdJgyFeqcaN66CG@H3WM0L=0SsCcD7nNZWwRwhn2I z%+;0sX!YinXUJVtwC-#lr!vGZgSG92`625b!wnOJvsE7xm*F^_M^;1uoinI5T|9Ru zlT|=a=;QtFn2z|)YFz`5!vX&4`br+{h9J42%40(m$9lR&zvWW+EcZIGY4*Ht2#eK+ zdZAU2u5PZ*S+u!f-63c0=)7^b?>uJKWak?_b56BElW8J7@yGlq4&WUoKHOoO*^~(L z>^CeT{neFsJ(v$vh=yo{#C9j!^v(sxiHP@l2|}5)=&VcVgG<)bYVU6rITmV+y54)_ zBq4)h9?Dy`rp^jcW(srfZ5xjDZwI>U_xH^>X*M7>{16zOhvk@$KWbHvBdc1PP87GP z4#qrQzJ{xhz-Nt8t)!sRBu4^${(hh0XF7XBRF zqLGH#pWx6b;V5eAZmLw_R4itw3qw`C-g@Kn9B8=crNs}|cx5YIy0WA)_n~vta`$_= z{Du6}p>!5j(^2la5}W?&V=hEtjeQ7yAL9x{WMr}$E+OcqeSN8gOWuNgHGlEr`r?hD zb}a+CFVAbA@^PnqUXPIbIhW8rrOOW}jA-!!dJnpV=ER6ea3a!K!%L!iAMHrq(WAP! z@u@8+7=s+^Q279FZ4b6*+{Cd>`6t&sffvWQh+d=|P-j?-BcW~+Sdq9(j@vruiovv! zp1RP=G3s3oE!qf;YSW8SD_NKQU^y-_5Vvy7O!mHcG03wy`8NAVaMm=)P`u!}bDEFg zR^@T}L%b-ruZ8u+C%h_5kF{#{AFIoW0S~ILSsZlr_OZ|x>DHD@2Vc)XojeD1+vAxB zYnXWpiyxzNtZ^8MAwW#(R@BbSdijq!mt)4yW2zNfY~1VEgP1FAmVS6YNxqXqSzXP_ z>~rH7M7mZyih}Gel&e3^#OYcgpM98SV6n@|YvyD9J}Q4^Rb9P8WZiAq)gD*wLD1EE zL8q!1vGL;&^4N5w)KBB3KaS9HmofnX;m4Eu@5i;AsHQ~~pKUu3FWUt1IKDR6-&Z0f z0!RKW&8(lCp6}r&Mb$e9#+}jaoo0BKc>{BtR>6NO z-{*U|*XQ`%k$iY5j)|k8->~szbWa-!Vh9!uC?73jBDZc#L?K3bZ!@H}u>S^D}xBjaATf5*nUPx@d|O z&BMUMRpCgdjo6WvZ<4b45M}ZYO(lXm##uiScbuvosTbrZpuyJ{AoCxSy zZuJ1s7^u2GB;9%$x{11X#1P+5(>7hQ_Wt`&ja!iFs5;-Vl&Dok88Wx~r6f{&|GPw#N0)9v* z#1>XzB)-faiX-k!0xOt)l8@56c9%RgxOl~3*Z+V?sqSX(04$qP;W9^Zo7}1Zrk^8644TE~p(=N^q-&-~_&uBPiyZk^!r_XTN zw5$zw_&j@1o*BBW&z)N-Q>EI+YV6`he+~R}x3#6z~YX5{Mg)`S?iv)Q1<|H;@N8 z-qA~M&2@?C%(}8DE2ZEDN0sTb1un+Q=y8wqqWE#%Tv9FFXdDx2s&e+Ae(y76gHQA+ zDldFMIpcJ*$^{!wsb{YpETwMcpAJ1Bb2Nlm2JyGO$=CEjF<3F1Io?6ZO?F#6>{m>B zyEc%H!x+OD-wkChJa~6Z-m)YL3xl?C87IHYFR8u^dGdlzUUrtDWH!3L(k2tan0v^0 zc3bNks>}}8ja!-J&e>C^+)(qlPhxEsbSFZ;ZcL=yt6IO3*6qYdS)U2U>bO~qC zm80{FqtoWJ)sod=_mMLaqSl?i^zf%z7xgnv)PimZ>Uuko#BF}B7yYGNq(ADD^9SzYoD%g`>TJcmzBJMbvIQx2Sk%FwJ@%+_`18qGxQe<12d=H#yNH9{Kn%*l#qf_6eOpA&a)Dfc8kSi72d zbjil|IlMaV9_N?~PXnVRDf-X?xYXllQ_6JqP*XY8Q}YRS{|%q}a~kF#Ej``D3t#>8 z#qN_8jZOjg}0{#N>xeo6U(+7r}FL zC%qHQRTEGPZob3jp2le|_UMoz)9_SVI=&lvs>4zZ+)=@Sgw=_+4)kSRZxv~_NpK8Q zYgWIc_CroC%!SV7DmG0ERNwKWH+rFO3SBjTe4lQKBuzYc6(0-YT-WJ~erYikhojqLy2hg|5*s6UscE zC-1rqyV zoGQ(C3zI8zM(o$ipp|y-4;~TkgWj3lrN1>}<1m()l^(3kttPZMtFl;?%&5*T$}+;= zIvR-o?thv40{`2!aqsAQT-i?(mB-+hO@PJt^iNIIBkuu*R+tT_wgZ-G*%#lHfdx?w zKKKFEM*?Xai|(g{{}ixPU@S>4)oz6hWS4Eeh98hyx^po3E(Zg=;`5)3LZlc1#<<}q zen9^pH(G%p;34$iGNWJdYcGBZY@jzmRzwW{w)58wAg+Cj`?KWhpOge}3FK4&=#`WJ z3E(k+8*z{L67l{nu+b^~naDUB%vr=>Qt*CRl==emtX(xs%pdscO z{@NyVKfb!uIxgLI(8Fo3l1p4g>L5>L6}=>S$Nr`UI#%c_3=G?t^4YkNErU;J)u&9a zHS#&xf1DyLh;&7g1BsWGkhSXAzkObHR^l9IVFhDfh^SMEP&oY94#n4@54#yD9UNaGkk~>$YOx!t$rccOIo9u zSEr>x1s_TcMAi~#*l_?Uk-`(zd5Bz;7589F`Gh!8THnmUxOh>guR6H8>QG0HIHG(oFHMqv zYk%OoN+ZZaPflV=Vv30&UfO#`l1yWm+wa3HRLQ=;hy=9W>0wfnStIxn6NuN%(}s}p zEEt%S+`tkh<<#Jw(C9C4=X@PTK4rb+M%dU98k$%8a`T>$urEZ{BX{e5%-4Qzfyry0$!Sg;sOa4>^Ej`|s#5INw}N&7dU zM#DRyV_hG2G+v~SC1@$7W=@5afBl-xtrt#sgxvKSzOnB`X3vxrHPt1@ywk0sf$le& zJh8bS2AplWuFCjkw$}Y(mJ9p61VNerR{+H%9T{*yl4}>i=?JLH23BUcpw@0S+G(ea z-ddbMT0lS~hcF0C`TKAytK)yM%fG3>QChD|h9;i^^nXA3WnfEiPe}@Hye^4X#uL@< zNylxZ|F%Lvovskd1G_?Gne!g`)Q382qkK}TR;8z=;q2A{PRaZCYmZ>YFarbK1kt7| zlC>a`Uqq3%CD%mqs`0xt2&gxEE5?b!SAVx<{)i0!n1l+r*USX~E{cmjii7IgklN9b zRvp2&5ug4o`+5W~DNBOfC8sR$84dSra%Iq9oJ&7*r%;oQasG*mdFJE@-TycOsCr6m z{FHzfKVWP+&kVi?a+u)Rr24^e?CE-ise`E-|Zyio=oizrxbjIs|=_3y(_}MIKDCz(XgtVHI5<4fDcI5u31RcDfr^W zW>vXH0EZ#o|*A^X=2xzdXg~Xh*$|My<1~X8yo(W)m21c0<8c! z&s-%@K-g~sy+MjWh$+z^otkfMGjb|ggFJ3*6w zz`RrfVE$i<{wwqLG>TLC&Quv@i|ikbC&qY*!8j)Fflzm*jsbx#v@^i2r6 z9Lgv&p5H&Z@{dCJBI`(}wFTM0{;q%k`}qH?jAvJ`MEQe#mH=uJZC)25NtB2eIk@b2hN%{S@x@$)|ARJAC;{5AuKtMo zPq+Ujv9_P=llqeKw`u6-r4Bow;QkfD{eQwR05D6Pz}^4aV#gDJHd5);{_x^;tO1zE zl-*DM`)#C`(tueln~yO60pt6#_?7*Q^lv=)^Rm?%_5qRiOaq4fOx^Dh@FylLEmZnm H$LIe5zs#4B literal 0 HcmV?d00001 diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md new file mode 100644 index 000000000..0439a4653 --- /dev/null +++ b/website/docs/gen-ai/training/Llama2.md @@ -0,0 +1,271 @@ +--- +title: Llama2 on Trainium +sidebar_position: 1 +--- +import CollapsibleContent from '../../../src/components/CollapsibleContent'; + + +:::danger + +Note: Use of this Llama-2 model is governed by the Meta license. +In order to download the model weights and tokenizer, please visit the [website](https://ai.meta.com/) and accept the license before requesting access. + +::: + +:::info + +We are actively enhancing this blueprint to incorporate improvements in observability, logging, and scalability aspects. + +::: + + +# Training Llama2 Model using Trainium, Neuronx-Nemo-Megatron and MPI operator +Welcome to the comprehensive guide on training the [Meta Llama-2-7b ](https://ai.meta.com/llama/#inside-the-model) model on Amazon Elastic Kubernetes Service (EKS) using AWS Trainium, Neuronx-Nemo-megratron and MPI Operator. (https://github.com/kubeflow/mpi-operator). + +In this tutorial, you will learn how to harness the power of Llama-2, but also gain insights into the intricacies of deploying large language models (LLMs) efficiently, particularly on [trn1/inf2](https://aws.amazon.com/machine-learning/neuron/) (powered by AWS Trainium and Inferentia) instances, such as `inf2.24xlarge` and `inf2.48xlarge`, +which are optimized for deploying and scaling large language models. + +### What is Llama-2? +Llama-2 is a pretrained large language model (LLM) trained on 2 trillion tokens of text and code. It is one of the largest and most powerful LLMs available today. Llama-2 can be used for a variety of tasks, including natural language processing, text generation, and translation. + +#### Llama-2-chat +Llama-2 is a remarkable language model that has undergone a rigorous training process. It starts with pretraining using publicly available online data. + +Llama-2 is available in three different model sizes: + +- **Llama-2-70b:** This is the largest Llama-2 model, with 70 billion parameters. It is the most powerful Llama-2 model and can be used for the most demanding tasks. +- **Llama-2-13b:** This is a medium-sized Llama-2 model, with 13 billion parameters. It is a good balance between performance and efficiency, and can be used for a variety of tasks. +- **Llama-2-7b:** This is the smallest Llama-2 model, with 7 billion parameters. It is the most efficient Llama-2 model and can be used for tasks that do not require the highest level of performance. + +### **Which Llama-2 model size should I use?** +The best Llama-2 model size for you will depend on your specific needs. and it may not always be the largest model for achieving the highest performance. It's advisable to evaluate your needs and consider factors such as computational resources, response time, and cost-efficiency when selecting the appropriate Llama-2 model size. The decision should be based on a comprehensive assessment of your application's goals and constraints. + + + +**Performance Boost** +While Llama-2 can achieve high-performance inference on GPUs, Neuron accelerators take performance to the next level. Neuron accelerators are purpose-built for machine learning workloads, providing hardware acceleration that significantly enhances Llama-2's inference speeds. This translates to faster response times and improved user experiences when deploying Llama-2 on Trn1/Inf2 instances. + + + +## Solution Architecture +In this section, we will delve into the architecture of our solution. + +**MPI Worker Pod:** These are Kubernetes pods configured for running MPI (Message Passing Interface) tasks. MPI is a standard for distributed memory parallel computing. Each pod is equipped with: 8 EFAs (Elastic Fabric Adapter) on Trn1.32xl Instances. EFAs are network devices that support high-performance computing applications running on Amazon EC2 instances. +Trn1.32xl Instance: This is an EC2 instance type that is part of the EC2 Trn1 (Trainium) instance family, optimized for machine learning training workloads and has 16 EFAs + +**MPI Launcher:** A component that is likely responsible for initiating and managing the MPI jobs within the cluster. + +**MPI Operator:** An operator in Kubernetes is a method of packaging, deploying, and managing a Kubernetes application. The MPI Operator automates the deployment and management of MPI workloads. + +![Llama-2-inf2](img/llama2-trainium.png) + +## Deploying the Solution + +**Steps to train Llama2 using AWS Trainium on Amazon EKS** + +Note: This post makes use of Meta’s Llama tokenizer, which is protected by a user license that must be accepted before the tokenizer files can be downloaded. Please ensure that you have access to the Llama files by requesting access here. + +Prerequisites}> +Before we begin, ensure you have all the prerequisites in place to make the deployment process smooth and hassle-free. +Ensure that you have installed the following tools on your macEC2 or Cloud9 instance. + +* [EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html) or [Cloud9 instance](https://docs.aws.amazon.com/cloud9/latest/user-guide/tutorial-create-environment.html) → for both, please ensure you have 100GB+ of storage +* [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) +* [kubectl](https://Kubernetes.io/docs/tasks/tools/) +* Git(Only for EC2 instance); Cloud9 comes with git installed by default +* Docker +* [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) +* Python, pip, jq, unzip + +To install all the pre-reqs on EC2, you can run this [script](https://github.com/sanjeevrg89/data-on-eks/blob/main/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh) + + +**Step1:** Clone the Data on EKS repository + +```bash +git clone https://github.com/awslabs/data-on-eks.git +``` + +Navigate to trainium-inferentia directory. + +``` bash +cd data-on-eks/ai-ml/trainium-inferentia +``` + +Modify the **“trn1-32xl-ng1”** node group size in eks.tf file. Go to line 179 and change the min_size to 4, max_size to 4 and desired_size to 4. + +In addition, also update **variables.tf** for MPI operator to be installed. By default its not installed and for this post its important to change the default value from **false** to **true** + +Run the install script to provision an EKS cluster with all the add-ons needed for the solution. + +```bash +./install.sh +``` + + +### Verify the resources + +Verify the Amazon EKS Cluster + +```bash +aws eks --region us-west-2 describe-cluster --name +``` + +```bash +# Creates k8s config file to authenticate with EKS +aws eks --region us-west-2 update-kubeconfig --name + +kubectl get nodes # Output shows the EKS Managed Node group nodes +``` + + + +## Distributed training +Once the EKS Cluster is deployed, you can proceed with the next steps of building neuronx-nemo-megatron container image and push the image to ECR. + +Navigate to examples/llama2 directory + +cd examples/llama2/ + +Run the 1-llama2-neuronx-pretrain-build-image.sh script to build the neuronx-nemo-megatron container image and push the image into ECR. + +When prompted for a region, enter the region in which you launched your EKS cluster, above. + +./1-llama2-neuronx-pretrain-build-image.sh + +Note: The image building and pushing to ECR will approximately take ~10 minutes +Step 5: In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above. + +Run the following script to launch the CLI pod: + +./2-launch-cmd-shell-pod.sh + +Next, periodically run the following command until you see the CLI pod go into ‘Running’ state: + +kubectl get pod + + +Once the CLI pod is ‘Running’, connect to it using the following command: + +```bash +kubectl exec -it cli-cmd-shell -- /bin/bash +``` + +From the CLI pod, we’ll download the Llama tokenizer files: First, run the huggingface-cli login command to login to Hugging Face using your access token. The access token is found under Settings → Access Tokens on the Hugging Face website. + + +``` +huggingface-cli login +``` +Paste the access token and hit enter. + +Download the llama7-7b tokenizer files to /shared/llama7b_tokenizer by running the python code + +```bash +python3 < /shared/redpajama_sample.jsonl + +# Run preprocessing script using llama tokenizer +python3 neuronx-nemo-megatron/nemo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ + --input=/shared/redpajama_sample.jsonl \ + --json-keys=text \ + --tokenizer-library=huggingface \ + --tokenizer-type=/shared/llama7b_tokenizer \ + --dataset-impl=mmap \ + --output-prefix=/shared/data/redpajama_sample \ + --append-eod \ + --need-pad-id \ + --workers=32 +``` + +Note: When we later launch our training jobs in EKS, the training pods will run the training script from within neuronx-nemo-megatron/nemo/examples directory on FSx. This is convenient, because it will let you modify your training script directly on FSx without requiring that you rebuild the neuronx-nemo-megatron container for every change. + +Modify the test_llama script /shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh to update the following two lines. These lines tell the training pod workers where to find the Llama tokenizer and the dataset on the FSx filesystem. + +You can use any common text editor such as nano or vim to make these changes. + +Run: +nano /shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh + +: ${TOKENIZER_PATH=/shared/llama7b_tokenizer} +: ${DATASET_PATH=/shared/data/redpajama_sample_text_document} + +Before: + +After: +Type exit or enter ctrl+x + +Step 11: When you are finished with the CLI pod you can remove it by running: + +kubectl delete pod cli-cmd-shell + +We are finally ready to launch our pre-compilation and training jobs! + +Before we can run the training job, we first need to run a pre-compilation job in order to prepare the model artifacts. This step extracts and compiles the underlying compute graphs for the llama2-7B model and generates Neuron executable files (NEFFs) that can run on the Trainium accelerators. These NEFFs are stored in a persistent Neuron cache on FSx so that the training job can later access them. + +Before you run the compilation job make sure MPI operator is functional by running this command: + +kubectl get all -n mpi-operator + +Run the pre-compilation script + +./3-llama2-neuronx-mpi-compile.sh + +Pre-compilation will take ~10 minutes when using 4 trn1.32xlarge nodes. + +Periodically run kubectl get pods | grep compile and wait until you see that the compile job shows ‘Completed’. + +When pre-compilation is complete, you can then launch the pre-training job on 4 trn1.32xl nodes by running the following script: + +./4-llama2-neuronx-mpi-train.sh + +To monitor the training job output - first, find the name of the launcher pod associated with your training job: + +kubectl get pods | grep launcher + + + +Once you have identified the name of the launcher pod and see that it is ‘Running’, the next step is to determine its UID. Replace test-mpi-train-launcher-xxx with your launcher pod name in the following command and it will output the UID: + +kubectl get pod test-mpi-train-launcher-g52f4 -o json | jq -r ".metadata.uid" + +Use the UID to determine the log path so you can tail the training logs. Replace UID with the above value. + +kubectl exec -it test-mpi-train-worker-0 -- tail -f /shared/nemo_experiments//0/log + + When you are done viewing the logs, you can press CTRL-C to quit the tail command. + +To monitor Trainium accelerator utilization you can use the neuron-top command. Neuron-top is a console-based tool for monitoring Neuron and system-related performance metrics on trn1/inf2/inf1 instances. You can launch neuron-top on one of the worker pods as follows: + +kubectl exec -it test-mpi-train-worker-0 -- /bin/bash -l neuron-top + +Create a Tensorboard deployment to visualize these logs by running the following command: + +./5-deploy-tensorboard.sh + + Tensorboard logs are also available in the /shared/nemo_experiments/ directory on the FSx for Lustre filesystem. Once the deployment is ready the script will output a password-protected URL for your new Tensorboard deployment. + +Launch the URL to view your training progress \ No newline at end of file From e57cb421f816215885089133824628e155b95773 Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Fri, 15 Dec 2023 15:05:37 -0700 Subject: [PATCH 22/45] initial doc updates --- website/docs/gen-ai/training/Llama2.md | 32 ++++++++++++-------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index 0439a4653..e96acfa0d 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -1,5 +1,5 @@ --- -title: Llama2 on Trainium +title: Llama-2 on Trainium sidebar_position: 1 --- import CollapsibleContent from '../../../src/components/CollapsibleContent'; @@ -19,14 +19,15 @@ We are actively enhancing this blueprint to incorporate improvements in observab ::: -# Training Llama2 Model using Trainium, Neuronx-Nemo-Megatron and MPI operator -Welcome to the comprehensive guide on training the [Meta Llama-2-7b ](https://ai.meta.com/llama/#inside-the-model) model on Amazon Elastic Kubernetes Service (EKS) using AWS Trainium, Neuronx-Nemo-megratron and MPI Operator. (https://github.com/kubeflow/mpi-operator). +# Training Llama-2 Model using Trainium, Neuronx-Nemo-Megatron and MPI operator +Welcome to the comprehensive guide on training the [Meta Llama-2-7b ](https://ai.meta.com/llama/#inside-the-model) model on Amazon Elastic Kubernetes Service (EKS) using AWS Trainium, Neuronx-Nemo-Megatron and the MPI Operator. (https://github.com/kubeflow/mpi-operator). -In this tutorial, you will learn how to harness the power of Llama-2, but also gain insights into the intricacies of deploying large language models (LLMs) efficiently, particularly on [trn1/inf2](https://aws.amazon.com/machine-learning/neuron/) (powered by AWS Trainium and Inferentia) instances, such as `inf2.24xlarge` and `inf2.48xlarge`, -which are optimized for deploying and scaling large language models. +In this tutorial you will learn how to run multi-node training jobs using [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/) accelerators in Amazon EKS. Specifically, you will pretrain Llama-2-7b on 4 AWS EC2 trn1.32xlarge instances using a [subset of the RedPajama dataset](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample). ### What is Llama-2? -Llama-2 is a pretrained large language model (LLM) trained on 2 trillion tokens of text and code. It is one of the largest and most powerful LLMs available today. Llama-2 can be used for a variety of tasks, including natural language processing, text generation, and translation. +Llama-2 is a large language model (LLM) trained on 2 trillion tokens of text and code. It is one of the largest and most powerful LLMs available today. Llama-2 can be used for a variety of tasks, including natural language processing, text generation, and translation. + +Although Llama-2 is available as a pretrained model, in this tutorial we will show how to pretrain the model from scratch. #### Llama-2-chat Llama-2 is a remarkable language model that has undergone a rigorous training process. It starts with pretraining using publicly available online data. @@ -40,28 +41,25 @@ Llama-2 is available in three different model sizes: ### **Which Llama-2 model size should I use?** The best Llama-2 model size for you will depend on your specific needs. and it may not always be the largest model for achieving the highest performance. It's advisable to evaluate your needs and consider factors such as computational resources, response time, and cost-efficiency when selecting the appropriate Llama-2 model size. The decision should be based on a comprehensive assessment of your application's goals and constraints. - - **Performance Boost** While Llama-2 can achieve high-performance inference on GPUs, Neuron accelerators take performance to the next level. Neuron accelerators are purpose-built for machine learning workloads, providing hardware acceleration that significantly enhances Llama-2's inference speeds. This translates to faster response times and improved user experiences when deploying Llama-2 on Trn1/Inf2 instances. - - ## Solution Architecture In this section, we will delve into the architecture of our solution. -**MPI Worker Pod:** These are Kubernetes pods configured for running MPI (Message Passing Interface) tasks. MPI is a standard for distributed memory parallel computing. Each pod is equipped with: 8 EFAs (Elastic Fabric Adapter) on Trn1.32xl Instances. EFAs are network devices that support high-performance computing applications running on Amazon EC2 instances. -Trn1.32xl Instance: This is an EC2 instance type that is part of the EC2 Trn1 (Trainium) instance family, optimized for machine learning training workloads and has 16 EFAs +**Trn1.32xl Instance:** This is an EC2 accelerated instance type that is part of the EC2 Trn1 (Trainium) instance family, optimized for machine learning training workloads + +**MPI Worker Pods:** These are Kubernetes pods configured for running MPI (Message Passing Interface) tasks. MPI is a standard for distributed memory parallel computing. Each worker pod runs on a trn1.32xlarge instance which is equipped with 16 Trainium accelerators and 8 Elastic Fabric Adapters (EFAs). EFAs are network devices that support high-performance computing applications running on Amazon EC2 instances. -**MPI Launcher:** A component that is likely responsible for initiating and managing the MPI jobs within the cluster. +**MPI Launcher Pod:** This pod is responsible for coordinating the MPI job across the worker pods. When a training job is first submitted to the cluster, an MPI launcher pod is created which waits for the workers to come online, connects to each worker, and invokes the training script. **MPI Operator:** An operator in Kubernetes is a method of packaging, deploying, and managing a Kubernetes application. The MPI Operator automates the deployment and management of MPI workloads. -![Llama-2-inf2](img/llama2-trainium.png) +![Llama-2-trn1](img/llama2-trainium.png) ## Deploying the Solution -**Steps to train Llama2 using AWS Trainium on Amazon EKS** +**Steps to train Llama-2 using AWS Trainium on Amazon EKS** Note: This post makes use of Meta’s Llama tokenizer, which is protected by a user license that must be accepted before the tokenizer files can be downloaded. Please ensure that you have access to the Llama files by requesting access here. @@ -224,7 +222,7 @@ kubectl delete pod cli-cmd-shell We are finally ready to launch our pre-compilation and training jobs! -Before we can run the training job, we first need to run a pre-compilation job in order to prepare the model artifacts. This step extracts and compiles the underlying compute graphs for the llama2-7B model and generates Neuron executable files (NEFFs) that can run on the Trainium accelerators. These NEFFs are stored in a persistent Neuron cache on FSx so that the training job can later access them. +Before we can run the training job, we first need to run a pre-compilation job in order to prepare the model artifacts. This step extracts and compiles the underlying compute graphs for the Llama-2-7b model and generates Neuron executable files (NEFFs) that can run on the Trainium accelerators. These NEFFs are stored in a persistent Neuron cache on FSx so that the training job can later access them. Before you run the compilation job make sure MPI operator is functional by running this command: @@ -268,4 +266,4 @@ Create a Tensorboard deployment to visualize these logs by running the following Tensorboard logs are also available in the /shared/nemo_experiments/ directory on the FSx for Lustre filesystem. Once the deployment is ready the script will output a password-protected URL for your new Tensorboard deployment. -Launch the URL to view your training progress \ No newline at end of file +Launch the URL to view your training progress From 2f8508154f85cd6aab4630bedb09f73ac3979a3a Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Fri, 15 Dec 2023 15:24:17 -0700 Subject: [PATCH 23/45] more llama doc updates --- website/docs/gen-ai/training/Llama2.md | 63 ++++++++++++++++++-------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index e96acfa0d..a1c73f892 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -55,6 +55,8 @@ In this section, we will delve into the architecture of our solution. **MPI Operator:** An operator in Kubernetes is a method of packaging, deploying, and managing a Kubernetes application. The MPI Operator automates the deployment and management of MPI workloads. +**FSx for Lustre:** A shared, high-performance filesystem which is well suited for workloads such as machine learning, high performance computing (HPC), video processing, and financial modeling. The FSx for Lustre filesystem will be shared across worker pods in the training job, providing a central repository to access the training data and to store model artifacts and logs. + ![Llama-2-trn1](img/llama2-trainium.png) ## Deploying the Solution @@ -65,17 +67,17 @@ Note: This post makes use of Meta’s Llama tokenizer, which is protected by a u Prerequisites}> Before we begin, ensure you have all the prerequisites in place to make the deployment process smooth and hassle-free. -Ensure that you have installed the following tools on your macEC2 or Cloud9 instance. +Ensure that you have installed the following tools on your EC2 or Cloud9 instance. * [EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html) or [Cloud9 instance](https://docs.aws.amazon.com/cloud9/latest/user-guide/tutorial-create-environment.html) → for both, please ensure you have 100GB+ of storage -* [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) +* [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) * [kubectl](https://Kubernetes.io/docs/tasks/tools/) * Git(Only for EC2 instance); Cloud9 comes with git installed by default * Docker * [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) * Python, pip, jq, unzip -To install all the pre-reqs on EC2, you can run this [script](https://github.com/sanjeevrg89/data-on-eks/blob/main/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh) +To install all the pre-reqs on EC2, you can run this [script](https://github.com/sanjeevrg89/data-on-eks/blob/main/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh) which is compatible with Amazon Linux 2023. **Step1:** Clone the Data on EKS repository @@ -84,9 +86,9 @@ To install all the pre-reqs on EC2, you can run this [script](https://github.com git clone https://github.com/awslabs/data-on-eks.git ``` -Navigate to trainium-inferentia directory. +Navigate to the trainium-inferentia directory. -``` bash +```bash cd data-on-eks/ai-ml/trainium-inferentia ``` @@ -119,29 +121,37 @@ kubectl get nodes # Output shows the EKS Managed Node group nodes ## Distributed training -Once the EKS Cluster is deployed, you can proceed with the next steps of building neuronx-nemo-megatron container image and push the image to ECR. +Once the EKS Cluster is deployed, you can proceed with the next steps of building neuronx-nemo-megatron container image and pushing the image to ECR. Navigate to examples/llama2 directory +```bash cd examples/llama2/ +``` -Run the 1-llama2-neuronx-pretrain-build-image.sh script to build the neuronx-nemo-megatron container image and push the image into ECR. +Run the `1-llama2-neuronx-pretrain-build-image.sh` script to build the neuronx-nemo-megatron container image and push the image into ECR. When prompted for a region, enter the region in which you launched your EKS cluster, above. +```bash ./1-llama2-neuronx-pretrain-build-image.sh +``` + +Note: The image building and pushing to ECR will take ~10 minutes -Note: The image building and pushing to ECR will approximately take ~10 minutes Step 5: In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above. Run the following script to launch the CLI pod: +```bash ./2-launch-cmd-shell-pod.sh +``` Next, periodically run the following command until you see the CLI pod go into ‘Running’ state: +```bash kubectl get pod - +``` Once the CLI pod is ‘Running’, connect to it using the following command: @@ -149,15 +159,14 @@ Once the CLI pod is ‘Running’, connect to it using the following command: kubectl exec -it cli-cmd-shell -- /bin/bash ``` -From the CLI pod, we’ll download the Llama tokenizer files: First, run the huggingface-cli login command to login to Hugging Face using your access token. The access token is found under Settings → Access Tokens on the Hugging Face website. +From within the CLI pod, we’ll download the Llama tokenizer files. These files are protected by Meta's Llama license, so you will need to run the `huggingface-cli login` command to login to Hugging Face using your access token. The access token is found under Settings → Access Tokens on the Hugging Face website. - -``` +```bash huggingface-cli login ``` Paste the access token and hit enter. -Download the llama7-7b tokenizer files to /shared/llama7b_tokenizer by running the python code +Next, you download the llama7-7b tokenizer files to /shared/llama7b_tokenizer by running the python code ```bash python3 </0/log +``` - When you are done viewing the logs, you can press CTRL-C to quit the tail command. +When you are done viewing the logs, you can press CTRL-C to quit the tail command. To monitor Trainium accelerator utilization you can use the neuron-top command. Neuron-top is a console-based tool for monitoring Neuron and system-related performance metrics on trn1/inf2/inf1 instances. You can launch neuron-top on one of the worker pods as follows: +```bash kubectl exec -it test-mpi-train-worker-0 -- /bin/bash -l neuron-top +``` Create a Tensorboard deployment to visualize these logs by running the following command: +```bash ./5-deploy-tensorboard.sh +``` Tensorboard logs are also available in the /shared/nemo_experiments/ directory on the FSx for Lustre filesystem. Once the deployment is ready the script will output a password-protected URL for your new Tensorboard deployment. -Launch the URL to view your training progress +Launch the URL to view your training progress. From 9b76684288c434a7db5e3cae85509834f982a922 Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Fri, 15 Dec 2023 15:47:17 -0700 Subject: [PATCH 24/45] more updates --- website/docs/gen-ai/training/Llama2.md | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index a1c73f892..c8419cc9e 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -19,7 +19,7 @@ We are actively enhancing this blueprint to incorporate improvements in observab ::: -# Training Llama-2 Model using Trainium, Neuronx-Nemo-Megatron and MPI operator +# Training a Llama-2 Model using Trainium, Neuronx-Nemo-Megatron and MPI operator Welcome to the comprehensive guide on training the [Meta Llama-2-7b ](https://ai.meta.com/llama/#inside-the-model) model on Amazon Elastic Kubernetes Service (EKS) using AWS Trainium, Neuronx-Nemo-Megatron and the MPI Operator. (https://github.com/kubeflow/mpi-operator). In this tutorial you will learn how to run multi-node training jobs using [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/) accelerators in Amazon EKS. Specifically, you will pretrain Llama-2-7b on 4 AWS EC2 trn1.32xlarge instances using a [subset of the RedPajama dataset](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample). @@ -210,20 +210,32 @@ python3 neuronx-nemo-megatron/nemo/scripts/nlp_language_modeling/preprocess_data Note: When we later launch our training jobs in EKS, the training pods will run the training script from within neuronx-nemo-megatron/nemo/examples directory on FSx. This is convenient, because it will let you modify your training script directly on FSx without requiring that you rebuild the neuronx-nemo-megatron container for every change. -Modify the test_llama script /shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh to update the following two lines. These lines tell the training pod workers where to find the Llama tokenizer and the dataset on the FSx filesystem. +Modify the test_llama.sh script `/shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh` to update the following two lines. These lines tell the training pod workers where to find the Llama tokenizer and the dataset on the FSx filesystem. You can use any common text editor such as nano or vim to make these changes. Run: +```bash nano /shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh +``` + +Before changes: +``` +: ${TOKENIZER_PATH=$HOME/llamav2_weights/7b-hf} +: ${DATASET_PATH=$HOME/examples_datasets/llama_7b/book.jsonl-processed_text_document} +``` + +After changes: +``` : ${TOKENIZER_PATH=/shared/llama7b_tokenizer} : ${DATASET_PATH=/shared/data/redpajama_sample_text_document} +``` + +You can save your changes in nano by pressing `CTRL-X`, then 'y', then 'ENTER'. -Before: +When you are finished, type `exit` or press `CTRL-d` to exit the CLI pod. -After: -Type exit or enter ctrl+x Step 11: When you are finished with the CLI pod you can remove it by running: From 1e5e8da99ca7f9bc2e29f2ebd4d57e8c8bdbd308 Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Fri, 15 Dec 2023 16:02:43 -0700 Subject: [PATCH 25/45] more updates --- website/docs/gen-ai/training/Llama2.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index c8419cc9e..d52f08ce8 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -80,7 +80,7 @@ Ensure that you have installed the following tools on your EC2 or Cloud9 instanc To install all the pre-reqs on EC2, you can run this [script](https://github.com/sanjeevrg89/data-on-eks/blob/main/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh) which is compatible with Amazon Linux 2023. -**Step1:** Clone the Data on EKS repository +Clone the Data on EKS repository ```bash git clone https://github.com/awslabs/data-on-eks.git @@ -139,7 +139,7 @@ When prompted for a region, enter the region in which you launched your EKS clus Note: The image building and pushing to ECR will take ~10 minutes -Step 5: In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above. +In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above. Run the following script to launch the CLI pod: @@ -232,14 +232,16 @@ After changes: : ${DATASET_PATH=/shared/data/redpajama_sample_text_document} ``` -You can save your changes in nano by pressing `CTRL-X`, then 'y', then 'ENTER'. +You can save your changes in nano by pressing `CTRL-X`, then `y`, then `ENTER`. When you are finished, type `exit` or press `CTRL-d` to exit the CLI pod. -Step 11: When you are finished with the CLI pod you can remove it by running: +When you are finished with the CLI pod you can remove it by running: +```bash kubectl delete pod cli-cmd-shell +``` We are finally ready to launch our pre-compilation and training jobs! @@ -299,6 +301,6 @@ Create a Tensorboard deployment to visualize these logs by running the following ./5-deploy-tensorboard.sh ``` - Tensorboard logs are also available in the /shared/nemo_experiments/ directory on the FSx for Lustre filesystem. Once the deployment is ready the script will output a password-protected URL for your new Tensorboard deployment. +Tensorboard logs are also available in the /shared/nemo_experiments/ directory on the FSx for Lustre filesystem. Once the deployment is ready the script will output a password-protected URL for your new Tensorboard deployment. Launch the URL to view your training progress. From 7b5ac6708dd37a4b92c057d09cc6eee09aa6f6ca Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Fri, 15 Dec 2023 16:21:44 -0700 Subject: [PATCH 26/45] add subheadings to docs --- website/docs/gen-ai/training/Llama2.md | 42 +++++++++++++++++++------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index d52f08ce8..8e379d119 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -123,6 +123,8 @@ kubectl get nodes # Output shows the EKS Managed Node group nodes ## Distributed training Once the EKS Cluster is deployed, you can proceed with the next steps of building neuronx-nemo-megatron container image and pushing the image to ECR. +### Build the neuronx-nemo-megatron container image + Navigate to examples/llama2 directory ```bash @@ -139,6 +141,8 @@ When prompted for a region, enter the region in which you launched your EKS clus Note: The image building and pushing to ECR will take ~10 minutes +### Download the Llama tokenizer and RedPajama dataset + In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above. Run the following script to launch the CLI pod: @@ -164,9 +168,9 @@ From within the CLI pod, we’ll download the Llama tokenizer files. These files ```bash huggingface-cli login ``` -Paste the access token and hit enter. +When prompted for your token, paste-in the access token and hit `ENTER`. -Next, you download the llama7-7b tokenizer files to /shared/llama7b_tokenizer by running the python code +Next, you download the llama7-7b tokenizer files to /shared/llama7b_tokenizer by running the following python code: ```bash python3 </0/log +kubectl exec -it test-mpi-train-worker-0 -- tail -f /shared/nemo_experiments/UID/0/log ``` -When you are done viewing the logs, you can press CTRL-C to quit the tail command. +When you are done viewing the logs, you can press `CTRL-C` to quit the tail command. + +### Monitor Trainium accelerator utilization To monitor Trainium accelerator utilization you can use the neuron-top command. Neuron-top is a console-based tool for monitoring Neuron and system-related performance metrics on trn1/inf2/inf1 instances. You can launch neuron-top on one of the worker pods as follows: @@ -295,6 +313,8 @@ To monitor Trainium accelerator utilization you can use the neuron-top command. kubectl exec -it test-mpi-train-worker-0 -- /bin/bash -l neuron-top ``` +### View training job metrics in Tensorboard + Create a Tensorboard deployment to visualize these logs by running the following command: ```bash From 7e3d377fce059b6191ae3ab92fbd8c4a14cc0044 Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Fri, 15 Dec 2023 16:38:27 -0700 Subject: [PATCH 27/45] update tensorboard blurb --- website/docs/gen-ai/training/Llama2.md | 28 +++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index 8e379d119..868cb4916 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -141,7 +141,7 @@ When prompted for a region, enter the region in which you launched your EKS clus Note: The image building and pushing to ECR will take ~10 minutes -### Download the Llama tokenizer and RedPajama dataset +### Launch and connect to a CLI pod In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above. @@ -163,6 +163,8 @@ Once the CLI pod is ‘Running’, connect to it using the following command: kubectl exec -it cli-cmd-shell -- /bin/bash ``` +### Download the Llama tokenizer and Redpajama dataset to FSx + From within the CLI pod, we’ll download the Llama tokenizer files. These files are protected by Meta's Llama license, so you will need to run the `huggingface-cli login` command to login to Hugging Face using your access token. The access token is found under Settings → Access Tokens on the Hugging Face website. ```bash @@ -180,7 +182,7 @@ tok.save_pretrained("/shared/llama7b_tokenizer") EOF ``` -Download and tokenize the RedPajama-Data-1T-Sample dataset (a small subset of the full RedPajama dataset that contains 1B tokens). +Next, download the RedPajama-Data-1T-Sample dataset (a small subset of the full RedPajama dataset that contains 1B tokens). While still connected to the CLI pod, use git to download the dataset @@ -313,14 +315,30 @@ To monitor Trainium accelerator utilization you can use the neuron-top command. kubectl exec -it test-mpi-train-worker-0 -- /bin/bash -l neuron-top ``` -### View training job metrics in Tensorboard +### View training job metrics in TensorBoard + +[TensorBoard](https://www.tensorflow.org/tensorboard) is a web-based visualization tool that is commonly used to monitor and explore training jobs. It allows you to quickly monitor training metrics, and you can also easily compare metrics across different training runs. + +TensorBoard logs available in the /shared/nemo_experiments/ directory on the FSx for Lustre filesystem. -Create a Tensorboard deployment to visualize these logs by running the following command: +Run the following script to create a TensorBoard deployment so you can visualize your Llama-2 training job progress: ```bash ./5-deploy-tensorboard.sh ``` -Tensorboard logs are also available in the /shared/nemo_experiments/ directory on the FSx for Lustre filesystem. Once the deployment is ready the script will output a password-protected URL for your new Tensorboard deployment. +Once the deployment is ready the script will output a password-protected URL for your new TensorBoard deployment. Launch the URL to view your training progress. + +When you have opened the TensorBoard interface, choose your training job UID from the left-hand menu, and then explore the various training metrics (ex: reduced-train-loss, throughput, and grad-norm) from the main application window. + +### Stopping the training job + +To stop your training job and remove the launcher/worker pods, run the following command: + +```bash +kubectl delete mpijob test-mpi-train +``` + +You can then run `kubectl get pods` to confirm that the launcher/worker pods have been removed. From cf691d558da38d07241e4c6b9d571d6d284ca6e6 Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Fri, 15 Dec 2023 16:45:56 -0700 Subject: [PATCH 28/45] minor tweak --- website/docs/gen-ai/training/Llama2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index 868cb4916..fd48ee296 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -20,7 +20,7 @@ We are actively enhancing this blueprint to incorporate improvements in observab # Training a Llama-2 Model using Trainium, Neuronx-Nemo-Megatron and MPI operator -Welcome to the comprehensive guide on training the [Meta Llama-2-7b ](https://ai.meta.com/llama/#inside-the-model) model on Amazon Elastic Kubernetes Service (EKS) using AWS Trainium, Neuronx-Nemo-Megatron and the MPI Operator. (https://github.com/kubeflow/mpi-operator). +Welcome to the comprehensive guide on training the [Meta Llama-2-7b ](https://ai.meta.com/llama/#inside-the-model) model on Amazon Elastic Kubernetes Service (EKS) using AWS Trainium, Neuronx-Nemo-Megatron, and the MPI Operator. In this tutorial you will learn how to run multi-node training jobs using [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/) accelerators in Amazon EKS. Specifically, you will pretrain Llama-2-7b on 4 AWS EC2 trn1.32xlarge instances using a [subset of the RedPajama dataset](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample). From a5f9d5b8d0bc2ffaecb3d49ac8e7596cd530ced9 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Tue, 19 Dec 2023 10:14:57 -0500 Subject: [PATCH 29/45] missing img folder --- .../{inference => training}/img/llama2-trainium.png | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename website/docs/gen-ai/{inference => training}/img/llama2-trainium.png (100%) diff --git a/website/docs/gen-ai/inference/img/llama2-trainium.png b/website/docs/gen-ai/training/img/llama2-trainium.png similarity index 100% rename from website/docs/gen-ai/inference/img/llama2-trainium.png rename to website/docs/gen-ai/training/img/llama2-trainium.png From ae304782b3ac78a17aea2dda0872d35878e38a21 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Tue, 2 Jan 2024 15:14:08 -0500 Subject: [PATCH 30/45] PR review requested changes --- ai-ml/trainium-inferentia/eks.tf | 24 ++++++++++++------------ ai-ml/trainium-inferentia/outputs.tf | 5 ----- ai-ml/trainium-inferentia/variables.tf | 22 ++++++++++++++++++++-- website/docs/gen-ai/inference/Llama2.md | 8 +++++--- website/docs/gen-ai/training/Llama2.md | 6 ++++-- 5 files changed, 41 insertions(+), 24 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index 1815f068f..f63a6c7fb 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -176,9 +176,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = 4 - max_size = 4 - desired_size = 4 + min_size = var.min_size + max_size = var.max_size + desired_size = var.desired_size # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ @@ -322,9 +322,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = 0 - max_size = 1 - desired_size = 0 + min_size = var.min_size + max_size = var.max_size + desired_size = var.desired_size # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ @@ -507,9 +507,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = 0 - max_size = 2 - desired_size = 0 + min_size = var.min_size + max_size = var.max_size + desired_size = var.desired_size labels = { instance-type = "inf2" @@ -555,9 +555,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = 0 - max_size = 2 - desired_size = 0 + min_size = var.min_size + max_size = var.max_size + desired_size = var.desired_size labels = { instance-type = "inf2-48xl" diff --git a/ai-ml/trainium-inferentia/outputs.tf b/ai-ml/trainium-inferentia/outputs.tf index c66da693f..40adfb3b3 100755 --- a/ai-ml/trainium-inferentia/outputs.tf +++ b/ai-ml/trainium-inferentia/outputs.tf @@ -1,8 +1,3 @@ -/* output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}" -} */ - output "configure_kubectl" { description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" value = "aws eks --region ${var.region} update-kubeconfig --name ${local.name}" diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 749d3c6db..b1a9c3002 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -1,6 +1,6 @@ variable "name" { description = "Name of the VPC and EKS Cluster" - default = "tr-inf" + default = "trn1-inf2" type = string } @@ -13,7 +13,7 @@ variable "region" { variable "eks_cluster_version" { description = "EKS Cluster version" - default = "1.27" + default = "1.28" type = string } @@ -49,3 +49,21 @@ variable "enable_mpi_operator" { type = bool default = false } + +variable "min_size" { + description = "Worker node minimum size" + type = number + default = 0 +} + +variable "max_size" { + description = "Worker node max size" + type = number + default = 0 +} + +variable "desired_size" { + description = "Worker node desired size" + type = number + default = 0 +} \ No newline at end of file diff --git a/website/docs/gen-ai/inference/Llama2.md b/website/docs/gen-ai/inference/Llama2.md index 7fc77accb..500222103 100644 --- a/website/docs/gen-ai/inference/Llama2.md +++ b/website/docs/gen-ai/inference/Llama2.md @@ -114,13 +114,15 @@ cd data-on-eks/ai-ml/trainium-inferentia/ && chmod +x install.sh Verify the Amazon EKS Cluster +**NOTE:** Replace [cluster-name] with your actual EKS cluster name + ```bash -aws eks --region us-west-2 describe-cluster --name trainium-inferentia +aws eks --region us-west-2 describe-cluster --name [cluster-name] ``` ```bash # Creates k8s config file to authenticate with EKS -aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia +aws eks --region us-west-2 update-kubeconfig --name [cluster-name] kubectl get nodes # Output shows the EKS Managed Node group nodes ``` @@ -148,7 +150,7 @@ Users can also modify the Dockerfile to suit their specific requirements and pus **Ensure the cluster is configured locally** ```bash -aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia +aws eks --region us-west-2 update-kubeconfig --name [cluster-name] ``` **Deploy RayServe Cluster** diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index fd48ee296..99d68f9ed 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -107,13 +107,15 @@ Run the install script to provision an EKS cluster with all the add-ons needed f Verify the Amazon EKS Cluster +**NOTE:** Replace [cluster-name] with your actual EKS cluster name + ```bash -aws eks --region us-west-2 describe-cluster --name +aws eks --region us-west-2 describe-cluster --name [cluster-name] ``` ```bash # Creates k8s config file to authenticate with EKS -aws eks --region us-west-2 update-kubeconfig --name +aws eks --region us-west-2 update-kubeconfig --name [cluster-name] kubectl get nodes # Output shows the EKS Managed Node group nodes ``` From 3c2b71f6d39acd80d16419c38ad7eaa2774a2a58 Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Thu, 4 Jan 2024 08:33:39 -0700 Subject: [PATCH 31/45] Automatically select appropriate trn1/inf2-supporting AZs based on user's chosen region --- ai-ml/trainium-inferentia/get_eks_azs.sh | 16 ++++++++++++---- ai-ml/trainium-inferentia/main.tf | 2 +- ai-ml/trainium-inferentia/variables.tf | 3 ++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ai-ml/trainium-inferentia/get_eks_azs.sh b/ai-ml/trainium-inferentia/get_eks_azs.sh index d45648e46..08dc832b1 100644 --- a/ai-ml/trainium-inferentia/get_eks_azs.sh +++ b/ai-ml/trainium-inferentia/get_eks_azs.sh @@ -1,12 +1,20 @@ #!/bin/bash -# Hardcoded AWS region -REGION_CODE="us-west-2" +# Desired AWS region should be passed in as arg1, ex 'us-west-2' +REGION_CODE=$1 -# Determine appropriate EKS AZs based on the hardcoded region +# Determine appropriate EKS AZs based on the AWS region. (EKS requires that we specify 2 AZs) +# The AZs specified here currently support both trn1 and inf2, but inf2 is also supported +# in additional AZs. AZ1 should be preferred when launching nodes. if [[ $REGION_CODE == "us-west-2" ]]; then AZ1="usw2-az4" - AZ2="usw2-az3" + AZ2="usw2-az1" +elif [[ $REGION_CODE == "us-east-1" ]]; then + AZ1="use1-az6" + AZ2="use1-az5" +elif [[ $REGION_CODE == "us-east-2" ]]; then + AZ1="use2-az3" + AZ2="use2-az1" else echo "{\"error\": \"Unsupported region: $REGION_CODE\"}" exit 1 diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index d55ebb65f..51de5996d 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -59,7 +59,7 @@ data "aws_ecrpublic_authorization_token" "token" { } */ data "external" "eks_azs" { - program = ["bash", "${path.module}/get_eks_azs.sh"] + program = ["bash", "${path.module}/get_eks_azs.sh", var.region] } locals { diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index b1a9c3002..22c8d1b23 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -4,7 +4,8 @@ variable "name" { type = string } -# NOTE: Trainium and Inferentia are only available in us-west-2 and us-east-1 regions +# NOTE: As of 2024/01/04 Trainium instances only available in us-west-2, us-east-1, and us-east-2 regions +# Inferentia instances are available in the above regions + several others variable "region" { description = "region" default = "us-west-2" From 700d5e646233fc8dc86e705c06e18412a0cac091 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 4 Jan 2024 11:55:26 -0500 Subject: [PATCH 32/45] added variables for trn1 and inf2 instance sizes --- ai-ml/trainium-inferentia/eks.tf | 24 +++++----- ai-ml/trainium-inferentia/main.tf | 12 ----- ai-ml/trainium-inferentia/variables.tf | 63 ++++++++++++++++++++++++-- 3 files changed, 71 insertions(+), 28 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index f63a6c7fb..b43b0ad66 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -176,9 +176,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = var.min_size - max_size = var.max_size - desired_size = var.desired_size + min_size = var.trn1_32xl_min_size + max_size = var.trn1_32xl_max_size + desired_size = var.trn1_32xl_desired_size # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ @@ -322,9 +322,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = var.min_size - max_size = var.max_size - desired_size = var.desired_size + min_size = var.trn1n_32xl_min_size + max_size = var.trn1n_32xl_max_size + desired_size = var.trn1n_32xl_desired_size # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ @@ -507,9 +507,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = var.min_size - max_size = var.max_size - desired_size = var.desired_size + min_size = var.inf2-24xl_min_size + max_size = var.inf2-24xl_max_size + desired_size = var.inf2-24xl_desired_size labels = { instance-type = "inf2" @@ -555,9 +555,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = var.min_size - max_size = var.max_size - desired_size = var.desired_size + min_size = var.inf2-48xl_min_size + max_size = var.inf2-48xl_max_size + desired_size = var.inf2-48xl_desired_size labels = { instance-type = "inf2-48xl" diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index 51de5996d..c4d93604a 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -46,18 +46,6 @@ data "aws_ecrpublic_authorization_token" "token" { provider = aws.ecr } -/* locals { - name = "${var.name}-${random_string.this.result}" - region = var.region - # Training and Inference instances are available in the following AZs us-east-1 and us-west-2 - # You can find the list of supported AZs here: https://aws.amazon.com/ec2/instance-types/trn1/ - azs = ["${local.region}c", "${local.region}d"] - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} */ - data "external" "eks_azs" { program = ["bash", "${path.module}/get_eks_azs.sh", var.region] } diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 22c8d1b23..0b02d86ee 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -51,20 +51,75 @@ variable "enable_mpi_operator" { default = false } -variable "min_size" { +variable "trn1_32xl_min_size" { description = "Worker node minimum size" type = number default = 0 } -variable "max_size" { +variable "trn1_32xl_max_size" { description = "Worker node max size" type = number default = 0 } -variable "desired_size" { +variable "trn1_32xl_desired_size" { description = "Worker node desired size" type = number default = 0 -} \ No newline at end of file +} + +variable "trn1n_32xl_min_size" { + description = "Worker node minimum size" + type = number + default = 0 +} + +variable "trn1n_32xl_max_size" { + description = "Worker node max size" + type = number + default = 0 +} + +variable "trn1n_32xl_desired_size" { + description = "Worker node desired size" + type = number + default = 0 +} + +variable "inf2-24xl_min_size" { + description = "Worker node minimum size" + type = number + default = 0 +} + +variable "inf2-24xl_max_size" { + description = "Worker node max size" + type = number + default = 2 +} + +variable "inf2-24xl_desired_size" { + description = "Worker node desired size" + type = number + default = 0 +} + +variable "inf2-48xl_min_size" { + description = "Worker node minimum size" + type = number + default = 0 +} + +variable "inf2-48xl_max_size" { + description = "Worker node max size" + type = number + default = 2 +} + +variable "inf2-48xl_desired_size" { + description = "Worker node desired size" + type = number + default = 0 +} + From 4ede8ebb44c05fd8ae196d2052af9e85dca24216 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 4 Jan 2024 13:30:10 -0500 Subject: [PATCH 33/45] redo instance size variables for inf2 and trn1n --- ai-ml/trainium-inferentia/eks.tf | 24 +++++------ ai-ml/trainium-inferentia/variables.tf | 60 ++------------------------ 2 files changed, 15 insertions(+), 69 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index b43b0ad66..e01099470 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -176,9 +176,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = var.trn1_32xl_min_size - max_size = var.trn1_32xl_max_size - desired_size = var.trn1_32xl_desired_size + min_size = var.min_size + max_size = var.max_size + desired_size = var.desired_size # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ @@ -322,9 +322,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = var.trn1n_32xl_min_size - max_size = var.trn1n_32xl_max_size - desired_size = var.trn1n_32xl_desired_size + min_size = 0 + max_size = 1 + desired_size = 0 # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ @@ -507,9 +507,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = var.inf2-24xl_min_size - max_size = var.inf2-24xl_max_size - desired_size = var.inf2-24xl_desired_size + min_size = 0 + max_size = 2 + desired_size = 0 labels = { instance-type = "inf2" @@ -555,9 +555,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = var.inf2-48xl_min_size - max_size = var.inf2-48xl_max_size - desired_size = var.inf2-48xl_desired_size + min_size = 0 + max_size = 2 + desired_size = 0 labels = { instance-type = "inf2-48xl" diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 0b02d86ee..4e4e16f96 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -51,73 +51,19 @@ variable "enable_mpi_operator" { default = false } -variable "trn1_32xl_min_size" { +variable "min_size" { description = "Worker node minimum size" type = number default = 0 } -variable "trn1_32xl_max_size" { +variable "max_size" { description = "Worker node max size" type = number default = 0 } -variable "trn1_32xl_desired_size" { - description = "Worker node desired size" - type = number - default = 0 -} - -variable "trn1n_32xl_min_size" { - description = "Worker node minimum size" - type = number - default = 0 -} - -variable "trn1n_32xl_max_size" { - description = "Worker node max size" - type = number - default = 0 -} - -variable "trn1n_32xl_desired_size" { - description = "Worker node desired size" - type = number - default = 0 -} - -variable "inf2-24xl_min_size" { - description = "Worker node minimum size" - type = number - default = 0 -} - -variable "inf2-24xl_max_size" { - description = "Worker node max size" - type = number - default = 2 -} - -variable "inf2-24xl_desired_size" { - description = "Worker node desired size" - type = number - default = 0 -} - -variable "inf2-48xl_min_size" { - description = "Worker node minimum size" - type = number - default = 0 -} - -variable "inf2-48xl_max_size" { - description = "Worker node max size" - type = number - default = 2 -} - -variable "inf2-48xl_desired_size" { +variable "desired_size" { description = "Worker node desired size" type = number default = 0 From ecbe68a99adf8f48d98143f1dc02a948fca2dc7e Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 4 Jan 2024 15:10:00 -0500 Subject: [PATCH 34/45] instance size variables fix --- ai-ml/trainium-inferentia/eks.tf | 24 +++++------ ai-ml/trainium-inferentia/variables.tf | 60 ++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index e01099470..b43b0ad66 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -176,9 +176,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = var.min_size - max_size = var.max_size - desired_size = var.desired_size + min_size = var.trn1_32xl_min_size + max_size = var.trn1_32xl_max_size + desired_size = var.trn1_32xl_desired_size # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ @@ -322,9 +322,9 @@ module "eks" { echo "Bootstrap complete. Ready to Go!" EOT - min_size = 0 - max_size = 1 - desired_size = 0 + min_size = var.trn1n_32xl_min_size + max_size = var.trn1n_32xl_max_size + desired_size = var.trn1n_32xl_desired_size # EFA Network Interfaces configuration for Trn1.32xlarge network_interfaces = [ @@ -507,9 +507,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = 0 - max_size = 2 - desired_size = 0 + min_size = var.inf2-24xl_min_size + max_size = var.inf2-24xl_max_size + desired_size = var.inf2-24xl_desired_size labels = { instance-type = "inf2" @@ -555,9 +555,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = 0 - max_size = 2 - desired_size = 0 + min_size = var.inf2-48xl_min_size + max_size = var.inf2-48xl_max_size + desired_size = var.inf2-48xl_desired_size labels = { instance-type = "inf2-48xl" diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 4e4e16f96..28e3a1024 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -51,19 +51,73 @@ variable "enable_mpi_operator" { default = false } -variable "min_size" { +variable "trn1_32xl_min_size" { + description = "trn1 Worker node minimum size" + type = number + default = 0 +} + +variable "trn1_32xl_max_size" { + description = "trn1 Worker node max size" + type = number + default = 2 +} + +variable "trn1_32xl_desired_size" { + description = "trn1 Worker node desired size" + type = number + default = 0 +} + +variable "trn1n_32xl_min_size" { + description = "Worker node minimum size" + type = number + default = 0 +} + +variable "trn1n_32xl_max_size" { + description = "Worker node max size" + type = number + default = 1 +} + +variable "trn1n_32xl_desired_size" { + description = "Worker node desired size" + type = number + default = 0 +} + +variable "inf2-24xl_min_size" { description = "Worker node minimum size" type = number default = 0 } -variable "max_size" { +variable "inf2-24xl_max_size" { description = "Worker node max size" type = number + default = 2 +} + +variable "inf2-24xl_desired_size" { + description = "Worker node desired size" + type = number default = 0 } -variable "desired_size" { +variable "inf2-48xl_min_size" { + description = "Worker node minimum size" + type = number + default = 0 +} + +variable "inf2-48xl_max_size" { + description = "Worker node max size" + type = number + default = 2 +} + +variable "inf2-48xl_desired_size" { description = "Worker node desired size" type = number default = 0 From 51ef0beb6bb6296d0a66b44483315d263879d747 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 4 Jan 2024 15:24:28 -0500 Subject: [PATCH 35/45] fix trn1 default max size setting --- ai-ml/trainium-inferentia/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 28e3a1024..8cf617561 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -60,7 +60,7 @@ variable "trn1_32xl_min_size" { variable "trn1_32xl_max_size" { description = "trn1 Worker node max size" type = number - default = 2 + default = 4 } variable "trn1_32xl_desired_size" { From b71e27f2ff44ce4282d00ecbc40615d52d9217cf Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Thu, 4 Jan 2024 16:03:13 -0500 Subject: [PATCH 36/45] llama2 training doc update --- website/docs/gen-ai/training/Llama2.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index 99d68f9ed..656fd6cd6 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -92,9 +92,17 @@ Navigate to the trainium-inferentia directory. cd data-on-eks/ai-ml/trainium-inferentia ``` -Modify the **“trn1-32xl-ng1”** node group size in eks.tf file. Go to line 179 and change the min_size to 4, max_size to 4 and desired_size to 4. +By default **MPI operator** is not installed and its set to false. We will run the below export commands to set environment variables. -In addition, also update **variables.tf** for MPI operator to be installed. By default its not installed and for this post its important to change the default value from **false** to **true** +**NOTE:** As of 2024/01/04 Trainium instances only available in us-west-2, us-east-1, and us-east-2 regions. + +```bash +export TF_VAR_enable_mpi_operator=true +export TF_VAR_region=us-west-2 +export TF_VAR_trn1_min_size=4 +export TF_VAR_trn1_desired_size=4 +export TF_VAR_trn1_max_size=4 +``` Run the install script to provision an EKS cluster with all the add-ons needed for the solution. @@ -102,7 +110,6 @@ Run the install script to provision an EKS cluster with all the add-ons needed f ./install.sh ``` - ### Verify the resources Verify the Amazon EKS Cluster From 3d0d6740c1d08a501057e54636e87fa098762815 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Mon, 15 Jan 2024 23:50:47 -0500 Subject: [PATCH 37/45] code changes to map AZs --- ai-ml/trainium-inferentia/eks.tf | 17 +++++++++++++++-- ai-ml/trainium-inferentia/main.tf | 18 +++++++++++++++++- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index b43b0ad66..1c7162706 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -1,3 +1,15 @@ +# Compute subnet_ids based on the secondary private subnets and AZs +locals { + subnet_ids = compact([ + for i in range(length(module.vpc.secondary_private_subnets)) : + let + subnet_id = module.vpc.secondary_private_subnets[i], + cidr_block = module.vpc.secondary_private_subnets_cidr_blocks[i] + in + substr(cidr_block, 0, 4) == "100." ? subnet_id : null + ]) +} + #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- @@ -13,8 +25,9 @@ module "eks" { vpc_id = module.vpc.vpc_id # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) + #subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + #substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) + subnet_ids = local.subnet_ids manage_aws_auth_configmap = true aws_auth_roles = [ diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index c4d93604a..98cb7c90b 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -46,7 +46,7 @@ data "aws_ecrpublic_authorization_token" "token" { provider = aws.ecr } -data "external" "eks_azs" { +/* data "external" "eks_azs" { program = ["bash", "${path.module}/get_eks_azs.sh", var.region] } @@ -58,6 +58,22 @@ locals { Blueprint = local.name GithubRepo = "github.com/awslabs/data-on-eks" } +} */ + +locals { + az_mapping = { + "us-west-2" = ["usw2-az4", "usw2-az1"], + "us-east-1" = ["use1-az6", "use1-az5"], + "us-east-2" = ["use2-az3", "use2-az1"] + } + + name = "${var.name}-${random_string.this.result}" + region = var.region + azs = local.az_mapping[var.region] # Retrieves the AZs from the mapping based on the specified region + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } } From 6eb009935aebb0a9c0c2da60662838d27807a8ce Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Tue, 16 Jan 2024 08:58:09 -0500 Subject: [PATCH 38/45] AZ fetch code changes --- ai-ml/trainium-inferentia/eks.tf | 13 +++++++------ ai-ml/trainium-inferentia/outputs.tf | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index 1c7162706..11daba67d 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -1,15 +1,16 @@ # Compute subnet_ids based on the secondary private subnets and AZs locals { subnet_ids = compact([ - for i in range(length(module.vpc.secondary_private_subnets)) : - let - subnet_id = module.vpc.secondary_private_subnets[i], - cidr_block = module.vpc.secondary_private_subnets_cidr_blocks[i] - in - substr(cidr_block, 0, 4) == "100." ? subnet_id : null + for i in range(length(module.vpc.private_subnets)) : + substr(module.vpc.private_subnets_cidr_blocks[i], 0, 4) == "100." ? module.vpc.private_subnets[i] : null ]) } + + + + + #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- diff --git a/ai-ml/trainium-inferentia/outputs.tf b/ai-ml/trainium-inferentia/outputs.tf index 40adfb3b3..d787d8e67 100755 --- a/ai-ml/trainium-inferentia/outputs.tf +++ b/ai-ml/trainium-inferentia/outputs.tf @@ -2,3 +2,4 @@ output "configure_kubectl" { description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" value = "aws eks --region ${var.region} update-kubeconfig --name ${local.name}" } + From 49cf49a371d02ad94738239f6d82dd51aa834823 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Wed, 17 Jan 2024 00:21:57 -0500 Subject: [PATCH 39/45] reverted back to original AZ implementation --- ai-ml/trainium-inferentia/eks.tf | 34 ++++++++------------------ ai-ml/trainium-inferentia/main.tf | 30 ++++------------------- ai-ml/trainium-inferentia/outputs.tf | 3 +-- ai-ml/trainium-inferentia/variables.tf | 12 ++++----- 4 files changed, 22 insertions(+), 57 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index 11daba67d..35700b69b 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -1,16 +1,3 @@ -# Compute subnet_ids based on the secondary private subnets and AZs -locals { - subnet_ids = compact([ - for i in range(length(module.vpc.private_subnets)) : - substr(module.vpc.private_subnets_cidr_blocks[i], 0, 4) == "100." ? module.vpc.private_subnets[i] : null - ]) -} - - - - - - #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- @@ -26,9 +13,9 @@ module "eks" { vpc_id = module.vpc.vpc_id # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created - #subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - #substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) - subnet_ids = local.subnet_ids + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) + manage_aws_auth_configmap = true aws_auth_roles = [ @@ -149,8 +136,7 @@ module "eks" { # The code filters the private subnets based on their CIDR blocks and selects the subnet ID if the CIDR block starts with "100." Otherwise, it assigns a null value. # The element(compact([...]), 0) expression ensures that only the first non-null value is included in the resulting list of subnet IDs. subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0) - ] + substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)] # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2 # ami_id = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type @@ -521,9 +507,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = var.inf2-24xl_min_size - max_size = var.inf2-24xl_max_size - desired_size = var.inf2-24xl_desired_size + min_size = var.inf2_24xl_min_size + max_size = var.inf2_24xl_max_size + desired_size = var.inf2_24xl_desired_size labels = { instance-type = "inf2" @@ -569,9 +555,9 @@ module "eks" { export PATH=/opt/aws/neuron/bin:$PATH EOT - min_size = var.inf2-48xl_min_size - max_size = var.inf2-48xl_max_size - desired_size = var.inf2-48xl_desired_size + min_size = var.inf2_48xl_min_size + max_size = var.inf2_48xl_max_size + desired_size = var.inf2_48xl_desired_size labels = { instance-type = "inf2-48xl" diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index 98cb7c90b..bb13dc384 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -35,7 +35,7 @@ resource "random_string" "this" { special = false upper = false lower = true - numeric = true + numeric = true } data "aws_eks_cluster_auth" "this" { @@ -46,34 +46,14 @@ data "aws_ecrpublic_authorization_token" "token" { provider = aws.ecr } -/* data "external" "eks_azs" { - program = ["bash", "${path.module}/get_eks_azs.sh", var.region] -} - locals { name = "${var.name}-${random_string.this.result}" region = var.region - azs = [data.external.eks_azs.result["EKSAZ1"], data.external.eks_azs.result["EKSAZ2"]] + # Training and Inference instances are available in the following AZs us-east-1 and us-west-2 + # You can find the list of supported AZs here: https://aws.amazon.com/ec2/instance-types/trn1/ + azs = ["${local.region}c", "${local.region}d"] tags = { Blueprint = local.name GithubRepo = "github.com/awslabs/data-on-eks" } -} */ - -locals { - az_mapping = { - "us-west-2" = ["usw2-az4", "usw2-az1"], - "us-east-1" = ["use1-az6", "use1-az5"], - "us-east-2" = ["use2-az3", "use2-az1"] - } - - name = "${var.name}-${random_string.this.result}" - region = var.region - azs = local.az_mapping[var.region] # Retrieves the AZs from the mapping based on the specified region - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} - - +} \ No newline at end of file diff --git a/ai-ml/trainium-inferentia/outputs.tf b/ai-ml/trainium-inferentia/outputs.tf index d787d8e67..35354df9e 100755 --- a/ai-ml/trainium-inferentia/outputs.tf +++ b/ai-ml/trainium-inferentia/outputs.tf @@ -1,5 +1,4 @@ output "configure_kubectl" { description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" value = "aws eks --region ${var.region} update-kubeconfig --name ${local.name}" -} - +} \ No newline at end of file diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 8cf617561..8dec490ef 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -87,37 +87,37 @@ variable "trn1n_32xl_desired_size" { default = 0 } -variable "inf2-24xl_min_size" { +variable "inf2_24xl_min_size" { description = "Worker node minimum size" type = number default = 0 } -variable "inf2-24xl_max_size" { +variable "inf2_24xl_max_size" { description = "Worker node max size" type = number default = 2 } -variable "inf2-24xl_desired_size" { +variable "inf2_24xl_desired_size" { description = "Worker node desired size" type = number default = 0 } -variable "inf2-48xl_min_size" { +variable "inf2_48xl_min_size" { description = "Worker node minimum size" type = number default = 0 } -variable "inf2-48xl_max_size" { +variable "inf2_48xl_max_size" { description = "Worker node max size" type = number default = 2 } -variable "inf2-48xl_desired_size" { +variable "inf2_48xl_desired_size" { description = "Worker node desired size" type = number default = 0 From 0620075c14881570a6cc3df02c396061417ba04e Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Fri, 19 Jan 2024 11:30:46 -0500 Subject: [PATCH 40/45] addressed latest PR reviewed changes --- ai-ml/trainium-inferentia/eks.tf | 8 ++--- ai-ml/trainium-inferentia/get_eks_azs.sh | 43 ------------------------ ai-ml/trainium-inferentia/main.tf | 10 +----- ai-ml/trainium-inferentia/variables.tf | 26 +------------- website/docs/gen-ai/inference/Llama2.md | 8 ++--- website/docs/gen-ai/training/Llama2.md | 7 ++-- 6 files changed, 11 insertions(+), 91 deletions(-) delete mode 100644 ai-ml/trainium-inferentia/get_eks_azs.sh diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index 35700b69b..12b59bbe9 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -177,7 +177,7 @@ module "eks" { EOT min_size = var.trn1_32xl_min_size - max_size = var.trn1_32xl_max_size + max_size = 4 desired_size = var.trn1_32xl_desired_size # EFA Network Interfaces configuration for Trn1.32xlarge @@ -323,7 +323,7 @@ module "eks" { EOT min_size = var.trn1n_32xl_min_size - max_size = var.trn1n_32xl_max_size + max_size = 2 desired_size = var.trn1n_32xl_desired_size # EFA Network Interfaces configuration for Trn1.32xlarge @@ -508,7 +508,7 @@ module "eks" { EOT min_size = var.inf2_24xl_min_size - max_size = var.inf2_24xl_max_size + max_size = 2 desired_size = var.inf2_24xl_desired_size labels = { @@ -556,7 +556,7 @@ module "eks" { EOT min_size = var.inf2_48xl_min_size - max_size = var.inf2_48xl_max_size + max_size = 2 desired_size = var.inf2_48xl_desired_size labels = { diff --git a/ai-ml/trainium-inferentia/get_eks_azs.sh b/ai-ml/trainium-inferentia/get_eks_azs.sh deleted file mode 100644 index 08dc832b1..000000000 --- a/ai-ml/trainium-inferentia/get_eks_azs.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Desired AWS region should be passed in as arg1, ex 'us-west-2' -REGION_CODE=$1 - -# Determine appropriate EKS AZs based on the AWS region. (EKS requires that we specify 2 AZs) -# The AZs specified here currently support both trn1 and inf2, but inf2 is also supported -# in additional AZs. AZ1 should be preferred when launching nodes. -if [[ $REGION_CODE == "us-west-2" ]]; then - AZ1="usw2-az4" - AZ2="usw2-az1" -elif [[ $REGION_CODE == "us-east-1" ]]; then - AZ1="use1-az6" - AZ2="use1-az5" -elif [[ $REGION_CODE == "us-east-2" ]]; then - AZ1="use2-az3" - AZ2="use2-az1" -else - echo "{\"error\": \"Unsupported region: $REGION_CODE\"}" - exit 1 -fi - -# Fetch and set the actual names of the availability zones -EKSAZ1=$(aws ec2 describe-availability-zones \ - --region $REGION_CODE \ - --filters "Name=zone-id,Values=$AZ1" \ - --query "AvailabilityZones[].ZoneName" \ - --output text) - -EKSAZ2=$(aws ec2 describe-availability-zones \ - --region $REGION_CODE \ - --filters "Name=zone-id,Values=$AZ2" \ - --query "AvailabilityZones[].ZoneName" \ - --output text) - -# Check if EKSAZ1 and EKSAZ2 are not empty and output as JSON -if [ -n "$EKSAZ1" ] && [ -n "$EKSAZ2" ]; then - echo "{\"EKSAZ1\": \"$EKSAZ1\", \"EKSAZ2\": \"$EKSAZ2\"}" -else - # Output errors as JSON - echo "{\"error\": \"Unable to determine EKS availability zones\"}" - exit 1 -fi diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index bb13dc384..8503ce60e 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -30,14 +30,6 @@ provider "kubectl" { load_config_file = false } -resource "random_string" "this" { - length = 5 - special = false - upper = false - lower = true - numeric = true -} - data "aws_eks_cluster_auth" "this" { name = module.eks.cluster_name } @@ -47,7 +39,7 @@ data "aws_ecrpublic_authorization_token" "token" { } locals { - name = "${var.name}-${random_string.this.result}" + name = var.name region = var.region # Training and Inference instances are available in the following AZs us-east-1 and us-west-2 # You can find the list of supported AZs here: https://aws.amazon.com/ec2/instance-types/trn1/ diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 8dec490ef..094ea931d 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -1,6 +1,6 @@ variable "name" { description = "Name of the VPC and EKS Cluster" - default = "trn1-inf2" + default = "trainium-inferentia" type = string } @@ -57,12 +57,6 @@ variable "trn1_32xl_min_size" { default = 0 } -variable "trn1_32xl_max_size" { - description = "trn1 Worker node max size" - type = number - default = 4 -} - variable "trn1_32xl_desired_size" { description = "trn1 Worker node desired size" type = number @@ -75,12 +69,6 @@ variable "trn1n_32xl_min_size" { default = 0 } -variable "trn1n_32xl_max_size" { - description = "Worker node max size" - type = number - default = 1 -} - variable "trn1n_32xl_desired_size" { description = "Worker node desired size" type = number @@ -93,12 +81,6 @@ variable "inf2_24xl_min_size" { default = 0 } -variable "inf2_24xl_max_size" { - description = "Worker node max size" - type = number - default = 2 -} - variable "inf2_24xl_desired_size" { description = "Worker node desired size" type = number @@ -111,12 +93,6 @@ variable "inf2_48xl_min_size" { default = 0 } -variable "inf2_48xl_max_size" { - description = "Worker node max size" - type = number - default = 2 -} - variable "inf2_48xl_desired_size" { description = "Worker node desired size" type = number diff --git a/website/docs/gen-ai/inference/Llama2.md b/website/docs/gen-ai/inference/Llama2.md index 500222103..7fc77accb 100644 --- a/website/docs/gen-ai/inference/Llama2.md +++ b/website/docs/gen-ai/inference/Llama2.md @@ -114,15 +114,13 @@ cd data-on-eks/ai-ml/trainium-inferentia/ && chmod +x install.sh Verify the Amazon EKS Cluster -**NOTE:** Replace [cluster-name] with your actual EKS cluster name - ```bash -aws eks --region us-west-2 describe-cluster --name [cluster-name] +aws eks --region us-west-2 describe-cluster --name trainium-inferentia ``` ```bash # Creates k8s config file to authenticate with EKS -aws eks --region us-west-2 update-kubeconfig --name [cluster-name] +aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia kubectl get nodes # Output shows the EKS Managed Node group nodes ``` @@ -150,7 +148,7 @@ Users can also modify the Dockerfile to suit their specific requirements and pus **Ensure the cluster is configured locally** ```bash -aws eks --region us-west-2 update-kubeconfig --name [cluster-name] +aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia ``` **Deploy RayServe Cluster** diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index 656fd6cd6..8a3f2cef8 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -101,7 +101,6 @@ export TF_VAR_enable_mpi_operator=true export TF_VAR_region=us-west-2 export TF_VAR_trn1_min_size=4 export TF_VAR_trn1_desired_size=4 -export TF_VAR_trn1_max_size=4 ``` Run the install script to provision an EKS cluster with all the add-ons needed for the solution. @@ -114,15 +113,13 @@ Run the install script to provision an EKS cluster with all the add-ons needed f Verify the Amazon EKS Cluster -**NOTE:** Replace [cluster-name] with your actual EKS cluster name - ```bash -aws eks --region us-west-2 describe-cluster --name [cluster-name] +aws eks --region us-west-2 describe-cluster --name trainium-inferentia ``` ```bash # Creates k8s config file to authenticate with EKS -aws eks --region us-west-2 update-kubeconfig --name [cluster-name] +aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia kubectl get nodes # Output shows the EKS Managed Node group nodes ``` From 100aa25d49753a0ebbebe248007df29f9d79079f Mon Sep 17 00:00:00 2001 From: Scott Perry <48838323+5cp@users.noreply.github.com> Date: Tue, 30 Jan 2024 19:59:17 -0700 Subject: [PATCH 41/45] Fix trn1 nodegroups so they use the preferred subnet/AZ --- ai-ml/trainium-inferentia/eks.tf | 23 +++++++++++------------ ai-ml/trainium-inferentia/main.tf | 14 ++++++++++---- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index 12b59bbe9..f0efb4b62 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -133,11 +133,11 @@ module "eks" { trn1-32xl-ng1 = { name = "trn1-32xl-ng1" description = "Tran1 32xlarge node group for hosting ML workloads" - # The code filters the private subnets based on their CIDR blocks and selects the subnet ID if the CIDR block starts with "100." Otherwise, it assigns a null value. - # The element(compact([...]), 0) expression ensures that only the first non-null value is included in the resulting list of subnet IDs. - subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)] - + # All trn1 instances should be launched into the same subnet in the preferred trn1 AZ + # The preferred AZ is the first AZ listed in the AZ id <-> region mapping in main.tf. + # We use index 2 to select the subnet in AZ1 with the 100.x CIDR: + # module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] + subnet_ids = [module.vpc.private_subnets[2]] # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2 # ami_id = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type ami_type = "AL2_x86_64_GPU" # Contains Neuron driver @@ -278,15 +278,14 @@ module "eks" { trn1n-32xl-ng = { name = "trn1n-32xl-ng" description = "trn1n 32xlarge node group for hosting ML workloads" - # The code filters the private subnets based on their CIDR blocks and selects the subnet ID if the CIDR block starts with "100." Otherwise, it assigns a null value. - # The element(compact([...]), 0) expression ensures that only the first non-null value is included in the resulting list of subnet IDs. - subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0) - ] - + # All trn1 instances should be launched into the same subnet in the preferred trn1 AZ + # The preferred AZ is the first AZ listed in the AZ id <-> region mapping in main.tf. + # We use index 2 to select the subnet in AZ1 with the 100.x CIDR: + # module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] + subnet_ids = [module.vpc.private_subnets[2]] # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2 # ami_id = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type - ami_type = "AL2_x86_64_GPU" + ami_type = "AL2_x86_64_GPU" # Contains Neuron driver instance_types = ["trn1n.32xlarge"] pre_bootstrap_user_data = <<-EOT diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index 8503ce60e..46d413577 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -41,11 +41,17 @@ data "aws_ecrpublic_authorization_token" "token" { locals { name = var.name region = var.region - # Training and Inference instances are available in the following AZs us-east-1 and us-west-2 - # You can find the list of supported AZs here: https://aws.amazon.com/ec2/instance-types/trn1/ - azs = ["${local.region}c", "${local.region}d"] + # Trn1 and Inf2 instances are available in specific AZs in us-east-1, + # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should + # be used. + az_mapping = { + "us-west-2" = ["usw2-az4", "usw2-az1"], + "us-east-1" = ["use1-az6", "use1-az5"], + "us-east-2" = ["use2-az3", "use2-az1"] + } + azs = local.az_mapping[var.region] tags = { Blueprint = local.name GithubRepo = "github.com/awslabs/data-on-eks" } -} \ No newline at end of file +} From c179262b2d7ef984cbc021f042c8c89fa6239eb4 Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Wed, 31 Jan 2024 08:54:45 -0500 Subject: [PATCH 42/45] az changes for trn1 --- .../llama2/install-pre-requsites-for-ec2.sh | 50 +++++++++++++++++-- ai-ml/trainium-inferentia/main.tf | 3 +- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh index f2cf7d19c..cbee17959 100644 --- a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh @@ -9,7 +9,7 @@ install_docker() { newgrp docker } -# Install a package if it is not already installed +# Function to install a package using yum install_package() { PACKAGE=$1 echo "Checking for $PACKAGE..." @@ -21,6 +21,46 @@ install_package() { fi } +# Function to install kubectl +install_kubectl() { + echo "Installing kubectl..." + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl +} + +# Function to install Terraform +install_terraform() { + echo "Installing Terraform..." + sudo yum install -y yum-utils + sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/AmazonLinux/hashicorp.repo + sudo yum install -y terraform +} + +# Function to install AWS CLI v2 +install_aws_cli() { + echo "Installing AWS CLI v2..." + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install + echo "AWS CLI v2 installed successfully." +} + +# Function to install Helm +install_helm() { + echo "Installing Helm..." + curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 + chmod 700 get_helm.sh + ./get_helm.sh + echo "Helm installed successfully." +} + +# Function to install Boto3 +install_boto3() { + echo "Installing Boto3..." + pip3 install boto3 + echo "Boto3 installed successfully." +} + echo "Starting installation of prerequisites..." # Install Docker @@ -33,7 +73,11 @@ install_package unzip install_package python3-pip install_package jq -# Additional installations (kubectl, AWS CLI v2, Terraform, Helm, Boto3)... -# (Include the existing logic for these installations here, with similar echo statements for tracking) +# Install kubectl, Terraform, AWS CLI v2, Helm, and Boto3 +install_kubectl +install_terraform +install_aws_cli +install_helm +install_boto3 echo "Installation of prerequisites complete." diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index 46d413577..b4526f19d 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -42,8 +42,7 @@ locals { name = var.name region = var.region # Trn1 and Inf2 instances are available in specific AZs in us-east-1, - # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should - # be used. + # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should be used. az_mapping = { "us-west-2" = ["usw2-az4", "usw2-az1"], "us-east-1" = ["use1-az6", "use1-az5"], From 19dc56c6f2f69bb48d0eb042629eb456faa8baad Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Wed, 31 Jan 2024 08:57:20 -0500 Subject: [PATCH 43/45] pre-req script fix --- .../examples/llama2/install-pre-requsites-for-ec2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh index cbee17959..e1636fbe3 100644 --- a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh @@ -4,7 +4,7 @@ install_docker() { echo "Checking and installing Docker..." sudo yum install docker -y - sudo systemctl start docker + sudo service docker start sudo usermod -aG docker $(whoami) newgrp docker } From 22d10cfd074163a11b6850680b3f492343821b1d Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Wed, 31 Jan 2024 09:13:30 -0500 Subject: [PATCH 44/45] pre-req issue fix --- .../examples/llama2/install-pre-requsites-for-ec2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh old mode 100644 new mode 100755 index e1636fbe3..430241622 --- a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh @@ -6,7 +6,7 @@ install_docker() { sudo yum install docker -y sudo service docker start sudo usermod -aG docker $(whoami) - newgrp docker + # newgrp docker removed to prevent script interruption } # Function to install a package using yum From 74d662f6f3249a19e1db25653e84eb5d3c4a243f Mon Sep 17 00:00:00 2001 From: Sanjeev Ganjihal Date: Wed, 31 Jan 2024 12:00:34 -0500 Subject: [PATCH 45/45] fixed spelling mistakes --- ai-ml/trainium-inferentia/eks.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index f0efb4b62..5175673d0 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -46,7 +46,7 @@ module "eks" { # security group rule from all ipv4 to nodes for port 22 node_security_group_additional_rules = { - # Critical Secruity group rule for EFA enabled nodes + # Critical Security group rule for EFA enabled nodes ingress_efa_self_enabled = { description = "EFA-enabled self-referencing security group Ingress" protocol = "-1" @@ -56,7 +56,7 @@ module "eks" { self = true } - # Critical Secruity group rule for EFA enabled nodes + # Critical Security group rule for EFA enabled nodes egress_efa_self_enabled = { description = "EFA-enabled self-referencing security group Egress" protocol = "-1" @@ -248,7 +248,7 @@ module "eks" { } ] - # Commented to investigate further as the node group creation is failing with palcement group + # Commented to investigate further as the node group creation is failing with placement group # placement = { # spread_domain = "cluster" # groupName = "trn1-32xl-ng1" @@ -457,7 +457,7 @@ module "eks" { }, ] - # Commented to investigate further as the node group creation is failing with palcement group + # Commented to investigate further as the node group creation is failing with placement group # placement = { # spread_domain = "cluster" # groupName = "trn1-32xl-ng1"