awslabs · vara-bonthu · Jan 31, 2024 · Oct 31, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf
@@ -133,11 +133,11 @@ module "eks" {
     trn1-32xl-ng1 = {
       name        = "trn1-32xl-ng1"
       description = "Tran1 32xlarge node group for hosting ML workloads"
-      # The code filters the private subnets based on their CIDR blocks and selects the subnet ID if the CIDR block starts with "100." Otherwise, it assigns a null value.
-      # The element(compact([...]), 0) expression ensures that only the first non-null value is included in the resulting list of subnet IDs.
-      subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)]
-
+      # All trn1 instances should be launched into the same subnet in the preferred trn1 AZ
+      # The preferred AZ is the first AZ listed in the AZ id <-> region mapping in main.tf.
+      # We use index 2 to select the subnet in AZ1 with the 100.x CIDR:
+      #   module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
+      subnet_ids = [module.vpc.private_subnets[2]]
       # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2
       # ami_id   = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type
       ami_type       = "AL2_x86_64_GPU" # Contains Neuron driver
@@ -278,15 +278,14 @@ module "eks" {
     trn1n-32xl-ng = {
       name        = "trn1n-32xl-ng"
       description = "trn1n 32xlarge node group for hosting ML workloads"
-      # The code filters the private subnets based on their CIDR blocks and selects the subnet ID if the CIDR block starts with "100." Otherwise, it assigns a null value.
-      # The element(compact([...]), 0) expression ensures that only the first non-null value is included in the resulting list of subnet IDs.
-      subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)
-      ]
-
+      # All trn1 instances should be launched into the same subnet in the preferred trn1 AZ
+      # The preferred AZ is the first AZ listed in the AZ id <-> region mapping in main.tf.
+      # We use index 2 to select the subnet in AZ1 with the 100.x CIDR:
+      #   module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
+      subnet_ids = [module.vpc.private_subnets[2]]
       # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2
       # ami_id   = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type
-      ami_type       = "AL2_x86_64_GPU"
+      ami_type       = "AL2_x86_64_GPU" # Contains Neuron driver
       instance_types = ["trn1n.32xlarge"]
 
       pre_bootstrap_user_data = <<-EOT

diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh
@@ -4,12 +4,12 @@
 install_docker() {
     echo "Checking and installing Docker..."
     sudo yum install docker -y
-    sudo systemctl start docker
+    sudo service docker start
     sudo usermod -aG docker $(whoami)
-    newgrp docker
+    # newgrp docker removed to prevent script interruption
 }
 
-# Install a package if it is not already installed
+# Function to install a package using yum
 install_package() {
     PACKAGE=$1
     echo "Checking for $PACKAGE..."
@@ -21,6 +21,46 @@ install_package() {
     fi
 }
 
+# Function to install kubectl
+install_kubectl() {
+    echo "Installing kubectl..."
+    curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+    sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+}
+
+# Function to install Terraform
+install_terraform() {
+    echo "Installing Terraform..."
+    sudo yum install -y yum-utils
+    sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/AmazonLinux/hashicorp.repo
+    sudo yum install -y terraform
+}
+
+# Function to install AWS CLI v2
+install_aws_cli() {
+    echo "Installing AWS CLI v2..."
+    curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+    unzip awscliv2.zip
+    sudo ./aws/install
+    echo "AWS CLI v2 installed successfully."
+}
+
+# Function to install Helm
+install_helm() {
+    echo "Installing Helm..."
+    curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
+    chmod 700 get_helm.sh
+    ./get_helm.sh
+    echo "Helm installed successfully."
+}
+
+# Function to install Boto3
+install_boto3() {
+    echo "Installing Boto3..."
+    pip3 install boto3
+    echo "Boto3 installed successfully."
+}
+
 echo "Starting installation of prerequisites..."
 
 # Install Docker
@@ -33,7 +73,13 @@ install_package unzip
 install_package python3-pip
 install_package jq
 
-# Additional installations (kubectl, AWS CLI v2, Terraform, Helm, Boto3)...
-# (Include the existing logic for these installations here, with similar echo statements for tracking)
+# Install kubectl, Terraform, AWS CLI v2, Helm, and Boto3
+install_kubectl
+install_terraform
+install_aws_cli
+install_helm
+install_boto3
 
 echo "Installation of prerequisites complete."
+
+
diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf
@@ -41,11 +41,72 @@ data "aws_ecrpublic_authorization_token" "token" {
 locals {
   name   = var.name
   region = var.region
-  # Training and Inference instances are available in the following AZs us-east-1 and us-west-2
-  # You can find the list of supported AZs here: https://aws.amazon.com/ec2/instance-types/trn1/
-  azs = ["${local.region}c", "${local.region}d"]
+  # Trn1 and Inf2 instances are available in specific AZs in us-east-1,
+  # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should be used.
+  az_mapping = {
+    "us-west-2" = ["usw2-az4", "usw2-az1"],
+    "us-east-1" = ["use1-az6", "use1-az5"],
+    "us-east-2" = ["use2-az3", "use2-az1"]
+  }
+  azs = local.az_mapping[var.region]
   tags = {
     Blueprint  = local.name
     GithubRepo = "github.com/awslabs/data-on-eks"
   }
-}
+}
+provider "aws" {
+  region = local.region
+}
+
+# ECR always authenticates with `us-east-1` region
+# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
+provider "aws" {
+  alias  = "ecr"
+  region = "us-east-1"
+}
+
+provider "kubernetes" {
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+  token                  = data.aws_eks_cluster_auth.this.token
+}
+
+provider "helm" {
+  kubernetes {
+    host                   = module.eks.cluster_endpoint
+    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+    token                  = data.aws_eks_cluster_auth.this.token
+  }
+}
+provider "kubectl" {
+  apply_retry_count      = 30
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+  token                  = data.aws_eks_cluster_auth.this.token
+  load_config_file       = false
+}
+
+data "aws_eks_cluster_auth" "this" {
+  name = module.eks.cluster_name
+}
+
+data "aws_ecrpublic_authorization_token" "token" {
+  provider = aws.ecr
+}
+
+locals {
+  name   = var.name
+  region = var.region
+  # Trn1 and Inf2 instances are available in specific AZs in us-east-1,
+  # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should be used.
+  az_mapping = {
+    "us-west-2" = ["usw2-az4", "usw2-az1"],
+    "us-east-1" = ["use1-az6", "use1-az5"],
+    "us-east-2" = ["use2-az3", "use2-az1"]
+  }
+  azs = local.az_mapping[var.region]
+  tags = {
+    Blueprint  = local.name
+    GithubRepo = "github.com/awslabs/data-on-eks"
+  }
+}