diff --git a/ai-ml/bionemo/eks.tf b/ai-ml/bionemo/eks.tf index c3e8f4193..60d79725e 100644 --- a/ai-ml/bionemo/eks.tf +++ b/ai-ml/bionemo/eks.tf @@ -1,3 +1,13 @@ +#--------------------------------------------------------------- +# Data Sources +#--------------------------------------------------------------- +data "aws_availability_zones" "available" {} + +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + + #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- @@ -143,10 +153,3 @@ module "eks" { } } } - - -data "aws_availability_zones" "available" {} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} diff --git a/ai-ml/bionemo/locals.tf b/ai-ml/bionemo/locals.tf index cf15947ab..abae00987 100644 --- a/ai-ml/bionemo/locals.tf +++ b/ai-ml/bionemo/locals.tf @@ -1,5 +1,5 @@ #--------------------------------------------------------------- -# Local variables +# Local Variables #--------------------------------------------------------------- locals { name = var.name diff --git a/ai-ml/bionemo/providers.tf b/ai-ml/bionemo/providers.tf index cab84e377..80620417c 100644 --- a/ai-ml/bionemo/providers.tf +++ b/ai-ml/bionemo/providers.tf @@ -1,3 +1,6 @@ +#--------------------------------------------------------------- +# Providers +#--------------------------------------------------------------- provider "aws" { region = local.region } diff --git a/ai-ml/emr-spark-rapids/amp.tf b/ai-ml/emr-spark-rapids/amp.tf index 96df2a495..9ea521188 100644 --- a/ai-ml/emr-spark-rapids/amp.tf +++ b/ai-ml/emr-spark-rapids/amp.tf @@ -99,15 +99,8 @@ data "aws_iam_policy_document" "grafana" { } } -#------------------------------------------ -# Amazon Prometheus -#------------------------------------------ -locals { - amp_ingest_service_account = "amp-iamproxy-ingest-service-account" - amp_namespace = "kube-prometheus-stack" -} - resource "aws_prometheus_workspace" "amp" { + count = var.enable_amazon_prometheus ? 1 : 0 alias = format("%s-%s", "amp-ws", local.name) diff --git a/ai-ml/emr-spark-rapids/eks.tf b/ai-ml/emr-spark-rapids/eks.tf index 4d01ae4fe..5c6acb096 100644 --- a/ai-ml/emr-spark-rapids/eks.tf +++ b/ai-ml/emr-spark-rapids/eks.tf @@ -1,7 +1,22 @@ #--------------------------------------------------------------- -# EKS Cluster +# Data Sources #--------------------------------------------------------------- +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.ecr +} + +data "aws_availability_zones" "available" {} +data "aws_caller_identity" "current" {} +data "aws_partition" "current" {} + +#--------------------------------------------------------------- +# EKS Cluster +#--------------------------------------------------------------- module "eks" { source = "terraform-aws-modules/eks/aws" version = "~> 19.21" diff --git a/ai-ml/emr-spark-rapids/locals.tf b/ai-ml/emr-spark-rapids/locals.tf new file mode 100644 index 000000000..568732378 --- /dev/null +++ b/ai-ml/emr-spark-rapids/locals.tf @@ -0,0 +1,37 @@ +#--------------------------------------------------------------- +# Local Variables +#--------------------------------------------------------------- +locals { + name = var.name + region = var.region + + # Only two AZs for this example + azs = slice(data.aws_availability_zones.available.names, 0, 2) + + account_id = data.aws_caller_identity.current.account_id + partition = data.aws_partition.current.partition + + #------------------------------------------ + # Amazon Prometheus + #------------------------------------------ + amp_ingest_service_account = "amp-iamproxy-ingest-service-account" + amp_namespace = "kube-prometheus-stack" + + #------------------------------------------ + # VPC + #------------------------------------------ + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] + + tags = merge(var.tags, { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + }) +} diff --git a/ai-ml/emr-spark-rapids/main.tf b/ai-ml/emr-spark-rapids/main.tf deleted file mode 100644 index 809cc6343..000000000 --- a/ai-ml/emr-spark-rapids/main.tf +++ /dev/null @@ -1,61 +0,0 @@ -provider "aws" { - region = local.region -} - -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - } -} - -provider "kubectl" { - apply_retry_count = 30 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - token = data.aws_eks_cluster_auth.this.token -} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -data "aws_availability_zones" "available" {} - -data "aws_caller_identity" "current" {} -data "aws_partition" "current" {} - -locals { - name = var.name - region = var.region - - # Only two AZs for this example - azs = slice(data.aws_availability_zones.available.names, 0, 2) - - account_id = data.aws_caller_identity.current.account_id - partition = data.aws_partition.current.partition - - tags = merge(var.tags, { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - }) -} diff --git a/ai-ml/emr-spark-rapids/outputs.tf b/ai-ml/emr-spark-rapids/outputs.tf index 8645d7977..012cf29ef 100644 --- a/ai-ml/emr-spark-rapids/outputs.tf +++ b/ai-ml/emr-spark-rapids/outputs.tf @@ -1,7 +1,6 @@ -################################################################################ -# Cluster -################################################################################ - +#--------------------------------------------------------------- +# EKS Cluster +#--------------------------------------------------------------- output "cluster_arn" { description = "The Amazon Resource Name (ARN) of the cluster" value = module.eks.cluster_arn @@ -17,10 +16,9 @@ output "oidc_provider_arn" { value = module.eks.oidc_provider_arn } -################################################################################ +#--------------------------------------------------------------- # EKS Managed Node Group -################################################################################ - +#--------------------------------------------------------------- output "configure_kubectl" { description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" @@ -31,10 +29,9 @@ output "emr_on_eks" { value = module.emr_containers } -################################################################################ +#--------------------------------------------------------------- # AMP -################################################################################ - +#--------------------------------------------------------------- output "amp_workspace_id" { description = "The id of amp" value = aws_prometheus_workspace.amp[0].id diff --git a/ai-ml/emr-spark-rapids/providers.tf b/ai-ml/emr-spark-rapids/providers.tf index e69de29bb..c57a6207e 100644 --- a/ai-ml/emr-spark-rapids/providers.tf +++ b/ai-ml/emr-spark-rapids/providers.tf @@ -0,0 +1,35 @@ +#--------------------------------------------------------------- +# Providers +#--------------------------------------------------------------- +provider "aws" { + region = local.region +} + +# ECR always authenticates with `us-east-1` region +# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html +provider "aws" { + alias = "ecr" + region = "us-east-1" +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token + } +} + +provider "kubectl" { + apply_retry_count = 30 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + load_config_file = false + token = data.aws_eks_cluster_auth.this.token +} diff --git a/ai-ml/emr-spark-rapids/vpc.tf b/ai-ml/emr-spark-rapids/vpc.tf index e7e6473ee..a5601dab6 100644 --- a/ai-ml/emr-spark-rapids/vpc.tf +++ b/ai-ml/emr-spark-rapids/vpc.tf @@ -1,15 +1,3 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - #--------------------------------------------------------------- # VPC #--------------------------------------------------------------- diff --git a/ai-ml/jark-stack/terraform/eks.tf b/ai-ml/jark-stack/terraform/eks.tf index aaf11a9e7..cbe48cc30 100644 --- a/ai-ml/jark-stack/terraform/eks.tf +++ b/ai-ml/jark-stack/terraform/eks.tf @@ -1,3 +1,29 @@ +#--------------------------------------------------------------- +# Data Sources +#--------------------------------------------------------------- +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +data "aws_availability_zones" "available" {} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.ecr +} + +#--------------------------------------------------------------- +# EKS Cluster +#--------------------------------------------------------------- +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +data "aws_availability_zones" "available" {} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.ecr +} + #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- diff --git a/ai-ml/jark-stack/terraform/karpenter.tf b/ai-ml/jark-stack/terraform/karpenter.tf deleted file mode 100644 index e69de29bb..000000000 diff --git a/ai-ml/jark-stack/terraform/locals.tf b/ai-ml/jark-stack/terraform/locals.tf new file mode 100644 index 000000000..8f206f412 --- /dev/null +++ b/ai-ml/jark-stack/terraform/locals.tf @@ -0,0 +1,20 @@ +locals { + name = var.name + region = var.region + azs = slice(data.aws_availability_zones.available.names, 0, 2) + + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] + + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/jark-stack/terraform/main.tf b/ai-ml/jark-stack/terraform/providers.tf similarity index 71% rename from ai-ml/jark-stack/terraform/main.tf rename to ai-ml/jark-stack/terraform/providers.tf index f93511951..96b06477f 100644 --- a/ai-ml/jark-stack/terraform/main.tf +++ b/ai-ml/jark-stack/terraform/providers.tf @@ -29,23 +29,3 @@ provider "kubectl" { token = data.aws_eks_cluster_auth.this.token load_config_file = false } - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -data "aws_availability_zones" "available" {} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -locals { - name = var.name - region = var.region - azs = slice(data.aws_availability_zones.available.names, 0, 2) - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} diff --git a/ai-ml/jark-stack/terraform/vpc.tf b/ai-ml/jark-stack/terraform/vpc.tf index 59c3da89c..e6f0b1dfb 100644 --- a/ai-ml/jark-stack/terraform/vpc.tf +++ b/ai-ml/jark-stack/terraform/vpc.tf @@ -1,15 +1,3 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - #--------------------------------------------------------------- # VPC #--------------------------------------------------------------- diff --git a/ai-ml/jupyterhub/addons.tf b/ai-ml/jupyterhub/addons.tf index ed8f7db44..d2393e791 100755 --- a/ai-ml/jupyterhub/addons.tf +++ b/ai-ml/jupyterhub/addons.tf @@ -9,10 +9,6 @@ data "aws_ecrpublic_authorization_token" "token" { provider = aws.ecr } -locals { - cognito_custom_domain = var.cognito_custom_domain -} - #--------------------------------------------------------------- # GP3 Encrypted Storage Class #--------------------------------------------------------------- diff --git a/ai-ml/jupyterhub/main.tf b/ai-ml/jupyterhub/eks.tf similarity index 82% rename from ai-ml/jupyterhub/main.tf rename to ai-ml/jupyterhub/eks.tf index dcccf3de4..e6158756f 100755 --- a/ai-ml/jupyterhub/main.tf +++ b/ai-ml/jupyterhub/eks.tf @@ -1,28 +1,6 @@ -provider "aws" { - region = local.region -} - -# Removed exec plugin as this doesn't work with Terraform Cloud and TOFU controller plugin with backstage -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - } -} - -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html -provider "aws" { - alias = "ecr" - region = "us-east-1" -} +#--------------------------------------------------------------- +# Data Sources +#--------------------------------------------------------------- data "aws_eks_cluster_auth" "this" { name = module.eks.cluster_name @@ -30,16 +8,6 @@ data "aws_eks_cluster_auth" "this" { data "aws_availability_zones" "available" {} -locals { - name = var.name - region = var.region - azs = slice(data.aws_availability_zones.available.names, 0, 2) - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} - #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- diff --git a/ai-ml/jupyterhub/locals.tf b/ai-ml/jupyterhub/locals.tf new file mode 100755 index 000000000..a2ea0c13b --- /dev/null +++ b/ai-ml/jupyterhub/locals.tf @@ -0,0 +1,28 @@ +#--------------------------------------------------------------- +# Local Variables +#--------------------------------------------------------------- +locals { + name = var.name + region = var.region + azs = slice(data.aws_availability_zones.available.names, 0, 2) + + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] + + #--------------------------------------------------------------- + # Amazon Cognito + #--------------------------------------------------------------- + cognito_custom_domain = var.cognito_custom_domain + + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/jupyterhub/providers.tf b/ai-ml/jupyterhub/providers.tf new file mode 100755 index 000000000..96d237e06 --- /dev/null +++ b/ai-ml/jupyterhub/providers.tf @@ -0,0 +1,28 @@ +#--------------------------------------------------------------- +# Providers +#--------------------------------------------------------------- +provider "aws" { + region = local.region +} + +# Removed exec plugin as this doesn't work with Terraform Cloud and TOFU controller plugin with backstage +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token + } +} + +# ECR always authenticates with `us-east-1` region +# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html +provider "aws" { + alias = "ecr" + region = "us-east-1" +} diff --git a/ai-ml/jupyterhub/vpc.tf b/ai-ml/jupyterhub/vpc.tf index 59c3da89c..e6f0b1dfb 100755 --- a/ai-ml/jupyterhub/vpc.tf +++ b/ai-ml/jupyterhub/vpc.tf @@ -1,15 +1,3 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - #--------------------------------------------------------------- # VPC #--------------------------------------------------------------- diff --git a/ai-ml/mlflow/amp.tf b/ai-ml/mlflow/amp.tf index 14b47ba4c..aac31c14b 100644 --- a/ai-ml/mlflow/amp.tf +++ b/ai-ml/mlflow/amp.tf @@ -1,11 +1,6 @@ #------------------------------------------ # Amazon Prometheus #------------------------------------------ -locals { - amp_ingest_service_account = "amp-iamproxy-ingest-service-account" - amp_namespace = "kube-prometheus-stack" -} - resource "aws_prometheus_workspace" "amp" { count = var.enable_amazon_prometheus ? 1 : 0 diff --git a/ai-ml/mlflow/eks.tf b/ai-ml/mlflow/eks.tf index 15fa077d1..5bac47ee1 100644 --- a/ai-ml/mlflow/eks.tf +++ b/ai-ml/mlflow/eks.tf @@ -1,3 +1,18 @@ +#--------------------------------------------------------------- +# Data Sources +#--------------------------------------------------------------- +data "aws_availability_zones" "available" {} +data "aws_caller_identity" "current" {} +data "aws_partition" "current" {} + +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.ecr +} + #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- diff --git a/ai-ml/mlflow/locals.tf b/ai-ml/mlflow/locals.tf new file mode 100755 index 000000000..9684d10af --- /dev/null +++ b/ai-ml/mlflow/locals.tf @@ -0,0 +1,44 @@ +#--------------------------------------------------------------- +# Local Variables +#--------------------------------------------------------------- +locals { + name = var.name + region = var.region + vpc_cidr = var.vpc_cidr + azs = slice(data.aws_availability_zones.available.names, 0, 2) + account_id = data.aws_caller_identity.current.account_id + partition = data.aws_partition.current.partition + + #------------------------------------------ + # VPC + #------------------------------------------ + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + + database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] + + #------------------------------------------ + # mlfow + #------------------------------------------ + mlflow_name = "mlflow" + mlflow_namespace = "mlflow" + mlflow_service_account = "mlflow" + + #------------------------------------------ + # Amazon Prometheus + #------------------------------------------ + amp_ingest_service_account = "amp-iamproxy-ingest-service-account" + amp_namespace = "kube-prometheus-stack" + + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/mlflow/main.tf b/ai-ml/mlflow/main.tf deleted file mode 100644 index a5e4360ea..000000000 --- a/ai-ml/mlflow/main.tf +++ /dev/null @@ -1,65 +0,0 @@ -provider "aws" { - region = local.region -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - } -} - -provider "kubectl" { - apply_retry_count = 10 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - token = data.aws_eks_cluster_auth.this.token -} - -data "aws_availability_zones" "available" {} -data "aws_caller_identity" "current" {} -data "aws_partition" "current" {} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -#--------------------------------------------------------------- -# Local variables -#--------------------------------------------------------------- -locals { - name = var.name - region = var.region - vpc_cidr = var.vpc_cidr - azs = slice(data.aws_availability_zones.available.names, 0, 2) - account_id = data.aws_caller_identity.current.account_id - partition = data.aws_partition.current.partition - - mlflow_name = "mlflow" - mlflow_namespace = "mlflow" - mlflow_service_account = "mlflow" - - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} diff --git a/ai-ml/bionemo/main.tf b/ai-ml/mlflow/providers.tf old mode 100644 new mode 100755 similarity index 74% rename from ai-ml/bionemo/main.tf rename to ai-ml/mlflow/providers.tf index dd7d220a0..80620417c --- a/ai-ml/bionemo/main.tf +++ b/ai-ml/mlflow/providers.tf @@ -1,3 +1,6 @@ +#--------------------------------------------------------------- +# Providers +#--------------------------------------------------------------- provider "aws" { region = local.region } @@ -30,24 +33,3 @@ provider "kubectl" { load_config_file = false token = data.aws_eks_cluster_auth.this.token } - -data "aws_availability_zones" "available" {} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -#--------------------------------------------------------------- -# Local variables -#--------------------------------------------------------------- -locals { - name = var.name - region = var.region - vpc_cidr = var.vpc_cidr - azs = slice(data.aws_availability_zones.available.names, 0, 2) - - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} diff --git a/ai-ml/mlflow/vpc.tf b/ai-ml/mlflow/vpc.tf index 0aa8b7aab..5ccb78cc9 100644 --- a/ai-ml/mlflow/vpc.tf +++ b/ai-ml/mlflow/vpc.tf @@ -1,17 +1,3 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - - database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - #--------------------------------------------------------------- # VPC #--------------------------------------------------------------- diff --git a/ai-ml/nvidia-triton-server/eks.tf b/ai-ml/nvidia-triton-server/eks.tf index 469947520..a5d5f551b 100644 --- a/ai-ml/nvidia-triton-server/eks.tf +++ b/ai-ml/nvidia-triton-server/eks.tf @@ -1,3 +1,16 @@ +#--------------------------------------------------------------- +# Data Sources +#--------------------------------------------------------------- +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +data "aws_availability_zones" "available" {} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.ecr +} + #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- diff --git a/ai-ml/nvidia-triton-server/locals.tf b/ai-ml/nvidia-triton-server/locals.tf new file mode 100644 index 000000000..42a38a246 --- /dev/null +++ b/ai-ml/nvidia-triton-server/locals.tf @@ -0,0 +1,39 @@ +#--------------------------------------------------------------- +# Local Variables +#--------------------------------------------------------------- +locals { + name = var.name + region = var.region + azs = slice(data.aws_availability_zones.available.names, 0, 2) + + #-------------------------------------------------------------------- + # VPC + #-------------------------------------------------------------------- + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] + + #-------------------------------------------------------------------- + # Helm Chart for deploying NIM models + #-------------------------------------------------------------------- + enabled_models = var.enable_nvidia_nim ? { + for model in var.nim_models : model.name => model + if model.enable + } : {} + + #-------------------------------------------------------------------- + # Nvidia Triton Server + #-------------------------------------------------------------------- + triton_model = "triton-vllm" + + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/nvidia-triton-server/nvidia-nim.tf b/ai-ml/nvidia-triton-server/nvidia-nim.tf index 8989e25ed..b1e9c667c 100644 --- a/ai-ml/nvidia-triton-server/nvidia-nim.tf +++ b/ai-ml/nvidia-triton-server/nvidia-nim.tf @@ -104,16 +104,6 @@ resource "null_resource" "download_nim_deploy" { } } -#-------------------------------------------------------------------- -# Helm Chart for deploying NIM models -#-------------------------------------------------------------------- -locals { - enabled_models = var.enable_nvidia_nim ? { - for model in var.nim_models : model.name => model - if model.enable - } : {} -} - resource "helm_release" "nim_llm" { for_each = local.enabled_models name = "nim-llm-${each.key}" diff --git a/ai-ml/nvidia-triton-server/nvidia-triton-server.tf b/ai-ml/nvidia-triton-server/nvidia-triton-server.tf index e0a9fb1ec..b70f855f2 100644 --- a/ai-ml/nvidia-triton-server/nvidia-triton-server.tf +++ b/ai-ml/nvidia-triton-server/nvidia-triton-server.tf @@ -1,7 +1,3 @@ -locals { - triton_model = "triton-vllm" -} - #--------------------------------------------------------------- # Data on EKS Kubernetes Addons #--------------------------------------------------------------- diff --git a/ai-ml/nvidia-triton-server/main.tf b/ai-ml/nvidia-triton-server/providers.tf similarity index 71% rename from ai-ml/nvidia-triton-server/main.tf rename to ai-ml/nvidia-triton-server/providers.tf index f93511951..ac22a9ec6 100644 --- a/ai-ml/nvidia-triton-server/main.tf +++ b/ai-ml/nvidia-triton-server/providers.tf @@ -1,3 +1,6 @@ +#--------------------------------------------------------------- +# Providers +#--------------------------------------------------------------- provider "aws" { region = local.region } @@ -29,23 +32,3 @@ provider "kubectl" { token = data.aws_eks_cluster_auth.this.token load_config_file = false } - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -data "aws_availability_zones" "available" {} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -locals { - name = var.name - region = var.region - azs = slice(data.aws_availability_zones.available.names, 0, 2) - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} diff --git a/ai-ml/nvidia-triton-server/vpc.tf b/ai-ml/nvidia-triton-server/vpc.tf index 59c3da89c..e6f0b1dfb 100644 --- a/ai-ml/nvidia-triton-server/vpc.tf +++ b/ai-ml/nvidia-triton-server/vpc.tf @@ -1,15 +1,3 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - #--------------------------------------------------------------- # VPC #--------------------------------------------------------------- diff --git a/ai-ml/ray/terraform/main.tf b/ai-ml/ray/terraform/eks.tf similarity index 100% rename from ai-ml/ray/terraform/main.tf rename to ai-ml/ray/terraform/eks.tf diff --git a/ai-ml/ray/terraform/locals.tf b/ai-ml/ray/terraform/locals.tf index b63541f5b..0b03ba10c 100644 --- a/ai-ml/ray/terraform/locals.tf +++ b/ai-ml/ray/terraform/locals.tf @@ -1,5 +1,5 @@ #--------------------------------------------------------------- -# Locals +# Local Variables #--------------------------------------------------------------- locals { name = var.name diff --git a/ai-ml/ray/terraform/providers.tf b/ai-ml/ray/terraform/providers.tf index 00119c4c2..61d0e749d 100644 --- a/ai-ml/ray/terraform/providers.tf +++ b/ai-ml/ray/terraform/providers.tf @@ -1,7 +1,6 @@ #--------------------------------------------------------------- # Providers #--------------------------------------------------------------- - provider "aws" { region = local.region } diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf deleted file mode 100755 index 93f09df4e..000000000 --- a/ai-ml/trainium-inferentia/main.tf +++ /dev/null @@ -1,75 +0,0 @@ -provider "aws" { - region = local.region -} - -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - -provider "kubectl" { - apply_retry_count = 5 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } -} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -data "aws_caller_identity" "current" {} - -data "aws_iam_session_context" "current" { - arn = data.aws_caller_identity.current.arn -} - -locals { - name = var.name - region = var.region - # Trn1 and Inf2 instances are available in specific AZs in us-east-1, - # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should be used. - az_mapping = { - "us-west-2" = ["usw2-az4", "usw2-az1"], - "us-east-1" = ["use1-az6", "use1-az5"], - "us-east-2" = ["use2-az3", "use2-az1"] - } - azs = local.az_mapping[var.region] - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} diff --git a/ai-ml/trainium-inferentia/providers.tf b/ai-ml/trainium-inferentia/providers.tf index a62a0bb7c..0e8030375 100755 --- a/ai-ml/trainium-inferentia/providers.tf +++ b/ai-ml/trainium-inferentia/providers.tf @@ -1,3 +1,6 @@ +#--------------------------------------------------------------- +# Providers +#--------------------------------------------------------------- provider "aws" { region = local.region }