From 633f28d1dc07c7e4eb5f0b4f68632a3c02c41041 Mon Sep 17 00:00:00 2001
From: Ovidiu Valeanu <ovaleanu@amazon.co.uk>
Date: Wed, 6 Sep 2023 12:46:19 +0100
Subject: [PATCH 1/8] infra

---
 ai-ml/mlflow/addons.tf                        | 337 ++++++++++++++++++
 ai-ml/mlflow/amp.tf                           | 136 +++++++
 ai-ml/mlflow/eks.tf                           | 114 ++++++
 .../helm-values/aws-for-fluentbit-values.yaml | 102 ++++++
 .../cluster-autoscaler-values.yaml            |  25 ++
 .../coredns-autoscaler-values.yaml            |  40 +++
 .../kube-prometheus-amp-enable.yaml           |  65 ++++
 ai-ml/mlflow/helm-values/kube-prometheus.yaml |  36 ++
 .../helm-values/metrics-server-values.yaml    |  52 +++
 .../00-karpenter-provisioner-cpu.yaml         |  57 +++
 ai-ml/mlflow/main.tf                          |  66 ++++
 ai-ml/mlflow/mlflow-core.tf                   | 245 +++++++++++++
 ai-ml/mlflow/outputs.tf                       |  14 +
 ai-ml/mlflow/variables.tf                     |  41 +++
 ai-ml/mlflow/versions.tf                      |  33 ++
 ai-ml/mlflow/vpc.tf                           |  36 ++
 16 files changed, 1399 insertions(+)
 create mode 100644 ai-ml/mlflow/addons.tf
 create mode 100644 ai-ml/mlflow/amp.tf
 create mode 100644 ai-ml/mlflow/eks.tf
 create mode 100644 ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml
 create mode 100644 ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml
 create mode 100644 ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml
 create mode 100644 ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml
 create mode 100644 ai-ml/mlflow/helm-values/kube-prometheus.yaml
 create mode 100644 ai-ml/mlflow/helm-values/metrics-server-values.yaml
 create mode 100644 ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml
 create mode 100644 ai-ml/mlflow/main.tf
 create mode 100644 ai-ml/mlflow/mlflow-core.tf
 create mode 100644 ai-ml/mlflow/outputs.tf
 create mode 100644 ai-ml/mlflow/variables.tf
 create mode 100644 ai-ml/mlflow/versions.tf
 create mode 100644 ai-ml/mlflow/vpc.tf

diff --git a/ai-ml/mlflow/addons.tf b/ai-ml/mlflow/addons.tf
new file mode 100644
index 000000000..79b07cf51
--- /dev/null
+++ b/ai-ml/mlflow/addons.tf
@@ -0,0 +1,337 @@
+#---------------------------------------------------------------
+# IRSA for EBS CSI Driver
+#---------------------------------------------------------------
+module "ebs_csi_driver_irsa" {
+  source                = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  version               = "~> 5.20"
+  role_name_prefix      = format("%s-%s-", local.name, "ebs-csi-driver")
+  attach_ebs_csi_policy = true
+  oidc_providers = {
+    main = {
+      provider_arn               = module.eks.oidc_provider_arn
+      namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
+    }
+  }
+  tags = local.tags
+}
+#---------------------------------------------------------------
+# EKS Blueprints Kubernetes Addons
+#---------------------------------------------------------------
+module "eks_blueprints_addons" {
+  # Short commit hash from 8th May using git rev-parse --short HEAD
+  source  = "aws-ia/eks-blueprints-addons/aws"
+  version = "~> 1.3"
+
+  cluster_name      = module.eks.cluster_name
+  cluster_endpoint  = module.eks.cluster_endpoint
+  cluster_version   = module.eks.cluster_version
+  oidc_provider_arn = module.eks.oidc_provider_arn
+
+  #---------------------------------------
+  # Amazon EKS Managed Add-ons
+  #---------------------------------------
+  eks_addons = {
+    aws-ebs-csi-driver = {
+      service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn
+    }
+    coredns = {
+      preserve = true
+    }
+    vpc-cni = {
+      preserve = true
+    }
+    kube-proxy = {
+      preserve = true
+    }
+  }
+
+  #---------------------------------------------------------------
+  # CoreDNS Autoscaler helps to scale for large EKS Clusters
+  #   Further tuning for CoreDNS is to leverage NodeLocal DNSCache -> https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/
+  #---------------------------------------------------------------
+  enable_cluster_proportional_autoscaler = true
+  cluster_proportional_autoscaler = {
+    values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", {
+      target = "deployment/coredns"
+    })]
+    description = "Cluster Proportional Autoscaler for CoreDNS Service"
+  }
+
+  #---------------------------------------
+  # Metrics Server
+  #---------------------------------------
+  enable_metrics_server = true
+  metrics_server = {
+    values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
+  }
+
+  #---------------------------------------
+  # Cluster Autoscaler
+  #---------------------------------------
+  enable_cluster_autoscaler = true
+  cluster_autoscaler = {
+    timeout = "300"
+    values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", {
+      aws_region     = var.region,
+      eks_cluster_id = module.eks.cluster_name
+    })]
+  }
+
+  #---------------------------------------
+  # AWS for FluentBit - DaemonSet
+  #---------------------------------------
+  enable_aws_for_fluentbit = true
+  aws_for_fluentbit_cw_log_group = {
+    use_name_prefix   = false
+    name              = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group
+    retention_in_days = 30
+  }
+  aws_for_fluentbit = {
+    s3_bucket_arns = [
+      module.fluentbit_s3_bucket.s3_bucket_arn,
+      "${module.fluentbit_s3_bucket.s3_bucket_arn}/*}"
+    ]
+    values = [templatefile("${path.module}/helm-values/aws-for-fluentbit-values.yaml", {
+      region               = local.region,
+      cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs"
+      s3_bucket_name       = module.fluentbit_s3_bucket.s3_bucket_id
+      cluster_name         = module.eks.cluster_name
+    })]
+  }
+
+  #---------------------------------------
+  # Karpenter Autoscaler for EKS Cluster
+  #---------------------------------------
+  enable_karpenter                  = true
+  karpenter_enable_spot_termination = true
+  karpenter = {
+    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
+    repository_password = data.aws_ecrpublic_authorization_token.token.password
+  }
+
+  #---------------------------------------
+  # AWS Load Balancer  Controller
+  #---------------------------------------
+  enable_aws_load_balancer_controller = true
+  aws_load_balancer_controller = {
+    chart_version = "1.6.0" # min version required to use SG for NLB feature
+  }
+
+  #---------------------------------------
+  # Ingress Nginx external
+  #---------------------------------------
+  enable_ingress_nginx = true
+  ingress_nginx = {
+    name = "ingress-nginx-external"
+    values = [
+      <<-EOT
+          controller:
+            replicaCount: 3
+            service:
+              annotations:
+                service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+                service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+                service.beta.kubernetes.io/aws-load-balancer-security-groups: ${aws_security_group.ingress_nginx_external.id}
+                service.beta.kubernetes.io/aws-load-balancer-manage-backend-security-group-rules: true
+              loadBalancerClass: service.k8s.aws/nlb
+            topologySpreadConstraints:
+              - maxSkew: 1
+                topologyKey: topology.kubernetes.io/zone
+                whenUnsatisfiable: ScheduleAnyway
+                labelSelector:
+                  matchLabels:
+                    app.kubernetes.io/instance: ingress-nginx-external
+              - maxSkew: 1
+                topologyKey: kubernetes.io/hostname
+                whenUnsatisfiable: ScheduleAnyway
+                labelSelector:
+                  matchLabels:
+                    app.kubernetes.io/instance: ingress-nginx-external
+            minAvailable: 2
+            ingressClassResource:
+              name: ingress-nginx-external
+              default: false
+        EOT
+    ]
+  }
+
+  #---------------------------------------
+  # Prommetheus and Grafana stack
+  #---------------------------------------
+  #---------------------------------------------------------------
+  # Install Kafka Montoring Stack with Prometheus and Grafana
+  # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
+  # 2- Grafana Admin user: admin
+  # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id <output.grafana_secret_name> --region $AWS_REGION --query "SecretString" --output text`
+  #---------------------------------------------------------------
+  enable_kube_prometheus_stack = false
+  kube_prometheus_stack = {
+    values = [
+      var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", {
+        region              = local.region
+        amp_sa              = local.amp_ingest_service_account
+        amp_irsa            = module.amp_ingest_irsa[0].iam_role_arn
+        amp_remotewrite_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write"
+        amp_url             = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}"
+        storage_class_type  = kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class.id
+      }) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {})
+    ]
+    chart_version = "48.1.1"
+    set_sensitive = [
+      {
+        name  = "grafana.adminPassword"
+        value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
+      }
+    ],
+  }
+
+  tags = local.tags
+
+}
+
+
+#---------------------------------------------------------------
+# Data on EKS Kubernetes Addons
+#---------------------------------------------------------------
+module "eks_data_addons" {
+  source  = "aws-ia/eks-data-addons/aws"
+  version = "~> 1.0" # ensure to update this to the latest/desired version
+
+  oidc_provider_arn = module.eks.oidc_provider_arn
+
+}
+
+#---------------------------------------------------------------
+# Ingress Nginx external security groups
+#---------------------------------------------------------------
+resource "aws_security_group" "ingress_nginx_external" {
+  name        = "ingress-nginx-external"
+  description = "Allow public HTTP and HTTPS traffic"
+  vpc_id      = module.vpc.vpc_id
+
+  ingress {
+    from_port   = 80
+    to_port     = 80
+    protocol    = "tcp"
+    cidr_blocks = ["0.0.0.0/0"] # modify to your requirements
+  }
+
+  ingress {
+    from_port   = 443
+    to_port     = 443
+    protocol    = "tcp"
+    cidr_blocks = ["0.0.0.0/0"] # modify to your requirements
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = local.tags
+}
+
+#---------------------------------------------------------------
+# Grafana Admin credentials resources
+#---------------------------------------------------------------
+data "aws_secretsmanager_secret_version" "admin_password_version" {
+  secret_id  = aws_secretsmanager_secret.grafana.id
+  depends_on = [aws_secretsmanager_secret_version.grafana]
+}
+
+resource "random_password" "grafana" {
+  length           = 16
+  special          = true
+  override_special = "@_"
+}
+
+#tfsec:ignore:aws-ssm-secret-use-customer-key
+resource "aws_secretsmanager_secret" "grafana" {
+  name                    = "${local.name}-grafana"
+  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
+}
+
+resource "aws_secretsmanager_secret_version" "grafana" {
+  secret_id     = aws_secretsmanager_secret.grafana.id
+  secret_string = random_password.grafana.result
+}
+
+#---------------------------------------------------------------
+# S3 log bucket for FluentBit
+#---------------------------------------------------------------
+#tfsec:ignore:*
+module "fluentbit_s3_bucket" {
+  source  = "terraform-aws-modules/s3-bucket/aws"
+  version = "~> 3.0"
+
+  bucket_prefix = "${local.name}-argo-workflow-logs-"
+  # For example only - please evaluate for your environment
+  force_destroy = true
+  server_side_encryption_configuration = {
+    rule = {
+      apply_server_side_encryption_by_default = {
+        sse_algorithm = "AES256"
+      }
+    }
+  }
+
+  tags = local.tags
+}
+
+#---------------------------------------
+# Karpenter Provisioners for workloads
+#---------------------------------------
+data "kubectl_path_documents" "karpenter_provisioners" {
+  pattern = "${path.module}/karpenter-provisioners/*.yaml"
+  vars = {
+    cluster_name = module.eks.cluster_name
+  }
+}
+
+resource "kubectl_manifest" "karpenter_provisioner" {
+  for_each  = toset(data.kubectl_path_documents.karpenter_provisioners.documents)
+  yaml_body = each.value
+
+  depends_on = [module.eks_blueprints_addons]
+}
+
+#---------------------------------------------------------------
+# GP3 Encrypted Storage Class
+#---------------------------------------------------------------
+
+resource "kubernetes_annotations" "gp2_default" {
+  annotations = {
+    "storageclass.kubernetes.io/is-default-class" : "false"
+  }
+  api_version = "storage.k8s.io/v1"
+  kind        = "StorageClass"
+  metadata {
+    name = "gp2"
+  }
+  force = true
+
+  depends_on = [module.eks]
+}
+
+resource "kubernetes_storage_class" "ebs_csi_encrypted_gp3_storage_class" {
+  metadata {
+    name = "gp3"
+    annotations = {
+      "storageclass.kubernetes.io/is-default-class" : "true"
+    }
+  }
+
+  storage_provisioner    = "ebs.csi.aws.com"
+  reclaim_policy         = "Delete"
+  allow_volume_expansion = true
+  volume_binding_mode    = "WaitForFirstConsumer"
+  parameters = {
+    fsType    = "xfs"
+    encrypted = true
+    type      = "gp3"
+  }
+
+  depends_on = [kubernetes_annotations.gp2_default]
+}
diff --git a/ai-ml/mlflow/amp.tf b/ai-ml/mlflow/amp.tf
new file mode 100644
index 000000000..14b47ba4c
--- /dev/null
+++ b/ai-ml/mlflow/amp.tf
@@ -0,0 +1,136 @@
+#------------------------------------------
+# Amazon Prometheus
+#------------------------------------------
+locals {
+  amp_ingest_service_account = "amp-iamproxy-ingest-service-account"
+  amp_namespace              = "kube-prometheus-stack"
+}
+
+resource "aws_prometheus_workspace" "amp" {
+  count = var.enable_amazon_prometheus ? 1 : 0
+
+  alias = format("%s-%s", "amp-ws", local.name)
+  tags  = local.tags
+}
+#IAM Policy for Amazon Prometheus & Grafana
+resource "aws_iam_policy" "grafana" {
+  count = var.enable_amazon_prometheus ? 1 : 0
+
+  description = "IAM policy for Grafana Pod"
+  name_prefix = format("%s-%s-", local.name, "grafana")
+  path        = "/"
+  policy      = data.aws_iam_policy_document.grafana[0].json
+}
+
+data "aws_iam_policy_document" "grafana" {
+  count = var.enable_amazon_prometheus ? 1 : 0
+
+  statement {
+    sid       = "AllowReadingMetricsFromCloudWatch"
+    effect    = "Allow"
+    resources = ["*"]
+
+    actions = [
+      "cloudwatch:DescribeAlarmsForMetric",
+      "cloudwatch:ListMetrics",
+      "cloudwatch:GetMetricData",
+      "cloudwatch:GetMetricStatistics"
+    ]
+  }
+
+  statement {
+    sid       = "AllowGetInsightsCloudWatch"
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:insight-rule/*"]
+
+    actions = [
+      "cloudwatch:GetInsightRuleReport",
+    ]
+  }
+
+  statement {
+    sid       = "AllowReadingAlarmHistoryFromCloudWatch"
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:alarm:*"]
+
+    actions = [
+      "cloudwatch:DescribeAlarmHistory",
+      "cloudwatch:DescribeAlarms",
+    ]
+  }
+
+  statement {
+    sid       = "AllowReadingLogsFromCloudWatch"
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:*:log-stream:*"]
+
+    actions = [
+      "logs:DescribeLogGroups",
+      "logs:GetLogGroupFields",
+      "logs:StartQuery",
+      "logs:StopQuery",
+      "logs:GetQueryResults",
+      "logs:GetLogEvents",
+    ]
+  }
+
+  statement {
+    sid       = "AllowReadingTagsInstancesRegionsFromEC2"
+    effect    = "Allow"
+    resources = ["*"]
+
+    actions = [
+      "ec2:DescribeTags",
+      "ec2:DescribeInstances",
+      "ec2:DescribeRegions",
+    ]
+  }
+
+  statement {
+    sid       = "AllowReadingResourcesForTags"
+    effect    = "Allow"
+    resources = ["*"]
+    actions   = ["tag:GetResources"]
+  }
+
+  statement {
+    sid    = "AllowListApsWorkspaces"
+    effect = "Allow"
+    resources = [
+      "arn:${local.partition}:aps:${local.region}:${local.account_id}:/*",
+      "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*",
+      "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*/*",
+    ]
+    actions = [
+      "aps:ListWorkspaces",
+      "aps:DescribeWorkspace",
+      "aps:GetMetricMetadata",
+      "aps:GetSeries",
+      "aps:QueryMetrics",
+      "aps:RemoteWrite",
+      "aps:GetLabels"
+    ]
+  }
+}
+
+module "amp_ingest_irsa" {
+  count = var.enable_amazon_prometheus ? 1 : 0
+
+  source         = "aws-ia/eks-blueprints-addon/aws"
+  version        = "~> 1.0"
+  create_release = false
+  create_role    = true
+  create_policy  = false
+  role_name      = format("%s-%s", local.name, "amp-ingest")
+  role_policies  = { amp_policy = aws_iam_policy.grafana[0].arn }
+
+  oidc_providers = {
+    this = {
+      provider_arn    = module.eks.oidc_provider_arn
+      namespace       = local.amp_namespace
+      service_account = local.amp_ingest_service_account
+    }
+  }
+
+  tags = local.tags
+}
diff --git a/ai-ml/mlflow/eks.tf b/ai-ml/mlflow/eks.tf
new file mode 100644
index 000000000..4d486df1c
--- /dev/null
+++ b/ai-ml/mlflow/eks.tf
@@ -0,0 +1,114 @@
+#---------------------------------------------------------------
+# EKS Cluster
+#---------------------------------------------------------------
+module "eks" {
+  source  = "terraform-aws-modules/eks/aws"
+  version = "~> 19.15"
+
+  cluster_name    = local.name
+  cluster_version = var.eks_cluster_version
+
+  cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint.
+
+  vpc_id = module.vpc.vpc_id
+
+  subnet_ids = module.vpc.private_subnets
+
+  manage_aws_auth_configmap = true
+  aws_auth_roles = [
+    # We need to add in the Karpenter node IAM role for nodes launched by Karpenter
+    {
+      rolearn  = module.eks_blueprints_addons.karpenter.node_iam_role_arn
+      username = "system:node:{{EC2PrivateDNSName}}"
+      groups = [
+        "system:bootstrappers",
+        "system:nodes",
+      ]
+    }
+  ]
+
+  #---------------------------------------
+  # Note: This can further restricted to specific required for each Add-on and your application
+  #---------------------------------------
+  # Extend cluster security group rules
+  cluster_security_group_additional_rules = {
+    ingress_nodes_ephemeral_ports_tcp = {
+      description                = "Nodes on ephemeral ports"
+      protocol                   = "tcp"
+      from_port                  = 1025
+      to_port                    = 65535
+      type                       = "ingress"
+      source_node_security_group = true
+    }
+  }
+
+  # Extend node-to-node security group rules
+  node_security_group_additional_rules = {
+    ingress_self_all = {
+      description = "Node to node all ports/protocols"
+      protocol    = "-1"
+      from_port   = 0
+      to_port     = 0
+      type        = "ingress"
+      self        = true
+    }
+    # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
+    # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
+    # Change this according to your security requirements if needed
+    ingress_cluster_to_node_all_traffic = {
+      description                   = "Cluster API to Nodegroup all traffic"
+      protocol                      = "-1"
+      from_port                     = 0
+      to_port                       = 0
+      type                          = "ingress"
+      source_cluster_security_group = true
+    }
+  }
+
+  eks_managed_node_group_defaults = {
+    iam_role_additional_policies = {
+      # Not required, but used in the example to access the nodes to inspect mounted volumes
+      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+    }
+  }
+
+  eks_managed_node_groups = {
+    #  We recommend to have a MNG to place your critical workloads and add-ons
+    #  Then rely on Karpenter to scale your workloads
+    #  You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners
+    core_node_group = {
+      name        = "core-node-group"
+      description = "EKS Core node group for hosting critical add-ons"
+      subnet_ids  = module.vpc.private_subnets
+
+      min_size     = 3
+      max_size     = 9
+      desired_size = 3
+
+      instance_types = ["m5.xlarge"]
+
+      ebs_optimized = true
+      block_device_mappings = {
+        xvda = {
+          device_name = "/dev/xvda"
+          ebs = {
+            volume_size = 100
+            volume_type = "gp3"
+          }
+        }
+      }
+
+      labels = {
+        Environment   = "preprod"
+        Zone          = "test"
+        WorkerType    = "ON_DEMAND"
+        NodeGroupType = "core"
+      }
+
+      tags = merge(local.tags, {
+        Name                     = "core-node-grp",
+        "karpenter.sh/discovery" = local.name
+      })
+    }
+  }
+}
diff --git a/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml b/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml
new file mode 100644
index 000000000..a08d4260d
--- /dev/null
+++ b/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml
@@ -0,0 +1,102 @@
+global:
+
+#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server
+# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata
+hostNetwork: true
+dnsPolicy: ClusterFirstWithHostNet
+
+service:
+  parsersFiles:
+    - /fluent-bit/parsers/parsers.conf
+  extraParsers: |
+    [PARSER]
+        Name    kubernetes
+        Format  regex
+        Regex   ^(?<namespace_name>[^_]+)\.(?<container_name>.+)\.(?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?<docker_id>[a-z0-9]{64})-$
+
+input:
+  name: "tail"
+  enabled: true
+  tag: "systempods.<namespace_name>.<container_name>.<pod_name>.<docker_id>-"
+  path: "/var/log/containers/*.log"
+  db: "/var/log/flb_kube.db"
+  memBufLimit: 5MB
+  skipLongLines: "On"
+  refreshInterval: 10
+  extraInputs: |
+    multiline.parser  docker, cri
+    Tag_Regex         (?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-(?<docker_id>[a-z0-9]{64})\.log$
+
+
+# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters
+filter:
+  name: "kubernetes"
+  match: "systempods.*"
+  kubeURL: "https://kubernetes.default.svc.cluster.local:443"
+  mergeLog: "On"
+  mergeLogKey: "log_processed"
+  keepLog: "On"
+  k8sLoggingParser: "On"
+  k8sLoggingExclude: "Off"
+  bufferSize: "0"
+  extraFilters: |
+    Kube_Tag_Prefix     systempods.
+    Regex_Parser        kubernetes
+    Labels              On
+    Annotations         Off
+    Use_Kubelet         true
+    Kubelet_Port        10250
+    Kube_CA_File        /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+    Kube_Token_File     /var/run/secrets/kubernetes.io/serviceaccount/token
+
+# CATION: Donot use `cloudwatch` plugin. This Golang Plugin is not recommnded by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance.
+# cloudWatch:
+#   enabled: false
+
+# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
+cloudWatchLogs:
+  enabled: true
+  match: "systempods.*"
+  region: ${region}
+  logGroupName: ${cloudwatch_log_group}
+  autoCreateGroup: false
+  extraOutputs: |
+    log_key               log
+
+#----------------------------------------------------------#
+# OUTPUT logs to S3
+#----------------------------------------------------------#
+
+# This is an example for writing logs to S3 bucket.
+# This example writes system pod logs and spark logs into dedicated prefix.
+# This second output is using the rewrite_tag filter commented above
+
+additionalOutputs: |
+  [OUTPUT]
+      Name                            s3
+      Match                           systempods.*
+      region                          ${region}
+      bucket                          ${s3_bucket_name}
+      total_file_size                 100M
+      s3_key_format                   /${cluster_name}/system-pod-logs/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log
+      s3_key_format_tag_delimiters    ..
+      store_dir                       /home/ec2-user/buffer
+      upload_timeout                  10m
+      log_key                         log
+
+
+# Resource config for large clusters
+resources:
+  limits:
+    cpu: 1000m
+    memory: 1500Mi
+  requests:
+    cpu: 500m
+    memory: 500Mi
+
+## Assign a PriorityClassName to pods if set
+priorityClassName: system-node-critical
+
+# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
+tolerations:
+  - operator: Exists
diff --git a/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml b/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml
new file mode 100644
index 000000000..5a42794f2
--- /dev/null
+++ b/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml
@@ -0,0 +1,25 @@
+autoDiscovery:
+  clusterName: ${eks_cluster_id}
+
+awsRegion: ${aws_region}
+
+cloudProvider: aws
+
+extraArgs:
+  aws-use-static-instance-list: true
+
+# Best practice to update the resource requests and limits for each add-on
+resources:
+   limits:
+     cpu: 1000m
+     memory: 1G
+   requests:
+     cpu: 200m
+     memory: 512Mi
+
+# Best practice to updateStrategy for each add-on
+updateStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 0
+    maxUnavailable: 1
diff --git a/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml b/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml
new file mode 100644
index 000000000..64cb540bf
--- /dev/null
+++ b/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml
@@ -0,0 +1,40 @@
+nameOverride: kube-dns-autoscaler
+
+# Formula for controlling the replicas. Adjust according to your needs
+#  replicas = max( ceil( cores * 1/coresPerReplica ) , ceil( nodes * 1/nodesPerReplica ) )
+#  replicas = min(replicas, max)
+#  replicas = max(replicas, min)
+config:
+  linear:
+    coresPerReplica: 256
+    nodesPerReplica: 16
+    min: 1
+    max: 100
+    preventSinglePointFailure: true
+    includeUnschedulableNodes: true
+
+# Target to scale. In format: deployment/*, replicationcontroller/* or replicaset/* (not case sensitive).
+options:
+  target: ${target}
+
+serviceAccount:
+  create: true
+  name: kube-dns-autoscaler
+
+podSecurityContext:
+  seccompProfile:
+    type: RuntimeDefault
+  supplementalGroups: [ 65534 ]
+  fsGroup: 65534
+
+resources:
+  limits:
+    cpu: 100m
+    memory: 128Mi
+  requests:
+    cpu: 100m
+    memory: 128Mi
+
+tolerations:
+  - key: "CriticalAddonsOnly"
+    operator: "Exists"
diff --git a/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml b/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml
new file mode 100644
index 000000000..cc7687163
--- /dev/null
+++ b/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml
@@ -0,0 +1,65 @@
+prometheus:
+  serviceAccount:
+    create: true
+    name: ${amp_sa}
+    annotations:
+      eks.amazonaws.com/role-arn: ${amp_irsa}
+  prometheusSpec:
+    remoteWrite:
+      - url: ${amp_remotewrite_url}
+        sigv4:
+          region: ${region}
+        queueConfig:
+          maxSamplesPerSend: 1000
+          maxShards: 200
+          capacity: 2500
+    retention: 5h
+    scrapeInterval: 30s
+    evaluationInterval: 30s
+    scrapeTimeout: 10s
+    storageSpec:
+      volumeClaimTemplate:
+        metadata:
+          name: data
+        spec:
+          storageClassName: ${storage_class_type}
+          accessModes:
+            - ReadWriteOnce
+          resources:
+            requests:
+              storage: 50Gi
+    # Scrape metrics for Yunikorn add-on
+    additionalScrapeConfigs:
+      - job_name: yunikorn
+        honor_labels: true
+        scrape_interval: 1m
+        scrape_timeout: 10s
+        metrics_path: /ws/v1//metrics
+        scheme: http
+        dns_sd_configs:
+          - names:
+              - yunikorn-service.yunikorn.svc
+            type: 'A'
+            port: 9080
+alertmanager:
+  enabled: false
+
+grafana:
+  enabled: true
+  defaultDashboardsEnabled: true
+# Adding AMP datasource to Grafana config
+  serviceAccount:
+    create: false
+    name: ${amp_sa}
+  grafana.ini:
+    auth:
+      sigv4_auth_enabled: true
+  additionalDataSources:
+    - name: AMP
+      editable: true
+      jsonData:
+        sigV4Auth: true
+        sigV4Region: ${region}
+      type: prometheus
+      isDefault: false
+      url: ${amp_url}
diff --git a/ai-ml/mlflow/helm-values/kube-prometheus.yaml b/ai-ml/mlflow/helm-values/kube-prometheus.yaml
new file mode 100644
index 000000000..dedff553b
--- /dev/null
+++ b/ai-ml/mlflow/helm-values/kube-prometheus.yaml
@@ -0,0 +1,36 @@
+prometheus:
+  prometheusSpec:
+    retention: 5h
+    scrapeInterval: 30s
+    evaluationInterval: 30s
+    scrapeTimeout: 10s
+    storageSpec:
+      volumeClaimTemplate:
+        metadata:
+          name: data
+        spec:
+          storageClassName: ${storage_class_type}
+          accessModes:
+            - ReadWriteOnce
+          resources:
+            requests:
+              storage: 50Gi
+    # Scrape metrics for Yunikorn add-on
+    additionalScrapeConfigs:
+      - job_name: yunikorn
+        honor_labels: true
+        scrape_interval: 1m
+        scrape_timeout: 10s
+        metrics_path: /ws/v1//metrics
+        scheme: http
+        dns_sd_configs:
+          - names:
+              - yunikorn-service.yunikorn.svc
+            type: 'A'
+            port: 9080
+alertmanager:
+  enabled: false
+
+grafana:
+  enabled: true
+  defaultDashboardsEnabled: true
diff --git a/ai-ml/mlflow/helm-values/metrics-server-values.yaml b/ai-ml/mlflow/helm-values/metrics-server-values.yaml
new file mode 100644
index 000000000..bc806ced6
--- /dev/null
+++ b/ai-ml/mlflow/helm-values/metrics-server-values.yaml
@@ -0,0 +1,52 @@
+# HA config for metrics-server
+image:
+  repository: registry.k8s.io/metrics-server/metrics-server
+  pullPolicy: IfNotPresent
+
+serviceAccount:
+  create: true
+  name: metrics-server
+
+rbac:
+  create: true
+  pspEnabled: false
+
+apiService:
+  create: true
+
+podLabels:
+  k8s-app: metrics-server
+
+# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true
+replicas: 2
+
+updateStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 0
+    maxUnavailable: 1
+
+podDisruptionBudget:
+  enabled: true
+  minAvailable: 1
+
+defaultArgs:
+  - --cert-dir=/tmp
+  - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
+  - --kubelet-use-node-status-port
+  - --metric-resolution=15s
+
+resources:
+  requests:
+    cpu: 200m
+    memory: 512Mi
+
+affinity:
+  podAntiAffinity:
+    requiredDuringSchedulingIgnoredDuringExecution:
+      - labelSelector:
+          matchLabels:
+            k8s-app: metrics-server
+        namespaces:
+          - kube-system
+        topologyKey: kubernetes.io/hostname
diff --git a/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml b/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml
new file mode 100644
index 000000000..47a978c89
--- /dev/null
+++ b/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml
@@ -0,0 +1,57 @@
+---
+apiVersion: karpenter.sh/v1alpha5
+kind: Provisioner
+metadata:
+  name: default
+spec:
+  # Wich AWS Node Template to pick
+  providerRef:
+    name: default
+
+  # ttlSecondsAfterEmpty: 30
+
+  # Requirements that constrain the parameters of provisioned nodes.
+  # These requirements are combined with pod.spec.affinity.nodeAffinity rules.
+  # Operators { In, NotIn } are supported to enable including or excluding values
+  requirements:
+    - key: "karpenter.k8s.aws/instance-category"
+      operator: In
+      values: ["c", "m", "r"]
+    - key: "karpenter.k8s.aws/instance-cpu"
+      operator: In
+      values: ["4", "8", "16", "32"]
+    - key: "kubernetes.io/arch"
+      operator: In
+      values: ["amd64"]
+    - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand
+      operator: In
+      values: ["on-demand", "spot"]
+  limits:
+    resources:
+      cpu: 20 # CPU Cores across all instances
+      memory: 2000Gi
+
+  # Enables consolidation which attempts to reduce cluster cost by both removing un-needed nodes and down-sizing those
+  # that can't be removed.  Mutually exclusive with the ttlSecondsAfterEmpty parameter.
+  consolidation:
+    enabled: true
+---
+apiVersion: karpenter.k8s.aws/v1alpha1
+kind: AWSNodeTemplate
+metadata:
+  name: default
+spec:
+  subnetSelector:
+    Name: ${cluster_name}-private*     # Name of the Subnets to spin up the nodes
+  securityGroupSelector:                      # required, when not using launchTemplate
+    Name: ${cluster_name}-node*     # name of the SecurityGroup to be used with Nodes
+  blockDeviceMappings:
+    - deviceName: /dev/xvda
+      ebs:
+        volumeSize: 100Gi
+        volumeType: gp3
+        encrypted: true
+  tags:
+    managed-by: "karpenter"
+    intent: "apps"
+    Name: "karpenter-node-default"
diff --git a/ai-ml/mlflow/main.tf b/ai-ml/mlflow/main.tf
new file mode 100644
index 000000000..734c0ec63
--- /dev/null
+++ b/ai-ml/mlflow/main.tf
@@ -0,0 +1,66 @@
+provider "aws" {
+  region = local.region
+}
+
+provider "kubernetes" {
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate  = base64decode(module.eks.cluster_certificate_authority_data)
+  token                  = data.aws_eks_cluster_auth.this.token
+}
+
+# ECR always authenticates with `us-east-1` region
+# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
+provider "aws" {
+  alias  = "ecr"
+  region = "us-east-1"
+}
+
+provider "helm" {
+  kubernetes {
+    host                   = module.eks.cluster_endpoint
+    cluster_ca_certificate  = base64decode(module.eks.cluster_certificate_authority_data)
+    token                  = data.aws_eks_cluster_auth.this.token
+  }
+}
+
+provider "kubectl" {
+  apply_retry_count      = 10
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate  = base64decode(module.eks.cluster_certificate_authority_data)
+  load_config_file         = false
+  token                  = data.aws_eks_cluster_auth.this.token
+}
+
+data "aws_availability_zones" "available" {}
+data "aws_region" "current" {}
+data "aws_caller_identity" "current" {}
+data "aws_partition" "current" {}
+
+data "aws_eks_cluster_auth" "this" {
+  name = module.eks.cluster_name
+}
+
+data "aws_ecrpublic_authorization_token" "token" {
+  provider = aws.ecr
+}
+
+#---------------------------------------------------------------
+# Local variables
+#---------------------------------------------------------------
+locals {
+  name       = var.name
+  region     = var.region
+  vpc_cidr   = var.vpc_cidr
+  azs        = slice(data.aws_availability_zones.available.names, 0, 3)
+  account_id = data.aws_caller_identity.current.account_id
+  partition  = data.aws_partition.current.partition
+
+  mlflow_name      = "mlflow"
+  mlflow_namespace = "mlflow"
+  mlflow_service_account = "mlflow"
+
+  tags = {
+    Blueprint  = local.name
+    GithubRepo = "github.com/awslabs/data-on-eks"
+  }
+}
diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf
new file mode 100644
index 000000000..b88d9abec
--- /dev/null
+++ b/ai-ml/mlflow/mlflow-core.tf
@@ -0,0 +1,245 @@
+#---------------------------------------------------------------
+# RDS Postgres Database for MLflow Backend
+#---------------------------------------------------------------
+module "db" {
+  count   = var.enable_mlflow ? 1 : 0
+  source  = "terraform-aws-modules/rds/aws"
+  version = "~> 5.0"
+
+  identifier = local.mlflow_name
+
+  engine               = "postgres"
+  engine_version       = "14.3"
+  family               = "postgres14"
+  major_engine_version = "14"
+  instance_class       = "db.m6i.xlarge"
+
+  storage_type      = "io1"
+  allocated_storage = 100
+  iops              = 3000
+
+  db_name                = local.mlflow_name
+  username               = local.mlflow_name
+  create_random_password = false
+  password               = sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string)
+  port                   = 5432
+
+  multi_az               = true
+  db_subnet_group_name   = module.vpc.database_subnet_group
+  vpc_security_group_ids = [module.security_group[0].security_group_id]
+
+  maintenance_window              = "Mon:00:00-Mon:03:00"
+  backup_window                   = "03:00-06:00"
+  enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"]
+  create_cloudwatch_log_group     = true
+
+  backup_retention_period = 5
+  skip_final_snapshot     = true
+  deletion_protection     = false
+
+  performance_insights_enabled          = true
+  performance_insights_retention_period = 7
+  create_monitoring_role                = true
+  monitoring_interval                   = 60
+  monitoring_role_name                  = "mlflow-backend"
+  monitoring_role_use_name_prefix        = true
+  monitoring_role_description           = "MLflow Postgres Backend for monitoring role"
+
+  parameters = [
+    {
+      name  = "autovacuum"
+      value = 1
+    },
+    {
+      name  = "client_encoding"
+      value = "utf8"
+    }
+  ]
+
+  tags = local.tags
+}
+
+#---------------------------------------------------------------
+# MLflow Postgres Backend DB Master password
+#---------------------------------------------------------------
+resource "random_password" "postgres" {
+  count   = var.enable_mlflow ? 1 : 0
+  length  = 16
+  special = false
+}
+#tfsec:ignore:aws-ssm-secret-use-customer-key
+resource "aws_secretsmanager_secret" "postgres" {
+  count                   = var.enable_mlflow ? 1 : 0
+  name                    = "postgres-2"
+  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
+}
+
+resource "aws_secretsmanager_secret_version" "postgres" {
+  count         = var.enable_mlflow ? 1 : 0
+  secret_id     = aws_secretsmanager_secret.postgres[0].id
+  secret_string = random_password.postgres[0].result
+}
+
+#---------------------------------------------------------------
+# PostgreSQL RDS security group
+#---------------------------------------------------------------
+module "security_group" {
+  count   = var.enable_mlflow ? 1 : 0
+  source  = "terraform-aws-modules/security-group/aws"
+  version = "~> 5.0"
+
+  name        = local.name
+  description = "Complete PostgreSQL example security group"
+  vpc_id      = module.vpc.vpc_id
+
+  # ingress
+  ingress_with_cidr_blocks = [
+    {
+      from_port   = 5432
+      to_port     = 5432
+      protocol    = "tcp"
+      description = "PostgreSQL access from within VPC"
+      cidr_blocks = "${module.vpc.vpc_cidr_block}"
+    },
+  ]
+
+  tags = local.tags
+}
+
+
+#---------------------------------------------------------------
+# S3 bucket for MLflow artifacts
+#---------------------------------------------------------------
+
+#tfsec:ignore:*
+module "mlflow_s3_bucket" {
+  count   = var.enable_mlflow ? 1 : 0
+  source  = "terraform-aws-modules/s3-bucket/aws"
+  version = "~> 3.0"
+
+  bucket_prefix = "${local.name}-logs-"
+
+  # For example only - please evaluate for your environment
+  force_destroy = true
+
+  server_side_encryption_configuration = {
+    rule = {
+      apply_server_side_encryption_by_default = {
+        sse_algorithm = "AES256"
+      }
+    }
+  }
+
+  tags = local.tags
+}
+
+#---------------------------------------------------------------
+# MLflow Namespace
+#---------------------------------------------------------------
+resource "kubernetes_namespace_v1" "mlflow" {
+  count = var.enable_mlflow ? 1 : 0
+  metadata {
+    name = local.mlflow_namespace
+  }
+  timeouts {
+    delete = "15m"
+  }
+}
+
+resource "kubernetes_service_account_v1" "mlflow" {
+  count = var.enable_mlflow ? 1 : 0
+  metadata {
+    name        = local.mlflow_service_account
+    namespace   = kubernetes_namespace_v1.mlflow[0].metadata[0].name
+    annotations = { "eks.amazonaws.com/role-arn" : module.mlflow_irsa[0].iam_role_arn }
+  }
+
+  automount_service_account_token = true
+}
+
+resource "kubernetes_secret_v1" "mlflow" {
+  count = var.enable_mlflow ? 1 : 0
+  metadata {
+    name      = "${local.mlflow_service_account}-secret"
+    namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name
+    annotations = {
+      "kubernetes.io/service-account.name"      = kubernetes_service_account_v1.mlflow[0].metadata[0].name
+      "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.mlflow[0].metadata[0].name
+    }
+  }
+
+  type = "kubernetes.io/service-account-token"
+}
+
+# Create IAM Role for Service Account (IRSA) Only if Airflow is enabled
+module "mlflow_irsa" {
+  count = var.enable_mlflow ? 1 : 0
+
+  source  = "aws-ia/eks-blueprints-addon/aws"
+  version = "~> 1.0" #ensure to update this to the latest/desired version
+
+  # Disable helm release
+  create_release = false
+
+  # IAM role for service account (IRSA)
+  create_role   = true
+  create_policy = false # Policy is created in the next resource
+
+  role_name     = local.mlflow_service_account
+  role_policies = { mlflow_policy = aws_iam_policy.mlflow[0].arn }
+
+  oidc_providers = {
+    this = {
+      provider_arn    = module.eks.oidc_provider_arn
+      namespace       = kubernetes_namespace_v1.mlflow[0].metadata[0].name
+      service_account = local.mlflow_service_account
+    }
+  }
+
+  tags = local.tags
+}
+
+#---------------------------------------------------------------
+# IAM policy for MLflow for accesing S3 artifacts and RDS Postgres backend
+#---------------------------------------------------------------
+resource "aws_iam_policy" "mlflow" {
+  count = var.enable_mlflow ? 1 : 0
+
+  description = "IAM policy for MLflow"
+  name_prefix = format("%s-%s-", local.name, "mlflow")
+  path        = "/"
+  policy      = data.aws_iam_policy_document.mlflow[0].json
+}
+
+data "aws_iam_policy_document" "mlflow" {
+  count = var.enable_mlflow ? 1 : 0
+  statement {
+    sid       = ""
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}"]
+
+    actions = [
+      "s3:ListBucket"
+    ]
+  }
+  statement {
+    sid       = ""
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}/*"]
+
+    actions = [
+      "s3:GetObject",
+      "s3:PutObject",
+      "s3:DeleteObject"
+    ]
+  }
+  statement {
+    sid       = ""
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"]
+    
+    actions = [
+      "rds-db:connect",
+    ]
+  }
+}
diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf
new file mode 100644
index 000000000..3c2034b62
--- /dev/null
+++ b/ai-ml/mlflow/outputs.tf
@@ -0,0 +1,14 @@
+output "configure_kubectl" {
+  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
+  value       = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
+}
+
+output "eks_api_server_url" {
+  description = "Your eks API server endpoint"
+  value       = module.eks.cluster_endpoint
+}
+
+output "grafana_secret_name" {
+  description = "Grafana password secret name"
+  value       = aws_secretsmanager_secret.grafana.name
+}
diff --git a/ai-ml/mlflow/variables.tf b/ai-ml/mlflow/variables.tf
new file mode 100644
index 000000000..8a41a224b
--- /dev/null
+++ b/ai-ml/mlflow/variables.tf
@@ -0,0 +1,41 @@
+variable "name" {
+  description = "Name of the VPC and EKS Cluster"
+  default     = "mlflow-eks"
+  type        = string
+}
+
+variable "region" {
+  description = "Region"
+  type        = string
+  default     = "us-west-2"
+}
+
+variable "eks_cluster_version" {
+  description = "EKS Cluster version"
+  default     = "1.27"
+  type        = string
+}
+
+variable "vpc_cidr" {
+  description = "VPC CIDR"
+  default     = "10.1.0.0/16"
+  type        = string
+}
+
+variable "db_private_subnets" {
+  description = "Private Subnets CIDRs. 254 IPs per Subnet/AZ for Airflow DB."
+  default     = ["10.0.20.0/26", "10.0.21.0/26"]
+  type        = list(string)
+}
+
+variable "enable_amazon_prometheus" {
+  description = "Enable AWS Managed Prometheus service"
+  type        = bool
+  default     = true
+}
+
+variable "enable_mlflow" {
+  description = "Enable MMLflow"
+  type        = bool
+  default     = true
+}
\ No newline at end of file
diff --git a/ai-ml/mlflow/versions.tf b/ai-ml/mlflow/versions.tf
new file mode 100644
index 000000000..be6e7d672
--- /dev/null
+++ b/ai-ml/mlflow/versions.tf
@@ -0,0 +1,33 @@
+terraform {
+  required_version = ">= 1.0.0"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 3.72"
+    }
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = ">= 2.10"
+    }
+    helm = {
+      source  = "hashicorp/helm"
+      version = ">= 2.4.1"
+    }
+    random = {
+      source  = "hashicorp/random"
+      version = "3.3.2"
+    }
+    kubectl = {
+      source  = "gavinbunney/kubectl"
+      version = ">= 1.14"
+    }
+  }
+
+  # ##  Used for end-to-end testing on project; update to suit your needs
+  # backend "s3" {
+  #   bucket = "doeks-github-actions-e2e-test-state"
+  #   region = "us-west-2"
+  #   key    = "e2e/argo-workflow/terraform.tfstate"
+  # }
+}
diff --git a/ai-ml/mlflow/vpc.tf b/ai-ml/mlflow/vpc.tf
new file mode 100644
index 000000000..ffe29219f
--- /dev/null
+++ b/ai-ml/mlflow/vpc.tf
@@ -0,0 +1,36 @@
+module "vpc" {
+  source  = "terraform-aws-modules/vpc/aws"
+  version = "~> 5.0"
+
+  name = local.name
+  cidr = local.vpc_cidr
+  azs  = local.azs
+
+  # Three private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB
+  private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k)]
+  
+  # ------------------------------
+  # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
+  # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
+  public_subnets  = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 10)]
+
+  # ------------------------------
+  # Private Subnets for MLflow backend store
+  database_subnets                   = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 20)]
+  create_database_subnet_group       = true
+  create_database_subnet_route_table = true
+  
+  enable_nat_gateway   = true
+  single_nat_gateway   = true
+  enable_dns_hostnames = true
+
+  public_subnet_tags = {
+    "kubernetes.io/role/elb" = 1
+  }
+
+  private_subnet_tags = {
+    "kubernetes.io/role/internal-elb" = 1
+  }
+
+  tags = local.tags
+}

From 5c6efe4d0098a8ea1c91a5debe069bfecad8d077 Mon Sep 17 00:00:00 2001
From: Ovidiu Valeanu <ovaleanu@amazon.co.uk>
Date: Wed, 6 Sep 2023 13:02:12 +0100
Subject: [PATCH 2/8] :rocket: MLflow first commit

---
 ai-ml/mlflow/mlflow-core.tf |  4 ++--
 ai-ml/mlflow/outputs.tf     | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf
index b88d9abec..fd2273e1a 100644
--- a/ai-ml/mlflow/mlflow-core.tf
+++ b/ai-ml/mlflow/mlflow-core.tf
@@ -117,7 +117,7 @@ module "mlflow_s3_bucket" {
   source  = "terraform-aws-modules/s3-bucket/aws"
   version = "~> 3.0"
 
-  bucket_prefix = "${local.name}-logs-"
+  bucket_prefix = "${local.name}-artifacts-"
 
   # For example only - please evaluate for your environment
   force_destroy = true
@@ -236,7 +236,7 @@ data "aws_iam_policy_document" "mlflow" {
   statement {
     sid       = ""
     effect    = "Allow"
-    resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"]
+    resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_endpoint}/${local.mlflow_name}"]
     
     actions = [
       "rds-db:connect",
diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf
index 3c2034b62..2ea6f9293 100644
--- a/ai-ml/mlflow/outputs.tf
+++ b/ai-ml/mlflow/outputs.tf
@@ -12,3 +12,13 @@ output "grafana_secret_name" {
   description = "Grafana password secret name"
   value       = aws_secretsmanager_secret.grafana.name
 }
+
+output "mlflow_s3_artifacts" {
+  description = "S3 bucket for MLflow artifacts"
+  value = module.mlflow_s3_bucket[0].s3_bucket_id
+}
+
+output "mlflow_db_backend" {
+  description = "Amazon RDS Postgres database for MLflow backend"
+  value = module.db[0].db_instance_endpoint
+}
\ No newline at end of file

From 3ee59d37efc118d0ff735f5102b8a4ffacf9d9e8 Mon Sep 17 00:00:00 2001
From: Ovidiu Valeanu <ovaleanu@amazon.co.uk>
Date: Wed, 6 Sep 2023 13:19:17 +0100
Subject: [PATCH 3/8] :rocket: MLflow first commit

---
 ai-ml/mlflow/mlflow-core.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf
index fd2273e1a..6117f4918 100644
--- a/ai-ml/mlflow/mlflow-core.tf
+++ b/ai-ml/mlflow/mlflow-core.tf
@@ -236,7 +236,7 @@ data "aws_iam_policy_document" "mlflow" {
   statement {
     sid       = ""
     effect    = "Allow"
-    resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_endpoint}/${local.mlflow_name}"]
+    resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"]
     
     actions = [
       "rds-db:connect",

From efef5dd853ae853a33786192ddd398a9b6b92e2c Mon Sep 17 00:00:00 2001
From: Ovidiu Valeanu <ovaleanu@amazon.co.uk>
Date: Wed, 4 Oct 2023 15:15:03 +0100
Subject: [PATCH 4/8] feat: MLflow Tracking blueprint running on EKS

---
 ai-ml/mlflow/README.md                        | 93 ++++++++++++++++++-
 ai-ml/mlflow/addons.tf                        | 81 +++++++++-------
 ai-ml/mlflow/cleanup.sh                       | 45 +++++++++
 ai-ml/mlflow/eks.tf                           | 41 +++++++-
 .../helm-values/ingress-nginx-values.yaml     | 11 +++
 .../helm-values/mlflow-tracking-values.yaml   | 88 ++++++++++++++++++
 ai-ml/mlflow/helm-values/nvidia-values.yaml   | 10 ++
 ai-ml/mlflow/install.sh                       | 37 ++++++++
 ai-ml/mlflow/main.tf                          | 16 ++--
 ai-ml/mlflow/mlflow-core.tf                   | 34 +++----
 ai-ml/mlflow/outputs.tf                       |  6 +-
 ai-ml/mlflow/variables.tf                     | 19 ++--
 ai-ml/mlflow/vpc.tf                           | 35 +++++--
 13 files changed, 436 insertions(+), 80 deletions(-)
 create mode 100644 ai-ml/mlflow/cleanup.sh
 create mode 100644 ai-ml/mlflow/helm-values/ingress-nginx-values.yaml
 create mode 100644 ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml
 create mode 100644 ai-ml/mlflow/helm-values/nvidia-values.yaml
 create mode 100755 ai-ml/mlflow/install.sh

diff --git a/ai-ml/mlflow/README.md b/ai-ml/mlflow/README.md
index ff644528d..3f610da29 100755
--- a/ai-ml/mlflow/README.md
+++ b/ai-ml/mlflow/README.md
@@ -1 +1,92 @@
-# MLflow on EKS  (Coming Soon)
+# MLflow on EKS
+
+Docs comming soon ...
+
+## Requirements
+
+| Name | Version |
+|------|---------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.0.0 |
+| <a name="requirement_aws"></a> [aws](#requirement\_aws) | >= 3.72 |
+| <a name="requirement_helm"></a> [helm](#requirement\_helm) | >= 2.4.1 |
+| <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.14 |
+| <a name="requirement_kubernetes"></a> [kubernetes](#requirement\_kubernetes) | >= 2.10 |
+| <a name="requirement_random"></a> [random](#requirement\_random) | 3.3.2 |
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| <a name="provider_aws"></a> [aws](#provider\_aws) | 5.19.0 |
+| <a name="provider_aws.ecr"></a> [aws.ecr](#provider\_aws.ecr) | 5.19.0 |
+| <a name="provider_kubectl"></a> [kubectl](#provider\_kubectl) | 1.14.0 |
+| <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | 2.23.0 |
+| <a name="provider_random"></a> [random](#provider\_random) | 3.3.2 |
+
+## Modules
+
+| Name | Source | Version |
+|------|--------|---------|
+| <a name="module_amp_ingest_irsa"></a> [amp\_ingest\_irsa](#module\_amp\_ingest\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 |
+| <a name="module_db"></a> [db](#module\_db) | terraform-aws-modules/rds/aws | ~> 5.0 |
+| <a name="module_ebs_csi_driver_irsa"></a> [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
+| <a name="module_eks"></a> [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
+| <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.3 |
+| <a name="module_eks_data_addons"></a> [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.2.3 |
+| <a name="module_fluentbit_s3_bucket"></a> [fluentbit\_s3\_bucket](#module\_fluentbit\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 |
+| <a name="module_mlflow_irsa"></a> [mlflow\_irsa](#module\_mlflow\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 |
+| <a name="module_mlflow_s3_bucket"></a> [mlflow\_s3\_bucket](#module\_mlflow\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 |
+| <a name="module_security_group"></a> [security\_group](#module\_security\_group) | terraform-aws-modules/security-group/aws | ~> 5.0 |
+| <a name="module_vpc"></a> [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 |
+
+## Resources
+
+| Name | Type |
+|------|------|
+| [aws_iam_policy.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
+| [aws_iam_policy.mlflow](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
+| [aws_prometheus_workspace.amp](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/prometheus_workspace) | resource |
+| [aws_secretsmanager_secret.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource |
+| [aws_secretsmanager_secret.postgres](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource |
+| [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource |
+| [aws_secretsmanager_secret_version.postgres](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource |
+| [aws_security_group.ingress_nginx_external](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource |
+| [kubectl_manifest.karpenter_provisioner](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource |
+| [kubernetes_annotations.gp2_default](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource |
+| [kubernetes_namespace_v1.mlflow](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
+| [kubernetes_secret_v1.mlflow](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
+| [kubernetes_service_account_v1.mlflow](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource |
+| [kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource |
+| [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/3.3.2/docs/resources/password) | resource |
+| [random_password.postgres](https://registry.terraform.io/providers/hashicorp/random/3.3.2/docs/resources/password) | resource |
+| [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source |
+| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
+| [aws_ecrpublic_authorization_token.token](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ecrpublic_authorization_token) | data source |
+| [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source |
+| [aws_iam_policy_document.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_iam_policy_document.mlflow](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source |
+| [aws_secretsmanager_secret_version.admin_password_version](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source |
+| [kubectl_path_documents.karpenter_provisioners](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/data-sources/path_documents) | data source |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.27"` | no |
+| <a name="input_enable_amazon_prometheus"></a> [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no |
+| <a name="input_enable_mlflow_tracking"></a> [enable\_mlflow\_tracking](#input\_enable\_mlflow\_tracking) | Enable MLflow Tracking | `bool` | `true` | no |
+| <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"mlflow-on-eks"` | no |
+| <a name="input_region"></a> [region](#input\_region) | Region | `string` | `"us-west-2"` | no |
+| <a name="input_secondary_cidr_blocks"></a> [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` | <pre>[<br>  "100.64.0.0/16"<br>]</pre> | no |
+| <a name="input_vpc_cidr"></a> [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR | `string` | `"10.1.0.0/21"` | no |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| <a name="output_configure_kubectl"></a> [configure\_kubectl](#output\_configure\_kubectl) | Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig |
+| <a name="output_eks_api_server_url"></a> [eks\_api\_server\_url](#output\_eks\_api\_server\_url) | Your eks API server endpoint |
+| <a name="output_grafana_secret_name"></a> [grafana\_secret\_name](#output\_grafana\_secret\_name) | Grafana password secret name |
+| <a name="output_mlflow_db_backend"></a> [mlflow\_db\_backend](#output\_mlflow\_db\_backend) | Amazon RDS Postgres database for MLflow backend |
+| <a name="output_mlflow_s3_artifacts"></a> [mlflow\_s3\_artifacts](#output\_mlflow\_s3\_artifacts) | S3 bucket for MLflow artifacts |
diff --git a/ai-ml/mlflow/addons.tf b/ai-ml/mlflow/addons.tf
index 79b07cf51..3cf69f05e 100644
--- a/ai-ml/mlflow/addons.tf
+++ b/ai-ml/mlflow/addons.tf
@@ -114,45 +114,33 @@ module "eks_blueprints_addons" {
   #---------------------------------------
   enable_aws_load_balancer_controller = true
   aws_load_balancer_controller = {
-    chart_version = "1.6.0" # min version required to use SG for NLB feature
+    set = [{
+      name  = "enableServiceMutatorWebhook"
+      value = "false"
+    }]
   }
 
   #---------------------------------------
-  # Ingress Nginx external
+  # Ingress Nginx Add-on
   #---------------------------------------
   enable_ingress_nginx = true
   ingress_nginx = {
-    name = "ingress-nginx-external"
-    values = [
-      <<-EOT
-          controller:
-            replicaCount: 3
-            service:
-              annotations:
-                service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-                service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-                service.beta.kubernetes.io/aws-load-balancer-security-groups: ${aws_security_group.ingress_nginx_external.id}
-                service.beta.kubernetes.io/aws-load-balancer-manage-backend-security-group-rules: true
-              loadBalancerClass: service.k8s.aws/nlb
-            topologySpreadConstraints:
-              - maxSkew: 1
-                topologyKey: topology.kubernetes.io/zone
-                whenUnsatisfiable: ScheduleAnyway
-                labelSelector:
-                  matchLabels:
-                    app.kubernetes.io/instance: ingress-nginx-external
-              - maxSkew: 1
-                topologyKey: kubernetes.io/hostname
-                whenUnsatisfiable: ScheduleAnyway
-                labelSelector:
-                  matchLabels:
-                    app.kubernetes.io/instance: ingress-nginx-external
-            minAvailable: 2
-            ingressClassResource:
-              name: ingress-nginx-external
-              default: false
-        EOT
-    ]
+    values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})]
+  }
+
+  helm_releases = {
+    #---------------------------------------
+    # NVIDIA Device Plugin Add-on
+    #---------------------------------------
+    nvidia-device-plugin = {
+      description      = "A Helm chart for NVIDIA Device Plugin"
+      namespace        = "nvidia-device-plugin"
+      create_namespace = true
+      chart            = "nvidia-device-plugin"
+      chart_version    = "0.14.0"
+      repository       = "https://nvidia.github.io/k8s-device-plugin"
+      values           = [file("${path.module}/helm-values/nvidia-values.yaml")]
+    }
   }
 
   #---------------------------------------
@@ -164,7 +152,7 @@ module "eks_blueprints_addons" {
   # 2- Grafana Admin user: admin
   # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id <output.grafana_secret_name> --region $AWS_REGION --query "SecretString" --output text`
   #---------------------------------------------------------------
-  enable_kube_prometheus_stack = false
+  enable_kube_prometheus_stack = true
   kube_prometheus_stack = {
     values = [
       var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", {
@@ -195,10 +183,31 @@ module "eks_blueprints_addons" {
 #---------------------------------------------------------------
 module "eks_data_addons" {
   source  = "aws-ia/eks-data-addons/aws"
-  version = "~> 1.0" # ensure to update this to the latest/desired version
+  version = "~> 1.2.3" # ensure to update this to the latest/desired version
 
   oidc_provider_arn = module.eks.oidc_provider_arn
 
+  #---------------------------------------------------------------
+  # MLflow Tracking Add-on
+  #---------------------------------------------------------------
+
+  enable_mlflow_tracking = true
+  mlflow_tracking_helm_config = {
+    mlflow_namespace = try(kubernetes_namespace_v1.mlflow[0].metadata[0].name, local.mlflow_namespace)
+
+    values = [templatefile("${path.module}/helm-values/mlflow-tracking-values.yaml", {
+      mlflow_sa   = local.mlflow_service_account
+      mlflow_irsa = module.mlflow_irsa[0].iam_role_arn
+      # MLflow Postgres RDS Config
+      mlflow_db_username = local.mlflow_name
+      mlflow_db_password = try(sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string), "")
+      mlflow_db_name     = try(module.db[0].db_instance_name, "")
+      mlflow_db_host     = try(element(split(":", module.db[0].db_instance_endpoint), 0), "")
+      # S3 bucket config for artifacts
+      s3_bucket_name = try(module.mlflow_s3_bucket[0].s3_bucket_id, "")
+    })]
+  }
+
 }
 
 #---------------------------------------------------------------
@@ -266,7 +275,7 @@ module "fluentbit_s3_bucket" {
   source  = "terraform-aws-modules/s3-bucket/aws"
   version = "~> 3.0"
 
-  bucket_prefix = "${local.name}-argo-workflow-logs-"
+  bucket_prefix = "${local.name}-fluentbit-logs-"
   # For example only - please evaluate for your environment
   force_destroy = true
   server_side_encryption_configuration = {
diff --git a/ai-ml/mlflow/cleanup.sh b/ai-ml/mlflow/cleanup.sh
new file mode 100644
index 000000000..6f96c6ef5
--- /dev/null
+++ b/ai-ml/mlflow/cleanup.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+set -o errexit
+set -o pipefail
+
+targets=(
+  "module.eks_data_addons"
+  "module.eks_blueprints_addons"
+)
+
+#-------------------------------------------
+# Helpful to delete the stuck in "Terminating" namespaces
+# Rerun the cleanup.sh script to detect and delete the stuck resources
+#-------------------------------------------
+terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name')
+
+# If there are no terminating namespaces, exit the script
+if [[ -z $terminating_namespaces ]]; then
+    echo "No terminating namespaces found"
+fi
+
+for ns in $terminating_namespaces; do
+    echo "Terminating namespace: $ns"
+    kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f -
+done
+
+for target in "${targets[@]}"
+do
+  terraform destroy -target="$target" -auto-approve
+  destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1)
+  if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
+    echo "SUCCESS: Terraform destroy of $target completed successfully"
+  else
+    echo "FAILED: Terraform destroy of $target failed"
+    exit 1
+  fi
+done
+
+terraform destroy -auto-approve
+destroy_output=$(terraform destroy -auto-approve 2>&1)
+if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
+  echo "SUCCESS: Terraform destroy of all targets completed successfully"
+else
+  echo "FAILED: Terraform destroy of all targets failed"
+  exit 1
+fi
diff --git a/ai-ml/mlflow/eks.tf b/ai-ml/mlflow/eks.tf
index 4d486df1c..cfb06b56e 100644
--- a/ai-ml/mlflow/eks.tf
+++ b/ai-ml/mlflow/eks.tf
@@ -79,7 +79,11 @@ module "eks" {
     core_node_group = {
       name        = "core-node-group"
       description = "EKS Core node group for hosting critical add-ons"
-      subnet_ids  = module.vpc.private_subnets
+      # Filtering only Secondary CIDR private subnets starting with "100.".
+      # Subnet IDs where the nodes/node groups will be provisioned
+      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
+      )
 
       min_size     = 3
       max_size     = 9
@@ -110,5 +114,40 @@ module "eks" {
         "karpenter.sh/discovery" = local.name
       })
     }
+
+    gpu1 = {
+      name        = "gpu-node-grp"
+      description = "EKS Node Group to run GPU workloads"
+      # Filtering only Secondary CIDR private subnets starting with "100.".
+      # Subnet IDs where the nodes/node groups will be provisioned
+      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
+      )
+
+      ami_type     = "AL2_x86_64_GPU"
+      min_size     = 0
+      max_size     = 1
+      desired_size = 0
+
+      instance_types = ["g5.12xlarge"]
+
+      labels = {
+        WorkerType    = "ON_DEMAND"
+        NodeGroupType = "gpu"
+      }
+
+      taints = {
+        gpu = {
+          key      = "nvidia.com/gpu"
+          effect   = "NO_SCHEDULE"
+          operator = "EXISTS"
+        }
+      }
+
+      tags = merge(local.tags, {
+        Name                     = "gpu-node-grp",
+        "karpenter.sh/discovery" = local.name
+      })
+    }
   }
 }
diff --git a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml
new file mode 100644
index 000000000..126b30152
--- /dev/null
+++ b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml
@@ -0,0 +1,11 @@
+controller:
+  service:
+    externalTrafficPolicy: "Local"
+    annotations:
+      service.beta.kubernetes.io/aws-load-balancer-type: external
+      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
+      service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing # Private Load Balancer can only be accessed within the VPC
+    targetPorts:
+      http: http
+      https: http
\ No newline at end of file
diff --git a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml
new file mode 100644
index 000000000..7614409f7
--- /dev/null
+++ b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml
@@ -0,0 +1,88 @@
+# Default values for mlflow-tracking-server.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+image:
+  repository: public.ecr.aws/data-on-eks/mlflow
+  pullPolicy: Always
+  tag: 2.7.1
+
+imagePullSecrets: []
+
+nameOverride: mlflow-tracking-server
+
+fullnameOverride: mlflow-tracking-server
+
+podAnnotations: {}
+
+replicaCount: 1
+
+service:
+  type: ClusterIP 
+  port: 5000
+
+serviceAccount:
+  # Specifies whether a service account should be created
+  create: false
+  # Annotations to add to the service account
+  annotations:
+    eks.amazonaws.com/role-arn: ${mlflow_irsa}
+  labels: {}
+  # The name of the service account to use.
+  # If not set and create is true, a name is generated using the fullname template
+  name: ${mlflow_sa}
+
+ingress:
+  enabled: true
+  className: nginx 
+  annotations:
+    kubernetes.io/ingress.class: nginx
+    nginx.ingress.kubernetes.io/use-regex: "true"
+  hosts:
+    - host:
+      paths:
+        - path: /
+          pathType: Prefix
+  tls: []
+  #  - secretName: chart-example-tls
+  #    hosts:
+  #      - chart-example.local
+
+mlflow:
+  artifacts:
+    bucketName: ${s3_bucket_name}
+  database:
+    name: ${mlflow_db_name}
+    username: ${mlflow_db_username}
+    password: ${mlflow_db_password}
+    host: ${mlflow_db_host}
+    port: 5432
+
+podSecurityContext: {}
+  # fsGroup: 2000
+
+securityContext: {}
+  # capabilities:
+  #   drop:
+  #   - ALL
+  # readOnlyRootFilesystem: true
+  # runAsNonRoot: true
+  # runAsUser: 1000
+
+resources: {}
+  # We usually recommend not to specify default resources and to leave this as a conscious
+  # choice for the user. This also increases chances charts run on environments with little
+  # resources, such as Minikube. If you do want to specify resources, uncomment the following
+  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
+  # limits:
+  #   cpu: 100m
+  #   memory: 128Mi
+  # requests:
+  #   cpu: 100m
+  #   memory: 128Mi
+
+nodeSelector: {}
+
+tolerations: []
+
+affinity: {}
\ No newline at end of file
diff --git a/ai-ml/mlflow/helm-values/nvidia-values.yaml b/ai-ml/mlflow/helm-values/nvidia-values.yaml
new file mode 100644
index 000000000..3c50e8c1f
--- /dev/null
+++ b/ai-ml/mlflow/helm-values/nvidia-values.yaml
@@ -0,0 +1,10 @@
+gfd:
+  enabled: true
+nfd:
+  enabled: true
+  worker:
+    tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      - operator: "Exists"
\ No newline at end of file
diff --git a/ai-ml/mlflow/install.sh b/ai-ml/mlflow/install.sh
new file mode 100755
index 000000000..2832252fb
--- /dev/null
+++ b/ai-ml/mlflow/install.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# List of Terraform modules to apply in sequence
+targets=(
+  "module.vpc"
+  "module.eks"
+  "module.ebs_csi_driver_irsa"
+  "module.eks_blueprints_addons"
+  "module.db"
+)
+
+# Initialize Terraform
+echo "Initializing ..."
+terraform init --upgrade || echo "\"terraform init\" failed"
+
+# Apply modules in sequence
+for target in "${targets[@]}"
+do
+  echo "Applying module $target..."
+  apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty)
+  if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
+    echo "SUCCESS: Terraform apply of $target completed successfully"
+  else
+    echo "FAILED: Terraform apply of $target failed"
+    exit 1
+  fi
+done
+
+# Final apply to catch any remaining resources
+echo "Applying remaining resources..."
+apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty)
+if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
+  echo "SUCCESS: Terraform apply of all modules completed successfully"
+else
+  echo "FAILED: Terraform apply of all modules failed"
+  exit 1
+fi
diff --git a/ai-ml/mlflow/main.tf b/ai-ml/mlflow/main.tf
index 734c0ec63..df7b59945 100644
--- a/ai-ml/mlflow/main.tf
+++ b/ai-ml/mlflow/main.tf
@@ -4,7 +4,7 @@ provider "aws" {
 
 provider "kubernetes" {
   host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate  = base64decode(module.eks.cluster_certificate_authority_data)
+  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
   token                  = data.aws_eks_cluster_auth.this.token
 }
 
@@ -18,7 +18,7 @@ provider "aws" {
 provider "helm" {
   kubernetes {
     host                   = module.eks.cluster_endpoint
-    cluster_ca_certificate  = base64decode(module.eks.cluster_certificate_authority_data)
+    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
     token                  = data.aws_eks_cluster_auth.this.token
   }
 }
@@ -26,13 +26,13 @@ provider "helm" {
 provider "kubectl" {
   apply_retry_count      = 10
   host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate  = base64decode(module.eks.cluster_certificate_authority_data)
-  load_config_file         = false
+  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+  load_config_file       = false
   token                  = data.aws_eks_cluster_auth.this.token
 }
 
 data "aws_availability_zones" "available" {}
-data "aws_region" "current" {}
+# data "aws_region" "current" {}
 data "aws_caller_identity" "current" {}
 data "aws_partition" "current" {}
 
@@ -51,12 +51,12 @@ locals {
   name       = var.name
   region     = var.region
   vpc_cidr   = var.vpc_cidr
-  azs        = slice(data.aws_availability_zones.available.names, 0, 3)
+  azs        = slice(data.aws_availability_zones.available.names, 0, 2)
   account_id = data.aws_caller_identity.current.account_id
   partition  = data.aws_partition.current.partition
 
-  mlflow_name      = "mlflow"
-  mlflow_namespace = "mlflow"
+  mlflow_name            = "mlflow"
+  mlflow_namespace       = "mlflow"
   mlflow_service_account = "mlflow"
 
   tags = {
diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf
index 6117f4918..babe6bd36 100644
--- a/ai-ml/mlflow/mlflow-core.tf
+++ b/ai-ml/mlflow/mlflow-core.tf
@@ -2,7 +2,7 @@
 # RDS Postgres Database for MLflow Backend
 #---------------------------------------------------------------
 module "db" {
-  count   = var.enable_mlflow ? 1 : 0
+  count   = var.enable_mlflow_tracking ? 1 : 0
   source  = "terraform-aws-modules/rds/aws"
   version = "~> 5.0"
 
@@ -42,7 +42,7 @@ module "db" {
   create_monitoring_role                = true
   monitoring_interval                   = 60
   monitoring_role_name                  = "mlflow-backend"
-  monitoring_role_use_name_prefix        = true
+  monitoring_role_use_name_prefix       = true
   monitoring_role_description           = "MLflow Postgres Backend for monitoring role"
 
   parameters = [
@@ -63,19 +63,19 @@ module "db" {
 # MLflow Postgres Backend DB Master password
 #---------------------------------------------------------------
 resource "random_password" "postgres" {
-  count   = var.enable_mlflow ? 1 : 0
+  count   = var.enable_mlflow_tracking ? 1 : 0
   length  = 16
   special = false
 }
 #tfsec:ignore:aws-ssm-secret-use-customer-key
 resource "aws_secretsmanager_secret" "postgres" {
-  count                   = var.enable_mlflow ? 1 : 0
-  name                    = "postgres-2"
+  count                   = var.enable_mlflow_tracking ? 1 : 0
+  name                    = "postgres-mlflow"
   recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
 }
 
 resource "aws_secretsmanager_secret_version" "postgres" {
-  count         = var.enable_mlflow ? 1 : 0
+  count         = var.enable_mlflow_tracking ? 1 : 0
   secret_id     = aws_secretsmanager_secret.postgres[0].id
   secret_string = random_password.postgres[0].result
 }
@@ -84,7 +84,7 @@ resource "aws_secretsmanager_secret_version" "postgres" {
 # PostgreSQL RDS security group
 #---------------------------------------------------------------
 module "security_group" {
-  count   = var.enable_mlflow ? 1 : 0
+  count   = var.enable_mlflow_tracking ? 1 : 0
   source  = "terraform-aws-modules/security-group/aws"
   version = "~> 5.0"
 
@@ -99,7 +99,7 @@ module "security_group" {
       to_port     = 5432
       protocol    = "tcp"
       description = "PostgreSQL access from within VPC"
-      cidr_blocks = "${module.vpc.vpc_cidr_block}"
+      cidr_blocks = "${module.vpc.vpc_cidr_block},${module.vpc.vpc_secondary_cidr_blocks[0]}"
     },
   ]
 
@@ -113,7 +113,7 @@ module "security_group" {
 
 #tfsec:ignore:*
 module "mlflow_s3_bucket" {
-  count   = var.enable_mlflow ? 1 : 0
+  count   = var.enable_mlflow_tracking ? 1 : 0
   source  = "terraform-aws-modules/s3-bucket/aws"
   version = "~> 3.0"
 
@@ -137,7 +137,7 @@ module "mlflow_s3_bucket" {
 # MLflow Namespace
 #---------------------------------------------------------------
 resource "kubernetes_namespace_v1" "mlflow" {
-  count = var.enable_mlflow ? 1 : 0
+  count = var.enable_mlflow_tracking ? 1 : 0
   metadata {
     name = local.mlflow_namespace
   }
@@ -147,7 +147,7 @@ resource "kubernetes_namespace_v1" "mlflow" {
 }
 
 resource "kubernetes_service_account_v1" "mlflow" {
-  count = var.enable_mlflow ? 1 : 0
+  count = var.enable_mlflow_tracking ? 1 : 0
   metadata {
     name        = local.mlflow_service_account
     namespace   = kubernetes_namespace_v1.mlflow[0].metadata[0].name
@@ -158,7 +158,7 @@ resource "kubernetes_service_account_v1" "mlflow" {
 }
 
 resource "kubernetes_secret_v1" "mlflow" {
-  count = var.enable_mlflow ? 1 : 0
+  count = var.enable_mlflow_tracking ? 1 : 0
   metadata {
     name      = "${local.mlflow_service_account}-secret"
     namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name
@@ -171,9 +171,9 @@ resource "kubernetes_secret_v1" "mlflow" {
   type = "kubernetes.io/service-account-token"
 }
 
-# Create IAM Role for Service Account (IRSA) Only if Airflow is enabled
+# Create IAM Role for Service Account (IRSA) Only if MLflow is enabled
 module "mlflow_irsa" {
-  count = var.enable_mlflow ? 1 : 0
+  count = var.enable_mlflow_tracking ? 1 : 0
 
   source  = "aws-ia/eks-blueprints-addon/aws"
   version = "~> 1.0" #ensure to update this to the latest/desired version
@@ -203,7 +203,7 @@ module "mlflow_irsa" {
 # IAM policy for MLflow for accesing S3 artifacts and RDS Postgres backend
 #---------------------------------------------------------------
 resource "aws_iam_policy" "mlflow" {
-  count = var.enable_mlflow ? 1 : 0
+  count = var.enable_mlflow_tracking ? 1 : 0
 
   description = "IAM policy for MLflow"
   name_prefix = format("%s-%s-", local.name, "mlflow")
@@ -212,7 +212,7 @@ resource "aws_iam_policy" "mlflow" {
 }
 
 data "aws_iam_policy_document" "mlflow" {
-  count = var.enable_mlflow ? 1 : 0
+  count = var.enable_mlflow_tracking ? 1 : 0
   statement {
     sid       = ""
     effect    = "Allow"
@@ -237,7 +237,7 @@ data "aws_iam_policy_document" "mlflow" {
     sid       = ""
     effect    = "Allow"
     resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"]
-    
+
     actions = [
       "rds-db:connect",
     ]
diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf
index 2ea6f9293..10ea6529f 100644
--- a/ai-ml/mlflow/outputs.tf
+++ b/ai-ml/mlflow/outputs.tf
@@ -15,10 +15,10 @@ output "grafana_secret_name" {
 
 output "mlflow_s3_artifacts" {
   description = "S3 bucket for MLflow artifacts"
-  value = module.mlflow_s3_bucket[0].s3_bucket_id
+  value       = module.mlflow_s3_bucket[0].s3_bucket_id
 }
 
 output "mlflow_db_backend" {
   description = "Amazon RDS Postgres database for MLflow backend"
-  value = module.db[0].db_instance_endpoint
-}
\ No newline at end of file
+  value       = module.db[0].db_instance_endpoint
+}
diff --git a/ai-ml/mlflow/variables.tf b/ai-ml/mlflow/variables.tf
index 8a41a224b..1600e75b5 100644
--- a/ai-ml/mlflow/variables.tf
+++ b/ai-ml/mlflow/variables.tf
@@ -1,6 +1,6 @@
 variable "name" {
   description = "Name of the VPC and EKS Cluster"
-  default     = "mlflow-eks"
+  default     = "mlflow-on-eks"
   type        = string
 }
 
@@ -16,15 +16,18 @@ variable "eks_cluster_version" {
   type        = string
 }
 
+# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs
 variable "vpc_cidr" {
   description = "VPC CIDR"
-  default     = "10.1.0.0/16"
+  default     = "10.1.0.0/21"
   type        = string
 }
 
-variable "db_private_subnets" {
-  description = "Private Subnets CIDRs. 254 IPs per Subnet/AZ for Airflow DB."
-  default     = ["10.0.20.0/26", "10.0.21.0/26"]
+# RFC6598 range 100.64.0.0/10
+# Note you can only /16 range to VPC. You can add multiples of /16 if required
+variable "secondary_cidr_blocks" {
+  description = "Secondary CIDR blocks to be attached to VPC"
+  default     = ["100.64.0.0/16"]
   type        = list(string)
 }
 
@@ -34,8 +37,8 @@ variable "enable_amazon_prometheus" {
   default     = true
 }
 
-variable "enable_mlflow" {
-  description = "Enable MMLflow"
+variable "enable_mlflow_tracking" {
+  description = "Enable MLflow Tracking"
   type        = bool
   default     = true
-}
\ No newline at end of file
+}
diff --git a/ai-ml/mlflow/vpc.tf b/ai-ml/mlflow/vpc.tf
index ffe29219f..0aa8b7aab 100644
--- a/ai-ml/mlflow/vpc.tf
+++ b/ai-ml/mlflow/vpc.tf
@@ -1,3 +1,21 @@
+locals {
+  # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs
+  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ
+  private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)]
+  # Routable Public subnets with NAT Gateway and Internet Gateway
+  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ
+  public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)]
+
+  database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)]
+  # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods
+  # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ
+  secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)]
+}
+
+#---------------------------------------------------------------
+# VPC
+#---------------------------------------------------------------
+
 module "vpc" {
   source  = "terraform-aws-modules/vpc/aws"
   version = "~> 5.0"
@@ -6,20 +24,23 @@ module "vpc" {
   cidr = local.vpc_cidr
   azs  = local.azs
 
-  # Three private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB
-  private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k)]
-  
+  # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods
+  secondary_cidr_blocks = var.secondary_cidr_blocks
+
+  # Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB
+  private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets)
+
   # ------------------------------
   # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
   # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
-  public_subnets  = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 10)]
+  public_subnets = local.public_subnets
 
   # ------------------------------
   # Private Subnets for MLflow backend store
-  database_subnets                   = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 20)]
+  database_subnets                   = local.database_private_subnets
   create_database_subnet_group       = true
   create_database_subnet_route_table = true
-  
+
   enable_nat_gateway   = true
   single_nat_gateway   = true
   enable_dns_hostnames = true
@@ -30,6 +51,8 @@ module "vpc" {
 
   private_subnet_tags = {
     "kubernetes.io/role/internal-elb" = 1
+    # Tags subnets for Karpenter auto-discovery
+    "karpenter.sh/discovery" = local.name
   }
 
   tags = local.tags

From 9a61a731f4612c841fb1b3de4743f6189fc494b7 Mon Sep 17 00:00:00 2001
From: Ovidiu Valeanu <ovaleanu@amazon.co.uk>
Date: Wed, 4 Oct 2023 17:27:46 +0100
Subject: [PATCH 5/8] feat: MLflow Tracking blueprint running on EKS

---
 ai-ml/mlflow/helm-values/ingress-nginx-values.yaml   | 2 +-
 ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml | 6 +++---
 ai-ml/mlflow/helm-values/nvidia-values.yaml          | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml
index 126b30152..22e48c7f9 100644
--- a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml
+++ b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml
@@ -8,4 +8,4 @@ controller:
       service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing # Private Load Balancer can only be accessed within the VPC
     targetPorts:
       http: http
-      https: http
\ No newline at end of file
+      https: http
diff --git a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml
index 7614409f7..1f604f610 100644
--- a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml
+++ b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml
@@ -18,7 +18,7 @@ podAnnotations: {}
 replicaCount: 1
 
 service:
-  type: ClusterIP 
+  type: ClusterIP
   port: 5000
 
 serviceAccount:
@@ -34,7 +34,7 @@ serviceAccount:
 
 ingress:
   enabled: true
-  className: nginx 
+  className: nginx
   annotations:
     kubernetes.io/ingress.class: nginx
     nginx.ingress.kubernetes.io/use-regex: "true"
@@ -85,4 +85,4 @@ nodeSelector: {}
 
 tolerations: []
 
-affinity: {}
\ No newline at end of file
+affinity: {}
diff --git a/ai-ml/mlflow/helm-values/nvidia-values.yaml b/ai-ml/mlflow/helm-values/nvidia-values.yaml
index 3c50e8c1f..9fa59599e 100644
--- a/ai-ml/mlflow/helm-values/nvidia-values.yaml
+++ b/ai-ml/mlflow/helm-values/nvidia-values.yaml
@@ -7,4 +7,4 @@ nfd:
       - key: nvidia.com/gpu
         operator: Exists
         effect: NoSchedule
-      - operator: "Exists"
\ No newline at end of file
+      - operator: "Exists"

From 38c8dc00080c6b5bbd9ee6e5b15855d1a9bba81b Mon Sep 17 00:00:00 2001
From: Ovidiu Valeanu <ovaleanu@amazon.co.uk>
Date: Tue, 12 Dec 2023 15:18:05 +0000
Subject: [PATCH 6/8] Fix PR comments

---
 ai-ml/mlflow/addons.tf                      | 23 ++----
 ai-ml/mlflow/cleanup.sh                     |  0
 ai-ml/mlflow/helm-values/nvidia-values.yaml | 91 ++++++++++++++++++++-
 ai-ml/mlflow/main.tf                        |  1 -
 ai-ml/mlflow/versions.tf                    |  2 +-
 5 files changed, 98 insertions(+), 19 deletions(-)
 mode change 100644 => 100755 ai-ml/mlflow/cleanup.sh

diff --git a/ai-ml/mlflow/addons.tf b/ai-ml/mlflow/addons.tf
index 3cf69f05e..a6994ae4b 100644
--- a/ai-ml/mlflow/addons.tf
+++ b/ai-ml/mlflow/addons.tf
@@ -128,21 +128,6 @@ module "eks_blueprints_addons" {
     values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})]
   }
 
-  helm_releases = {
-    #---------------------------------------
-    # NVIDIA Device Plugin Add-on
-    #---------------------------------------
-    nvidia-device-plugin = {
-      description      = "A Helm chart for NVIDIA Device Plugin"
-      namespace        = "nvidia-device-plugin"
-      create_namespace = true
-      chart            = "nvidia-device-plugin"
-      chart_version    = "0.14.0"
-      repository       = "https://nvidia.github.io/k8s-device-plugin"
-      values           = [file("${path.module}/helm-values/nvidia-values.yaml")]
-    }
-  }
-
   #---------------------------------------
   # Prommetheus and Grafana stack
   #---------------------------------------
@@ -208,6 +193,14 @@ module "eks_data_addons" {
     })]
   }
 
+  #---------------------------------------------------------------
+  # NVIDIA GPU Operator Add-on
+  #---------------------------------------------------------------
+  enable_nvidia_gpu_operator = true
+  nvidia_gpu_operator_helm_config = {
+    values = [templatefile("${path.module}/helm-values/nvidia-values.yaml", {})]
+  }
+
 }
 
 #---------------------------------------------------------------
diff --git a/ai-ml/mlflow/cleanup.sh b/ai-ml/mlflow/cleanup.sh
old mode 100644
new mode 100755
diff --git a/ai-ml/mlflow/helm-values/nvidia-values.yaml b/ai-ml/mlflow/helm-values/nvidia-values.yaml
index 9fa59599e..60078daa6 100644
--- a/ai-ml/mlflow/helm-values/nvidia-values.yaml
+++ b/ai-ml/mlflow/helm-values/nvidia-values.yaml
@@ -1,10 +1,97 @@
+# Default values for gpu-operator.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+daemonsets:
+  labels: {}
+  annotations: {}
+  priorityClassName: system-node-critical
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
+    - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes
+
+validator:
+  repository: nvcr.io/nvidia/cloud-native
+  image: gpu-operator-validator
+
+operator:
+  repository: nvcr.io/nvidia
+  priorityClassName: system-node-critical
+  defaultRuntime: containerd
+  image: gpu-operator
+  cleanupCRD: false # This option doesn't do anything even if you change this to true. NVIDIA recommends to use the manual approach of upgrading the CRDs
+  upgradeCRD: false
+  resources:
+    limits:
+      cpu: 500m
+      memory: 350Mi
+    requests:
+      cpu: 200m
+      memory: 100Mi
+
+mig:
+  strategy: single
+
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/install-precompiled-signed-drivers.html
+# Currently NVIDIA Operator takes more than 5 mins to make the node GPU ready with all the required drivers.
+# With pre-compiled NVIDIA Drivers this process can be faster hence we are using the config values as driver.version: "515-signed"
+driver:
+  enabled: true
+  repository: nvcr.io/nvidia
+  image: driver
+  # Commented this as latest Ubuntu AMIs are failing with this option enabled
+  # version: "515-signed" # supported DRIVER_BRANCH value currently are 470, 510 and 515 which will install latest drivers available on that branch for current running kernel version.
+  manager:
+    image: k8s-driver-manager
+    repository: nvcr.io/nvidia/cloud-native
+
+toolkit:
+  enabled: true
+
+devicePlugin:
+  enabled: true
+
+dcgm:
+  enabled: false
+
+dcgmExporter:
+  enabled: true
+
 gfd:
   enabled: true
-nfd:
+
+migManager:
+  enabled: true
+
+nodeStatusExporter:
+  enabled: false
+
+gds:
+  enabled: false
+
+vgpuManager:
+  enabled: false
+
+vgpuDeviceManager:
+  enabled: true
+
+vfioManager:
+  enabled: true
+
+sandboxDevicePlugin:
   enabled: true
+
+node-feature-discovery:
+  enableNodeFeatureApi: true
   worker:
     tolerations:
+      - key: "node-role.kubernetes.io/master"
+        operator: "Equal"
+        value: ""
+        effect: "NoSchedule"
       - key: nvidia.com/gpu
         operator: Exists
         effect: NoSchedule
-      - operator: "Exists"
+      - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes
diff --git a/ai-ml/mlflow/main.tf b/ai-ml/mlflow/main.tf
index df7b59945..a5e4360ea 100644
--- a/ai-ml/mlflow/main.tf
+++ b/ai-ml/mlflow/main.tf
@@ -32,7 +32,6 @@ provider "kubectl" {
 }
 
 data "aws_availability_zones" "available" {}
-# data "aws_region" "current" {}
 data "aws_caller_identity" "current" {}
 data "aws_partition" "current" {}
 
diff --git a/ai-ml/mlflow/versions.tf b/ai-ml/mlflow/versions.tf
index be6e7d672..156fc1e49 100644
--- a/ai-ml/mlflow/versions.tf
+++ b/ai-ml/mlflow/versions.tf
@@ -28,6 +28,6 @@ terraform {
   # backend "s3" {
   #   bucket = "doeks-github-actions-e2e-test-state"
   #   region = "us-west-2"
-  #   key    = "e2e/argo-workflow/terraform.tfstate"
+  #   key    = "e2e/mlflow/terraform.tfstate"
   # }
 }

From 3d9bb2216756f6e1971231baa0aa1a597f7bc2e1 Mon Sep 17 00:00:00 2001
From: Ovidiu Valeanu <ovaleanu@amazon.co.uk>
Date: Tue, 12 Dec 2023 16:22:57 +0000
Subject: [PATCH 7/8] Add variable name pg secret

---
 ai-ml/mlflow/mlflow-core.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf
index babe6bd36..81bfa1d95 100644
--- a/ai-ml/mlflow/mlflow-core.tf
+++ b/ai-ml/mlflow/mlflow-core.tf
@@ -70,7 +70,7 @@ resource "random_password" "postgres" {
 #tfsec:ignore:aws-ssm-secret-use-customer-key
 resource "aws_secretsmanager_secret" "postgres" {
   count                   = var.enable_mlflow_tracking ? 1 : 0
-  name                    = "postgres-mlflow"
+  name                    = local.mlflow_name
   recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
 }
 

From a546817bcb96f01218d7881cf76ee57351abbed0 Mon Sep 17 00:00:00 2001
From: Ovidiu Valeanu <ovaleanu@amazon.co.uk>
Date: Tue, 12 Dec 2023 16:35:13 +0000
Subject: [PATCH 8/8] Add eks alias in kubectl output

---
 ai-ml/mlflow/outputs.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf
index 10ea6529f..b5db71900 100644
--- a/ai-ml/mlflow/outputs.tf
+++ b/ai-ml/mlflow/outputs.tf
@@ -1,6 +1,6 @@
 output "configure_kubectl" {
   description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
-  value       = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
+  value       = "aws eks --region ${local.region} update-kubeconfig --alias ${module.eks.cluster_name} --name ${module.eks.cluster_name}"
 }
 
 output "eks_api_server_url" {