From 633f28d1dc07c7e4eb5f0b4f68632a3c02c41041 Mon Sep 17 00:00:00 2001 From: Ovidiu Valeanu Date: Wed, 6 Sep 2023 12:46:19 +0100 Subject: [PATCH 1/8] infra --- ai-ml/mlflow/addons.tf | 337 ++++++++++++++++++ ai-ml/mlflow/amp.tf | 136 +++++++ ai-ml/mlflow/eks.tf | 114 ++++++ .../helm-values/aws-for-fluentbit-values.yaml | 102 ++++++ .../cluster-autoscaler-values.yaml | 25 ++ .../coredns-autoscaler-values.yaml | 40 +++ .../kube-prometheus-amp-enable.yaml | 65 ++++ ai-ml/mlflow/helm-values/kube-prometheus.yaml | 36 ++ .../helm-values/metrics-server-values.yaml | 52 +++ .../00-karpenter-provisioner-cpu.yaml | 57 +++ ai-ml/mlflow/main.tf | 66 ++++ ai-ml/mlflow/mlflow-core.tf | 245 +++++++++++++ ai-ml/mlflow/outputs.tf | 14 + ai-ml/mlflow/variables.tf | 41 +++ ai-ml/mlflow/versions.tf | 33 ++ ai-ml/mlflow/vpc.tf | 36 ++ 16 files changed, 1399 insertions(+) create mode 100644 ai-ml/mlflow/addons.tf create mode 100644 ai-ml/mlflow/amp.tf create mode 100644 ai-ml/mlflow/eks.tf create mode 100644 ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml create mode 100644 ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml create mode 100644 ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml create mode 100644 ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml create mode 100644 ai-ml/mlflow/helm-values/kube-prometheus.yaml create mode 100644 ai-ml/mlflow/helm-values/metrics-server-values.yaml create mode 100644 ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml create mode 100644 ai-ml/mlflow/main.tf create mode 100644 ai-ml/mlflow/mlflow-core.tf create mode 100644 ai-ml/mlflow/outputs.tf create mode 100644 ai-ml/mlflow/variables.tf create mode 100644 ai-ml/mlflow/versions.tf create mode 100644 ai-ml/mlflow/vpc.tf diff --git a/ai-ml/mlflow/addons.tf b/ai-ml/mlflow/addons.tf new file mode 100644 index 000000000..79b07cf51 --- /dev/null +++ b/ai-ml/mlflow/addons.tf @@ -0,0 +1,337 @@ +#--------------------------------------------------------------- +# IRSA for EBS CSI Driver +#--------------------------------------------------------------- +module "ebs_csi_driver_irsa" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.20" + role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver") + attach_ebs_csi_policy = true + oidc_providers = { + main = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] + } + } + tags = local.tags +} +#--------------------------------------------------------------- +# EKS Blueprints Kubernetes Addons +#--------------------------------------------------------------- +module "eks_blueprints_addons" { + # Short commit hash from 8th May using git rev-parse --short HEAD + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.3" + + cluster_name = module.eks.cluster_name + cluster_endpoint = module.eks.cluster_endpoint + cluster_version = module.eks.cluster_version + oidc_provider_arn = module.eks.oidc_provider_arn + + #--------------------------------------- + # Amazon EKS Managed Add-ons + #--------------------------------------- + eks_addons = { + aws-ebs-csi-driver = { + service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn + } + coredns = { + preserve = true + } + vpc-cni = { + preserve = true + } + kube-proxy = { + preserve = true + } + } + + #--------------------------------------------------------------- + # CoreDNS Autoscaler helps to scale for large EKS Clusters + # Further tuning for CoreDNS is to leverage NodeLocal DNSCache -> https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/ + #--------------------------------------------------------------- + enable_cluster_proportional_autoscaler = true + cluster_proportional_autoscaler = { + values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", { + target = "deployment/coredns" + })] + description = "Cluster Proportional Autoscaler for CoreDNS Service" + } + + #--------------------------------------- + # Metrics Server + #--------------------------------------- + enable_metrics_server = true + metrics_server = { + values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})] + } + + #--------------------------------------- + # Cluster Autoscaler + #--------------------------------------- + enable_cluster_autoscaler = true + cluster_autoscaler = { + timeout = "300" + values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", { + aws_region = var.region, + eks_cluster_id = module.eks.cluster_name + })] + } + + #--------------------------------------- + # AWS for FluentBit - DaemonSet + #--------------------------------------- + enable_aws_for_fluentbit = true + aws_for_fluentbit_cw_log_group = { + use_name_prefix = false + name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group + retention_in_days = 30 + } + aws_for_fluentbit = { + s3_bucket_arns = [ + module.fluentbit_s3_bucket.s3_bucket_arn, + "${module.fluentbit_s3_bucket.s3_bucket_arn}/*}" + ] + values = [templatefile("${path.module}/helm-values/aws-for-fluentbit-values.yaml", { + region = local.region, + cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs" + s3_bucket_name = module.fluentbit_s3_bucket.s3_bucket_id + cluster_name = module.eks.cluster_name + })] + } + + #--------------------------------------- + # Karpenter Autoscaler for EKS Cluster + #--------------------------------------- + enable_karpenter = true + karpenter_enable_spot_termination = true + karpenter = { + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } + + #--------------------------------------- + # AWS Load Balancer Controller + #--------------------------------------- + enable_aws_load_balancer_controller = true + aws_load_balancer_controller = { + chart_version = "1.6.0" # min version required to use SG for NLB feature + } + + #--------------------------------------- + # Ingress Nginx external + #--------------------------------------- + enable_ingress_nginx = true + ingress_nginx = { + name = "ingress-nginx-external" + values = [ + <<-EOT + controller: + replicaCount: 3 + service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + service.beta.kubernetes.io/aws-load-balancer-security-groups: ${aws_security_group.ingress_nginx_external.id} + service.beta.kubernetes.io/aws-load-balancer-manage-backend-security-group-rules: true + loadBalancerClass: service.k8s.aws/nlb + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/instance: ingress-nginx-external + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/instance: ingress-nginx-external + minAvailable: 2 + ingressClassResource: + name: ingress-nginx-external + default: false + EOT + ] + } + + #--------------------------------------- + # Prommetheus and Grafana stack + #--------------------------------------- + #--------------------------------------------------------------- + # Install Kafka Montoring Stack with Prometheus and Grafana + # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack` + # 2- Grafana Admin user: admin + # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text` + #--------------------------------------------------------------- + enable_kube_prometheus_stack = false + kube_prometheus_stack = { + values = [ + var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", { + region = local.region + amp_sa = local.amp_ingest_service_account + amp_irsa = module.amp_ingest_irsa[0].iam_role_arn + amp_remotewrite_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write" + amp_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}" + storage_class_type = kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class.id + }) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {}) + ] + chart_version = "48.1.1" + set_sensitive = [ + { + name = "grafana.adminPassword" + value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string + } + ], + } + + tags = local.tags + +} + + +#--------------------------------------------------------------- +# Data on EKS Kubernetes Addons +#--------------------------------------------------------------- +module "eks_data_addons" { + source = "aws-ia/eks-data-addons/aws" + version = "~> 1.0" # ensure to update this to the latest/desired version + + oidc_provider_arn = module.eks.oidc_provider_arn + +} + +#--------------------------------------------------------------- +# Ingress Nginx external security groups +#--------------------------------------------------------------- +resource "aws_security_group" "ingress_nginx_external" { + name = "ingress-nginx-external" + description = "Allow public HTTP and HTTPS traffic" + vpc_id = module.vpc.vpc_id + + ingress { + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] # modify to your requirements + } + + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] # modify to your requirements + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = local.tags +} + +#--------------------------------------------------------------- +# Grafana Admin credentials resources +#--------------------------------------------------------------- +data "aws_secretsmanager_secret_version" "admin_password_version" { + secret_id = aws_secretsmanager_secret.grafana.id + depends_on = [aws_secretsmanager_secret_version.grafana] +} + +resource "random_password" "grafana" { + length = 16 + special = true + override_special = "@_" +} + +#tfsec:ignore:aws-ssm-secret-use-customer-key +resource "aws_secretsmanager_secret" "grafana" { + name = "${local.name}-grafana" + recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy +} + +resource "aws_secretsmanager_secret_version" "grafana" { + secret_id = aws_secretsmanager_secret.grafana.id + secret_string = random_password.grafana.result +} + +#--------------------------------------------------------------- +# S3 log bucket for FluentBit +#--------------------------------------------------------------- +#tfsec:ignore:* +module "fluentbit_s3_bucket" { + source = "terraform-aws-modules/s3-bucket/aws" + version = "~> 3.0" + + bucket_prefix = "${local.name}-argo-workflow-logs-" + # For example only - please evaluate for your environment + force_destroy = true + server_side_encryption_configuration = { + rule = { + apply_server_side_encryption_by_default = { + sse_algorithm = "AES256" + } + } + } + + tags = local.tags +} + +#--------------------------------------- +# Karpenter Provisioners for workloads +#--------------------------------------- +data "kubectl_path_documents" "karpenter_provisioners" { + pattern = "${path.module}/karpenter-provisioners/*.yaml" + vars = { + cluster_name = module.eks.cluster_name + } +} + +resource "kubectl_manifest" "karpenter_provisioner" { + for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents) + yaml_body = each.value + + depends_on = [module.eks_blueprints_addons] +} + +#--------------------------------------------------------------- +# GP3 Encrypted Storage Class +#--------------------------------------------------------------- + +resource "kubernetes_annotations" "gp2_default" { + annotations = { + "storageclass.kubernetes.io/is-default-class" : "false" + } + api_version = "storage.k8s.io/v1" + kind = "StorageClass" + metadata { + name = "gp2" + } + force = true + + depends_on = [module.eks] +} + +resource "kubernetes_storage_class" "ebs_csi_encrypted_gp3_storage_class" { + metadata { + name = "gp3" + annotations = { + "storageclass.kubernetes.io/is-default-class" : "true" + } + } + + storage_provisioner = "ebs.csi.aws.com" + reclaim_policy = "Delete" + allow_volume_expansion = true + volume_binding_mode = "WaitForFirstConsumer" + parameters = { + fsType = "xfs" + encrypted = true + type = "gp3" + } + + depends_on = [kubernetes_annotations.gp2_default] +} diff --git a/ai-ml/mlflow/amp.tf b/ai-ml/mlflow/amp.tf new file mode 100644 index 000000000..14b47ba4c --- /dev/null +++ b/ai-ml/mlflow/amp.tf @@ -0,0 +1,136 @@ +#------------------------------------------ +# Amazon Prometheus +#------------------------------------------ +locals { + amp_ingest_service_account = "amp-iamproxy-ingest-service-account" + amp_namespace = "kube-prometheus-stack" +} + +resource "aws_prometheus_workspace" "amp" { + count = var.enable_amazon_prometheus ? 1 : 0 + + alias = format("%s-%s", "amp-ws", local.name) + tags = local.tags +} +#IAM Policy for Amazon Prometheus & Grafana +resource "aws_iam_policy" "grafana" { + count = var.enable_amazon_prometheus ? 1 : 0 + + description = "IAM policy for Grafana Pod" + name_prefix = format("%s-%s-", local.name, "grafana") + path = "/" + policy = data.aws_iam_policy_document.grafana[0].json +} + +data "aws_iam_policy_document" "grafana" { + count = var.enable_amazon_prometheus ? 1 : 0 + + statement { + sid = "AllowReadingMetricsFromCloudWatch" + effect = "Allow" + resources = ["*"] + + actions = [ + "cloudwatch:DescribeAlarmsForMetric", + "cloudwatch:ListMetrics", + "cloudwatch:GetMetricData", + "cloudwatch:GetMetricStatistics" + ] + } + + statement { + sid = "AllowGetInsightsCloudWatch" + effect = "Allow" + resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:insight-rule/*"] + + actions = [ + "cloudwatch:GetInsightRuleReport", + ] + } + + statement { + sid = "AllowReadingAlarmHistoryFromCloudWatch" + effect = "Allow" + resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:alarm:*"] + + actions = [ + "cloudwatch:DescribeAlarmHistory", + "cloudwatch:DescribeAlarms", + ] + } + + statement { + sid = "AllowReadingLogsFromCloudWatch" + effect = "Allow" + resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:*:log-stream:*"] + + actions = [ + "logs:DescribeLogGroups", + "logs:GetLogGroupFields", + "logs:StartQuery", + "logs:StopQuery", + "logs:GetQueryResults", + "logs:GetLogEvents", + ] + } + + statement { + sid = "AllowReadingTagsInstancesRegionsFromEC2" + effect = "Allow" + resources = ["*"] + + actions = [ + "ec2:DescribeTags", + "ec2:DescribeInstances", + "ec2:DescribeRegions", + ] + } + + statement { + sid = "AllowReadingResourcesForTags" + effect = "Allow" + resources = ["*"] + actions = ["tag:GetResources"] + } + + statement { + sid = "AllowListApsWorkspaces" + effect = "Allow" + resources = [ + "arn:${local.partition}:aps:${local.region}:${local.account_id}:/*", + "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*", + "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*/*", + ] + actions = [ + "aps:ListWorkspaces", + "aps:DescribeWorkspace", + "aps:GetMetricMetadata", + "aps:GetSeries", + "aps:QueryMetrics", + "aps:RemoteWrite", + "aps:GetLabels" + ] + } +} + +module "amp_ingest_irsa" { + count = var.enable_amazon_prometheus ? 1 : 0 + + source = "aws-ia/eks-blueprints-addon/aws" + version = "~> 1.0" + create_release = false + create_role = true + create_policy = false + role_name = format("%s-%s", local.name, "amp-ingest") + role_policies = { amp_policy = aws_iam_policy.grafana[0].arn } + + oidc_providers = { + this = { + provider_arn = module.eks.oidc_provider_arn + namespace = local.amp_namespace + service_account = local.amp_ingest_service_account + } + } + + tags = local.tags +} diff --git a/ai-ml/mlflow/eks.tf b/ai-ml/mlflow/eks.tf new file mode 100644 index 000000000..4d486df1c --- /dev/null +++ b/ai-ml/mlflow/eks.tf @@ -0,0 +1,114 @@ +#--------------------------------------------------------------- +# EKS Cluster +#--------------------------------------------------------------- +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 19.15" + + cluster_name = local.name + cluster_version = var.eks_cluster_version + + cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. + + vpc_id = module.vpc.vpc_id + + subnet_ids = module.vpc.private_subnets + + manage_aws_auth_configmap = true + aws_auth_roles = [ + # We need to add in the Karpenter node IAM role for nodes launched by Karpenter + { + rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + ] + } + ] + + #--------------------------------------- + # Note: This can further restricted to specific required for each Add-on and your application + #--------------------------------------- + # Extend cluster security group rules + cluster_security_group_additional_rules = { + ingress_nodes_ephemeral_ports_tcp = { + description = "Nodes on ephemeral ports" + protocol = "tcp" + from_port = 1025 + to_port = 65535 + type = "ingress" + source_node_security_group = true + } + } + + # Extend node-to-node security group rules + node_security_group_additional_rules = { + ingress_self_all = { + description = "Node to node all ports/protocols" + protocol = "-1" + from_port = 0 + to_port = 0 + type = "ingress" + self = true + } + # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. + # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc. + # Change this according to your security requirements if needed + ingress_cluster_to_node_all_traffic = { + description = "Cluster API to Nodegroup all traffic" + protocol = "-1" + from_port = 0 + to_port = 0 + type = "ingress" + source_cluster_security_group = true + } + } + + eks_managed_node_group_defaults = { + iam_role_additional_policies = { + # Not required, but used in the example to access the nodes to inspect mounted volumes + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + + eks_managed_node_groups = { + # We recommend to have a MNG to place your critical workloads and add-ons + # Then rely on Karpenter to scale your workloads + # You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners + core_node_group = { + name = "core-node-group" + description = "EKS Core node group for hosting critical add-ons" + subnet_ids = module.vpc.private_subnets + + min_size = 3 + max_size = 9 + desired_size = 3 + + instance_types = ["m5.xlarge"] + + ebs_optimized = true + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 100 + volume_type = "gp3" + } + } + } + + labels = { + Environment = "preprod" + Zone = "test" + WorkerType = "ON_DEMAND" + NodeGroupType = "core" + } + + tags = merge(local.tags, { + Name = "core-node-grp", + "karpenter.sh/discovery" = local.name + }) + } + } +} diff --git a/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml b/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml new file mode 100644 index 000000000..a08d4260d --- /dev/null +++ b/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml @@ -0,0 +1,102 @@ +global: + +#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server +# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata +hostNetwork: true +dnsPolicy: ClusterFirstWithHostNet + +service: + parsersFiles: + - /fluent-bit/parsers/parsers.conf + extraParsers: | + [PARSER] + Name kubernetes + Format regex + Regex ^(?[^_]+)\.(?.+)\.(?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?[a-z0-9]{64})-$ + +input: + name: "tail" + enabled: true + tag: "systempods....-" + path: "/var/log/containers/*.log" + db: "/var/log/flb_kube.db" + memBufLimit: 5MB + skipLongLines: "On" + refreshInterval: 10 + extraInputs: | + multiline.parser docker, cri + Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)-(?[a-z0-9]{64})\.log$ + + +# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters +filter: + name: "kubernetes" + match: "systempods.*" + kubeURL: "https://kubernetes.default.svc.cluster.local:443" + mergeLog: "On" + mergeLogKey: "log_processed" + keepLog: "On" + k8sLoggingParser: "On" + k8sLoggingExclude: "Off" + bufferSize: "0" + extraFilters: | + Kube_Tag_Prefix systempods. + Regex_Parser kubernetes + Labels On + Annotations Off + Use_Kubelet true + Kubelet_Port 10250 + Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token + +# CATION: Donot use `cloudwatch` plugin. This Golang Plugin is not recommnded by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance. +# cloudWatch: +# enabled: false + +# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch +cloudWatchLogs: + enabled: true + match: "systempods.*" + region: ${region} + logGroupName: ${cloudwatch_log_group} + autoCreateGroup: false + extraOutputs: | + log_key log + +#----------------------------------------------------------# +# OUTPUT logs to S3 +#----------------------------------------------------------# + +# This is an example for writing logs to S3 bucket. +# This example writes system pod logs and spark logs into dedicated prefix. +# This second output is using the rewrite_tag filter commented above + +additionalOutputs: | + [OUTPUT] + Name s3 + Match systempods.* + region ${region} + bucket ${s3_bucket_name} + total_file_size 100M + s3_key_format /${cluster_name}/system-pod-logs/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log + s3_key_format_tag_delimiters .. + store_dir /home/ec2-user/buffer + upload_timeout 10m + log_key log + + +# Resource config for large clusters +resources: + limits: + cpu: 1000m + memory: 1500Mi + requests: + cpu: 500m + memory: 500Mi + +## Assign a PriorityClassName to pods if set +priorityClassName: system-node-critical + +# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. +tolerations: + - operator: Exists diff --git a/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml b/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml new file mode 100644 index 000000000..5a42794f2 --- /dev/null +++ b/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml @@ -0,0 +1,25 @@ +autoDiscovery: + clusterName: ${eks_cluster_id} + +awsRegion: ${aws_region} + +cloudProvider: aws + +extraArgs: + aws-use-static-instance-list: true + +# Best practice to update the resource requests and limits for each add-on +resources: + limits: + cpu: 1000m + memory: 1G + requests: + cpu: 200m + memory: 512Mi + +# Best practice to updateStrategy for each add-on +updateStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 diff --git a/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml b/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml new file mode 100644 index 000000000..64cb540bf --- /dev/null +++ b/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml @@ -0,0 +1,40 @@ +nameOverride: kube-dns-autoscaler + +# Formula for controlling the replicas. Adjust according to your needs +# replicas = max( ceil( cores * 1/coresPerReplica ) , ceil( nodes * 1/nodesPerReplica ) ) +# replicas = min(replicas, max) +# replicas = max(replicas, min) +config: + linear: + coresPerReplica: 256 + nodesPerReplica: 16 + min: 1 + max: 100 + preventSinglePointFailure: true + includeUnschedulableNodes: true + +# Target to scale. In format: deployment/*, replicationcontroller/* or replicaset/* (not case sensitive). +options: + target: ${target} + +serviceAccount: + create: true + name: kube-dns-autoscaler + +podSecurityContext: + seccompProfile: + type: RuntimeDefault + supplementalGroups: [ 65534 ] + fsGroup: 65534 + +resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + +tolerations: + - key: "CriticalAddonsOnly" + operator: "Exists" diff --git a/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml b/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml new file mode 100644 index 000000000..cc7687163 --- /dev/null +++ b/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml @@ -0,0 +1,65 @@ +prometheus: + serviceAccount: + create: true + name: ${amp_sa} + annotations: + eks.amazonaws.com/role-arn: ${amp_irsa} + prometheusSpec: + remoteWrite: + - url: ${amp_remotewrite_url} + sigv4: + region: ${region} + queueConfig: + maxSamplesPerSend: 1000 + maxShards: 200 + capacity: 2500 + retention: 5h + scrapeInterval: 30s + evaluationInterval: 30s + scrapeTimeout: 10s + storageSpec: + volumeClaimTemplate: + metadata: + name: data + spec: + storageClassName: ${storage_class_type} + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + # Scrape metrics for Yunikorn add-on + additionalScrapeConfigs: + - job_name: yunikorn + honor_labels: true + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /ws/v1//metrics + scheme: http + dns_sd_configs: + - names: + - yunikorn-service.yunikorn.svc + type: 'A' + port: 9080 +alertmanager: + enabled: false + +grafana: + enabled: true + defaultDashboardsEnabled: true +# Adding AMP datasource to Grafana config + serviceAccount: + create: false + name: ${amp_sa} + grafana.ini: + auth: + sigv4_auth_enabled: true + additionalDataSources: + - name: AMP + editable: true + jsonData: + sigV4Auth: true + sigV4Region: ${region} + type: prometheus + isDefault: false + url: ${amp_url} diff --git a/ai-ml/mlflow/helm-values/kube-prometheus.yaml b/ai-ml/mlflow/helm-values/kube-prometheus.yaml new file mode 100644 index 000000000..dedff553b --- /dev/null +++ b/ai-ml/mlflow/helm-values/kube-prometheus.yaml @@ -0,0 +1,36 @@ +prometheus: + prometheusSpec: + retention: 5h + scrapeInterval: 30s + evaluationInterval: 30s + scrapeTimeout: 10s + storageSpec: + volumeClaimTemplate: + metadata: + name: data + spec: + storageClassName: ${storage_class_type} + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + # Scrape metrics for Yunikorn add-on + additionalScrapeConfigs: + - job_name: yunikorn + honor_labels: true + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /ws/v1//metrics + scheme: http + dns_sd_configs: + - names: + - yunikorn-service.yunikorn.svc + type: 'A' + port: 9080 +alertmanager: + enabled: false + +grafana: + enabled: true + defaultDashboardsEnabled: true diff --git a/ai-ml/mlflow/helm-values/metrics-server-values.yaml b/ai-ml/mlflow/helm-values/metrics-server-values.yaml new file mode 100644 index 000000000..bc806ced6 --- /dev/null +++ b/ai-ml/mlflow/helm-values/metrics-server-values.yaml @@ -0,0 +1,52 @@ +# HA config for metrics-server +image: + repository: registry.k8s.io/metrics-server/metrics-server + pullPolicy: IfNotPresent + +serviceAccount: + create: true + name: metrics-server + +rbac: + create: true + pspEnabled: false + +apiService: + create: true + +podLabels: + k8s-app: metrics-server + +# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true +replicas: 2 + +updateStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + +podDisruptionBudget: + enabled: true + minAvailable: 1 + +defaultArgs: + - --cert-dir=/tmp + - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname + - --kubelet-use-node-status-port + - --metric-resolution=15s + +resources: + requests: + cpu: 200m + memory: 512Mi + +affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + k8s-app: metrics-server + namespaces: + - kube-system + topologyKey: kubernetes.io/hostname diff --git a/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml b/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml new file mode 100644 index 000000000..47a978c89 --- /dev/null +++ b/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml @@ -0,0 +1,57 @@ +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: default +spec: + # Wich AWS Node Template to pick + providerRef: + name: default + + # ttlSecondsAfterEmpty: 30 + + # Requirements that constrain the parameters of provisioned nodes. + # These requirements are combined with pod.spec.affinity.nodeAffinity rules. + # Operators { In, NotIn } are supported to enable including or excluding values + requirements: + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c", "m", "r"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand + operator: In + values: ["on-demand", "spot"] + limits: + resources: + cpu: 20 # CPU Cores across all instances + memory: 2000Gi + + # Enables consolidation which attempts to reduce cluster cost by both removing un-needed nodes and down-sizing those + # that can't be removed. Mutually exclusive with the ttlSecondsAfterEmpty parameter. + consolidation: + enabled: true +--- +apiVersion: karpenter.k8s.aws/v1alpha1 +kind: AWSNodeTemplate +metadata: + name: default +spec: + subnetSelector: + Name: ${cluster_name}-private* # Name of the Subnets to spin up the nodes + securityGroupSelector: # required, when not using launchTemplate + Name: ${cluster_name}-node* # name of the SecurityGroup to be used with Nodes + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 100Gi + volumeType: gp3 + encrypted: true + tags: + managed-by: "karpenter" + intent: "apps" + Name: "karpenter-node-default" diff --git a/ai-ml/mlflow/main.tf b/ai-ml/mlflow/main.tf new file mode 100644 index 000000000..734c0ec63 --- /dev/null +++ b/ai-ml/mlflow/main.tf @@ -0,0 +1,66 @@ +provider "aws" { + region = local.region +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token +} + +# ECR always authenticates with `us-east-1` region +# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html +provider "aws" { + alias = "ecr" + region = "us-east-1" +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token + } +} + +provider "kubectl" { + apply_retry_count = 10 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + load_config_file = false + token = data.aws_eks_cluster_auth.this.token +} + +data "aws_availability_zones" "available" {} +data "aws_region" "current" {} +data "aws_caller_identity" "current" {} +data "aws_partition" "current" {} + +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.ecr +} + +#--------------------------------------------------------------- +# Local variables +#--------------------------------------------------------------- +locals { + name = var.name + region = var.region + vpc_cidr = var.vpc_cidr + azs = slice(data.aws_availability_zones.available.names, 0, 3) + account_id = data.aws_caller_identity.current.account_id + partition = data.aws_partition.current.partition + + mlflow_name = "mlflow" + mlflow_namespace = "mlflow" + mlflow_service_account = "mlflow" + + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf new file mode 100644 index 000000000..b88d9abec --- /dev/null +++ b/ai-ml/mlflow/mlflow-core.tf @@ -0,0 +1,245 @@ +#--------------------------------------------------------------- +# RDS Postgres Database for MLflow Backend +#--------------------------------------------------------------- +module "db" { + count = var.enable_mlflow ? 1 : 0 + source = "terraform-aws-modules/rds/aws" + version = "~> 5.0" + + identifier = local.mlflow_name + + engine = "postgres" + engine_version = "14.3" + family = "postgres14" + major_engine_version = "14" + instance_class = "db.m6i.xlarge" + + storage_type = "io1" + allocated_storage = 100 + iops = 3000 + + db_name = local.mlflow_name + username = local.mlflow_name + create_random_password = false + password = sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string) + port = 5432 + + multi_az = true + db_subnet_group_name = module.vpc.database_subnet_group + vpc_security_group_ids = [module.security_group[0].security_group_id] + + maintenance_window = "Mon:00:00-Mon:03:00" + backup_window = "03:00-06:00" + enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"] + create_cloudwatch_log_group = true + + backup_retention_period = 5 + skip_final_snapshot = true + deletion_protection = false + + performance_insights_enabled = true + performance_insights_retention_period = 7 + create_monitoring_role = true + monitoring_interval = 60 + monitoring_role_name = "mlflow-backend" + monitoring_role_use_name_prefix = true + monitoring_role_description = "MLflow Postgres Backend for monitoring role" + + parameters = [ + { + name = "autovacuum" + value = 1 + }, + { + name = "client_encoding" + value = "utf8" + } + ] + + tags = local.tags +} + +#--------------------------------------------------------------- +# MLflow Postgres Backend DB Master password +#--------------------------------------------------------------- +resource "random_password" "postgres" { + count = var.enable_mlflow ? 1 : 0 + length = 16 + special = false +} +#tfsec:ignore:aws-ssm-secret-use-customer-key +resource "aws_secretsmanager_secret" "postgres" { + count = var.enable_mlflow ? 1 : 0 + name = "postgres-2" + recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy +} + +resource "aws_secretsmanager_secret_version" "postgres" { + count = var.enable_mlflow ? 1 : 0 + secret_id = aws_secretsmanager_secret.postgres[0].id + secret_string = random_password.postgres[0].result +} + +#--------------------------------------------------------------- +# PostgreSQL RDS security group +#--------------------------------------------------------------- +module "security_group" { + count = var.enable_mlflow ? 1 : 0 + source = "terraform-aws-modules/security-group/aws" + version = "~> 5.0" + + name = local.name + description = "Complete PostgreSQL example security group" + vpc_id = module.vpc.vpc_id + + # ingress + ingress_with_cidr_blocks = [ + { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + description = "PostgreSQL access from within VPC" + cidr_blocks = "${module.vpc.vpc_cidr_block}" + }, + ] + + tags = local.tags +} + + +#--------------------------------------------------------------- +# S3 bucket for MLflow artifacts +#--------------------------------------------------------------- + +#tfsec:ignore:* +module "mlflow_s3_bucket" { + count = var.enable_mlflow ? 1 : 0 + source = "terraform-aws-modules/s3-bucket/aws" + version = "~> 3.0" + + bucket_prefix = "${local.name}-logs-" + + # For example only - please evaluate for your environment + force_destroy = true + + server_side_encryption_configuration = { + rule = { + apply_server_side_encryption_by_default = { + sse_algorithm = "AES256" + } + } + } + + tags = local.tags +} + +#--------------------------------------------------------------- +# MLflow Namespace +#--------------------------------------------------------------- +resource "kubernetes_namespace_v1" "mlflow" { + count = var.enable_mlflow ? 1 : 0 + metadata { + name = local.mlflow_namespace + } + timeouts { + delete = "15m" + } +} + +resource "kubernetes_service_account_v1" "mlflow" { + count = var.enable_mlflow ? 1 : 0 + metadata { + name = local.mlflow_service_account + namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name + annotations = { "eks.amazonaws.com/role-arn" : module.mlflow_irsa[0].iam_role_arn } + } + + automount_service_account_token = true +} + +resource "kubernetes_secret_v1" "mlflow" { + count = var.enable_mlflow ? 1 : 0 + metadata { + name = "${local.mlflow_service_account}-secret" + namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name + annotations = { + "kubernetes.io/service-account.name" = kubernetes_service_account_v1.mlflow[0].metadata[0].name + "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.mlflow[0].metadata[0].name + } + } + + type = "kubernetes.io/service-account-token" +} + +# Create IAM Role for Service Account (IRSA) Only if Airflow is enabled +module "mlflow_irsa" { + count = var.enable_mlflow ? 1 : 0 + + source = "aws-ia/eks-blueprints-addon/aws" + version = "~> 1.0" #ensure to update this to the latest/desired version + + # Disable helm release + create_release = false + + # IAM role for service account (IRSA) + create_role = true + create_policy = false # Policy is created in the next resource + + role_name = local.mlflow_service_account + role_policies = { mlflow_policy = aws_iam_policy.mlflow[0].arn } + + oidc_providers = { + this = { + provider_arn = module.eks.oidc_provider_arn + namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name + service_account = local.mlflow_service_account + } + } + + tags = local.tags +} + +#--------------------------------------------------------------- +# IAM policy for MLflow for accesing S3 artifacts and RDS Postgres backend +#--------------------------------------------------------------- +resource "aws_iam_policy" "mlflow" { + count = var.enable_mlflow ? 1 : 0 + + description = "IAM policy for MLflow" + name_prefix = format("%s-%s-", local.name, "mlflow") + path = "/" + policy = data.aws_iam_policy_document.mlflow[0].json +} + +data "aws_iam_policy_document" "mlflow" { + count = var.enable_mlflow ? 1 : 0 + statement { + sid = "" + effect = "Allow" + resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}"] + + actions = [ + "s3:ListBucket" + ] + } + statement { + sid = "" + effect = "Allow" + resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}/*"] + + actions = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject" + ] + } + statement { + sid = "" + effect = "Allow" + resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"] + + actions = [ + "rds-db:connect", + ] + } +} diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf new file mode 100644 index 000000000..3c2034b62 --- /dev/null +++ b/ai-ml/mlflow/outputs.tf @@ -0,0 +1,14 @@ +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" +} + +output "eks_api_server_url" { + description = "Your eks API server endpoint" + value = module.eks.cluster_endpoint +} + +output "grafana_secret_name" { + description = "Grafana password secret name" + value = aws_secretsmanager_secret.grafana.name +} diff --git a/ai-ml/mlflow/variables.tf b/ai-ml/mlflow/variables.tf new file mode 100644 index 000000000..8a41a224b --- /dev/null +++ b/ai-ml/mlflow/variables.tf @@ -0,0 +1,41 @@ +variable "name" { + description = "Name of the VPC and EKS Cluster" + default = "mlflow-eks" + type = string +} + +variable "region" { + description = "Region" + type = string + default = "us-west-2" +} + +variable "eks_cluster_version" { + description = "EKS Cluster version" + default = "1.27" + type = string +} + +variable "vpc_cidr" { + description = "VPC CIDR" + default = "10.1.0.0/16" + type = string +} + +variable "db_private_subnets" { + description = "Private Subnets CIDRs. 254 IPs per Subnet/AZ for Airflow DB." + default = ["10.0.20.0/26", "10.0.21.0/26"] + type = list(string) +} + +variable "enable_amazon_prometheus" { + description = "Enable AWS Managed Prometheus service" + type = bool + default = true +} + +variable "enable_mlflow" { + description = "Enable MMLflow" + type = bool + default = true +} \ No newline at end of file diff --git a/ai-ml/mlflow/versions.tf b/ai-ml/mlflow/versions.tf new file mode 100644 index 000000000..be6e7d672 --- /dev/null +++ b/ai-ml/mlflow/versions.tf @@ -0,0 +1,33 @@ +terraform { + required_version = ">= 1.0.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 3.72" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.10" + } + helm = { + source = "hashicorp/helm" + version = ">= 2.4.1" + } + random = { + source = "hashicorp/random" + version = "3.3.2" + } + kubectl = { + source = "gavinbunney/kubectl" + version = ">= 1.14" + } + } + + # ## Used for end-to-end testing on project; update to suit your needs + # backend "s3" { + # bucket = "doeks-github-actions-e2e-test-state" + # region = "us-west-2" + # key = "e2e/argo-workflow/terraform.tfstate" + # } +} diff --git a/ai-ml/mlflow/vpc.tf b/ai-ml/mlflow/vpc.tf new file mode 100644 index 000000000..ffe29219f --- /dev/null +++ b/ai-ml/mlflow/vpc.tf @@ -0,0 +1,36 @@ +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + + name = local.name + cidr = local.vpc_cidr + azs = local.azs + + # Three private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k)] + + # ------------------------------ + # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments + # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW + public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 10)] + + # ------------------------------ + # Private Subnets for MLflow backend store + database_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 20)] + create_database_subnet_group = true + create_database_subnet_route_table = true + + enable_nat_gateway = true + single_nat_gateway = true + enable_dns_hostnames = true + + public_subnet_tags = { + "kubernetes.io/role/elb" = 1 + } + + private_subnet_tags = { + "kubernetes.io/role/internal-elb" = 1 + } + + tags = local.tags +} From 5c6efe4d0098a8ea1c91a5debe069bfecad8d077 Mon Sep 17 00:00:00 2001 From: Ovidiu Valeanu Date: Wed, 6 Sep 2023 13:02:12 +0100 Subject: [PATCH 2/8] :rocket: MLflow first commit --- ai-ml/mlflow/mlflow-core.tf | 4 ++-- ai-ml/mlflow/outputs.tf | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf index b88d9abec..fd2273e1a 100644 --- a/ai-ml/mlflow/mlflow-core.tf +++ b/ai-ml/mlflow/mlflow-core.tf @@ -117,7 +117,7 @@ module "mlflow_s3_bucket" { source = "terraform-aws-modules/s3-bucket/aws" version = "~> 3.0" - bucket_prefix = "${local.name}-logs-" + bucket_prefix = "${local.name}-artifacts-" # For example only - please evaluate for your environment force_destroy = true @@ -236,7 +236,7 @@ data "aws_iam_policy_document" "mlflow" { statement { sid = "" effect = "Allow" - resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"] + resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_endpoint}/${local.mlflow_name}"] actions = [ "rds-db:connect", diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf index 3c2034b62..2ea6f9293 100644 --- a/ai-ml/mlflow/outputs.tf +++ b/ai-ml/mlflow/outputs.tf @@ -12,3 +12,13 @@ output "grafana_secret_name" { description = "Grafana password secret name" value = aws_secretsmanager_secret.grafana.name } + +output "mlflow_s3_artifacts" { + description = "S3 bucket for MLflow artifacts" + value = module.mlflow_s3_bucket[0].s3_bucket_id +} + +output "mlflow_db_backend" { + description = "Amazon RDS Postgres database for MLflow backend" + value = module.db[0].db_instance_endpoint +} \ No newline at end of file From 3ee59d37efc118d0ff735f5102b8a4ffacf9d9e8 Mon Sep 17 00:00:00 2001 From: Ovidiu Valeanu Date: Wed, 6 Sep 2023 13:19:17 +0100 Subject: [PATCH 3/8] :rocket: MLflow first commit --- ai-ml/mlflow/mlflow-core.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf index fd2273e1a..6117f4918 100644 --- a/ai-ml/mlflow/mlflow-core.tf +++ b/ai-ml/mlflow/mlflow-core.tf @@ -236,7 +236,7 @@ data "aws_iam_policy_document" "mlflow" { statement { sid = "" effect = "Allow" - resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_endpoint}/${local.mlflow_name}"] + resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"] actions = [ "rds-db:connect", From efef5dd853ae853a33786192ddd398a9b6b92e2c Mon Sep 17 00:00:00 2001 From: Ovidiu Valeanu Date: Wed, 4 Oct 2023 15:15:03 +0100 Subject: [PATCH 4/8] feat: MLflow Tracking blueprint running on EKS --- ai-ml/mlflow/README.md | 93 ++++++++++++++++++- ai-ml/mlflow/addons.tf | 81 +++++++++------- ai-ml/mlflow/cleanup.sh | 45 +++++++++ ai-ml/mlflow/eks.tf | 41 +++++++- .../helm-values/ingress-nginx-values.yaml | 11 +++ .../helm-values/mlflow-tracking-values.yaml | 88 ++++++++++++++++++ ai-ml/mlflow/helm-values/nvidia-values.yaml | 10 ++ ai-ml/mlflow/install.sh | 37 ++++++++ ai-ml/mlflow/main.tf | 16 ++-- ai-ml/mlflow/mlflow-core.tf | 34 +++---- ai-ml/mlflow/outputs.tf | 6 +- ai-ml/mlflow/variables.tf | 19 ++-- ai-ml/mlflow/vpc.tf | 35 +++++-- 13 files changed, 436 insertions(+), 80 deletions(-) create mode 100644 ai-ml/mlflow/cleanup.sh create mode 100644 ai-ml/mlflow/helm-values/ingress-nginx-values.yaml create mode 100644 ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml create mode 100644 ai-ml/mlflow/helm-values/nvidia-values.yaml create mode 100755 ai-ml/mlflow/install.sh diff --git a/ai-ml/mlflow/README.md b/ai-ml/mlflow/README.md index ff644528d..3f610da29 100755 --- a/ai-ml/mlflow/README.md +++ b/ai-ml/mlflow/README.md @@ -1 +1,92 @@ -# MLflow on EKS (Coming Soon) +# MLflow on EKS + +Docs comming soon ... + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [aws](#requirement\_aws) | >= 3.72 | +| [helm](#requirement\_helm) | >= 2.4.1 | +| [kubectl](#requirement\_kubectl) | >= 1.14 | +| [kubernetes](#requirement\_kubernetes) | >= 2.10 | +| [random](#requirement\_random) | 3.3.2 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | 5.19.0 | +| [aws.ecr](#provider\_aws.ecr) | 5.19.0 | +| [kubectl](#provider\_kubectl) | 1.14.0 | +| [kubernetes](#provider\_kubernetes) | 2.23.0 | +| [random](#provider\_random) | 3.3.2 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [amp\_ingest\_irsa](#module\_amp\_ingest\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | +| [db](#module\_db) | terraform-aws-modules/rds/aws | ~> 5.0 | +| [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 | +| [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 | +| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.3 | +| [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.2.3 | +| [fluentbit\_s3\_bucket](#module\_fluentbit\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 | +| [mlflow\_irsa](#module\_mlflow\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | +| [mlflow\_s3\_bucket](#module\_mlflow\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 | +| [security\_group](#module\_security\_group) | terraform-aws-modules/security-group/aws | ~> 5.0 | +| [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 | + +## Resources + +| Name | Type | +|------|------| +| [aws_iam_policy.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_policy.mlflow](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_prometheus_workspace.amp](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/prometheus_workspace) | resource | +| [aws_secretsmanager_secret.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource | +| [aws_secretsmanager_secret.postgres](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource | +| [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource | +| [aws_secretsmanager_secret_version.postgres](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource | +| [aws_security_group.ingress_nginx_external](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | +| [kubectl_manifest.karpenter_provisioner](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubernetes_annotations.gp2_default](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource | +| [kubernetes_namespace_v1.mlflow](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource | +| [kubernetes_secret_v1.mlflow](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | +| [kubernetes_service_account_v1.mlflow](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource | +| [kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource | +| [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/3.3.2/docs/resources/password) | resource | +| [random_password.postgres](https://registry.terraform.io/providers/hashicorp/random/3.3.2/docs/resources/password) | resource | +| [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | +| [aws_ecrpublic_authorization_token.token](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ecrpublic_authorization_token) | data source | +| [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | +| [aws_iam_policy_document.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.mlflow](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source | +| [aws_secretsmanager_secret_version.admin_password_version](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | +| [kubectl_path_documents.karpenter_provisioners](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/data-sources/path_documents) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.27"` | no | +| [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no | +| [enable\_mlflow\_tracking](#input\_enable\_mlflow\_tracking) | Enable MLflow Tracking | `bool` | `true` | no | +| [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"mlflow-on-eks"` | no | +| [region](#input\_region) | Region | `string` | `"us-west-2"` | no | +| [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` |
[
"100.64.0.0/16"
]
| no | +| [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR | `string` | `"10.1.0.0/21"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [configure\_kubectl](#output\_configure\_kubectl) | Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig | +| [eks\_api\_server\_url](#output\_eks\_api\_server\_url) | Your eks API server endpoint | +| [grafana\_secret\_name](#output\_grafana\_secret\_name) | Grafana password secret name | +| [mlflow\_db\_backend](#output\_mlflow\_db\_backend) | Amazon RDS Postgres database for MLflow backend | +| [mlflow\_s3\_artifacts](#output\_mlflow\_s3\_artifacts) | S3 bucket for MLflow artifacts | diff --git a/ai-ml/mlflow/addons.tf b/ai-ml/mlflow/addons.tf index 79b07cf51..3cf69f05e 100644 --- a/ai-ml/mlflow/addons.tf +++ b/ai-ml/mlflow/addons.tf @@ -114,45 +114,33 @@ module "eks_blueprints_addons" { #--------------------------------------- enable_aws_load_balancer_controller = true aws_load_balancer_controller = { - chart_version = "1.6.0" # min version required to use SG for NLB feature + set = [{ + name = "enableServiceMutatorWebhook" + value = "false" + }] } #--------------------------------------- - # Ingress Nginx external + # Ingress Nginx Add-on #--------------------------------------- enable_ingress_nginx = true ingress_nginx = { - name = "ingress-nginx-external" - values = [ - <<-EOT - controller: - replicaCount: 3 - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - service.beta.kubernetes.io/aws-load-balancer-security-groups: ${aws_security_group.ingress_nginx_external.id} - service.beta.kubernetes.io/aws-load-balancer-manage-backend-security-group-rules: true - loadBalancerClass: service.k8s.aws/nlb - topologySpreadConstraints: - - maxSkew: 1 - topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: ScheduleAnyway - labelSelector: - matchLabels: - app.kubernetes.io/instance: ingress-nginx-external - - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - labelSelector: - matchLabels: - app.kubernetes.io/instance: ingress-nginx-external - minAvailable: 2 - ingressClassResource: - name: ingress-nginx-external - default: false - EOT - ] + values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})] + } + + helm_releases = { + #--------------------------------------- + # NVIDIA Device Plugin Add-on + #--------------------------------------- + nvidia-device-plugin = { + description = "A Helm chart for NVIDIA Device Plugin" + namespace = "nvidia-device-plugin" + create_namespace = true + chart = "nvidia-device-plugin" + chart_version = "0.14.0" + repository = "https://nvidia.github.io/k8s-device-plugin" + values = [file("${path.module}/helm-values/nvidia-values.yaml")] + } } #--------------------------------------- @@ -164,7 +152,7 @@ module "eks_blueprints_addons" { # 2- Grafana Admin user: admin # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text` #--------------------------------------------------------------- - enable_kube_prometheus_stack = false + enable_kube_prometheus_stack = true kube_prometheus_stack = { values = [ var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", { @@ -195,10 +183,31 @@ module "eks_blueprints_addons" { #--------------------------------------------------------------- module "eks_data_addons" { source = "aws-ia/eks-data-addons/aws" - version = "~> 1.0" # ensure to update this to the latest/desired version + version = "~> 1.2.3" # ensure to update this to the latest/desired version oidc_provider_arn = module.eks.oidc_provider_arn + #--------------------------------------------------------------- + # MLflow Tracking Add-on + #--------------------------------------------------------------- + + enable_mlflow_tracking = true + mlflow_tracking_helm_config = { + mlflow_namespace = try(kubernetes_namespace_v1.mlflow[0].metadata[0].name, local.mlflow_namespace) + + values = [templatefile("${path.module}/helm-values/mlflow-tracking-values.yaml", { + mlflow_sa = local.mlflow_service_account + mlflow_irsa = module.mlflow_irsa[0].iam_role_arn + # MLflow Postgres RDS Config + mlflow_db_username = local.mlflow_name + mlflow_db_password = try(sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string), "") + mlflow_db_name = try(module.db[0].db_instance_name, "") + mlflow_db_host = try(element(split(":", module.db[0].db_instance_endpoint), 0), "") + # S3 bucket config for artifacts + s3_bucket_name = try(module.mlflow_s3_bucket[0].s3_bucket_id, "") + })] + } + } #--------------------------------------------------------------- @@ -266,7 +275,7 @@ module "fluentbit_s3_bucket" { source = "terraform-aws-modules/s3-bucket/aws" version = "~> 3.0" - bucket_prefix = "${local.name}-argo-workflow-logs-" + bucket_prefix = "${local.name}-fluentbit-logs-" # For example only - please evaluate for your environment force_destroy = true server_side_encryption_configuration = { diff --git a/ai-ml/mlflow/cleanup.sh b/ai-ml/mlflow/cleanup.sh new file mode 100644 index 000000000..6f96c6ef5 --- /dev/null +++ b/ai-ml/mlflow/cleanup.sh @@ -0,0 +1,45 @@ +#!/bin/bash +set -o errexit +set -o pipefail + +targets=( + "module.eks_data_addons" + "module.eks_blueprints_addons" +) + +#------------------------------------------- +# Helpful to delete the stuck in "Terminating" namespaces +# Rerun the cleanup.sh script to detect and delete the stuck resources +#------------------------------------------- +terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name') + +# If there are no terminating namespaces, exit the script +if [[ -z $terminating_namespaces ]]; then + echo "No terminating namespaces found" +fi + +for ns in $terminating_namespaces; do + echo "Terminating namespace: $ns" + kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f - +done + +for target in "${targets[@]}" +do + terraform destroy -target="$target" -auto-approve + destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1) + if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then + echo "SUCCESS: Terraform destroy of $target completed successfully" + else + echo "FAILED: Terraform destroy of $target failed" + exit 1 + fi +done + +terraform destroy -auto-approve +destroy_output=$(terraform destroy -auto-approve 2>&1) +if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then + echo "SUCCESS: Terraform destroy of all targets completed successfully" +else + echo "FAILED: Terraform destroy of all targets failed" + exit 1 +fi diff --git a/ai-ml/mlflow/eks.tf b/ai-ml/mlflow/eks.tf index 4d486df1c..cfb06b56e 100644 --- a/ai-ml/mlflow/eks.tf +++ b/ai-ml/mlflow/eks.tf @@ -79,7 +79,11 @@ module "eks" { core_node_group = { name = "core-node-group" description = "EKS Core node group for hosting critical add-ons" - subnet_ids = module.vpc.private_subnets + # Filtering only Secondary CIDR private subnets starting with "100.". + # Subnet IDs where the nodes/node groups will be provisioned + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null] + ) min_size = 3 max_size = 9 @@ -110,5 +114,40 @@ module "eks" { "karpenter.sh/discovery" = local.name }) } + + gpu1 = { + name = "gpu-node-grp" + description = "EKS Node Group to run GPU workloads" + # Filtering only Secondary CIDR private subnets starting with "100.". + # Subnet IDs where the nodes/node groups will be provisioned + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null] + ) + + ami_type = "AL2_x86_64_GPU" + min_size = 0 + max_size = 1 + desired_size = 0 + + instance_types = ["g5.12xlarge"] + + labels = { + WorkerType = "ON_DEMAND" + NodeGroupType = "gpu" + } + + taints = { + gpu = { + key = "nvidia.com/gpu" + effect = "NO_SCHEDULE" + operator = "EXISTS" + } + } + + tags = merge(local.tags, { + Name = "gpu-node-grp", + "karpenter.sh/discovery" = local.name + }) + } } } diff --git a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml new file mode 100644 index 000000000..126b30152 --- /dev/null +++ b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml @@ -0,0 +1,11 @@ +controller: + service: + externalTrafficPolicy: "Local" + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing # Private Load Balancer can only be accessed within the VPC + targetPorts: + http: http + https: http \ No newline at end of file diff --git a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml new file mode 100644 index 000000000..7614409f7 --- /dev/null +++ b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml @@ -0,0 +1,88 @@ +# Default values for mlflow-tracking-server. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +image: + repository: public.ecr.aws/data-on-eks/mlflow + pullPolicy: Always + tag: 2.7.1 + +imagePullSecrets: [] + +nameOverride: mlflow-tracking-server + +fullnameOverride: mlflow-tracking-server + +podAnnotations: {} + +replicaCount: 1 + +service: + type: ClusterIP + port: 5000 + +serviceAccount: + # Specifies whether a service account should be created + create: false + # Annotations to add to the service account + annotations: + eks.amazonaws.com/role-arn: ${mlflow_irsa} + labels: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: ${mlflow_sa} + +ingress: + enabled: true + className: nginx + annotations: + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/use-regex: "true" + hosts: + - host: + paths: + - path: / + pathType: Prefix + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +mlflow: + artifacts: + bucketName: ${s3_bucket_name} + database: + name: ${mlflow_db_name} + username: ${mlflow_db_username} + password: ${mlflow_db_password} + host: ${mlflow_db_host} + port: 5432 + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {} \ No newline at end of file diff --git a/ai-ml/mlflow/helm-values/nvidia-values.yaml b/ai-ml/mlflow/helm-values/nvidia-values.yaml new file mode 100644 index 000000000..3c50e8c1f --- /dev/null +++ b/ai-ml/mlflow/helm-values/nvidia-values.yaml @@ -0,0 +1,10 @@ +gfd: + enabled: true +nfd: + enabled: true + worker: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - operator: "Exists" \ No newline at end of file diff --git a/ai-ml/mlflow/install.sh b/ai-ml/mlflow/install.sh new file mode 100755 index 000000000..2832252fb --- /dev/null +++ b/ai-ml/mlflow/install.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# List of Terraform modules to apply in sequence +targets=( + "module.vpc" + "module.eks" + "module.ebs_csi_driver_irsa" + "module.eks_blueprints_addons" + "module.db" +) + +# Initialize Terraform +echo "Initializing ..." +terraform init --upgrade || echo "\"terraform init\" failed" + +# Apply modules in sequence +for target in "${targets[@]}" +do + echo "Applying module $target..." + apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty) + if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then + echo "SUCCESS: Terraform apply of $target completed successfully" + else + echo "FAILED: Terraform apply of $target failed" + exit 1 + fi +done + +# Final apply to catch any remaining resources +echo "Applying remaining resources..." +apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty) +if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then + echo "SUCCESS: Terraform apply of all modules completed successfully" +else + echo "FAILED: Terraform apply of all modules failed" + exit 1 +fi diff --git a/ai-ml/mlflow/main.tf b/ai-ml/mlflow/main.tf index 734c0ec63..df7b59945 100644 --- a/ai-ml/mlflow/main.tf +++ b/ai-ml/mlflow/main.tf @@ -4,7 +4,7 @@ provider "aws" { provider "kubernetes" { host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) token = data.aws_eks_cluster_auth.this.token } @@ -18,7 +18,7 @@ provider "aws" { provider "helm" { kubernetes { host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) token = data.aws_eks_cluster_auth.this.token } } @@ -26,13 +26,13 @@ provider "helm" { provider "kubectl" { apply_retry_count = 10 host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + load_config_file = false token = data.aws_eks_cluster_auth.this.token } data "aws_availability_zones" "available" {} -data "aws_region" "current" {} +# data "aws_region" "current" {} data "aws_caller_identity" "current" {} data "aws_partition" "current" {} @@ -51,12 +51,12 @@ locals { name = var.name region = var.region vpc_cidr = var.vpc_cidr - azs = slice(data.aws_availability_zones.available.names, 0, 3) + azs = slice(data.aws_availability_zones.available.names, 0, 2) account_id = data.aws_caller_identity.current.account_id partition = data.aws_partition.current.partition - mlflow_name = "mlflow" - mlflow_namespace = "mlflow" + mlflow_name = "mlflow" + mlflow_namespace = "mlflow" mlflow_service_account = "mlflow" tags = { diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf index 6117f4918..babe6bd36 100644 --- a/ai-ml/mlflow/mlflow-core.tf +++ b/ai-ml/mlflow/mlflow-core.tf @@ -2,7 +2,7 @@ # RDS Postgres Database for MLflow Backend #--------------------------------------------------------------- module "db" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 source = "terraform-aws-modules/rds/aws" version = "~> 5.0" @@ -42,7 +42,7 @@ module "db" { create_monitoring_role = true monitoring_interval = 60 monitoring_role_name = "mlflow-backend" - monitoring_role_use_name_prefix = true + monitoring_role_use_name_prefix = true monitoring_role_description = "MLflow Postgres Backend for monitoring role" parameters = [ @@ -63,19 +63,19 @@ module "db" { # MLflow Postgres Backend DB Master password #--------------------------------------------------------------- resource "random_password" "postgres" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 length = 16 special = false } #tfsec:ignore:aws-ssm-secret-use-customer-key resource "aws_secretsmanager_secret" "postgres" { - count = var.enable_mlflow ? 1 : 0 - name = "postgres-2" + count = var.enable_mlflow_tracking ? 1 : 0 + name = "postgres-mlflow" recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy } resource "aws_secretsmanager_secret_version" "postgres" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 secret_id = aws_secretsmanager_secret.postgres[0].id secret_string = random_password.postgres[0].result } @@ -84,7 +84,7 @@ resource "aws_secretsmanager_secret_version" "postgres" { # PostgreSQL RDS security group #--------------------------------------------------------------- module "security_group" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 source = "terraform-aws-modules/security-group/aws" version = "~> 5.0" @@ -99,7 +99,7 @@ module "security_group" { to_port = 5432 protocol = "tcp" description = "PostgreSQL access from within VPC" - cidr_blocks = "${module.vpc.vpc_cidr_block}" + cidr_blocks = "${module.vpc.vpc_cidr_block},${module.vpc.vpc_secondary_cidr_blocks[0]}" }, ] @@ -113,7 +113,7 @@ module "security_group" { #tfsec:ignore:* module "mlflow_s3_bucket" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 source = "terraform-aws-modules/s3-bucket/aws" version = "~> 3.0" @@ -137,7 +137,7 @@ module "mlflow_s3_bucket" { # MLflow Namespace #--------------------------------------------------------------- resource "kubernetes_namespace_v1" "mlflow" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 metadata { name = local.mlflow_namespace } @@ -147,7 +147,7 @@ resource "kubernetes_namespace_v1" "mlflow" { } resource "kubernetes_service_account_v1" "mlflow" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 metadata { name = local.mlflow_service_account namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name @@ -158,7 +158,7 @@ resource "kubernetes_service_account_v1" "mlflow" { } resource "kubernetes_secret_v1" "mlflow" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 metadata { name = "${local.mlflow_service_account}-secret" namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name @@ -171,9 +171,9 @@ resource "kubernetes_secret_v1" "mlflow" { type = "kubernetes.io/service-account-token" } -# Create IAM Role for Service Account (IRSA) Only if Airflow is enabled +# Create IAM Role for Service Account (IRSA) Only if MLflow is enabled module "mlflow_irsa" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 source = "aws-ia/eks-blueprints-addon/aws" version = "~> 1.0" #ensure to update this to the latest/desired version @@ -203,7 +203,7 @@ module "mlflow_irsa" { # IAM policy for MLflow for accesing S3 artifacts and RDS Postgres backend #--------------------------------------------------------------- resource "aws_iam_policy" "mlflow" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 description = "IAM policy for MLflow" name_prefix = format("%s-%s-", local.name, "mlflow") @@ -212,7 +212,7 @@ resource "aws_iam_policy" "mlflow" { } data "aws_iam_policy_document" "mlflow" { - count = var.enable_mlflow ? 1 : 0 + count = var.enable_mlflow_tracking ? 1 : 0 statement { sid = "" effect = "Allow" @@ -237,7 +237,7 @@ data "aws_iam_policy_document" "mlflow" { sid = "" effect = "Allow" resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"] - + actions = [ "rds-db:connect", ] diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf index 2ea6f9293..10ea6529f 100644 --- a/ai-ml/mlflow/outputs.tf +++ b/ai-ml/mlflow/outputs.tf @@ -15,10 +15,10 @@ output "grafana_secret_name" { output "mlflow_s3_artifacts" { description = "S3 bucket for MLflow artifacts" - value = module.mlflow_s3_bucket[0].s3_bucket_id + value = module.mlflow_s3_bucket[0].s3_bucket_id } output "mlflow_db_backend" { description = "Amazon RDS Postgres database for MLflow backend" - value = module.db[0].db_instance_endpoint -} \ No newline at end of file + value = module.db[0].db_instance_endpoint +} diff --git a/ai-ml/mlflow/variables.tf b/ai-ml/mlflow/variables.tf index 8a41a224b..1600e75b5 100644 --- a/ai-ml/mlflow/variables.tf +++ b/ai-ml/mlflow/variables.tf @@ -1,6 +1,6 @@ variable "name" { description = "Name of the VPC and EKS Cluster" - default = "mlflow-eks" + default = "mlflow-on-eks" type = string } @@ -16,15 +16,18 @@ variable "eks_cluster_version" { type = string } +# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs variable "vpc_cidr" { description = "VPC CIDR" - default = "10.1.0.0/16" + default = "10.1.0.0/21" type = string } -variable "db_private_subnets" { - description = "Private Subnets CIDRs. 254 IPs per Subnet/AZ for Airflow DB." - default = ["10.0.20.0/26", "10.0.21.0/26"] +# RFC6598 range 100.64.0.0/10 +# Note you can only /16 range to VPC. You can add multiples of /16 if required +variable "secondary_cidr_blocks" { + description = "Secondary CIDR blocks to be attached to VPC" + default = ["100.64.0.0/16"] type = list(string) } @@ -34,8 +37,8 @@ variable "enable_amazon_prometheus" { default = true } -variable "enable_mlflow" { - description = "Enable MMLflow" +variable "enable_mlflow_tracking" { + description = "Enable MLflow Tracking" type = bool default = true -} \ No newline at end of file +} diff --git a/ai-ml/mlflow/vpc.tf b/ai-ml/mlflow/vpc.tf index ffe29219f..0aa8b7aab 100644 --- a/ai-ml/mlflow/vpc.tf +++ b/ai-ml/mlflow/vpc.tf @@ -1,3 +1,21 @@ +locals { + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + + database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] +} + +#--------------------------------------------------------------- +# VPC +#--------------------------------------------------------------- + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" @@ -6,20 +24,23 @@ module "vpc" { cidr = local.vpc_cidr azs = local.azs - # Three private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB - private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k)] - + # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods + secondary_cidr_blocks = var.secondary_cidr_blocks + + # Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) + # ------------------------------ # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW - public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 10)] + public_subnets = local.public_subnets # ------------------------------ # Private Subnets for MLflow backend store - database_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 20)] + database_subnets = local.database_private_subnets create_database_subnet_group = true create_database_subnet_route_table = true - + enable_nat_gateway = true single_nat_gateway = true enable_dns_hostnames = true @@ -30,6 +51,8 @@ module "vpc" { private_subnet_tags = { "kubernetes.io/role/internal-elb" = 1 + # Tags subnets for Karpenter auto-discovery + "karpenter.sh/discovery" = local.name } tags = local.tags From 9a61a731f4612c841fb1b3de4743f6189fc494b7 Mon Sep 17 00:00:00 2001 From: Ovidiu Valeanu Date: Wed, 4 Oct 2023 17:27:46 +0100 Subject: [PATCH 5/8] feat: MLflow Tracking blueprint running on EKS --- ai-ml/mlflow/helm-values/ingress-nginx-values.yaml | 2 +- ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml | 6 +++--- ai-ml/mlflow/helm-values/nvidia-values.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml index 126b30152..22e48c7f9 100644 --- a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml +++ b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml @@ -8,4 +8,4 @@ controller: service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing # Private Load Balancer can only be accessed within the VPC targetPorts: http: http - https: http \ No newline at end of file + https: http diff --git a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml index 7614409f7..1f604f610 100644 --- a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml +++ b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml @@ -18,7 +18,7 @@ podAnnotations: {} replicaCount: 1 service: - type: ClusterIP + type: ClusterIP port: 5000 serviceAccount: @@ -34,7 +34,7 @@ serviceAccount: ingress: enabled: true - className: nginx + className: nginx annotations: kubernetes.io/ingress.class: nginx nginx.ingress.kubernetes.io/use-regex: "true" @@ -85,4 +85,4 @@ nodeSelector: {} tolerations: [] -affinity: {} \ No newline at end of file +affinity: {} diff --git a/ai-ml/mlflow/helm-values/nvidia-values.yaml b/ai-ml/mlflow/helm-values/nvidia-values.yaml index 3c50e8c1f..9fa59599e 100644 --- a/ai-ml/mlflow/helm-values/nvidia-values.yaml +++ b/ai-ml/mlflow/helm-values/nvidia-values.yaml @@ -7,4 +7,4 @@ nfd: - key: nvidia.com/gpu operator: Exists effect: NoSchedule - - operator: "Exists" \ No newline at end of file + - operator: "Exists" From 38c8dc00080c6b5bbd9ee6e5b15855d1a9bba81b Mon Sep 17 00:00:00 2001 From: Ovidiu Valeanu Date: Tue, 12 Dec 2023 15:18:05 +0000 Subject: [PATCH 6/8] Fix PR comments --- ai-ml/mlflow/addons.tf | 23 ++---- ai-ml/mlflow/cleanup.sh | 0 ai-ml/mlflow/helm-values/nvidia-values.yaml | 91 ++++++++++++++++++++- ai-ml/mlflow/main.tf | 1 - ai-ml/mlflow/versions.tf | 2 +- 5 files changed, 98 insertions(+), 19 deletions(-) mode change 100644 => 100755 ai-ml/mlflow/cleanup.sh diff --git a/ai-ml/mlflow/addons.tf b/ai-ml/mlflow/addons.tf index 3cf69f05e..a6994ae4b 100644 --- a/ai-ml/mlflow/addons.tf +++ b/ai-ml/mlflow/addons.tf @@ -128,21 +128,6 @@ module "eks_blueprints_addons" { values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})] } - helm_releases = { - #--------------------------------------- - # NVIDIA Device Plugin Add-on - #--------------------------------------- - nvidia-device-plugin = { - description = "A Helm chart for NVIDIA Device Plugin" - namespace = "nvidia-device-plugin" - create_namespace = true - chart = "nvidia-device-plugin" - chart_version = "0.14.0" - repository = "https://nvidia.github.io/k8s-device-plugin" - values = [file("${path.module}/helm-values/nvidia-values.yaml")] - } - } - #--------------------------------------- # Prommetheus and Grafana stack #--------------------------------------- @@ -208,6 +193,14 @@ module "eks_data_addons" { })] } + #--------------------------------------------------------------- + # NVIDIA GPU Operator Add-on + #--------------------------------------------------------------- + enable_nvidia_gpu_operator = true + nvidia_gpu_operator_helm_config = { + values = [templatefile("${path.module}/helm-values/nvidia-values.yaml", {})] + } + } #--------------------------------------------------------------- diff --git a/ai-ml/mlflow/cleanup.sh b/ai-ml/mlflow/cleanup.sh old mode 100644 new mode 100755 diff --git a/ai-ml/mlflow/helm-values/nvidia-values.yaml b/ai-ml/mlflow/helm-values/nvidia-values.yaml index 9fa59599e..60078daa6 100644 --- a/ai-ml/mlflow/helm-values/nvidia-values.yaml +++ b/ai-ml/mlflow/helm-values/nvidia-values.yaml @@ -1,10 +1,97 @@ +# Default values for gpu-operator. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +daemonsets: + labels: {} + annotations: {} + priorityClassName: system-node-critical + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes + +validator: + repository: nvcr.io/nvidia/cloud-native + image: gpu-operator-validator + +operator: + repository: nvcr.io/nvidia + priorityClassName: system-node-critical + defaultRuntime: containerd + image: gpu-operator + cleanupCRD: false # This option doesn't do anything even if you change this to true. NVIDIA recommends to use the manual approach of upgrading the CRDs + upgradeCRD: false + resources: + limits: + cpu: 500m + memory: 350Mi + requests: + cpu: 200m + memory: 100Mi + +mig: + strategy: single + +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/install-precompiled-signed-drivers.html +# Currently NVIDIA Operator takes more than 5 mins to make the node GPU ready with all the required drivers. +# With pre-compiled NVIDIA Drivers this process can be faster hence we are using the config values as driver.version: "515-signed" +driver: + enabled: true + repository: nvcr.io/nvidia + image: driver + # Commented this as latest Ubuntu AMIs are failing with this option enabled + # version: "515-signed" # supported DRIVER_BRANCH value currently are 470, 510 and 515 which will install latest drivers available on that branch for current running kernel version. + manager: + image: k8s-driver-manager + repository: nvcr.io/nvidia/cloud-native + +toolkit: + enabled: true + +devicePlugin: + enabled: true + +dcgm: + enabled: false + +dcgmExporter: + enabled: true + gfd: enabled: true -nfd: + +migManager: + enabled: true + +nodeStatusExporter: + enabled: false + +gds: + enabled: false + +vgpuManager: + enabled: false + +vgpuDeviceManager: + enabled: true + +vfioManager: + enabled: true + +sandboxDevicePlugin: enabled: true + +node-feature-discovery: + enableNodeFeatureApi: true worker: tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" - key: nvidia.com/gpu operator: Exists effect: NoSchedule - - operator: "Exists" + - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes diff --git a/ai-ml/mlflow/main.tf b/ai-ml/mlflow/main.tf index df7b59945..a5e4360ea 100644 --- a/ai-ml/mlflow/main.tf +++ b/ai-ml/mlflow/main.tf @@ -32,7 +32,6 @@ provider "kubectl" { } data "aws_availability_zones" "available" {} -# data "aws_region" "current" {} data "aws_caller_identity" "current" {} data "aws_partition" "current" {} diff --git a/ai-ml/mlflow/versions.tf b/ai-ml/mlflow/versions.tf index be6e7d672..156fc1e49 100644 --- a/ai-ml/mlflow/versions.tf +++ b/ai-ml/mlflow/versions.tf @@ -28,6 +28,6 @@ terraform { # backend "s3" { # bucket = "doeks-github-actions-e2e-test-state" # region = "us-west-2" - # key = "e2e/argo-workflow/terraform.tfstate" + # key = "e2e/mlflow/terraform.tfstate" # } } From 3d9bb2216756f6e1971231baa0aa1a597f7bc2e1 Mon Sep 17 00:00:00 2001 From: Ovidiu Valeanu Date: Tue, 12 Dec 2023 16:22:57 +0000 Subject: [PATCH 7/8] Add variable name pg secret --- ai-ml/mlflow/mlflow-core.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf index babe6bd36..81bfa1d95 100644 --- a/ai-ml/mlflow/mlflow-core.tf +++ b/ai-ml/mlflow/mlflow-core.tf @@ -70,7 +70,7 @@ resource "random_password" "postgres" { #tfsec:ignore:aws-ssm-secret-use-customer-key resource "aws_secretsmanager_secret" "postgres" { count = var.enable_mlflow_tracking ? 1 : 0 - name = "postgres-mlflow" + name = local.mlflow_name recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy } From a546817bcb96f01218d7881cf76ee57351abbed0 Mon Sep 17 00:00:00 2001 From: Ovidiu Valeanu Date: Tue, 12 Dec 2023 16:35:13 +0000 Subject: [PATCH 8/8] Add eks alias in kubectl output --- ai-ml/mlflow/outputs.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf index 10ea6529f..b5db71900 100644 --- a/ai-ml/mlflow/outputs.tf +++ b/ai-ml/mlflow/outputs.tf @@ -1,6 +1,6 @@ output "configure_kubectl" { description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" + value = "aws eks --region ${local.region} update-kubeconfig --alias ${module.eks.cluster_name} --name ${module.eks.cluster_name}" } output "eks_api_server_url" {