diff --git a/.github/workflows/deploy-grafana-dashboards.yaml b/.github/workflows/deploy-grafana-dashboards.yaml index d330ab335..f27600ec2 100644 --- a/.github/workflows/deploy-grafana-dashboards.yaml +++ b/.github/workflows/deploy-grafana-dashboards.yaml @@ -34,6 +34,7 @@ jobs: - cluster_name: jupyter-meets-the-earth - cluster_name: kitware - cluster_name: leap + - cluster_name: maap - cluster_name: nasa-cryo - cluster_name: nasa-ghg - cluster_name: nasa-veda diff --git a/config/clusters/maap/cluster.yaml b/config/clusters/maap/cluster.yaml new file mode 100644 index 000000000..c0a00cdab --- /dev/null +++ b/config/clusters/maap/cluster.yaml @@ -0,0 +1,37 @@ +name: maap +provider: aws # https://916098889494.signin.aws.amazon.com/console +aws: + key: enc-deployer-credentials.secret.json + clusterType: eks + clusterName: maap + region: us-west-2 + billing: + paid_by_us: false +support: + helm_chart_values_files: + - support.values.yaml + - enc-support.secret.values.yaml +hubs: + [] + # Uncomment the lines below once the support infrastructure was deployed and + # you are ready to add the first cluster + + # - name: staging + # # Tip: consider changing this to something more human friendly + # display_name: "maap - staging" + # domain: staging.maap.2i2c.cloud + # helm_chart: basehub + # helm_chart_values_files: + # - common.values.yaml + # - staging.values.yaml + # - enc-staging.secret.values.yaml + + # - name: prod + # # Tip: consider changing this to something more human friendly + # display_name: "maap - prod" + # domain: prod.maap.2i2c.cloud + # helm_chart: basehub + # helm_chart_values_files: + # - common.values.yaml + # - prod.values.yaml + # - enc-prod.secret.values.yaml diff --git a/config/clusters/maap/enc-deployer-credentials.secret.json b/config/clusters/maap/enc-deployer-credentials.secret.json new file mode 100644 index 000000000..26f2d8e9a --- /dev/null +++ b/config/clusters/maap/enc-deployer-credentials.secret.json @@ -0,0 +1,25 @@ +{ + "AccessKey": { + "AccessKeyId": "ENC[AES256_GCM,data:JMiFl1UnzusCQNlEOBsYvHa+9Uo=,iv:CC0kCAIAbQXtJE4aWfvXd63FWVSuO9To2L8aKkHRgo4=,tag:r2ZlXvm+UtsVyim0WI0M9Q==,type:str]", + "SecretAccessKey": "ENC[AES256_GCM,data:w6Agme4BM109uRDH2CXIp9ffqeD6xXe/Rw6ed2X8uN42CecK1vamNQ==,iv:7eEROA5OrThNMgq9dsHeVyFFsSUbksmt1kA0f5dBDXA=,tag:5UD9cGGNEKvw20Cril4evw==,type:str]", + "UserName": "ENC[AES256_GCM,data:GcAK1BJTZVmJGoVxeRb4zErA7RA371Y=,iv:6udAmDeSfJ2DO8j+/aINVF4PSjhQs+j5BxBSA2llB9Y=,tag:zYLlltSLTCH01wxrr5mffg==,type:str]" + }, + "sops": { + "kms": null, + "gcp_kms": [ + { + "resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs", + "created_at": "2024-12-04T12:21:40Z", + "enc": "CiUA4OM7eOtAu8gt5nq+Tr+m64LsqMU7YruHfYzFWFswrGfKO5SgEkkAnGhyNghFbi9rWO0BUsWs199nUCTeQOOebtO8KFEMrbH5bejuZDyjRar2fU3WyUKxlBRuywgZySqZgJ9Ut+LDL+c2LdWZD+Qz" + } + ], + "azure_kv": null, + "hc_vault": null, + "age": null, + "lastmodified": "2024-12-04T12:21:41Z", + "mac": "ENC[AES256_GCM,data:kuyRynza4+RG2CGJyYQgUqjLAEZiCrjRvTpR/ciO0yKoRhFzykkbg12J/1y4M4eqlsezvUfyqE+EUtsBaISH1mg8nIuchHi6sRz9XAjQeLX3cwrEPlItH7sUjjGOTbRhcHna+zXVoM2q6gxIpEdNaNq/vPtAKs9TGCRRkw1NfSQ=,iv:RvP7hU6/6kJOBStTO5FEACDPwDA5tBYvjEptdGDRcOA=,tag:as4VS4owv5yZ2c0s+lbZ8A==,type:str]", + "pgp": null, + "unencrypted_suffix": "_unencrypted", + "version": "3.8.1" + } +} \ No newline at end of file diff --git a/config/clusters/maap/enc-support.secret.values.yaml b/config/clusters/maap/enc-support.secret.values.yaml new file mode 100644 index 000000000..b1ae64f29 --- /dev/null +++ b/config/clusters/maap/enc-support.secret.values.yaml @@ -0,0 +1,17 @@ +prometheusIngressAuthSecret: + username: ENC[AES256_GCM,data:1Fs5zwh1wn4/8KWnSoswC/KiW/1jw8CJxUSnOLne6KRI1W9uftsJt43FmRdzQMqsiadc291Jo74/YWBFBC1khw==,iv:ouHNVDQcyfsHQ7zj144fVEfqQX7oIez0uLmCDeO47dw=,tag:MxKMSNP+DVTBdQbBRIxA+Q==,type:str] + password: ENC[AES256_GCM,data:qtItFIiARguwpejHWHBDSoKOl4uilmXgEkC4nBonqqWoCkMBHBDFCAr7qbH+fwep+1+yNUkuDXKJE6l0zp/gqw==,iv:8Pcbr2lulRPc0wPYOtgLez2lBLa+PKfxmd/SA75VLpY=,tag:mzZukJ3yv+IPxxDO22O9Sg==,type:str] +sops: + kms: [] + gcp_kms: + - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs + created_at: "2024-12-04T11:17:12Z" + enc: CiUA4OM7eInxKKOnVMfm7f3ZEMUF8+vdF7TSx3WQo65HugraH6wMEkkAnGhyNpIACP7jUyAu/WPOXEmSwhwAXVaQGCMbgWbeuh0A+qvSUieMHE53t/VCgGa5n0Dnitr/jqchmhNaJQfs4GyoxgF3RbAp + azure_kv: [] + hc_vault: [] + age: [] + lastmodified: "2024-12-04T11:17:12Z" + mac: ENC[AES256_GCM,data:9hrfgDF4tkpynItWcIkFTIGF8GRxeCXm0vcdMwcuNAx4E/vC/WMKxES3LFK2ygNzSljKZ3C76F3ipHjEioognquZQoEZWF22tAcJHFfc1VGa9iR6Dh22z4X33UcEZFELXBDJUPI01YWEOybqx74Khd13Yo8ht61vnUsDEbvEPTY=,iv:EwWG5H90WIEoX1T46DDaSvascSafppbtRvQPW9byerY=,tag:wDIatpNvUyHBzLSqzhabkQ==,type:str] + pgp: [] + unencrypted_suffix: _unencrypted + version: 3.8.1 diff --git a/config/clusters/maap/support.values.yaml b/config/clusters/maap/support.values.yaml new file mode 100644 index 000000000..a80ae4598 --- /dev/null +++ b/config/clusters/maap/support.values.yaml @@ -0,0 +1,42 @@ +prometheusIngressAuthSecret: + enabled: true + +prometheus: + server: + ingress: + enabled: true + hosts: + - prometheus.maap.2i2c.cloud + tls: + - secretName: prometheus-tls + hosts: + - prometheus.maap.2i2c.cloud + +grafana: + grafana.ini: + server: + root_url: https://grafana.maap.2i2c.cloud/ + auth.github: + enabled: true + allowed_organizations: 2i2c-org + ingress: + hosts: + - grafana.maap.2i2c.cloud + tls: + - secretName: grafana-tls + hosts: + - grafana.maap.2i2c.cloud + +aws-ce-grafana-backend: + enabled: true + envBasedConfig: + clusterName: maap + serviceAccount: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::916098889494:role/aws_ce_grafana_backend_iam_role + +cluster-autoscaler: + enabled: true + autoDiscovery: + clusterName: maap + awsRegion: us-west-2 diff --git a/eksctl/maap.jsonnet b/eksctl/maap.jsonnet new file mode 100644 index 000000000..ce9719f6a --- /dev/null +++ b/eksctl/maap.jsonnet @@ -0,0 +1,262 @@ +/* + This file is a jsonnet template of a eksctl's cluster configuration file, + that is used with the eksctl CLI to both update and initialize an AWS EKS + based cluster. + + This file has in turn been generated from eksctl/template.jsonnet which is + relevant to compare with for changes over time. + + To use jsonnet to generate an eksctl configuration file from this, do: + + jsonnet maap.jsonnet > maap.eksctl.yaml + + References: + - https://eksctl.io/usage/schema/ +*/ +local ng = import "./libsonnet/nodegroup.jsonnet"; + +// place all cluster nodes here +local clusterRegion = "us-west-2"; +local masterAzs = ["us-west-2a", "us-west-2b", "us-west-2c"]; +local nodeAz = "us-west-2a"; + +// Node definitions for notebook nodes. Config here is merged +// with our notebook node definition. +// A `node.kubernetes.io/instance-type label is added, so pods +// can request a particular kind of node with a nodeSelector +local notebookNodes = [ + // staging + { + instanceType: "r5.xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + { + instanceType: "r5.4xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + { + instanceType: "r5.16xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + // prod + { + instanceType: "r5.xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + { + instanceType: "r5.4xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + { + instanceType: "r5.16xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + // gpus + { + instanceType: "g4dn.xlarge", + namePrefix: "gpu-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { + "2i2c:hub-name": "staging", + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + taints+: { + "nvidia.com/gpu": "present:NoSchedule" + }, + // Allow provisioning GPUs across all AZs, to prevent situation where all + // GPUs in a single AZ are in use and no new nodes can be spawned + availabilityZones: masterAzs, + }, + { + instanceType: "g4dn.xlarge", + namePrefix: "gpu-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { + "2i2c:hub-name": "prod", + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + taints+: { + "nvidia.com/gpu": "present:NoSchedule" + }, + // Allow provisioning GPUs across all AZs, to prevent situation where all + // GPUs in a single AZ are in use and no new nodes can be spawned + availabilityZones: masterAzs, + }, +]; + +local daskNodes = [ + // Node definitions for dask worker nodes. Config here is merged + // with our dask worker node definition, which uses spot instances. + // A `node.kubernetes.io/instance-type label is set to the name of the + // *first* item in instanceDistribution.instanceTypes, to match + // what we do with notebook nodes. Pods can request a particular + // kind of node with a nodeSelector + // + // A not yet fully established policy is being developed about using a single + // node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. + // + { + namePrefix: "dask-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + instancesDistribution+: { instanceTypes: ["r5.4xlarge"] } + }, + { + namePrefix: "dask-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + instancesDistribution+: { instanceTypes: ["r5.4xlarge"] } + }, +]; + + +{ + apiVersion: 'eksctl.io/v1alpha5', + kind: 'ClusterConfig', + metadata+: { + name: "maap", + region: clusterRegion, + version: "1.30", + tags+: { + "ManagedBy": "2i2c", + "2i2c.org/cluster-name": $.metadata.name, + }, + }, + availabilityZones: masterAzs, + iam: { + withOIDC: true, + }, + // If you add an addon to this config, run the create addon command. + // + // eksctl create addon --config-file=maap.eksctl.yaml + // + addons: [ + { version: "latest", tags: $.metadata.tags } + addon + for addon in + [ + { name: "coredns" }, + { name: "kube-proxy" }, + { + // vpc-cni is a Amazon maintained container networking interface + // (CNI), where a CNI is required for k8s networking. The aws-node + // DaemonSet in kube-system stems from installing this. + // + // Related docs: https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/ + // https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html + // + name: "vpc-cni", + attachPolicyARNs: ["arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"], + # FIXME: enabling network policy enforcement didn't work as of + # August 2024, what's wrong isn't clear. + # + # configurationValues ref: https://github.com/aws/amazon-vpc-cni-k8s/blob/HEAD/charts/aws-vpc-cni/values.yaml + configurationValues: ||| + enableNetworkPolicy: "false" + |||, + }, + { + // aws-ebs-csi-driver ensures that our PVCs are bound to PVs that + // couple to AWS EBS based storage, without it expect to see pods + // mounting a PVC failing to schedule and PVC resources that are + // unbound. + // + // Related docs: https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html + // + name: "aws-ebs-csi-driver", + wellKnownPolicies: { + ebsCSIController: true, + }, + # configurationValues ref: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/HEAD/charts/aws-ebs-csi-driver/values.yaml + configurationValues: ||| + defaultStorageClass: + enabled: true + |||, + }, + ] + ], + nodeGroups: [ + n + {clusterName: $.metadata.name} for n in + [ + ng + { + namePrefix: 'core', + nameSuffix: 'a', + nameIncludeInstanceType: false, + availabilityZones: [nodeAz], + ssh: { + publicKeyPath: 'ssh-keys/maap.key.pub' + }, + instanceType: "r5.xlarge", + minSize: 1, + maxSize: 6, + labels+: { + "hub.jupyter.org/node-purpose": "core", + "k8s.dask.org/node-purpose": "core", + }, + tags+: { + "2i2c:node-purpose": "core" + }, + }, + ] + [ + ng + { + namePrefix: 'nb', + availabilityZones: [nodeAz], + minSize: 0, + maxSize: 500, + instanceType: n.instanceType, + ssh: { + publicKeyPath: 'ssh-keys/maap.key.pub' + }, + labels+: { + "hub.jupyter.org/node-purpose": "user", + "k8s.dask.org/node-purpose": "scheduler" + }, + taints+: { + "hub.jupyter.org_dedicated": "user:NoSchedule", + "hub.jupyter.org/dedicated": "user:NoSchedule", + }, + tags+: { + "2i2c:node-purpose": "user" + }, + } + n for n in notebookNodes + ] + ( if daskNodes != null then + [ + ng + { + namePrefix: 'dask', + availabilityZones: [nodeAz], + minSize: 0, + maxSize: 500, + ssh: { + publicKeyPath: 'ssh-keys/maap.key.pub' + }, + labels+: { + "k8s.dask.org/node-purpose": "worker" + }, + taints+: { + "k8s.dask.org_dedicated" : "worker:NoSchedule", + "k8s.dask.org/dedicated" : "worker:NoSchedule", + }, + tags+: { + "2i2c:node-purpose": "worker" + }, + instancesDistribution+: { + onDemandBaseCapacity: 0, + onDemandPercentageAboveBaseCapacity: 0, + spotAllocationStrategy: "capacity-optimized", + }, + } + n for n in daskNodes + ] else [] + ) + ] +} \ No newline at end of file diff --git a/eksctl/ssh-keys/maap.key.pub b/eksctl/ssh-keys/maap.key.pub new file mode 100644 index 000000000..d4ea840e8 --- /dev/null +++ b/eksctl/ssh-keys/maap.key.pub @@ -0,0 +1 @@ +ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEKAnc9uvG/u94tT0iBOzgpcIbtzYqn18Mrm0MGGscJc georgiana@192.168.1.6 diff --git a/eksctl/ssh-keys/secret/maap.key b/eksctl/ssh-keys/secret/maap.key new file mode 100644 index 000000000..356a70a11 --- /dev/null +++ b/eksctl/ssh-keys/secret/maap.key @@ -0,0 +1,21 @@ +{ + "data": "ENC[AES256_GCM,data:kOse6sTNtKyOZtSupRNI82VW4hgcgX43EsbKIdw7kWjfJur1HBJGIz8CyaUb9K67A2pc8kUeREjuICraQHRVAnEw13r+kaapiDdILsKXLm05UFi02MxGPnXCEb0ZBMrycc6ath5SsC30ospw0ZawmfYW6C4g6U6qy0XevaMRQEs/8cdRu4RgfqZP276XOKAgShp3a6gR+p45xhu+fEMNViUd3hui51P3FMV+sCdsi1nuXNNYiiqI0feDMwRFihLQ4tRQQxSl3GqGqWTS+5YGpZPkvAsZZpZNb2o63oyISyfpz5KiJ67SN5zcunJPdC7ZyZ5dadsYBhn0SzieaUl7YDTjyMmem1o79LuK+bghnRZhDhFs39bM5LknHqDDn2P+NwiUOet+ptajR191zuPqeuM3oqY2vM2MXKpHuXGG/YaKeN2Lx86GDzCkAC57F4TfFEW5EzVeAdC6ip4oDOYXhqeXb30RauPrVfO47Rpmh6Fu9d5/Y9U+rnUj8PsIxZL/objVPFbWDeRaC0KPt0QOnV2chIDD6GBR0aK1,iv:JKpDh+iugNgU+leECE0bXeVWiC8utmf7bO7u9B/lub4=,tag:+IppJJWeAEcjmirMx1wkaw==,type:str]", + "sops": { + "kms": null, + "gcp_kms": [ + { + "resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs", + "created_at": "2024-12-04T11:17:11Z", + "enc": "CiUA4OM7eMlyhS/Ha7ZD8ATXgvib6xW8bOQRpKyHJXMOCPD7QL4XEkkAnGhyNleyQZIcuBZWbQUt7phhQNJDHgFQZ+60gjT5i/aTMHVAEXPzMecu8yLLH8jkN6U0fdmK5/8fNEIhF8arXPnSPh6fHQzh" + } + ], + "azure_kv": null, + "hc_vault": null, + "age": null, + "lastmodified": "2024-12-04T11:17:12Z", + "mac": "ENC[AES256_GCM,data:xeeq5ae6rr1nrv0QuQv2T7BYP0bdSbh1wnPtInwM6EyLDwVvcqk9KO9NeSFXx+ncSUE+cQla/F5OrSMAetdyhn0v2grZn5si1D4AFN2d1BRbO7nFqeDm+cJrg3IXhx3O3eIxa9QtmeB6zPjEekCvNY57qnxRvCOEZZEsCgQv7jk=,iv:/qdkEpXZ5/g8VmYSOOtaqcu9f7QVe2cs2RoAWfkLCS4=,tag:fHOqBGi4Gsn3vt4iQ8pgkg==,type:str]", + "pgp": null, + "unencrypted_suffix": "_unencrypted", + "version": "3.8.1" + } +} \ No newline at end of file diff --git a/terraform/aws/projects/maap.tfvars b/terraform/aws/projects/maap.tfvars new file mode 100644 index 000000000..c6235fcf7 --- /dev/null +++ b/terraform/aws/projects/maap.tfvars @@ -0,0 +1,46 @@ +region = "us-west-2" +cluster_name = "maap" +cluster_nodes_location = "us-west-2a" + +default_budget_alert = { + "enabled" : false, +} + +enable_aws_ce_grafana_backend_iam = true + +filestores = { + "staging" = { + name_suffix = "staging", + tags = { "2i2c:hub-name" : "staging" }, + }, + "prod" = { + name_suffix = "prod", + tags = { "2i2c:hub-name" : "prod" }, + }, +} + +user_buckets = { + "scratch-staging" : { + "delete_after" : 7, + "tags" : { "2i2c:hub-name" : "staging" }, + }, + + "scratch-prod" : { + "delete_after" : 7, + "tags" : { "2i2c:hub-name" : "prod" }, + }, +} + +hub_cloud_permissions = { + "staging" : { + "user-sa" : { + bucket_admin_access : ["scratch-staging"], + }, + }, + + "prod" : { + "user-sa" : { + bucket_admin_access : ["scratch-prod"], + }, + }, +}