diff --git a/.github/workflows/deploy-grafana-dashboards.yaml b/.github/workflows/deploy-grafana-dashboards.yaml index 5c0be6028..c8d965681 100644 --- a/.github/workflows/deploy-grafana-dashboards.yaml +++ b/.github/workflows/deploy-grafana-dashboards.yaml @@ -28,7 +28,6 @@ jobs: - cluster_name: cloudbank - cluster_name: dubois - cluster_name: earthscope - - cluster_name: gridsst - cluster_name: hhmi - cluster_name: jupyter-health - cluster_name: jupyter-meets-the-earth diff --git a/.github/workflows/deploy-hubs.yaml b/.github/workflows/deploy-hubs.yaml index 06187e183..2669b224f 100644 --- a/.github/workflows/deploy-hubs.yaml +++ b/.github/workflows/deploy-hubs.yaml @@ -215,7 +215,6 @@ jobs: failure_cloudbank: ${{ steps.declare-failure.outputs.failure_cloudbank }} failure_dubois: ${{ steps.declare-failure.outputs.failure_dubois }} failure_earthscope: ${{ steps.declare-failure.outputs.failure_earthscope }} - failure_gridsst: ${{ steps.declare-failure.outputs.failure_gridsst }} failure_hhmi: ${{ steps.declare-failure.outputs.failure_hhmi }} failure_jupyter-health: ${{ steps.declare-failure.outputs.failure_jupyter-health }} failure_jupyter-meets-the-earth: ${{ steps.declare-failure.outputs.failure_jupyter-meets-the-earth }} @@ -454,7 +453,6 @@ jobs: failure_cloudbank_staging: ${{ steps.declare-failure.outputs.failure_cloudbank_staging }} failure_dubois_staging: ${{ steps.declare-failure.outputs.failure_dubois_staging }} failure_earthscope_staging: ${{ steps.declare-failure.outputs.failure_earthscope_staging }} - failure_gridsst_staging: ${{ steps.declare-failure.outputs.failure_gridsst_staging }} failure_hhmi_staging: ${{ steps.declare-failure.outputs.failure_hhmi_staging }} failure_jupyter-health_staging: ${{ steps.declare-failure.outputs.failure_jupyter-health_staging }} failure_jupyter-meets-the-earth_staging: ${{ steps.declare-failure.outputs.failure_jupyter-meets-the-earth_staging }} diff --git a/config/clusters/gridsst/cluster.yaml b/config/clusters/gridsst/cluster.yaml deleted file mode 100644 index 11714a119..000000000 --- a/config/clusters/gridsst/cluster.yaml +++ /dev/null @@ -1,36 +0,0 @@ -name: gridsst -provider: aws # https://2i2c.awsapps.com/start#/ -aws: - key: enc-deployer-credentials.secret.json - clusterType: eks - clusterName: gridsst - region: us-west-2 - billing: - paid_by_us: true -support: - helm_chart_values_files: - - support.values.yaml - - enc-support.secret.values.yaml -hubs: - - name: staging - display_name: "gridSST Hack-A-Thon(staging)" - domain: staging.gridsst.2i2c.cloud - helm_chart: daskhub - helm_chart_values_files: - # The order in which you list files here is the order the will be passed - # to the helm upgrade command in, and that has meaning. Please check - # that you intend for these files to be applied in this order. - - common.values.yaml - - staging.values.yaml - - enc-staging.secret.values.yaml - - name: prod - display_name: "gridSST Hack-A-Thon (prod)" - domain: gridsst.2i2c.cloud - helm_chart: daskhub - helm_chart_values_files: - # The order in which you list files here is the order the will be passed - # to the helm upgrade command in, and that has meaning. Please check - # that you intend for these files to be applied in this order. - - common.values.yaml - - prod.values.yaml - - enc-prod.secret.values.yaml diff --git a/config/clusters/gridsst/common.values.yaml b/config/clusters/gridsst/common.values.yaml deleted file mode 100644 index f7122b59b..000000000 --- a/config/clusters/gridsst/common.values.yaml +++ /dev/null @@ -1,157 +0,0 @@ -basehub: - nfs: - enabled: true - volumeReporter: - enabled: false - pv: - enabled: true - # from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html - mountOptions: - - rsize=1048576 - - wsize=1048576 - - timeo=600 - - soft # We pick soft over hard, so NFS lockups don't lead to hung processes - - retrans=2 - - noresvport - serverIP: fs-05f68d7e096d7cf16.efs.us-west-2.amazonaws.com - baseShareName: / - dask-gateway: - enabled: true - jupyterhub: - custom: - daskhubSetup: - enabled: true - 2i2c: - add_staff_user_ids_to_admin_users: true - add_staff_user_ids_of_type: "github" - jupyterhubConfigurator: - enabled: false - homepage: - templateVars: - org: - name: gridSST Hack-A-Thon - logo_url: https://gridsst-hackathon.github.io/_static/logo.png - url: https://gridsst-hackathon.github.io/ # todo: find the correct link here - designed_by: - name: 2i2c - url: https://2i2c.org - operated_by: - name: 2i2c - url: https://2i2c.org - funded_by: - name: "NASA Physical Oceanography Program" - url: https://science.nasa.gov/earth-science/focus-areas/climate-variability-and-change/ocean-physics - hub: - config: - JupyterHub: - authenticator_class: github - OAuthenticator: - # WARNING: Don't use allow_existing_users with config to allow an - # externally managed group of users, such as - # GitHubOAuthenticator.allowed_organizations, as it breaks a - # common expectations for an admin user. - # - # The broken expectation is that removing a user from the - # externally managed group implies that the user won't have - # access any more. In practice the user will still have - # access if it had logged in once before, as it then exists - # in JupyterHub's database of users. - # - allow_existing_users: True - Authenticator: - # WARNING: Removing a user from admin_users or allowed_users doesn't - # revoke admin status or access. - # - # OAuthenticator.allow_existing_users allows any user in the - # JupyterHub database of users able to login. This includes - # any previously logged in user or user previously listed in - # allowed_users or admin_users, as such users are added to - # JupyterHub's database on startup. - # - # To revoke admin status or access for a user when - # allow_existing_users is enabled, first remove the user from - # admin_users or allowed_users, then deploy the change, and - # finally revoke the admin status or delete the user via the - # /hub/admin panel. - # - admin_users: - - alisonrgray - - nikki-t - - dgumustel - singleuser: - cloudMetadata: - blockWithIptables: false - profileList: - # The mem-guarantees are here so k8s doesn't schedule other pods - # on these nodes. - - display_name: "Small: m5.large" - description: "~2 CPU, ~8G RAM" - # default set to small because that node pool is configured with - # min_nodes 1, so we should make use of it. - default: true - kubespawner_override: - # Expllicitly unset mem_limit, so it overrides the default memory limit we set in - # basehub/values.yaml - mem_limit: null - mem_guarantee: 6.5G - node_selector: - node.kubernetes.io/instance-type: m5.large - - display_name: "Medium: m5.xlarge" - description: "~4 CPU, ~15G RAM" - kubespawner_override: - mem_limit: null - mem_guarantee: 12G - node_selector: - node.kubernetes.io/instance-type: m5.xlarge - - display_name: "Large: m5.2xlarge" - description: "~8 CPU, ~30G RAM" - kubespawner_override: - mem_limit: null - mem_guarantee: 26G - node_selector: - node.kubernetes.io/instance-type: m5.2xlarge - - display_name: "Huge: m5.8xlarge" - description: "~32 CPU, ~128G RAM" - kubespawner_override: - mem_limit: null - mem_guarantee: 115G - node_selector: - node.kubernetes.io/instance-type: m5.8xlarge - - display_name: "GPU" - # P2.xlarge has 64G of RAM per GPU while g4dn has 16?! - description: | - ~4CPUs, Nvidia T4 GPU, 14G of RAM. - profile_options: - image: - display_name: Image - choices: - tensorflow: - display_name: Pangeo Tensorflow ML Notebook - slug: "tensorflow" - kubespawner_override: - image: "pangeo/ml-notebook:2022.10.18" - pytorch: - display_name: Pangeo PyTorch ML Notebook - default: true - slug: "pytorch" - kubespawner_override: - image: "pangeo/pytorch-notebook:2022.10.18" - kubespawner_override: - environment: - NVIDIA_DRIVER_CAPABILITIES: compute,utility - mem_limit: null - extra_resource_limits: - nvidia.com/gpu: "1" - mem_guarantee: 14G - node_selector: - node.kubernetes.io/instance-type: g4dn.xlarge - nodeSelector: - node.kubernetes.io/instance-type: m5.8xlarge - defaultUrl: /lab - # User image: https://quay.io/repository/uwhackweek/snowex?tab=tags - image: - name: quay.io/uwhackweek/snowex - tag: "2022.07.07" - scheduling: - userScheduler: - enabled: true diff --git a/config/clusters/gridsst/enc-deployer-credentials.secret.json b/config/clusters/gridsst/enc-deployer-credentials.secret.json deleted file mode 100644 index 83c106f1b..000000000 --- a/config/clusters/gridsst/enc-deployer-credentials.secret.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "AccessKey": { - "AccessKeyId": "ENC[AES256_GCM,data:UdRZwOuJVI2Cdb7UPTK3DO44yhU=,iv:97iisBOrEtkxT4JxuBhtnicImsmHDHxChDGKufedjsw=,tag:6cBrBsesFGQh/CPiBV6ghQ==,type:str]", - "SecretAccessKey": "ENC[AES256_GCM,data:c22BD9TsmXUCpXpKyTBalraWPatTxJRTNsAtvz8lYk51OKlZAUf6yg==,iv:89FLCZsZ74GZyur7zbDv0lmfSwOecw0en0P0hcYwBg8=,tag:J/D6xBRkGgoYvjp5xKSzDQ==,type:str]", - "UserName": "ENC[AES256_GCM,data:HIb2Vg/lNanoeGrjNgmQ3VEjbbTGaAA=,iv:U9A85tiXog1IGdNAtHpDLM62y5YAgpj6YEKcznzhNEU=,tag:GLrYw76wrjOD+bBwDbrAbw==,type:str]" - }, - "sops": { - "kms": null, - "gcp_kms": [ - { - "resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs", - "created_at": "2022-10-28T09:50:27Z", - "enc": "CiQA4OM7eEEiMA1xNIk1CyWIxi61yYo23OwltlZHXbvXtxJ1ZD4SSADuy/p8XlaHS6IMF22u4jhnzfS6L1eJbyQV6h5VL1FDqNfezzFMZ6HvOW1PsmwdEnfjy8OadLyVbaL0/BHYq/dqEmLBCK/1Xg==" - } - ], - "azure_kv": null, - "hc_vault": null, - "age": null, - "lastmodified": "2022-10-28T09:50:28Z", - "mac": "ENC[AES256_GCM,data:c7Q8k4Y8POzT+VszAHQNviIqREwcA0neekiomaEHXm2aQC0m/WYaRi6dJ0gww7p5ndj2jRe4uh+mD6ZmyZcXRJnY+dehH3w5mn2zjVSTNY+Mt7YKPC8A/jRNWCHqRbIAfwpznJDANKVAKXphrmAcKsqZfbF0ITMKQV4ZajRbpms=,iv:yZyjUHMY77tk112PLX4yjZlUTt+qCEQqcFtJ1ETr5jk=,tag:jpBLgcM0rUgRImoPt1Tw+Q==,type:str]", - "pgp": null, - "unencrypted_suffix": "_unencrypted", - "version": "3.7.3" - } -} \ No newline at end of file diff --git a/config/clusters/gridsst/enc-grafana-token.secret.yaml b/config/clusters/gridsst/enc-grafana-token.secret.yaml deleted file mode 100644 index 96dd02027..000000000 --- a/config/clusters/gridsst/enc-grafana-token.secret.yaml +++ /dev/null @@ -1,15 +0,0 @@ -grafana_token: ENC[AES256_GCM,data:4PXyzxdS32EdQc5lKO5UHn3V+6Oi1fXrbcEXWVc8N0Az12909E0s/foWTbJwIQ==,iv:nYrrXx0tGEaZirJRYZR82hniwk55YFtZPMwGIJb78fc=,tag:go7ZP6vsjuAgs9f7Umnb3g==,type:str] -sops: - kms: [] - gcp_kms: - - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs - created_at: "2022-11-24T12:28:23Z" - enc: CiUA4OM7eE5VPcybglzROm1ID6hz+ivVsToD4pW9f2ku/Pyd8xTgEkkA+0T9hZGPqlSzCHKMrHiSVJGzduMU0/mxpcvV2O1EqRl3MRtlWPdgzWncIb65HpZXjWjsWiNCPyViIPFhxCKTT2hazVRVVXzX - azure_kv: [] - hc_vault: [] - age: [] - lastmodified: "2022-11-24T12:28:24Z" - mac: ENC[AES256_GCM,data:/yNECmSRZL/lgdQAPr8fCDrlc5cRJAMx/GhNCdBKR3gGAH4348XgmtBB6ca6SDdu3bcFVZL7wytwYRyU6mPv3XbxmRGzUEjb9eySyuHKfbGn/7hVTGDsddPSzRDgP2tsko3dZch9z4rIbxul4dKSD6sGYRZGvMnRYwxOMBEV8T8=,iv:YQ+tl8Hti0m1mmpTvnEKC5+bx9+u+J+o44mLC/aYblY=,tag:NgDXtfWmcIWhZtJqiYkwxA==,type:str] - pgp: [] - unencrypted_suffix: _unencrypted - version: 3.7.3 diff --git a/config/clusters/gridsst/enc-prod.secret.values.yaml b/config/clusters/gridsst/enc-prod.secret.values.yaml deleted file mode 100644 index dd0962232..000000000 --- a/config/clusters/gridsst/enc-prod.secret.values.yaml +++ /dev/null @@ -1,21 +0,0 @@ -basehub: - jupyterhub: - hub: - config: - GitHubOAuthenticator: - client_id: ENC[AES256_GCM,data:w//38VVSPmtB8VDgIqJTH8hxOJ4=,iv:Av/PJHoYdxLVCE+gAgNt4wKW/ur/1lyjbHjyYzzvN+k=,tag:CvoqSqtLArATODrIC+CJlw==,type:str] - client_secret: ENC[AES256_GCM,data:A8UnPXvzOCn2nCZOdHDB6Lu+72uS/wCBW3I05j92JqCiCUJYL4TyIA==,iv:AE8AdoxEZ/UknpTUajsMVuq0+POmXcpglfQB6MrD06k=,tag:AVLtaFpGKJP8m7FQf9+rwA==,type:str] -sops: - kms: [] - gcp_kms: - - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs - created_at: "2022-10-27T14:03:16Z" - enc: CiQA4OM7ePBq3xoyJ+cqQzQAMRzdedwvl7aB8Xvb9MuuuJ7gEaoSSQDuy/p8F597q4v2lvFBC9j9laAaX/r+KoeNhpgOlhTim7pP0ORGKcMjdZwSOd7f5p9msi49+0h+TdVTl87xjNoHEUVY1KW98Ig= - azure_kv: [] - hc_vault: [] - age: [] - lastmodified: "2022-10-27T14:04:31Z" - mac: ENC[AES256_GCM,data:JmW3XxKGIJsw+IOgXd+p+xXu/P/jRvnTlf0Owik9ttup+coQJ331FdFUd/3BIgIEYk5YvfQQLyu563Xd4euYdGzjeYOjxhOAn8+8vAvidqhZ/CtZOlXxMVMRLcU6GbrHYBI8vG1VPvOOrYbPfH1vjIafM/3f1X53K//pfwb1vLs=,iv:ygT82JPcDs4721odhbOgQq9g6gJ6nIrbTxWcilOVNIw=,tag:u7dqkrdHpRsv59tRgVQPyQ==,type:str] - pgp: [] - unencrypted_suffix: _unencrypted - version: 3.7.3 diff --git a/config/clusters/gridsst/enc-staging.secret.values.yaml b/config/clusters/gridsst/enc-staging.secret.values.yaml deleted file mode 100644 index b359f09af..000000000 --- a/config/clusters/gridsst/enc-staging.secret.values.yaml +++ /dev/null @@ -1,21 +0,0 @@ -basehub: - jupyterhub: - hub: - config: - GitHubOAuthenticator: - client_id: ENC[AES256_GCM,data:5sESntidozbPU2Wjb3KexrqKSuU=,iv:ousZL5t+Iu0CGB0rknzTo+M6073cOlgETlb5bgs5+ME=,tag:Ety59GgKeBot04QwifgjGw==,type:str] - client_secret: ENC[AES256_GCM,data:dm3Zg3t/1GWoE7RtxFwIpFQjoUjfRhRLN1IYvQI8G31R/6BTf4D11g==,iv:4Ffe+k6J0LUnzJWKj/fPkUz6Nxh+6wtirTI1y9o7sqg=,tag:aFnLkqT/UYgYUyo2zChiFQ==,type:str] -sops: - kms: [] - gcp_kms: - - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs - created_at: "2022-10-27T13:48:39Z" - enc: CiQA4OM7eCe5Z4OXrimcHrJLaV6PvjKhd9DdOe0RZfg24j1wIRgSSQDuy/p8pF1vb4Y2QfNFC00np51If00lMog4tOCYg3OO8w4mvRazf9PRv6IvJKJ5ZDiyUETc2p5HJWmF3ltHcyOYhNgNKUrQ2L4= - azure_kv: [] - hc_vault: [] - age: [] - lastmodified: "2022-10-27T13:49:51Z" - mac: ENC[AES256_GCM,data:uHLaOG3zuG+tunCdLwumxQvt7uOlyNxa2Tt9XywBTVsAueBqM/3OCAyBRYgzpnkPFldd8BS/XEvCH6K8eox6ysxJYwQX4I8Vjna336lmDFCTp9HmBE28zWJHbvuvwKs94x5j6LeLtVyTY7mVWtWkG2WiqbZSdt/lYePmvuSuT/c=,iv:35fe//uT0uy1xWHY0fmkq0jnedU+2IUK9V6xNgz41EM=,tag:nEs1DUsz/iUCQ48QU19oDg==,type:str] - pgp: [] - unencrypted_suffix: _unencrypted - version: 3.7.3 diff --git a/config/clusters/gridsst/enc-support.secret.values.yaml b/config/clusters/gridsst/enc-support.secret.values.yaml deleted file mode 100644 index 7341d4c98..000000000 --- a/config/clusters/gridsst/enc-support.secret.values.yaml +++ /dev/null @@ -1,22 +0,0 @@ -prometheusIngressAuthSecret: - username: ENC[AES256_GCM,data:RgttnPfuHqciM3qaxAwUgmvZjm3AkYvMveK9X5QqQLcwPjDOtpsFo98iqXs/IgJSrfDFs7vNM3F/U5iYjP77Rg==,iv:R7tmVYIh+1YDdQ1qTchaYNJ5OD727s9H0E/7R0kzU7w=,tag:zp+ctVogJG7jyEhu6xRcuA==,type:str] - password: ENC[AES256_GCM,data:M/AswbGMmcNN6bIwMKyRDsW6sdaQvcP3DQbL2f7Wn1yOcY59cVE8avSEqpJAg+d52XzkPMG6Z/RmYDMN4LQi7g==,iv:FK64RqgZUJRpkInbqdIJtwL4rol5MXGHN6Y+7B4XTPw=,tag:7d98TTIiTaXc/Ssiy0vvuA==,type:str] -grafana: - grafana.ini: - auth.github: - client_id: ENC[AES256_GCM,data:R72hgUbGv5yycxUbvq3kfU9dZk8=,iv:WIVPG1dP1IEWoTAdwJcgonRA7tJhvMhX+JgiPHA+zNw=,tag:K5uvA+YXLCPGIkbxURLT2A==,type:str] - client_secret: ENC[AES256_GCM,data:W4U5P9QoEHOsLKyg3RIEPie40hBXR6dubhF3JLIiC3UyUfbonVWVrQ==,iv:f5tNG0m7auKQMMY0GqHtqCuFzvTAhQlvmuWGpa3nhgw=,tag:WbtHXKAkxWFdjBwQ5VTFhA==,type:str] -sops: - kms: [] - gcp_kms: - - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs - created_at: "2022-10-27T14:20:03Z" - enc: CiQA4OM7eBc9gpUlEqbb7wK4oJAoTppNWimU2JLvSHCVy5t1X8kSSQDuy/p8uib7u3xBJ4LX7n3Zdbnruc8XQfBVA1HvCfQHpqodVbAFpoDa+ZSyBLz9znpNESKGIDl3wF/eIZh+Bk/BiMnKS3DXFwU= - azure_kv: [] - hc_vault: [] - age: [] - lastmodified: "2023-02-08T14:02:02Z" - mac: ENC[AES256_GCM,data:VHw69wu12NEi1Nke67jsF2IYoycuVWJLpenkEaBOPE8z2QZSDL+JMniLpT9RSpNlQgVFWinfQU4pPEv66BfJDm7yzVLkMDjPX62R2M/mP0Gtuc7JjcS71KHcTR0tELvF0zpzny6FHbaeYUpzwy3z0i/HaBFMIzAicp8nEbCG7NM=,iv:vUlhDDs8IL9QMaCeP6Bogz7Rle90ZlWU/JbSo0MoPC8=,tag:cckV05WOGEYsNuWu1hK4nw==,type:str] - pgp: [] - unencrypted_suffix: _unencrypted - version: 3.7.3 diff --git a/config/clusters/gridsst/prod.values.yaml b/config/clusters/gridsst/prod.values.yaml deleted file mode 100644 index 39ed752f7..000000000 --- a/config/clusters/gridsst/prod.values.yaml +++ /dev/null @@ -1,28 +0,0 @@ -basehub: - userServiceAccount: - annotations: - eks.amazonaws.com/role-arn: arn:aws:iam::993998403439:role/gridsst-prod - jupyterhub: - ingress: - hosts: [gridsst.2i2c.cloud] - tls: - - hosts: [gridsst.2i2c.cloud] - secretName: https-auto-tls - hub: - config: - GitHubOAuthenticator: - oauth_callback_url: https://gridsst.2i2c.cloud/hub/oauth_callback - singleuser: - nodeSelector: - 2i2c/hub-name: prod - dask-gateway: - gateway: - backend: - scheduler: - extraPodConfig: - nodeSelector: - 2i2c/hub-name: prod - worker: - extraPodConfig: - nodeSelector: - 2i2c/hub-name: prod diff --git a/config/clusters/gridsst/staging.values.yaml b/config/clusters/gridsst/staging.values.yaml deleted file mode 100644 index bc0b3293b..000000000 --- a/config/clusters/gridsst/staging.values.yaml +++ /dev/null @@ -1,28 +0,0 @@ -basehub: - userServiceAccount: - annotations: - eks.amazonaws.com/role-arn: arn:aws:iam::993998403439:role/gridsst-staging - jupyterhub: - ingress: - hosts: [staging.gridsst.2i2c.cloud] - tls: - - hosts: [staging.gridsst.2i2c.cloud] - secretName: https-auto-tls - hub: - config: - GitHubOAuthenticator: - oauth_callback_url: https://staging.gridsst.2i2c.cloud/hub/oauth_callback - singleuser: - nodeSelector: - 2i2c/hub-name: staging - dask-gateway: - gateway: - backend: - scheduler: - extraPodConfig: - nodeSelector: - 2i2c/hub-name: staging - worker: - extraPodConfig: - nodeSelector: - 2i2c/hub-name: staging diff --git a/config/clusters/gridsst/support.values.yaml b/config/clusters/gridsst/support.values.yaml deleted file mode 100644 index e5e5fb5fb..000000000 --- a/config/clusters/gridsst/support.values.yaml +++ /dev/null @@ -1,42 +0,0 @@ -prometheusIngressAuthSecret: - enabled: true - -cluster-autoscaler: - enabled: true - autoDiscovery: - clusterName: gridsst - awsRegion: us-west-2 - -grafana: - grafana.ini: - server: - root_url: https://grafana.gridsst.2i2c.cloud/ - auth.github: - enabled: true - allowed_organizations: 2i2c-org - ingress: - hosts: - - grafana.gridsst.2i2c.cloud - tls: - - secretName: grafana-tls - hosts: - - grafana.gridsst.2i2c.cloud - -prometheus: - server: - ingress: - enabled: true - hosts: - - prometheus.gridsst.2i2c.cloud - tls: - - secretName: prometheus-tls - hosts: - - prometheus.gridsst.2i2c.cloud - -aws-ce-grafana-backend: - enabled: true - envBasedConfig: - clusterName: gridsst - serviceAccount: - annotations: - eks.amazonaws.com/role-arn: arn:aws:iam::993998403439:role/aws_ce_grafana_backend_iam_role diff --git a/eksctl/gridsst.jsonnet b/eksctl/gridsst.jsonnet deleted file mode 100644 index cd575a072..000000000 --- a/eksctl/gridsst.jsonnet +++ /dev/null @@ -1,323 +0,0 @@ -/* - This file is a jsonnet template of a eksctl's cluster configuration file, - that is used with the eksctl CLI to both update and initialize an AWS EKS - based cluster. - - This file has in turn been generated from eksctl/template.jsonnet which is - relevant to compare with for changes over time. - - To use jsonnet to generate an eksctl configuration file from this, do: - - jsonnet gridsst.jsonnet > gridsst.eksctl.yaml - - References: - - https://eksctl.io/usage/schema/ -*/ -local ng = import "./libsonnet/nodegroup.jsonnet"; - -// place all cluster nodes here -local clusterRegion = "us-west-2"; -local masterAzs = ["us-west-2a", "us-west-2b", "us-west-2c"]; -local nodeAz = "us-west-2a"; - -// Node definitions for notebook nodes. Config here is merged -// with our notebook node definition. -// A `node.kubernetes.io/instance-type label is added, so pods -// can request a particular kind of node with a nodeSelector -local notebookNodes = [ - # FIXME: Ensure gridsst wants minSize 1. Before an event it was set to 0, - # but as part of scaling down after the event it stayed at one. - # - # scale up: https://github.com/2i2c-org/infrastructure/pull/1836 - # scale down: https://github.com/2i2c-org/infrastructure/pull/1844 - # - { - instanceType: "m5.large", - minSize: 0, - namePrefix: "nb-staging", - labels+: { "2i2c/hub-name": "staging" }, - tags+: { "2i2c:hub-name": "staging" }, - }, - { - instanceType: "m5.large", - minSize: 1, - namePrefix: "nb-prod", - labels+: { "2i2c/hub-name": "prod" }, - tags+: { "2i2c:hub-name": "prod" }, - }, - { - instanceType: "m5.xlarge", - minSize: 0, - namePrefix: "nb-staging", - labels+: { "2i2c/hub-name": "staging" }, - tags+: { "2i2c:hub-name": "staging" }, - }, - { - instanceType: "m5.xlarge", - minSize: 0, - namePrefix: "nb-prod", - labels+: { "2i2c/hub-name": "prod" }, - tags+: { "2i2c:hub-name": "prod" }, - }, - { - instanceType: "m5.2xlarge", - minSize: 0, - namePrefix: "nb-staging", - labels+: { "2i2c/hub-name": "staging" }, - tags+: { "2i2c:hub-name": "staging" }, - }, - { - instanceType: "m5.2xlarge", - minSize: 0, - namePrefix: "nb-prod", - labels+: { "2i2c/hub-name": "prod" }, - tags+: { "2i2c:hub-name": "prod" }, - }, - { - instanceType: "m5.8xlarge", - minSize: 0, - namePrefix: "nb-staging", - labels+: { "2i2c/hub-name": "staging" }, - tags+: { "2i2c:hub-name": "staging" }, - }, - { - instanceType: "m5.8xlarge", - minSize: 0, - namePrefix: "nb-prod", - labels+: { "2i2c/hub-name": "prod" }, - tags+: { "2i2c:hub-name": "prod" }, - }, - { - instanceType: "r5.xlarge", - minSize: 0, - namePrefix: "nb-staging", - labels+: { "2i2c/hub-name": "staging" }, - tags+: { "2i2c:hub-name": "staging" }, - }, - { - instanceType: "r5.xlarge", - minSize: 0, - namePrefix: "nb-prod", - labels+: { "2i2c/hub-name": "prod" }, - tags+: { "2i2c:hub-name": "prod" }, - }, - { - instanceType: "r5.4xlarge", - minSize: 0, - namePrefix: "nb-staging", - labels+: { "2i2c/hub-name": "staging" }, - tags+: { "2i2c:hub-name": "staging" }, - }, - { - instanceType: "r5.4xlarge", - minSize: 0, - namePrefix: "nb-prod", - labels+: { "2i2c/hub-name": "prod" }, - tags+: { "2i2c:hub-name": "prod" }, - }, - { - instanceType: "r5.16xlarge", - minSize: 0, - namePrefix: "nb-staging", - labels+: { "2i2c/hub-name": "staging" }, - tags+: { "2i2c:hub-name": "staging" }, - }, - { - instanceType: "r5.16xlarge", - minSize: 0, - namePrefix: "nb-prod", - labels+: { "2i2c/hub-name": "prod" }, - tags+: { "2i2c:hub-name": "prod" }, - }, - { - instanceType: "g4dn.xlarge", - minSize: 0, - namePrefix: "gpu-staging", - labels+: { "2i2c/hub-name": "staging" }, - tags+: { - "2i2c:hub-name": "staging", - "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" - }, - taints+: { - "nvidia.com/gpu": "present:NoSchedule" - }, - // Allow provisioning GPUs across all AZs, to prevent situation where all - // GPUs in a single AZ are in use and no new nodes can be spawned - availabilityZones: masterAzs, - }, - { - instanceType: "g4dn.xlarge", - minSize: 0, - namePrefix: "gpu-prod", - labels+: { "2i2c/hub-name": "prod" }, - tags+: { - "2i2c:hub-name": "prod", - "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" - }, - taints+: { - "nvidia.com/gpu": "present:NoSchedule" - }, - // Allow provisioning GPUs across all AZs, to prevent situation where all - // GPUs in a single AZ are in use and no new nodes can be spawned - availabilityZones: masterAzs, - }, -]; - -local daskNodes = [ - // Node definitions for dask worker nodes. Config here is merged - // with our dask worker node definition, which uses spot instances. - // A `node.kubernetes.io/instance-type label is set to the name of the - // *first* item in instanceDistribution.instanceTypes, to match - // what we do with notebook nodes. Pods can request a particular - // kind of node with a nodeSelector - // - // A not yet fully established policy is being developed about using a single - // node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. - // - { - namePrefix: "dask-staging", - labels+: { "2i2c/hub-name": "staging" }, - tags+: { "2i2c:hub-name": "staging" }, - instancesDistribution+: { instanceTypes: ["r5.4xlarge"] } - }, - { - namePrefix: "dask-prod", - labels+: { "2i2c/hub-name": "prod" }, - tags+: { "2i2c:hub-name": "prod" }, - instancesDistribution+: { instanceTypes: ["r5.4xlarge"] } - }, -]; - - -{ - apiVersion: 'eksctl.io/v1alpha5', - kind: 'ClusterConfig', - metadata+: { - name: "gridsst", - region: clusterRegion, - version: "1.30", - tags+: { - "ManagedBy": "2i2c", - "2i2c.org/cluster-name": $.metadata.name, - }, - }, - availabilityZones: masterAzs, - iam: { - withOIDC: true, - }, - // If you add an addon to this config, run the create addon command. - // - // eksctl create addon --config-file=gridsst.eksctl.yaml - // - addons: [ - { version: "latest", tags: $.metadata.tags } + addon - for addon in - [ - { name: "coredns" }, - { name: "kube-proxy" }, - { - // vpc-cni is a Amazon maintained container networking interface - // (CNI), where a CNI is required for k8s networking. The aws-node - // DaemonSet in kube-system stems from installing this. - // - // Related docs: https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/ - // https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html - // - name: "vpc-cni", - attachPolicyARNs: ["arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"], - # FIXME: enabling network policy enforcement didn't work as of - # August 2024, what's wrong isn't clear. - # - # configurationValues ref: https://github.com/aws/amazon-vpc-cni-k8s/blob/HEAD/charts/aws-vpc-cni/values.yaml - configurationValues: ||| - enableNetworkPolicy: "false" - |||, - }, - { - // aws-ebs-csi-driver ensures that our PVCs are bound to PVs that - // couple to AWS EBS based storage, without it expect to see pods - // mounting a PVC failing to schedule and PVC resources that are - // unbound. - // - // Related docs: https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html - // - name: "aws-ebs-csi-driver", - wellKnownPolicies: { - ebsCSIController: true, - }, - # configurationValues ref: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/HEAD/charts/aws-ebs-csi-driver/values.yaml - configurationValues: ||| - defaultStorageClass: - enabled: true - |||, - }, - ] - ], - nodeGroups: [ - n + {clusterName: $.metadata.name} for n in - [ - ng + { - namePrefix: 'core', - nameSuffix: 'b', - nameIncludeInstanceType: false, - availabilityZones: [nodeAz], - ssh: { - publicKeyPath: 'ssh-keys/gridsst.key.pub' - }, - instanceType: "r5.xlarge", - minSize: 1, - maxSize: 6, - labels+: { - "hub.jupyter.org/node-purpose": "core", - "k8s.dask.org/node-purpose": "core" - }, - tags+: { "2i2c:node-purpose": "core" }, - }, - ] + [ - ng + { - namePrefix: "nb", - availabilityZones: [nodeAz], - minSize: n.minSize, - maxSize: 500, - instanceType: n.instanceType, - ssh: { - publicKeyPath: 'ssh-keys/gridsst.key.pub' - }, - labels+: { - "hub.jupyter.org/node-purpose": "user", - "k8s.dask.org/node-purpose": "scheduler" - }, - tags+: { "2i2c:node-purpose": "user" }, - taints+: { - "hub.jupyter.org_dedicated": "user:NoSchedule", - "hub.jupyter.org/dedicated": "user:NoSchedule" - }, - } + n for n in notebookNodes - ] + [ - ng + { - namePrefix: "dask", - availabilityZones: [nodeAz], - minSize: 0, - maxSize: 500, - ssh: { - publicKeyPath: 'ssh-keys/gridsst.key.pub' - }, - labels+: { - "k8s.dask.org/node-purpose": "worker" - }, - tags+: { - "2i2c:node-purpose": "worker" - }, - taints+: { - "k8s.dask.org_dedicated" : "worker:NoSchedule", - "k8s.dask.org/dedicated" : "worker:NoSchedule" - }, - instancesDistribution+: { - onDemandBaseCapacity: 0, - onDemandPercentageAboveBaseCapacity: 0, - spotAllocationStrategy: "capacity-optimized", - }, - } + n for n in daskNodes - ] - ] -} diff --git a/terraform/aws/projects/gridsst.tfvars b/terraform/aws/projects/gridsst.tfvars deleted file mode 100644 index e3303e0af..000000000 --- a/terraform/aws/projects/gridsst.tfvars +++ /dev/null @@ -1,26 +0,0 @@ -region = "us-west-2" -cluster_name = "gridsst" -cluster_nodes_location = "us-west-2a" - -enable_aws_ce_grafana_backend_iam = true -disable_cluster_wide_filestore = false - -user_buckets = { - "scratch-staging" : { - "delete_after" : 7, - "tags" : { "2i2c:hub-name" : "staging" }, - }, - "scratch" : { - "delete_after" : 7, - "tags" : { "2i2c:hub-name" : "prod" }, - }, -} - -hub_cloud_permissions = { - "staging" : { - bucket_admin_access : ["scratch-staging"], - }, - "prod" : { - bucket_admin_access : ["scratch"], - }, -}