From 735a984506825118c67d7ade6ae45fdfcdf12838 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Thu, 14 Nov 2024 14:30:09 +0000 Subject: [PATCH 1/6] jmte: create hub-specific nodegroups --- eksctl/jupyter-meets-the-earth.jsonnet | 148 +++++++++++++++++++++++-- 1 file changed, 138 insertions(+), 10 deletions(-) diff --git a/eksctl/jupyter-meets-the-earth.jsonnet b/eksctl/jupyter-meets-the-earth.jsonnet index 5ceec54f03..27edb8c4ad 100644 --- a/eksctl/jupyter-meets-the-earth.jsonnet +++ b/eksctl/jupyter-meets-the-earth.jsonnet @@ -25,16 +25,110 @@ local nodeAz = "us-west-2a"; // A `node.kubernetes.io/instance-type label is added, so pods // can request a particular kind of node with a nodeSelector local notebookNodes = [ - { instanceType: "r5.xlarge" }, - { instanceType: "r5.4xlarge" }, - { instanceType: "r5.16xlarge" }, - { instanceType: "m5.xlarge" }, - { instanceType: "m5.4xlarge" }, - { instanceType: "m5.16xlarge" }, - { instanceType: "x1.16xlarge" }, + { + instanceType: "r5.xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + { + instanceType: "r5.xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + { + instanceType: "r5.4xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + { + instanceType: "r5.4xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + { + instanceType: "r5.16xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + { + instanceType: "r5.16xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + { + instanceType: "m5.xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + { + instanceType: "m5.xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + { + instanceType: "m5.4xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + { + instanceType: "m5.4xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + { + instanceType: "m5.16xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + { + instanceType: "m5.16xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + { + instanceType: "x1.16xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + }, + { + instanceType: "x1.16xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + }, + { + instanceType: "g4dn.xlarge", + minSize: 0, + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { + "2i2c:hub-name": "staging", + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + taints+: { + "nvidia.com/gpu": "NoSchedule" + } + }, { - instanceType: "g4dn.xlarge", minSize: 0, + instanceType: "g4dn.xlarge", + minSize: 0, + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, tags+: { + "2i2c:hub-name": "prod", "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" }, taints+: { @@ -42,8 +136,12 @@ local notebookNodes = [ } }, { - instanceType: "g4dn.4xlarge", minSize: 0, + instanceType: "g4dn.4xlarge", + minSize: 0, + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, tags+: { + "2i2c:hub-name": "staging", "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" }, taints+: { @@ -51,11 +149,41 @@ local notebookNodes = [ } }, { - instanceType: "g4dn.16xlarge", minSize: 0, + instanceType: "g4dn.4xlarge", + minSize: 0, + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { + "2i2c:hub-name": "prod", + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + taints+: { + "nvidia.com/gpu": "NoSchedule" + } + }, + { + instanceType: "g4dn.16xlarge", + minSize: 0, + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + taints+: { + "nvidia.com/gpu": "NoSchedule" + }, + tags+: { + "2i2c:hub-name": "staging", + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + } + }, + { + instanceType: "g4dn.16xlarge", + minSize: 0, + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, taints+: { "nvidia.com/gpu": "NoSchedule" }, tags+: { + "2i2c:hub-name": "prod", "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" } }, From 0f2a5421dc84eefe0c2d0c1f5e222c8b200cca61 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Thu, 14 Nov 2024 14:35:21 +0000 Subject: [PATCH 2/6] jmte: add node selectors for hub-specific nodegroups --- config/clusters/jupyter-meets-the-earth/prod.values.yaml | 2 ++ config/clusters/jupyter-meets-the-earth/staging.values.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/config/clusters/jupyter-meets-the-earth/prod.values.yaml b/config/clusters/jupyter-meets-the-earth/prod.values.yaml index e284d1c8f1..c5d1379eb3 100644 --- a/config/clusters/jupyter-meets-the-earth/prod.values.yaml +++ b/config/clusters/jupyter-meets-the-earth/prod.values.yaml @@ -23,6 +23,8 @@ basehub: # FIXME: We keep a *full* spare medium node around, is this necessary? memory: 14G singleuser: + nodeSelector: + 2i2c/hub-name: staging extraEnv: # FIXME: This is a *pre-existing bucket*, not the one created by # terraform. Either import it properly into our terraform state, or diff --git a/config/clusters/jupyter-meets-the-earth/staging.values.yaml b/config/clusters/jupyter-meets-the-earth/staging.values.yaml index 26aa468678..de1f9531e2 100644 --- a/config/clusters/jupyter-meets-the-earth/staging.values.yaml +++ b/config/clusters/jupyter-meets-the-earth/staging.values.yaml @@ -13,5 +13,7 @@ basehub: - hosts: [staging.jmte.2i2c.cloud] secretName: https-auto-tls singleuser: + nodeSelector: + 2i2c/hub-name: staging extraEnv: SCRATCH_BUCKET: s3://jupyter-meets-the-earth-scratch-staging/$(JUPYTERHUB_USER) From f57e480da3c8f6c0c4ec675a61f3938b9f83840d Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Thu, 14 Nov 2024 16:13:05 +0000 Subject: [PATCH 3/6] Remove duplicated key --- config/clusters/jupyter-meets-the-earth/prod.values.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config/clusters/jupyter-meets-the-earth/prod.values.yaml b/config/clusters/jupyter-meets-the-earth/prod.values.yaml index c5d1379eb3..eb946e5a9a 100644 --- a/config/clusters/jupyter-meets-the-earth/prod.values.yaml +++ b/config/clusters/jupyter-meets-the-earth/prod.values.yaml @@ -23,8 +23,6 @@ basehub: # FIXME: We keep a *full* spare medium node around, is this necessary? memory: 14G singleuser: - nodeSelector: - 2i2c/hub-name: staging extraEnv: # FIXME: This is a *pre-existing bucket*, not the one created by # terraform. Either import it properly into our terraform state, or @@ -32,4 +30,5 @@ basehub: SCRATCH_BUCKET: s3://jmte-scratch/$(JUPYTERHUB_USER) PANGEO_SCRATCH: s3://jmte-scratch/$(JUPYTERHUB_USER) nodeSelector: + 2i2c/hub-name: staging node.kubernetes.io/instance-type: m5.xlarge From 0199ae209fcffc9b7722c79c07ad66fa30cf9464 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 15 Nov 2024 14:34:24 +0000 Subject: [PATCH 4/6] jmte: add node-purpose tags --- eksctl/jupyter-meets-the-earth.jsonnet | 2 ++ 1 file changed, 2 insertions(+) diff --git a/eksctl/jupyter-meets-the-earth.jsonnet b/eksctl/jupyter-meets-the-earth.jsonnet index 27edb8c4ad..16f6cb8663 100644 --- a/eksctl/jupyter-meets-the-earth.jsonnet +++ b/eksctl/jupyter-meets-the-earth.jsonnet @@ -253,6 +253,7 @@ local daskNodes = [ "hub.jupyter.org/node-purpose": "core", "k8s.dask.org/node-purpose": "core" }, + tags+: { "2i2c:node-purpose": "core" }, }, ] + [ ng + { @@ -268,6 +269,7 @@ local daskNodes = [ "hub.jupyter.org/node-purpose": "user", "k8s.dask.org/node-purpose": "scheduler" }, + tags+: { "2i2c:node-purpose": "user" }, taints+: { "hub.jupyter.org_dedicated": "user:NoSchedule", "hub.jupyter.org/dedicated": "user:NoSchedule" From 4ea882afd4d9882c7376e78d1866857cfbb96ae2 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 15 Nov 2024 14:34:39 +0000 Subject: [PATCH 5/6] jmte: cycle core nodegroup --- eksctl/jupyter-meets-the-earth.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eksctl/jupyter-meets-the-earth.jsonnet b/eksctl/jupyter-meets-the-earth.jsonnet index 16f6cb8663..0f2949f1f9 100644 --- a/eksctl/jupyter-meets-the-earth.jsonnet +++ b/eksctl/jupyter-meets-the-earth.jsonnet @@ -240,7 +240,7 @@ local daskNodes = [ [ ng + { namePrefix: 'core', - nameSuffix: 'a', + nameSuffix: 'b', nameIncludeInstanceType: false, availabilityZones: [nodeAz], ssh: { From 46cce040f99e0f3443940dd6a1df6a721c6ba6b7 Mon Sep 17 00:00:00 2001 From: Sarah Gibson <44771837+sgibson91@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:26:26 +0000 Subject: [PATCH 6/6] fix node selector label --- config/clusters/jupyter-meets-the-earth/prod.values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/clusters/jupyter-meets-the-earth/prod.values.yaml b/config/clusters/jupyter-meets-the-earth/prod.values.yaml index eb946e5a9a..b46b6aee10 100644 --- a/config/clusters/jupyter-meets-the-earth/prod.values.yaml +++ b/config/clusters/jupyter-meets-the-earth/prod.values.yaml @@ -30,5 +30,5 @@ basehub: SCRATCH_BUCKET: s3://jmte-scratch/$(JUPYTERHUB_USER) PANGEO_SCRATCH: s3://jmte-scratch/$(JUPYTERHUB_USER) nodeSelector: - 2i2c/hub-name: staging + 2i2c/hub-name: prod node.kubernetes.io/instance-type: m5.xlarge