From e036e95c96564443d910d7c5bd2e5f3f1fee88f9 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Tue, 28 Nov 2023 11:47:18 +0100 Subject: [PATCH 1/2] terraform, gcp: fix bug for use of default node version --- terraform/gcp/cluster.tf | 2 +- terraform/gcp/variables.tf | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/terraform/gcp/cluster.tf b/terraform/gcp/cluster.tf index 050aa1364c..3b1472c7b6 100644 --- a/terraform/gcp/cluster.tf +++ b/terraform/gcp/cluster.tf @@ -251,7 +251,7 @@ resource "google_container_node_pool" "notebook" { cluster = google_container_cluster.cluster.name project = google_container_cluster.cluster.project location = google_container_cluster.cluster.location - version = try(each.value.node_version, var.k8s_versions.notebook_nodes_version) + version = coalesce(each.value.node_version, var.k8s_versions.notebook_nodes_version) # terraform treats null same as unset, so we only set the node_locations # here if it is explicitly overriden. If not, it will just inherit whatever diff --git a/terraform/gcp/variables.tf b/terraform/gcp/variables.tf index df5d6095b6..0acd6963f3 100644 --- a/terraform/gcp/variables.tf +++ b/terraform/gcp/variables.tf @@ -133,6 +133,7 @@ variable "dask_nodes" { temp_opt_out_node_purpose_label : optional(bool, false), resource_labels : optional(map(string), {}), zones : optional(list(string), []) + node_version : optional(string, ""), })) description = "Dask node pools to create. Defaults to notebook_nodes" default = {} From 2026a4ffc6486f995abe4205a9919e5bbefa11fb Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Tue, 28 Nov 2023 11:47:44 +0100 Subject: [PATCH 2/2] terraform, gcp: misc cleanup and node pool upgrades --- terraform/gcp/projects/2i2c-uk.tfvars | 7 +++- terraform/gcp/projects/callysto.tfvars | 7 +++- terraform/gcp/projects/cloudbank.tfvars | 3 +- .../gcp/projects/daskhub-template.tfvars | 2 +- terraform/gcp/projects/hhmi.tfvars | 18 ++++++---- terraform/gcp/projects/leap.tfvars | 32 +++++------------ terraform/gcp/projects/linked-earth.tfvars | 9 ++--- terraform/gcp/projects/m2lines.tfvars | 2 +- terraform/gcp/projects/meom-ige.tfvars | 2 +- terraform/gcp/projects/pilot-hubs.tfvars | 36 ++++++++++++++----- terraform/gcp/projects/qcl.tfvars | 7 ---- 11 files changed, 66 insertions(+), 59 deletions(-) diff --git a/terraform/gcp/projects/2i2c-uk.tfvars b/terraform/gcp/projects/2i2c-uk.tfvars index 2e8bc5e603..091fed4b98 100644 --- a/terraform/gcp/projects/2i2c-uk.tfvars +++ b/terraform/gcp/projects/2i2c-uk.tfvars @@ -19,12 +19,17 @@ enable_filestore = true filestore_capacity_gb = 1024 notebook_nodes = { - # FIXME: Rename this to "n2-highmem-4" when given the chance and no such nodes are running + # FIXME: Delete this node pool when its empty, its replaced by n2-highmem-4 "user" : { min : 0, max : 100, machine_type : "n2-highmem-4", }, + "n2-highmem-4" : { + min : 0, + max : 100, + machine_type : "n2-highmem-4", + }, "n2-highmem-16" : { min : 0, max : 100, diff --git a/terraform/gcp/projects/callysto.tfvars b/terraform/gcp/projects/callysto.tfvars index ec3a021a2d..7dd737e961 100644 --- a/terraform/gcp/projects/callysto.tfvars +++ b/terraform/gcp/projects/callysto.tfvars @@ -19,12 +19,17 @@ enable_filestore = true filestore_capacity_gb = 1024 notebook_nodes = { - # FIXME: Rename this to "n2-highmem-4" when given the chance and no such nodes are running + # FIXME: Delete this node pool when its empty, its replaced by n2-highmem-4 "user" : { min : 0, max : 100, machine_type : "n2-highmem-4", }, + "n2-highmem-4" : { + min : 0, + max : 100, + machine_type : "n2-highmem-4", + }, "n2-highmem-16" : { min : 0, max : 100, diff --git a/terraform/gcp/projects/cloudbank.tfvars b/terraform/gcp/projects/cloudbank.tfvars index 057aa9de2b..36fa4075d4 100644 --- a/terraform/gcp/projects/cloudbank.tfvars +++ b/terraform/gcp/projects/cloudbank.tfvars @@ -23,8 +23,7 @@ enable_filestore = true filestore_capacity_gb = 1024 notebook_nodes = { - # FIXME: Remove this node pool when unused, its been replaced by the - # n2-highmem-4 node pool + # FIXME: Delete this node pool when its empty, its replaced by n2-highmem-4 "user" : { min : 0, max : 100, diff --git a/terraform/gcp/projects/daskhub-template.tfvars b/terraform/gcp/projects/daskhub-template.tfvars index 98fc4adfb3..1bcf3e668a 100644 --- a/terraform/gcp/projects/daskhub-template.tfvars +++ b/terraform/gcp/projects/daskhub-template.tfvars @@ -65,7 +65,7 @@ notebook_nodes = { # node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. # dask_nodes = { - "worker" : { + "n2-highmem-16" : { min : 0, max : 200, machine_type : "n2-highmem-16", diff --git a/terraform/gcp/projects/hhmi.tfvars b/terraform/gcp/projects/hhmi.tfvars index 97ab5049c2..71e7431642 100644 --- a/terraform/gcp/projects/hhmi.tfvars +++ b/terraform/gcp/projects/hhmi.tfvars @@ -9,6 +9,13 @@ regional_cluster = true core_node_machine_type = "n2-highmem-4" +k8s_versions = { + min_master_version : "1.27.5-gke.200", + core_nodes_version : "1.27.5-gke.200", + notebook_nodes_version : "1.27.5-gke.200", + dask_nodes_version : "1.27.5-gke.200", +} + # Network policy is required to enforce separation between hubs on multi-tenant clusters # Tip: uncomment the line below if this cluster will be multi-tenant # enable_network_policy = true @@ -22,16 +29,15 @@ hub_cloud_permissions = {} # Setup notebook node pools notebook_nodes = { - # FIXME: Rename this to "n2-highmem-16" when given the chance and no such nodes are running - "medium" : { + "n2-highmem-4" : { min : 0, max : 100, - machine_type : "n2-highmem-16", + machine_type : "n2-highmem-4", }, - "n2-highmem-4" : { + "n2-highmem-16" : { min : 0, max : 100, - machine_type : "n2-highmem-4", + machine_type : "n2-highmem-16", }, "n2-highmem-64" : { min : 0, @@ -46,7 +52,7 @@ notebook_nodes = { # node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. # dask_nodes = { - "worker" : { + "n2-highmem-16" : { min : 0, max : 200, machine_type : "n2-highmem-16", diff --git a/terraform/gcp/projects/leap.tfvars b/terraform/gcp/projects/leap.tfvars index b154c4dd2b..a4cb7c439d 100644 --- a/terraform/gcp/projects/leap.tfvars +++ b/terraform/gcp/projects/leap.tfvars @@ -11,10 +11,6 @@ k8s_versions = { dask_nodes_version : "1.27.4-gke.900", } -# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be -# done. See https://github.com/2i2c-org/infrastructure/issues/3405. -temp_opt_out_node_purpose_label_core_nodes = true - # GPUs not available in us-central1-b zone = "us-central1-c" region = "us-central1" @@ -79,44 +75,35 @@ hub_cloud_permissions = { # Setup notebook node pools notebook_nodes = { - # FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be - # done. See https://github.com/2i2c-org/infrastructure/issues/3405. "n2-highmem-4" : { min : 0, max : 100, machine_type : "n2-highmem-4", - temp_opt_out_node_purpose_label : true, }, - # FIXME: Rename this to "n2-highmem-16" when given the chance and no such nodes are running - # FIXME: Remove node pool specific node_version pin when given the chance and no such nodes are running - # FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be - # done. See https://github.com/2i2c-org/infrastructure/issues/3405. + # FIXME: Delete this node pool when its empty, its replaced by n2-highmem-16 "medium" : { + min : 0, + max : 100, + machine_type : "n2-highmem-16", + node_version : "1.25.6-gke.1000", + temp_opt_out_node_purpose_label : true + }, + "n2-highmem-16" : { # A minimum of one is configured for LEAP to ensure quick startups at all # time. Cost is not a greater concern than optimizing startup times. min : 1, max : 100, machine_type : "n2-highmem-16", - node_version : "1.25.6-gke.1000", - temp_opt_out_node_purpose_label : true }, - # FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be - # done. See https://github.com/2i2c-org/infrastructure/issues/3405. "n2-highmem-64" : { min : 0, max : 100, machine_type : "n2-highmem-64" - temp_opt_out_node_purpose_label : true } - # FIXME: Remove node pool specific node_version pin when given the chance and no such nodes are running - # FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be - # done. See https://github.com/2i2c-org/infrastructure/issues/3405. "gpu-t4" : { min : 0, max : 100, machine_type : "n1-standard-8", - node_version : "1.25.6-gke.1000", - temp_opt_out_node_purpose_label : true gpu : { enabled : true, type : "nvidia-tesla-t4", @@ -139,8 +126,6 @@ notebook_nodes = { # node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. # dask_nodes = { - # FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be - # done. See https://github.com/2i2c-org/infrastructure/issues/3405. "n2-highmem-16" : { min : 0, max : 200, @@ -149,6 +134,5 @@ dask_nodes = { # See https://github.com/2i2c-org/infrastructure/issues/2396 preemptible : false, machine_type : "n2-highmem-16" - temp_opt_out_node_purpose_label : true }, } diff --git a/terraform/gcp/projects/linked-earth.tfvars b/terraform/gcp/projects/linked-earth.tfvars index 351be2a025..4234fb37a8 100644 --- a/terraform/gcp/projects/linked-earth.tfvars +++ b/terraform/gcp/projects/linked-earth.tfvars @@ -29,19 +29,16 @@ user_buckets = { # Setup notebook node pools notebook_nodes = { - # FIXME: Rename this to "n2-highmem-4" when given the chance and no such nodes are running - "small" : { + "n2-highmem-4" : { min : 0, max : 100, machine_type : "n2-highmem-4", }, - # FIXME: Rename this to "n2-highmem-16" when given the chance and no such nodes are running - "medium" : { + "n2-highmem-16" : { min : 0, max : 100, machine_type : "n2-highmem-16", }, - # FIXME: Rename this to "n2-highmem-64" when given the chance and no such nodes are running "n2-highmem-64" : { min : 0, max : 100, @@ -55,7 +52,7 @@ notebook_nodes = { # node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. # dask_nodes = { - "worker" : { + "n2-highmem-16" : { min : 0, max : 100, machine_type : "n2-highmem-16", diff --git a/terraform/gcp/projects/m2lines.tfvars b/terraform/gcp/projects/m2lines.tfvars index 10357ef41c..837160bc0f 100644 --- a/terraform/gcp/projects/m2lines.tfvars +++ b/terraform/gcp/projects/m2lines.tfvars @@ -100,7 +100,7 @@ notebook_nodes = { # node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. # dask_nodes = { - "worker" : { + "n2-highmem-16" : { min : 0, max : 100, machine_type : "n2-highmem-16", diff --git a/terraform/gcp/projects/meom-ige.tfvars b/terraform/gcp/projects/meom-ige.tfvars index dff374140e..3c25ebda9a 100644 --- a/terraform/gcp/projects/meom-ige.tfvars +++ b/terraform/gcp/projects/meom-ige.tfvars @@ -63,7 +63,7 @@ notebook_nodes = { # node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. # dask_nodes = { - "worker" : { + "n2-highmem-16" : { min : 0, max : 100, machine_type : "n2-highmem-16", diff --git a/terraform/gcp/projects/pilot-hubs.tfvars b/terraform/gcp/projects/pilot-hubs.tfvars index cb64c46e84..458acd3af4 100644 --- a/terraform/gcp/projects/pilot-hubs.tfvars +++ b/terraform/gcp/projects/pilot-hubs.tfvars @@ -12,10 +12,6 @@ k8s_versions = { dask_nodes_version : "1.27.4-gke.900", } -# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be -# done. See https://github.com/2i2c-org/infrastructure/issues/3405. -temp_opt_out_node_purpose_label_core_nodes = true - core_node_machine_type = "n2-highmem-4" enable_network_policy = true @@ -23,14 +19,18 @@ enable_filestore = true filestore_capacity_gb = 5120 notebook_nodes = { - # FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be - # done. See https://github.com/2i2c-org/infrastructure/issues/3405. + # FIXME: Delete this node pool when its empty, its replaced by n2-highmem-4-b "n2-highmem-4" : { min : 0, max : 100, machine_type : "n2-highmem-4", temp_opt_out_node_purpose_label : true, }, + "n2-highmem-4-b" : { + min : 0, + max : 100, + machine_type : "n2-highmem-4", + }, "n2-highmem-16" : { min : 0, max : 100, @@ -60,9 +60,7 @@ notebook_nodes = { }, }, # Nodepool for temple university. https://github.com/2i2c-org/infrastructure/issues/3158 - # FIXME: Remove node pool specific node_version pin when given the chance and no such nodes are running - # FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be - # done. See https://github.com/2i2c-org/infrastructure/issues/3405. + # FIXME: Delete this node pool when its empty, its replaced by temple-b "temple" : { # Expecting upto ~120 users at a time min : 0, @@ -85,6 +83,26 @@ notebook_nodes = { "community" : "temple" }, }, + "temple-b" : { + # Expecting upto ~120 users at a time + min : 0, + max : 100, + # Everyone gets a 256M guarantee, and n2-highmem-8 has about 60GB of RAM. + # This fits upto 100 users on the node, as memory guarantee isn't the constraint. + # This works ok. + machine_type : "n2-highmem-8", + labels : { + "2i2c.org/community" : "temple" + }, + taints : [{ + key : "2i2c.org/community", + value : "temple", + effect : "NO_SCHEDULE" + }], + resource_labels : { + "community" : "temple" + }, + }, # Nodepool for jackeddy symposium. https://github.com/2i2c-org/infrastructure/issues/3166 "jackeddy" : { min : 0, diff --git a/terraform/gcp/projects/qcl.tfvars b/terraform/gcp/projects/qcl.tfvars index 42d37235dd..369de24c76 100644 --- a/terraform/gcp/projects/qcl.tfvars +++ b/terraform/gcp/projects/qcl.tfvars @@ -11,10 +11,6 @@ k8s_versions = { notebook_nodes_version : "1.27.4-gke.900", } -# FIXME: Remove temp_opt_out_node_purpose_label_core_nodes when a node upgrade can be -# done. See https://github.com/2i2c-org/infrastructure/issues/3405. -temp_opt_out_node_purpose_label_core_nodes = true - core_node_machine_type = "n2-highmem-2" enable_network_policy = true @@ -31,13 +27,10 @@ user_buckets = { } notebook_nodes = { - # FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be - # done. See https://github.com/2i2c-org/infrastructure/issues/3405. "n2-highmem-4" : { min : 0, max : 100, machine_type : "n2-highmem-4", - temp_opt_out_node_purpose_label : true, }, "n2-highmem-16" : { min : 0,