Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

terraform, gcp: fix node version bug, do misc cleanup, and get some node pool upgraded #3468

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion terraform/gcp/cluster.tf
Original file line number Diff line number Diff line change
@@ -251,7 +251,7 @@ resource "google_container_node_pool" "notebook" {
cluster = google_container_cluster.cluster.name
project = google_container_cluster.cluster.project
location = google_container_cluster.cluster.location
version = try(each.value.node_version, var.k8s_versions.notebook_nodes_version)
version = coalesce(each.value.node_version, var.k8s_versions.notebook_nodes_version)

# terraform treats null same as unset, so we only set the node_locations
# here if it is explicitly overriden. If not, it will just inherit whatever
7 changes: 6 additions & 1 deletion terraform/gcp/projects/2i2c-uk.tfvars
Original file line number Diff line number Diff line change
@@ -19,12 +19,17 @@ enable_filestore = true
filestore_capacity_gb = 1024

notebook_nodes = {
# FIXME: Rename this to "n2-highmem-4" when given the chance and no such nodes are running
# FIXME: Delete this node pool when its empty, its replaced by n2-highmem-4
"user" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
},
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
},
"n2-highmem-16" : {
min : 0,
max : 100,
7 changes: 6 additions & 1 deletion terraform/gcp/projects/callysto.tfvars
Original file line number Diff line number Diff line change
@@ -19,12 +19,17 @@ enable_filestore = true
filestore_capacity_gb = 1024

notebook_nodes = {
# FIXME: Rename this to "n2-highmem-4" when given the chance and no such nodes are running
# FIXME: Delete this node pool when its empty, its replaced by n2-highmem-4
"user" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
},
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
},
"n2-highmem-16" : {
min : 0,
max : 100,
3 changes: 1 addition & 2 deletions terraform/gcp/projects/cloudbank.tfvars
Original file line number Diff line number Diff line change
@@ -23,8 +23,7 @@ enable_filestore = true
filestore_capacity_gb = 1024

notebook_nodes = {
# FIXME: Remove this node pool when unused, its been replaced by the
# n2-highmem-4 node pool
# FIXME: Delete this node pool when its empty, its replaced by n2-highmem-4
"user" : {
min : 0,
max : 100,
2 changes: 1 addition & 1 deletion terraform/gcp/projects/daskhub-template.tfvars
Original file line number Diff line number Diff line change
@@ -65,7 +65,7 @@ notebook_nodes = {
# node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
#
dask_nodes = {
"worker" : {
"n2-highmem-16" : {
min : 0,
max : 200,
machine_type : "n2-highmem-16",
18 changes: 12 additions & 6 deletions terraform/gcp/projects/hhmi.tfvars
Original file line number Diff line number Diff line change
@@ -9,6 +9,13 @@ regional_cluster = true

core_node_machine_type = "n2-highmem-4"

k8s_versions = {
min_master_version : "1.27.5-gke.200",
core_nodes_version : "1.27.5-gke.200",
notebook_nodes_version : "1.27.5-gke.200",
dask_nodes_version : "1.27.5-gke.200",
}

# Network policy is required to enforce separation between hubs on multi-tenant clusters
# Tip: uncomment the line below if this cluster will be multi-tenant
# enable_network_policy = true
@@ -22,16 +29,15 @@ hub_cloud_permissions = {}

# Setup notebook node pools
notebook_nodes = {
# FIXME: Rename this to "n2-highmem-16" when given the chance and no such nodes are running
"medium" : {
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16",
machine_type : "n2-highmem-4",
},
"n2-highmem-4" : {
"n2-highmem-16" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
machine_type : "n2-highmem-16",
},
"n2-highmem-64" : {
min : 0,
@@ -46,7 +52,7 @@ notebook_nodes = {
# node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
#
dask_nodes = {
"worker" : {
"n2-highmem-16" : {
min : 0,
max : 200,
machine_type : "n2-highmem-16",
32 changes: 8 additions & 24 deletions terraform/gcp/projects/leap.tfvars
Original file line number Diff line number Diff line change
@@ -11,10 +11,6 @@ k8s_versions = {
dask_nodes_version : "1.27.4-gke.900",
}

# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
temp_opt_out_node_purpose_label_core_nodes = true

# GPUs not available in us-central1-b
zone = "us-central1-c"
region = "us-central1"
@@ -79,44 +75,35 @@ hub_cloud_permissions = {

# Setup notebook node pools
notebook_nodes = {
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
temp_opt_out_node_purpose_label : true,
},
# FIXME: Rename this to "n2-highmem-16" when given the chance and no such nodes are running
# FIXME: Remove node pool specific node_version pin when given the chance and no such nodes are running
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
# FIXME: Delete this node pool when its empty, its replaced by n2-highmem-16
"medium" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16",
node_version : "1.25.6-gke.1000",
temp_opt_out_node_purpose_label : true
},
"n2-highmem-16" : {
# A minimum of one is configured for LEAP to ensure quick startups at all
# time. Cost is not a greater concern than optimizing startup times.
min : 1,
max : 100,
machine_type : "n2-highmem-16",
node_version : "1.25.6-gke.1000",
temp_opt_out_node_purpose_label : true
},
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"n2-highmem-64" : {
min : 0,
max : 100,
machine_type : "n2-highmem-64"
temp_opt_out_node_purpose_label : true
}
# FIXME: Remove node pool specific node_version pin when given the chance and no such nodes are running
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"gpu-t4" : {
min : 0,
max : 100,
machine_type : "n1-standard-8",
node_version : "1.25.6-gke.1000",
temp_opt_out_node_purpose_label : true
gpu : {
enabled : true,
type : "nvidia-tesla-t4",
@@ -139,8 +126,6 @@ notebook_nodes = {
# node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
#
dask_nodes = {
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"n2-highmem-16" : {
min : 0,
max : 200,
@@ -149,6 +134,5 @@ dask_nodes = {
# See https://github.com/2i2c-org/infrastructure/issues/2396
preemptible : false,
machine_type : "n2-highmem-16"
temp_opt_out_node_purpose_label : true
},
}
9 changes: 3 additions & 6 deletions terraform/gcp/projects/linked-earth.tfvars
Original file line number Diff line number Diff line change
@@ -29,19 +29,16 @@ user_buckets = {

# Setup notebook node pools
notebook_nodes = {
# FIXME: Rename this to "n2-highmem-4" when given the chance and no such nodes are running
"small" : {
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
},
# FIXME: Rename this to "n2-highmem-16" when given the chance and no such nodes are running
"medium" : {
"n2-highmem-16" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16",
},
# FIXME: Rename this to "n2-highmem-64" when given the chance and no such nodes are running
"n2-highmem-64" : {
min : 0,
max : 100,
@@ -55,7 +52,7 @@ notebook_nodes = {
# node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
#
dask_nodes = {
"worker" : {
"n2-highmem-16" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16",
2 changes: 1 addition & 1 deletion terraform/gcp/projects/m2lines.tfvars
Original file line number Diff line number Diff line change
@@ -100,7 +100,7 @@ notebook_nodes = {
# node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
#
dask_nodes = {
"worker" : {
"n2-highmem-16" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16",
2 changes: 1 addition & 1 deletion terraform/gcp/projects/meom-ige.tfvars
Original file line number Diff line number Diff line change
@@ -63,7 +63,7 @@ notebook_nodes = {
# node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
#
dask_nodes = {
"worker" : {
"n2-highmem-16" : {
min : 0,
max : 100,
machine_type : "n2-highmem-16",
36 changes: 27 additions & 9 deletions terraform/gcp/projects/pilot-hubs.tfvars
Original file line number Diff line number Diff line change
@@ -12,25 +12,25 @@ k8s_versions = {
dask_nodes_version : "1.27.4-gke.900",
}

# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
temp_opt_out_node_purpose_label_core_nodes = true

core_node_machine_type = "n2-highmem-4"
enable_network_policy = true

enable_filestore = true
filestore_capacity_gb = 5120

notebook_nodes = {
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
# FIXME: Delete this node pool when its empty, its replaced by n2-highmem-4-b
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
temp_opt_out_node_purpose_label : true,
},
"n2-highmem-4-b" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
},
"n2-highmem-16" : {
min : 0,
max : 100,
@@ -60,9 +60,7 @@ notebook_nodes = {
},
},
# Nodepool for temple university. https://github.com/2i2c-org/infrastructure/issues/3158
# FIXME: Remove node pool specific node_version pin when given the chance and no such nodes are running
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
# FIXME: Delete this node pool when its empty, its replaced by temple-b
"temple" : {
# Expecting upto ~120 users at a time
min : 0,
@@ -85,6 +83,26 @@ notebook_nodes = {
"community" : "temple"
},
},
"temple-b" : {
# Expecting upto ~120 users at a time
min : 0,
max : 100,
# Everyone gets a 256M guarantee, and n2-highmem-8 has about 60GB of RAM.
# This fits upto 100 users on the node, as memory guarantee isn't the constraint.
# This works ok.
machine_type : "n2-highmem-8",
labels : {
"2i2c.org/community" : "temple"
},
taints : [{
key : "2i2c.org/community",
value : "temple",
effect : "NO_SCHEDULE"
}],
resource_labels : {
"community" : "temple"
},
},
# Nodepool for jackeddy symposium. https://github.com/2i2c-org/infrastructure/issues/3166
"jackeddy" : {
min : 0,
7 changes: 0 additions & 7 deletions terraform/gcp/projects/qcl.tfvars
Original file line number Diff line number Diff line change
@@ -11,10 +11,6 @@ k8s_versions = {
notebook_nodes_version : "1.27.4-gke.900",
}

# FIXME: Remove temp_opt_out_node_purpose_label_core_nodes when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
temp_opt_out_node_purpose_label_core_nodes = true

core_node_machine_type = "n2-highmem-2"
enable_network_policy = true

@@ -31,13 +27,10 @@ user_buckets = {
}

notebook_nodes = {
# FIXME: Remove temp_opt_out_node_purpose_label when a node upgrade can be
# done. See https://github.com/2i2c-org/infrastructure/issues/3405.
"n2-highmem-4" : {
min : 0,
max : 100,
machine_type : "n2-highmem-4",
temp_opt_out_node_purpose_label : true,
},
"n2-highmem-16" : {
min : 0,
1 change: 1 addition & 0 deletions terraform/gcp/variables.tf
Original file line number Diff line number Diff line change
@@ -133,6 +133,7 @@ variable "dask_nodes" {
temp_opt_out_node_purpose_label : optional(bool, false),
resource_labels : optional(map(string), {}),
zones : optional(list(string), [])
node_version : optional(string, ""),
}))
description = "Dask node pools to create. Defaults to notebook_nodes"
default = {}