Skip to content

Commit 04f9369

Browse files
committed
terraform, azure and utoronto: k8s upgrade with misc variables to support it
1 parent 7eb118d commit 04f9369

File tree

3 files changed

+60
-29
lines changed

3 files changed

+60
-29
lines changed

terraform/azure/main.tf

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
9999
#
100100
# Most changes to this node pool forces a replace operation on the entire
101101
# cluster. This can be avoided with v3.47.0+ of this provider by declaring
102-
# temporary_name_for_rotation = "core-b".
102+
# temporary_name_for_rotation = "coreb".
103103
#
104104
# ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
105105
# ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
@@ -108,9 +108,11 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
108108
name = var.core_node_pool.name
109109
vm_size = var.core_node_pool.vm_size
110110
os_disk_size_gb = var.core_node_pool.os_disk_size_gb
111-
enable_auto_scaling = true
112-
min_count = var.core_node_pool.min
113-
max_count = var.core_node_pool.max
111+
enable_auto_scaling = var.core_node_pool.enable_auto_scaling
112+
min_count = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.min : null
113+
max_count = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.max : null
114+
node_count = var.core_node_pool.node_count
115+
kubelet_disk_type = var.core_node_pool.kubelet_disk_type
114116
vnet_subnet_id = azurerm_subnet.node_subnet.id
115117
node_labels = merge({
116118
"hub.jupyter.org/node-purpose" = "core",
@@ -147,13 +149,14 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
147149

148150

149151
resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
150-
for_each = var.notebook_nodes
152+
for_each = var.user_node_pools
151153

152154
name = coalesce(each.value.name, each.key)
153155
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
154156
enable_auto_scaling = true
155-
os_disk_size_gb = 200
157+
os_disk_size_gb = each.value.os_disk_size_gb
156158
vnet_subnet_id = azurerm_subnet.node_subnet.id
159+
kubelet_disk_type = each.value.kubelet_disk_type
157160

158161
orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
159162

@@ -173,14 +176,14 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
173176
}
174177

175178
resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
176-
# If dask_nodes is set, we use that. If it isn't, we use notebook_nodes.
177-
# This lets us set dask_nodes to an empty array to get no dask nodes
178-
for_each = var.dask_nodes
179+
# If dask_node_pools is set, we use that. If it isn't, we use user_node_pools.
180+
# This lets us set dask_node_pools to an empty array to get no dask nodes
181+
for_each = var.dask_node_pools
179182

180183
name = "dask${each.key}"
181184
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
182185
enable_auto_scaling = true
183-
os_disk_size_gb = 200
186+
os_disk_size_gb = each.value.os_disk_size_gb
184187
vnet_subnet_id = azurerm_subnet.node_subnet.id
185188

186189
orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

terraform/azure/projects/utoronto.tfvars

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,40 @@ location = "canadacentral"
88
storage_size = 8192
99
ssh_pub_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2aqRcuhDkQSMx0Hak5xkbt3KnT3cOwAgUP1Vt/SjhltSTuxpOHxiAKCRnjwRk60SxKhUNzPHih2nkfYTmBBjmLfdepDPSke/E0VWvTDIEXz/L8vW8aI0QGPXnXyqzEDO9+U1buheBlxB0diFAD3vEp2SqBOw+z7UgrGxXPdP+2b3AV+X6sOtd6uSzpV8Qvdh+QAkd4r7h9JrkFvkrUzNFAGMjlTb0Lz7qAlo4ynjEwzVN2I1i7cVDKgsGz9ZG/8yZfXXx+INr9jYtYogNZ63ajKR/dfjNPovydhuz5zQvQyxpokJNsTqt1CiWEUNj georgiana@georgiana"
1010

11-
# FIXME: upgrade to 1.27.7, and then 1.28.3, based on the latest versions
12-
# available via: az aks get-versions --location westus2 -o table
13-
#
14-
kubernetes_version = "1.27.7"
11+
# List available versions via: az aks get-versions --location westus2 -o table
12+
kubernetes_version = "1.28.3"
1513

1614
core_node_pool = {
1715
name : "core",
16+
kubernetes_version : "1.28.3",
17+
18+
# FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and
19+
# can more cheaply handle being forced to have 2-3 replicas for silly
20+
# reasons like three calico-typha pods. See
21+
# https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
22+
#
23+
# Transitioning to E2s_v5 would require reducing the requested memory
24+
# by prometheus-server though, but that should be okay since
25+
# prometheus has reduced its memory profile significant enough recently.
26+
#
1827
vm_size : "Standard_E4s_v3",
19-
kubernetes_version : "1.26.3",
28+
29+
# FIXME: stop using persistent disks for the nodes, use the variable default
30+
# "Temporary" instead
31+
kubelet_disk_type : "OS",
32+
33+
# FIXME: use a larger os_disk_size_gb than 40, like the default of 100, to
34+
# avoid running low when few replicas are used
35+
os_disk_size_gb : 40,
36+
37+
# FIXME: its nice to use autoscaling, but we end up with three replicas due to
38+
# https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632
39+
# and its a waste at least using Standard_E4s_v3 machines.
40+
enable_auto_scaling : false,
41+
node_count : 2,
2042
}
2143

22-
notebook_nodes = {
44+
user_node_pools = {
2345
"default" : {
2446
name : "nbdefault",
2547
# NOTE: min-max below was set to 0-86 retroactively to align with
@@ -33,11 +55,12 @@ notebook_nodes = {
3355
"hub.jupyter.org/node-size" = "Standard_E8s_v3",
3456
},
3557
kubernetes_version : "1.26.3",
58+
# FIXME: stop using persistent disks for the nodes, use Temporary instead
59+
kubelet_disk_type : "OS",
3660
},
37-
#"usere8sv5" : {
38-
# min : 0,
39-
# max : 100,
40-
# vm_size : "Standard_E8s_v5",
41-
# kubernetes_version : "1.28.3",
42-
#}
61+
"usere8sv5" : {
62+
min : 0,
63+
max : 100,
64+
vm_size : "Standard_E8s_v5",
65+
}
4366
}

terraform/azure/variables.tf

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,39 +81,44 @@ variable "ssh_pub_key" {
8181
variable "core_node_pool" {
8282
type = object({
8383
name : optional(string, ""),
84+
enable_auto_scaling = optional(bool, true),
8485
min : optional(number, 1),
8586
max : optional(number, 10),
87+
node_count : optional(number),
8688
vm_size : string,
8789
labels : optional(map(string), {}),
8890
taints : optional(list(string), []),
89-
os_disk_size_gb : optional(number, 40),
90-
kubernetes_version : optional(string, "")
91+
os_disk_size_gb : optional(number, 100),
92+
kubernetes_version : optional(string, ""),
93+
kubelet_disk_type : optional(string, "Temporary"),
9194
})
9295
description = "Core node pool"
9396
}
9497

95-
variable "notebook_nodes" {
98+
variable "user_node_pools" {
9699
type = map(object({
97100
name : optional(string, ""),
98101
min : number,
99102
max : number,
100103
vm_size : string,
101104
labels : optional(map(string), {}),
102105
taints : optional(list(string), []),
103-
kubernetes_version : optional(string, "")
106+
os_disk_size_gb : optional(number, 200),
107+
kubernetes_version : optional(string, ""),
108+
kubelet_disk_type : optional(string, "Temporary"),
104109
}))
105-
description = "Notebook node pools to create"
110+
description = "User node pools to create"
106111
default = {}
107112
}
108113

109-
variable "dask_nodes" {
114+
variable "dask_node_pools" {
110115
type = map(object({
111116
min : number,
112117
max : number,
113118
vm_size : string,
114119
labels : optional(map(string), {}),
115120
taints : optional(list(string), []),
116-
kubernetes_version : optional(string, "")
121+
kubernetes_version : optional(string, ""),
117122
}))
118123
description = "Dask node pools to create"
119124
default = {}

0 commit comments

Comments
 (0)