Skip to content

Commit

Permalink
terraform, azure and utoronto: k8s upgrade with misc variables to sup…
Browse files Browse the repository at this point in the history
…port it
  • Loading branch information
consideRatio committed Jan 9, 2024
1 parent 7eb118d commit 04f9369
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 29 deletions.
23 changes: 13 additions & 10 deletions terraform/azure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
#
# Most changes to this node pool forces a replace operation on the entire
# cluster. This can be avoided with v3.47.0+ of this provider by declaring
# temporary_name_for_rotation = "core-b".
# temporary_name_for_rotation = "coreb".
#
# ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
# ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
Expand All @@ -108,9 +108,11 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
name = var.core_node_pool.name
vm_size = var.core_node_pool.vm_size
os_disk_size_gb = var.core_node_pool.os_disk_size_gb
enable_auto_scaling = true
min_count = var.core_node_pool.min
max_count = var.core_node_pool.max
enable_auto_scaling = var.core_node_pool.enable_auto_scaling
min_count = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.min : null
max_count = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.max : null
node_count = var.core_node_pool.node_count
kubelet_disk_type = var.core_node_pool.kubelet_disk_type
vnet_subnet_id = azurerm_subnet.node_subnet.id
node_labels = merge({
"hub.jupyter.org/node-purpose" = "core",
Expand Down Expand Up @@ -147,13 +149,14 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {


resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
for_each = var.notebook_nodes
for_each = var.user_node_pools

name = coalesce(each.value.name, each.key)
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
enable_auto_scaling = true
os_disk_size_gb = 200
os_disk_size_gb = each.value.os_disk_size_gb
vnet_subnet_id = azurerm_subnet.node_subnet.id
kubelet_disk_type = each.value.kubelet_disk_type

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

Expand All @@ -173,14 +176,14 @@ resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
}

resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
# If dask_nodes is set, we use that. If it isn't, we use notebook_nodes.
# This lets us set dask_nodes to an empty array to get no dask nodes
for_each = var.dask_nodes
# If dask_node_pools is set, we use that. If it isn't, we use user_node_pools.
# This lets us set dask_node_pools to an empty array to get no dask nodes
for_each = var.dask_node_pools

name = "dask${each.key}"
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
enable_auto_scaling = true
os_disk_size_gb = 200
os_disk_size_gb = each.value.os_disk_size_gb
vnet_subnet_id = azurerm_subnet.node_subnet.id

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
Expand Down
47 changes: 35 additions & 12 deletions terraform/azure/projects/utoronto.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,40 @@ location = "canadacentral"
storage_size = 8192
ssh_pub_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2aqRcuhDkQSMx0Hak5xkbt3KnT3cOwAgUP1Vt/SjhltSTuxpOHxiAKCRnjwRk60SxKhUNzPHih2nkfYTmBBjmLfdepDPSke/E0VWvTDIEXz/L8vW8aI0QGPXnXyqzEDO9+U1buheBlxB0diFAD3vEp2SqBOw+z7UgrGxXPdP+2b3AV+X6sOtd6uSzpV8Qvdh+QAkd4r7h9JrkFvkrUzNFAGMjlTb0Lz7qAlo4ynjEwzVN2I1i7cVDKgsGz9ZG/8yZfXXx+INr9jYtYogNZ63ajKR/dfjNPovydhuz5zQvQyxpokJNsTqt1CiWEUNj georgiana@georgiana"

# FIXME: upgrade to 1.27.7, and then 1.28.3, based on the latest versions
# available via: az aks get-versions --location westus2 -o table
#
kubernetes_version = "1.27.7"
# List available versions via: az aks get-versions --location westus2 -o table
kubernetes_version = "1.28.3"

core_node_pool = {
name : "core",
kubernetes_version : "1.28.3",

# FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and
# can more cheaply handle being forced to have 2-3 replicas for silly
# reasons like three calico-typha pods. See
# https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
#
# Transitioning to E2s_v5 would require reducing the requested memory
# by prometheus-server though, but that should be okay since
# prometheus has reduced its memory profile significant enough recently.
#
vm_size : "Standard_E4s_v3",
kubernetes_version : "1.26.3",

# FIXME: stop using persistent disks for the nodes, use the variable default
# "Temporary" instead
kubelet_disk_type : "OS",

# FIXME: use a larger os_disk_size_gb than 40, like the default of 100, to
# avoid running low when few replicas are used
os_disk_size_gb : 40,

# FIXME: its nice to use autoscaling, but we end up with three replicas due to
# https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632
# and its a waste at least using Standard_E4s_v3 machines.
enable_auto_scaling : false,
node_count : 2,
}

notebook_nodes = {
user_node_pools = {
"default" : {
name : "nbdefault",
# NOTE: min-max below was set to 0-86 retroactively to align with
Expand All @@ -33,11 +55,12 @@ notebook_nodes = {
"hub.jupyter.org/node-size" = "Standard_E8s_v3",
},
kubernetes_version : "1.26.3",
# FIXME: stop using persistent disks for the nodes, use Temporary instead
kubelet_disk_type : "OS",
},
#"usere8sv5" : {
# min : 0,
# max : 100,
# vm_size : "Standard_E8s_v5",
# kubernetes_version : "1.28.3",
#}
"usere8sv5" : {
min : 0,
max : 100,
vm_size : "Standard_E8s_v5",
}
}
19 changes: 12 additions & 7 deletions terraform/azure/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -81,39 +81,44 @@ variable "ssh_pub_key" {
variable "core_node_pool" {
type = object({
name : optional(string, ""),
enable_auto_scaling = optional(bool, true),
min : optional(number, 1),
max : optional(number, 10),
node_count : optional(number),
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
os_disk_size_gb : optional(number, 40),
kubernetes_version : optional(string, "")
os_disk_size_gb : optional(number, 100),
kubernetes_version : optional(string, ""),
kubelet_disk_type : optional(string, "Temporary"),
})
description = "Core node pool"
}

variable "notebook_nodes" {
variable "user_node_pools" {
type = map(object({
name : optional(string, ""),
min : number,
max : number,
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
kubernetes_version : optional(string, "")
os_disk_size_gb : optional(number, 200),
kubernetes_version : optional(string, ""),
kubelet_disk_type : optional(string, "Temporary"),
}))
description = "Notebook node pools to create"
description = "User node pools to create"
default = {}
}

variable "dask_nodes" {
variable "dask_node_pools" {
type = map(object({
min : number,
max : number,
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
kubernetes_version : optional(string, "")
kubernetes_version : optional(string, ""),
}))
description = "Dask node pools to create"
default = {}
Expand Down

0 comments on commit 04f9369

Please sign in to comment.