2i2c-org · consideRatio · Jan 12, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/terraform/azure/main.tf b/terraform/azure/main.tf
@@ -37,6 +37,7 @@ terraform {
 provider "azuread" {
   tenant_id = var.tenant_id
 }
+
 provider "azurerm" {
   subscription_id = var.subscription_id
   features {}
@@ -93,24 +94,6 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
     }
   }
 
-  # Core node-pool
-  default_node_pool {
-    # Unfortunately, changing anything about VM type / size recreates *whole cluster
-    name                = "core"
-    vm_size             = var.core_node_vm_size
-    os_disk_size_gb     = 40
-    enable_auto_scaling = true
-    min_count           = 1
-    max_count           = 10
-    vnet_subnet_id      = azurerm_subnet.node_subnet.id
-    node_labels = {
-      "hub.jupyter.org/node-purpose" = "core",
-      "k8s.dask.org/node-purpose"    = "core"
-    }
-
-    orchestrator_version = var.kubernetes_version
-  }
-
   auto_scaler_profile {
     skip_nodes_with_local_storage = true
   }
@@ -120,7 +103,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
   }
 
   network_profile {
-    # I don't trust Azure CNI
+    # Azure CNI is the default, but we don't trust it to be reliable, so we've
+    # opted to use kubenet instead
     network_plugin = "kubenet"
     network_policy = "calico"
   }
@@ -133,76 +117,100 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
       client_secret = azuread_service_principal_password.service_principal_password[0].value
     }
   }
-}
 
+  # default_node_pool must be set, and it must be a node pool of system type
+  # that can't scale to zero. Due to that we are forced to use it, and have
+  # decided to use it as our core node pool.
+  #
+  # Most changes to this node pool forces a replace operation on the entire
+  # cluster. This can be avoided with v3.47.0+ of this provider by declaring
+  # temporary_name_for_rotation = "coreb".
+  #
+  # ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
+  # ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
+  #
+  default_node_pool {
+    name                = var.node_pools["core"][0].name
+    vm_size             = var.node_pools["core"][0].vm_size
+    os_disk_size_gb     = var.node_pools["core"][0].os_disk_size_gb
+    kubelet_disk_type   = var.node_pools["core"][0].kubelet_disk_type
+    enable_auto_scaling = true
+    min_count           = var.node_pools["core"][0].min
+    max_count           = var.node_pools["core"][0].max
 
+    node_labels = merge({
+      "hub.jupyter.org/node-purpose" = "core",
+      "k8s.dask.org/node-purpose"    = "core"
+    }, var.node_pools["core"][0].labels)
+    node_taints = concat([], var.node_pools["core"][0].taints)
 
-resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
-  for_each = var.notebook_nodes
+    orchestrator_version = coalesce(var.node_pools["core"][0].kubernetes_version, var.kubernetes_version)
 
-  name                  = "nb${each.key}"
-  kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
-  enable_auto_scaling   = true
-  os_disk_size_gb       = 200
-  vnet_subnet_id        = azurerm_subnet.node_subnet.id
+    vnet_subnet_id = azurerm_subnet.node_subnet.id
+  }
+}
 
-  orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
 
-  vm_size = each.value.vm_size
+resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
+  for_each = { for i, v in var.node_pools["user"] : v.name => v }
+
+  name                = each.value.name
+  vm_size             = each.value.vm_size
+  os_disk_size_gb     = each.value.os_disk_size_gb
+  kubelet_disk_type   = each.value.kubelet_disk_type
+  enable_auto_scaling = true
+  min_count           = each.value.min
+  max_count           = each.value.max
+
   node_labels = merge({
     "hub.jupyter.org/node-purpose" = "user",
     "k8s.dask.org/node-purpose"    = "scheduler"
-    "hub.jupyter.org/node-size"    = each.value.vm_size
   }, each.value.labels)
-
   node_taints = concat([
     "hub.jupyter.org_dedicated=user:NoSchedule"
   ], each.value.taints)
 
+  orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
 
-  min_count = each.value.min
-  max_count = each.value.max
+  kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
+  vnet_subnet_id        = azurerm_subnet.node_subnet.id
 }
 
-resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
-  # If dask_nodes is set, we use that. If it isn't, we use notebook_nodes.
-  # This lets us set dask_nodes to an empty array to get no dask nodes
-  for_each = var.dask_nodes
 
-  name                  = "dask${each.key}"
-  kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
-  enable_auto_scaling   = true
-  os_disk_size_gb       = 200
-  vnet_subnet_id        = azurerm_subnet.node_subnet.id
+resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
+  for_each = { for i, v in var.node_pools["dask"] : v.name => v }
 
-  orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
+  name                = each.value.name
+  vm_size             = each.value.vm_size
+  os_disk_size_gb     = each.value.os_disk_size_gb
+  kubelet_disk_type   = each.value.kubelet_disk_type
+  enable_auto_scaling = true
+  min_count           = each.value.min
+  max_count           = each.value.max
 
-  vm_size = each.value.vm_size
   node_labels = merge({
     "k8s.dask.org/node-purpose" = "worker",
-    "hub.jupyter.org/node-size" = each.value.vm_size
   }, each.value.labels)
-
   node_taints = concat([
     "k8s.dask.org_dedicated=worker:NoSchedule"
   ], each.value.taints)
 
+  orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
 
-  min_count = each.value.min
-  max_count = each.value.max
+  kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
+  vnet_subnet_id        = azurerm_subnet.node_subnet.id
 }
 
-# AZure container registry
 
 resource "azurerm_container_registry" "container_registry" {
-  # meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE
   name                = var.global_container_registry_name
   resource_group_name = azurerm_resource_group.jupyterhub.name
   location            = azurerm_resource_group.jupyterhub.location
   sku                 = "Premium"
   admin_enabled       = true
 }
 
+
 locals {
   registry_creds = {
     "imagePullSecret" = {

diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars
@@ -1,3 +1,11 @@
+# IMPORTANT: Due to a restrictive network rule from storage.tf, we can't perform
+#            "terraform plan" or "terraform apply" without a workaround.
+#
+#            One known workaround is to allow your public IP temporarily as
+#            discussed in https://github.com/2i2c-org/infrastructure/issues/890#issuecomment-1879072422.
+#            This workaround is problematic as that may temporarily allow access
+#            to storage by other actors with the same IP.
+#
 tenant_id                      = "78aac226-2f03-4b4d-9037-b46d56c55210"
 subscription_id                = "ead3521a-d994-4a44-a68d-b16e35642d5b"
 resourcegroup_name             = "2i2c-utoronto-cluster"
@@ -8,21 +16,44 @@ location                       = "canadacentral"
 storage_size = 8192
 ssh_pub_key  = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2aqRcuhDkQSMx0Hak5xkbt3KnT3cOwAgUP1Vt/SjhltSTuxpOHxiAKCRnjwRk60SxKhUNzPHih2nkfYTmBBjmLfdepDPSke/E0VWvTDIEXz/L8vW8aI0QGPXnXyqzEDO9+U1buheBlxB0diFAD3vEp2SqBOw+z7UgrGxXPdP+2b3AV+X6sOtd6uSzpV8Qvdh+QAkd4r7h9JrkFvkrUzNFAGMjlTb0Lz7qAlo4ynjEwzVN2I1i7cVDKgsGz9ZG/8yZfXXx+INr9jYtYogNZ63ajKR/dfjNPovydhuz5zQvQyxpokJNsTqt1CiWEUNj georgiana@georgiana"
 
-# FIXME: upgrade to 1.27.7, and then 1.28.3, based on the latest versions
-#        available via: az aks get-versions --location westus2 -o table
-#
-kubernetes_version = "1.26.3"
-
-# FIXME: upgrade core_node_vm_size to Standard_E4s_v5
-core_node_vm_size = "Standard_E4s_v3"
-
-notebook_nodes = {
-  "default" : {
-    # NOTE: min-max below was set to 0-86 retroactively to align with
-    #       observed state without understanding on why 0-86 was picked.
-    min : 0,
-    max : 86,
-    # FIXME: upgrade user nodes vm_size to Standard_E8s_v5
-    vm_size : "Standard_E8s_v3",
-  }
+# List available versions via: az aks get-versions --location westus2 -o table
+kubernetes_version = "1.28.3"
+
+node_pools = {
+  core : [
+    {
+      name : "core",
+
+      # FIXME: Transition to "Standard_E2s_v5" nodes as they are large enough to
+      #        for the biggest workload (prometheus-server) and can handle high
+      #        availability requirements better.
+      #
+      #        We are currently forced to handle three calico-typha pods that
+      #        can't schedule on the same node, see https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
+      #
+      vm_size : "Standard_E4s_v3",
+
+      # core nodes doesn't need much disk space
+      os_disk_size_gb : 40,
+
+      # FIXME: Stop using persistent disks for the nodes, use the variable default
+      #        "Temporary" instead by removing this line.
+      kubelet_disk_type : "OS",
+
+      min : 1,
+      max : 10,
+    },
+  ],
+
+  user : [
+    {
+      name : "usere8sv5",
+      vm_size : "Standard_E8s_v5",
+      os_disk_size_gb : 200,
+      min : 0,
+      max : 100,
+    },
+  ],
+
+  dask : [],
 }
diff --git a/terraform/azure/variables.tf b/terraform/azure/variables.tf
@@ -48,20 +48,6 @@ variable "kubernetes_version" {
 }
 
 
-variable "core_node_vm_size" {
-  type        = string
-  description = <<-EOT
-  VM Size to use for core nodes
-
-  Core nodes will always be on, and count as 'base cost'
-  for a cluster. We should try to run with as few of them
-  as possible.
-
-  WARNING: CHANGING THIS WILL DESTROY AND RECREATE THE CLUSTER!
-  EOT
-}
-
-
 variable "global_container_registry_name" {
   type        = string
   description = <<-EOT
@@ -92,30 +78,44 @@ variable "ssh_pub_key" {
   EOT
 }
 
-variable "notebook_nodes" {
-  type = map(object({
-    min : number,
-    max : number,
-    vm_size : string,
-    labels : optional(map(string), {}),
-    taints : optional(list(string), []),
-    kubernetes_version : optional(string, "")
-  }))
-  description = "Notebook node pools to create"
-  default     = {}
-}
+variable "node_pools" {
+  type = map(
+    list(
+      object({
+        name : string,
+        vm_size : string,
+        os_disk_size_gb : optional(number, 100),
+        kubelet_disk_type : optional(string, "Temporary"),
+        min : number,
+        max : number,
+        labels : optional(map(string), {}),
+        taints : optional(list(string), []),
+        kubernetes_version : optional(string, ""),
+      })
+    )
+  )
+  description = <<-EOT
+  Node pools to create to be listed under the keys 'core', 'user', and 'dask'.
+
+  There should be exactly one core node pool. The core node pool is given a
+  special treatment by being listed directly in the cluster resource's
+  'default_node_pool' field.
+  EOT
+
+  validation {
+    condition     = length(var.node_pools["core"]) == 1
+    error_message = "The core node pool is mapped to the cluster resource's `default_node_pool`, due to this we require exactly one core node pool to be specified."
+  }
+
+  validation {
+    condition     = length(setsubtract(keys(var.node_pools), ["core", "user", "dask"])) == 0
+    error_message = "Only three kinds of node pools supported: 'core', 'user', and 'dask'."
+  }
 
-variable "dask_nodes" {
-  type = map(object({
-    min : number,
-    max : number,
-    vm_size : string,
-    labels : optional(map(string), {}),
-    taints : optional(list(string), []),
-    kubernetes_version : optional(string, "")
-  }))
-  description = "Dask node pools to create"
-  default     = {}
+  validation {
+    condition     = length(setintersection(keys(var.node_pools), ["core", "user", "dask"])) == 3
+    error_message = "All three kinds of node pools ('core', 'user', and 'dask') must be declared, even if they are empty lists of node pools."
+  }
 }
 
 variable "create_service_principal" {