From d02bc52b4653c5216821b0508ff77aef624fd612 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Wed, 10 Jan 2024 09:18:39 +0100 Subject: [PATCH] terraform, azure: autoscaling core node pool and refactor to reduce duplication --- terraform/azure/main.tf | 118 +++++++++++------------ terraform/azure/projects/utoronto.tfvars | 66 +++++++------ terraform/azure/variables.tf | 45 ++------- 3 files changed, 102 insertions(+), 127 deletions(-) diff --git a/terraform/azure/main.tf b/terraform/azure/main.tf index 5777b68340..36e0b567d7 100644 --- a/terraform/azure/main.tf +++ b/terraform/azure/main.tf @@ -37,6 +37,7 @@ terraform { provider "azuread" { tenant_id = var.tenant_id } + provider "azurerm" { subscription_id = var.subscription_id features {} @@ -93,35 +94,6 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { } } - # default_node_pool must be set, and it must be a node pool of system type - # that can't scale to zero. Due to that we are forced to use it, and have - # decided to use it as our core node pool. - # - # Most changes to this node pool forces a replace operation on the entire - # cluster. This can be avoided with v3.47.0+ of this provider by declaring - # temporary_name_for_rotation = "coreb". - # - # ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628 - # ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation. - # - default_node_pool { - name = var.core_node_pool.name - vm_size = var.core_node_pool.vm_size - os_disk_size_gb = var.core_node_pool.os_disk_size_gb - enable_auto_scaling = var.core_node_pool.enable_auto_scaling - min_count = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.min : null - max_count = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.max : null - node_count = var.core_node_pool.node_count - kubelet_disk_type = var.core_node_pool.kubelet_disk_type - vnet_subnet_id = azurerm_subnet.node_subnet.id - node_labels = merge({ - "hub.jupyter.org/node-purpose" = "core", - "k8s.dask.org/node-purpose" = "core" - }, var.core_node_pool.labels) - - orchestrator_version = coalesce(var.core_node_pool.kubernetes_version, var.kubernetes_version) - } - auto_scaler_profile { skip_nodes_with_local_storage = true } @@ -131,7 +103,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { } network_profile { - # I don't trust Azure CNI + # Azure CNI is the default, but we don't trust it to be reliable, so we've + # opted to use kubenet instead network_plugin = "kubenet" network_policy = "calico" } @@ -144,68 +117,92 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { client_secret = azuread_service_principal_password.service_principal_password[0].value } } -} + # default_node_pool must be set, and it must be a node pool of system type + # that can't scale to zero. Due to that we are forced to use it, and have + # decided to use it as our core node pool. + # + # Most changes to this node pool forces a replace operation on the entire + # cluster. This can be avoided with v3.47.0+ of this provider by declaring + # temporary_name_for_rotation = "coreb". + # + # ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628 + # ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation. + # + default_node_pool { + name = var.node_pools["core"][0].name + vm_size = var.node_pools["core"][0].vm_size + os_disk_size_gb = var.node_pools["core"][0].os_disk_size_gb + kubelet_disk_type = var.node_pools["core"][0].kubelet_disk_type + enable_auto_scaling = true + min_count = var.node_pools["core"][0].min + max_count = var.node_pools["core"][0].max + + node_labels = merge({ + "hub.jupyter.org/node-purpose" = "core", + "k8s.dask.org/node-purpose" = "core" + }, var.node_pools["core"][0].labels) + node_taints = concat([], var.node_pools["core"][0].taints) + orchestrator_version = coalesce(var.node_pools["core"][0].kubernetes_version, var.kubernetes_version) -resource "azurerm_kubernetes_cluster_node_pool" "user_pool" { - for_each = var.user_node_pools + vnet_subnet_id = azurerm_subnet.node_subnet.id + } +} - name = coalesce(each.value.name, each.key) - kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id - enable_auto_scaling = true - os_disk_size_gb = each.value.os_disk_size_gb - vnet_subnet_id = azurerm_subnet.node_subnet.id - kubelet_disk_type = each.value.kubelet_disk_type - orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version +resource "azurerm_kubernetes_cluster_node_pool" "user_pool" { + for_each = { for i, v in var.node_pools["user"] : v.name => v } + + name = each.value.name + vm_size = each.value.vm_size + os_disk_size_gb = each.value.os_disk_size_gb + kubelet_disk_type = each.value.kubelet_disk_type + enable_auto_scaling = true + min_count = each.value.min + max_count = each.value.max - vm_size = each.value.vm_size node_labels = merge({ "hub.jupyter.org/node-purpose" = "user", "k8s.dask.org/node-purpose" = "scheduler" }, each.value.labels) - node_taints = concat([ "hub.jupyter.org_dedicated=user:NoSchedule" ], each.value.taints) + orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version - min_count = each.value.min - max_count = each.value.max + kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id + vnet_subnet_id = azurerm_subnet.node_subnet.id } -resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" { - # If dask_node_pools is set, we use that. If it isn't, we use user_node_pools. - # This lets us set dask_node_pools to an empty array to get no dask nodes - for_each = var.dask_node_pools - name = "dask${each.key}" - kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id - enable_auto_scaling = true - os_disk_size_gb = each.value.os_disk_size_gb - vnet_subnet_id = azurerm_subnet.node_subnet.id +resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" { + for_each = { for i, v in var.node_pools["dask"] : v.name => v } - orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version + name = each.value.name + vm_size = each.value.vm_size + os_disk_size_gb = each.value.os_disk_size_gb + kubelet_disk_type = each.value.kubelet_disk_type + enable_auto_scaling = true + min_count = each.value.min + max_count = each.value.max - vm_size = each.value.vm_size node_labels = merge({ "k8s.dask.org/node-purpose" = "worker", }, each.value.labels) - node_taints = concat([ "k8s.dask.org_dedicated=worker:NoSchedule" ], each.value.taints) + orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version - min_count = each.value.min - max_count = each.value.max + kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id + vnet_subnet_id = azurerm_subnet.node_subnet.id } -# AZure container registry resource "azurerm_container_registry" "container_registry" { - # meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE name = var.global_container_registry_name resource_group_name = azurerm_resource_group.jupyterhub.name location = azurerm_resource_group.jupyterhub.location @@ -213,6 +210,7 @@ resource "azurerm_container_registry" "container_registry" { admin_enabled = true } + locals { registry_creds = { "imagePullSecret" = { diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars index fc51571045..e3c3bb8daa 100644 --- a/terraform/azure/projects/utoronto.tfvars +++ b/terraform/azure/projects/utoronto.tfvars @@ -11,36 +11,38 @@ ssh_pub_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2 # List available versions via: az aks get-versions --location westus2 -o table kubernetes_version = "1.28.3" -core_node_pool = { - name : "core", - kubernetes_version : "1.28.3", - - # FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and - # can more cheaply handle being forced to have 2-3 replicas for silly - # reasons like three calico-typha pods. See - # https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632. - # - vm_size : "Standard_E4s_v3", - - # FIXME: stop using persistent disks for the nodes, use the variable default - # "Temporary" instead - kubelet_disk_type : "OS", - - # FIXME: use a larger os_disk_size_gb than 40, like the default of 100, to - # avoid running low when few replicas are used - os_disk_size_gb : 40, - - # FIXME: its nice to use autoscaling, but we end up with three replicas due to - # https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632 - # and its a waste at least using Standard_E4s_v3 machines. - enable_auto_scaling : false, - node_count : 2, -} - -user_node_pools = { - "usere8sv5" : { - min : 0, - max : 100, - vm_size : "Standard_E8s_v5", - }, +node_pools = { + core : [ + { + name : "core", + + # FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and + # can more cheaply handle being forced to have 2-3 replicas for silly + # reasons like three calico-typha pods. See + # https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632. + # + vm_size : "Standard_E4s_v3", + + os_disk_size_gb : 40, + + # FIXME: stop using persistent disks for the nodes, use the variable default + # "Temporary" instead + kubelet_disk_type : "OS", + + min : 1, + max : 10, + }, + ], + + user : [ + { + name : "usere8sv5", + vm_size : "Standard_E8s_v5", + os_disk_size_gb : 200, + min : 0, + max : 100, + }, + ], + + dask : [] } diff --git a/terraform/azure/variables.tf b/terraform/azure/variables.tf index dda5c1fab6..afeee2db6e 100644 --- a/terraform/azure/variables.tf +++ b/terraform/azure/variables.tf @@ -78,50 +78,25 @@ variable "ssh_pub_key" { EOT } -variable "core_node_pool" { - type = object({ - name : optional(string, ""), - enable_auto_scaling = optional(bool, true), - min : optional(number, 1), - max : optional(number, 10), - node_count : optional(number), +variable "node_pools" { + type = map(list(object({ + name : string, vm_size : string, - labels : optional(map(string), {}), - taints : optional(list(string), []), os_disk_size_gb : optional(number, 100), - kubernetes_version : optional(string, ""), kubelet_disk_type : optional(string, "Temporary"), - }) - description = "Core node pool" -} - -variable "user_node_pools" { - type = map(object({ - name : optional(string, ""), min : number, max : number, - vm_size : string, labels : optional(map(string), {}), taints : optional(list(string), []), - os_disk_size_gb : optional(number, 200), kubernetes_version : optional(string, ""), - kubelet_disk_type : optional(string, "Temporary"), - })) - description = "User node pools to create" - default = {} -} + }))) + description = <<-EOT + Node pools to create to be listed under the keys 'core', 'user', and 'dask'. -variable "dask_node_pools" { - type = map(object({ - min : number, - max : number, - vm_size : string, - labels : optional(map(string), {}), - taints : optional(list(string), []), - kubernetes_version : optional(string, ""), - })) - description = "Dask node pools to create" - default = {} + There should be exactly one core node pool. The core node pool is given a + special treatment by being listed directly in the cluster resource's + 'default_node_pool' field. + EOT } variable "create_service_principal" {