Skip to content

Commit

Permalink
terraform, azure: autoscaling core node pool and refactor to reduce d…
Browse files Browse the repository at this point in the history
…uplication
  • Loading branch information
consideRatio committed Jan 10, 2024
1 parent 9485b3d commit d02bc52
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 127 deletions.
118 changes: 58 additions & 60 deletions terraform/azure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ terraform {
provider "azuread" {
tenant_id = var.tenant_id
}

provider "azurerm" {
subscription_id = var.subscription_id
features {}
Expand Down Expand Up @@ -93,35 +94,6 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
}
}

# default_node_pool must be set, and it must be a node pool of system type
# that can't scale to zero. Due to that we are forced to use it, and have
# decided to use it as our core node pool.
#
# Most changes to this node pool forces a replace operation on the entire
# cluster. This can be avoided with v3.47.0+ of this provider by declaring
# temporary_name_for_rotation = "coreb".
#
# ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
# ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
#
default_node_pool {
name = var.core_node_pool.name
vm_size = var.core_node_pool.vm_size
os_disk_size_gb = var.core_node_pool.os_disk_size_gb
enable_auto_scaling = var.core_node_pool.enable_auto_scaling
min_count = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.min : null
max_count = var.core_node_pool.enable_auto_scaling ? var.core_node_pool.max : null
node_count = var.core_node_pool.node_count
kubelet_disk_type = var.core_node_pool.kubelet_disk_type
vnet_subnet_id = azurerm_subnet.node_subnet.id
node_labels = merge({
"hub.jupyter.org/node-purpose" = "core",
"k8s.dask.org/node-purpose" = "core"
}, var.core_node_pool.labels)

orchestrator_version = coalesce(var.core_node_pool.kubernetes_version, var.kubernetes_version)
}

auto_scaler_profile {
skip_nodes_with_local_storage = true
}
Expand All @@ -131,7 +103,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
}

network_profile {
# I don't trust Azure CNI
# Azure CNI is the default, but we don't trust it to be reliable, so we've
# opted to use kubenet instead
network_plugin = "kubenet"
network_policy = "calico"
}
Expand All @@ -144,75 +117,100 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
client_secret = azuread_service_principal_password.service_principal_password[0].value
}
}
}

# default_node_pool must be set, and it must be a node pool of system type
# that can't scale to zero. Due to that we are forced to use it, and have
# decided to use it as our core node pool.
#
# Most changes to this node pool forces a replace operation on the entire
# cluster. This can be avoided with v3.47.0+ of this provider by declaring
# temporary_name_for_rotation = "coreb".
#
# ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
# ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
#
default_node_pool {
name = var.node_pools["core"][0].name
vm_size = var.node_pools["core"][0].vm_size
os_disk_size_gb = var.node_pools["core"][0].os_disk_size_gb
kubelet_disk_type = var.node_pools["core"][0].kubelet_disk_type
enable_auto_scaling = true
min_count = var.node_pools["core"][0].min
max_count = var.node_pools["core"][0].max

node_labels = merge({
"hub.jupyter.org/node-purpose" = "core",
"k8s.dask.org/node-purpose" = "core"
}, var.node_pools["core"][0].labels)
node_taints = concat([], var.node_pools["core"][0].taints)

orchestrator_version = coalesce(var.node_pools["core"][0].kubernetes_version, var.kubernetes_version)

resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
for_each = var.user_node_pools
vnet_subnet_id = azurerm_subnet.node_subnet.id
}
}

name = coalesce(each.value.name, each.key)
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
enable_auto_scaling = true
os_disk_size_gb = each.value.os_disk_size_gb
vnet_subnet_id = azurerm_subnet.node_subnet.id
kubelet_disk_type = each.value.kubelet_disk_type

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
for_each = { for i, v in var.node_pools["user"] : v.name => v }

name = each.value.name
vm_size = each.value.vm_size
os_disk_size_gb = each.value.os_disk_size_gb
kubelet_disk_type = each.value.kubelet_disk_type
enable_auto_scaling = true
min_count = each.value.min
max_count = each.value.max

vm_size = each.value.vm_size
node_labels = merge({
"hub.jupyter.org/node-purpose" = "user",
"k8s.dask.org/node-purpose" = "scheduler"
}, each.value.labels)

node_taints = concat([
"hub.jupyter.org_dedicated=user:NoSchedule"
], each.value.taints)

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

min_count = each.value.min
max_count = each.value.max
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
vnet_subnet_id = azurerm_subnet.node_subnet.id
}

resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
# If dask_node_pools is set, we use that. If it isn't, we use user_node_pools.
# This lets us set dask_node_pools to an empty array to get no dask nodes
for_each = var.dask_node_pools

name = "dask${each.key}"
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
enable_auto_scaling = true
os_disk_size_gb = each.value.os_disk_size_gb
vnet_subnet_id = azurerm_subnet.node_subnet.id
resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
for_each = { for i, v in var.node_pools["dask"] : v.name => v }

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
name = each.value.name
vm_size = each.value.vm_size
os_disk_size_gb = each.value.os_disk_size_gb
kubelet_disk_type = each.value.kubelet_disk_type
enable_auto_scaling = true
min_count = each.value.min
max_count = each.value.max

vm_size = each.value.vm_size
node_labels = merge({
"k8s.dask.org/node-purpose" = "worker",
}, each.value.labels)

node_taints = concat([
"k8s.dask.org_dedicated=worker:NoSchedule"
], each.value.taints)

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

min_count = each.value.min
max_count = each.value.max
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
vnet_subnet_id = azurerm_subnet.node_subnet.id
}

# AZure container registry

resource "azurerm_container_registry" "container_registry" {
# meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE
name = var.global_container_registry_name
resource_group_name = azurerm_resource_group.jupyterhub.name
location = azurerm_resource_group.jupyterhub.location
sku = "Premium"
admin_enabled = true
}


locals {
registry_creds = {
"imagePullSecret" = {
Expand Down
66 changes: 34 additions & 32 deletions terraform/azure/projects/utoronto.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -11,36 +11,38 @@ ssh_pub_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2
# List available versions via: az aks get-versions --location westus2 -o table
kubernetes_version = "1.28.3"

core_node_pool = {
name : "core",
kubernetes_version : "1.28.3",

# FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and
# can more cheaply handle being forced to have 2-3 replicas for silly
# reasons like three calico-typha pods. See
# https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
#
vm_size : "Standard_E4s_v3",

# FIXME: stop using persistent disks for the nodes, use the variable default
# "Temporary" instead
kubelet_disk_type : "OS",

# FIXME: use a larger os_disk_size_gb than 40, like the default of 100, to
# avoid running low when few replicas are used
os_disk_size_gb : 40,

# FIXME: its nice to use autoscaling, but we end up with three replicas due to
# https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632
# and its a waste at least using Standard_E4s_v3 machines.
enable_auto_scaling : false,
node_count : 2,
}

user_node_pools = {
"usere8sv5" : {
min : 0,
max : 100,
vm_size : "Standard_E8s_v5",
},
node_pools = {
core : [
{
name : "core",

# FIXME: transition to "Standard_E2s_v5" nodes as they are large enough and
# can more cheaply handle being forced to have 2-3 replicas for silly
# reasons like three calico-typha pods. See
# https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
#
vm_size : "Standard_E4s_v3",

os_disk_size_gb : 40,

# FIXME: stop using persistent disks for the nodes, use the variable default
# "Temporary" instead
kubelet_disk_type : "OS",

min : 1,
max : 10,
},
],

user : [
{
name : "usere8sv5",
vm_size : "Standard_E8s_v5",
os_disk_size_gb : 200,
min : 0,
max : 100,
},
],

dask : []
}
45 changes: 10 additions & 35 deletions terraform/azure/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -78,50 +78,25 @@ variable "ssh_pub_key" {
EOT
}

variable "core_node_pool" {
type = object({
name : optional(string, ""),
enable_auto_scaling = optional(bool, true),
min : optional(number, 1),
max : optional(number, 10),
node_count : optional(number),
variable "node_pools" {
type = map(list(object({
name : string,
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
os_disk_size_gb : optional(number, 100),
kubernetes_version : optional(string, ""),
kubelet_disk_type : optional(string, "Temporary"),
})
description = "Core node pool"
}

variable "user_node_pools" {
type = map(object({
name : optional(string, ""),
min : number,
max : number,
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
os_disk_size_gb : optional(number, 200),
kubernetes_version : optional(string, ""),
kubelet_disk_type : optional(string, "Temporary"),
}))
description = "User node pools to create"
default = {}
}
})))
description = <<-EOT
Node pools to create to be listed under the keys 'core', 'user', and 'dask'.
variable "dask_node_pools" {
type = map(object({
min : number,
max : number,
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
kubernetes_version : optional(string, ""),
}))
description = "Dask node pools to create"
default = {}
There should be exactly one core node pool. The core node pool is given a
special treatment by being listed directly in the cluster resource's
'default_node_pool' field.
EOT
}

variable "create_service_principal" {
Expand Down

0 comments on commit d02bc52

Please sign in to comment.