Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

terraform, azure and utoronto: an upgrade, misc to support it, and misc opportunistic details #3596

Merged
merged 8 commits into from
Jan 12, 2024
108 changes: 58 additions & 50 deletions terraform/azure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ terraform {
provider "azuread" {
tenant_id = var.tenant_id
}

provider "azurerm" {
subscription_id = var.subscription_id
features {}
Expand Down Expand Up @@ -93,24 +94,6 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
}
}

# Core node-pool
default_node_pool {
# Unfortunately, changing anything about VM type / size recreates *whole cluster
name = "core"
vm_size = var.core_node_vm_size
os_disk_size_gb = 40
enable_auto_scaling = true
min_count = 1
max_count = 10
vnet_subnet_id = azurerm_subnet.node_subnet.id
node_labels = {
"hub.jupyter.org/node-purpose" = "core",
"k8s.dask.org/node-purpose" = "core"
}

orchestrator_version = var.kubernetes_version
}

auto_scaler_profile {
skip_nodes_with_local_storage = true
}
Expand All @@ -120,7 +103,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
}

network_profile {
# I don't trust Azure CNI
# Azure CNI is the default, but we don't trust it to be reliable, so we've
# opted to use kubenet instead
network_plugin = "kubenet"
network_policy = "calico"
}
Expand All @@ -133,76 +117,100 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
client_secret = azuread_service_principal_password.service_principal_password[0].value
}
}
}

# default_node_pool must be set, and it must be a node pool of system type
# that can't scale to zero. Due to that we are forced to use it, and have
# decided to use it as our core node pool.
#
# Most changes to this node pool forces a replace operation on the entire
# cluster. This can be avoided with v3.47.0+ of this provider by declaring
# temporary_name_for_rotation = "coreb".
#
# ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
# ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
#
default_node_pool {
name = var.node_pools["core"][0].name
vm_size = var.node_pools["core"][0].vm_size
os_disk_size_gb = var.node_pools["core"][0].os_disk_size_gb
kubelet_disk_type = var.node_pools["core"][0].kubelet_disk_type
enable_auto_scaling = true
min_count = var.node_pools["core"][0].min
max_count = var.node_pools["core"][0].max

node_labels = merge({
"hub.jupyter.org/node-purpose" = "core",
"k8s.dask.org/node-purpose" = "core"
}, var.node_pools["core"][0].labels)
node_taints = concat([], var.node_pools["core"][0].taints)

resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
for_each = var.notebook_nodes
orchestrator_version = coalesce(var.node_pools["core"][0].kubernetes_version, var.kubernetes_version)

name = "nb${each.key}"
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
enable_auto_scaling = true
os_disk_size_gb = 200
vnet_subnet_id = azurerm_subnet.node_subnet.id
vnet_subnet_id = azurerm_subnet.node_subnet.id
}
}

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

vm_size = each.value.vm_size
resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
for_each = { for i, v in var.node_pools["user"] : v.name => v }

name = each.value.name
vm_size = each.value.vm_size
os_disk_size_gb = each.value.os_disk_size_gb
kubelet_disk_type = each.value.kubelet_disk_type
enable_auto_scaling = true
min_count = each.value.min
max_count = each.value.max

node_labels = merge({
"hub.jupyter.org/node-purpose" = "user",
"k8s.dask.org/node-purpose" = "scheduler"
"hub.jupyter.org/node-size" = each.value.vm_size
}, each.value.labels)

node_taints = concat([
"hub.jupyter.org_dedicated=user:NoSchedule"
], each.value.taints)

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

min_count = each.value.min
max_count = each.value.max
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
vnet_subnet_id = azurerm_subnet.node_subnet.id
}

resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
# If dask_nodes is set, we use that. If it isn't, we use notebook_nodes.
# This lets us set dask_nodes to an empty array to get no dask nodes
for_each = var.dask_nodes

name = "dask${each.key}"
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
enable_auto_scaling = true
os_disk_size_gb = 200
vnet_subnet_id = azurerm_subnet.node_subnet.id
resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
for_each = { for i, v in var.node_pools["dask"] : v.name => v }

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
name = each.value.name
vm_size = each.value.vm_size
os_disk_size_gb = each.value.os_disk_size_gb
kubelet_disk_type = each.value.kubelet_disk_type
enable_auto_scaling = true
min_count = each.value.min
max_count = each.value.max

vm_size = each.value.vm_size
node_labels = merge({
"k8s.dask.org/node-purpose" = "worker",
"hub.jupyter.org/node-size" = each.value.vm_size
}, each.value.labels)

node_taints = concat([
"k8s.dask.org_dedicated=worker:NoSchedule"
], each.value.taints)

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

min_count = each.value.min
max_count = each.value.max
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
vnet_subnet_id = azurerm_subnet.node_subnet.id
}

# AZure container registry

resource "azurerm_container_registry" "container_registry" {
# meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE
name = var.global_container_registry_name
resource_group_name = azurerm_resource_group.jupyterhub.name
location = azurerm_resource_group.jupyterhub.location
sku = "Premium"
admin_enabled = true
}


locals {
registry_creds = {
"imagePullSecret" = {
Expand Down
65 changes: 48 additions & 17 deletions terraform/azure/projects/utoronto.tfvars
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# IMPORTANT: Due to a restrictive network rule from storage.tf, we can't perform
# "terraform plan" or "terraform apply" without a workaround.
#
# One known workaround is to allow your public IP temporarily as
# discussed in https://github.com/2i2c-org/infrastructure/issues/890#issuecomment-1879072422.
# This workaround is problematic as that may temporarily allow access
# to storage by other actors with the same IP.
#
tenant_id = "78aac226-2f03-4b4d-9037-b46d56c55210"
subscription_id = "ead3521a-d994-4a44-a68d-b16e35642d5b"
resourcegroup_name = "2i2c-utoronto-cluster"
Expand All @@ -8,21 +16,44 @@ location = "canadacentral"
storage_size = 8192
ssh_pub_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2aqRcuhDkQSMx0Hak5xkbt3KnT3cOwAgUP1Vt/SjhltSTuxpOHxiAKCRnjwRk60SxKhUNzPHih2nkfYTmBBjmLfdepDPSke/E0VWvTDIEXz/L8vW8aI0QGPXnXyqzEDO9+U1buheBlxB0diFAD3vEp2SqBOw+z7UgrGxXPdP+2b3AV+X6sOtd6uSzpV8Qvdh+QAkd4r7h9JrkFvkrUzNFAGMjlTb0Lz7qAlo4ynjEwzVN2I1i7cVDKgsGz9ZG/8yZfXXx+INr9jYtYogNZ63ajKR/dfjNPovydhuz5zQvQyxpokJNsTqt1CiWEUNj georgiana@georgiana"

# FIXME: upgrade to 1.27.7, and then 1.28.3, based on the latest versions
# available via: az aks get-versions --location westus2 -o table
#
kubernetes_version = "1.26.3"

# FIXME: upgrade core_node_vm_size to Standard_E4s_v5
core_node_vm_size = "Standard_E4s_v3"

notebook_nodes = {
"default" : {
# NOTE: min-max below was set to 0-86 retroactively to align with
# observed state without understanding on why 0-86 was picked.
min : 0,
max : 86,
# FIXME: upgrade user nodes vm_size to Standard_E8s_v5
vm_size : "Standard_E8s_v3",
}
# List available versions via: az aks get-versions --location westus2 -o table
kubernetes_version = "1.28.3"

node_pools = {
core : [
{
name : "core",

# FIXME: Transition to "Standard_E2s_v5" nodes as they are large enough to
# for the biggest workload (prometheus-server) and can handle high
# availability requirements better.
#
# We are currently forced to handle three calico-typha pods that
# can't schedule on the same node, see https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
#
vm_size : "Standard_E4s_v3",

# core nodes doesn't need much disk space
os_disk_size_gb : 40,

# FIXME: Stop using persistent disks for the nodes, use the variable default
# "Temporary" instead by removing this line.
kubelet_disk_type : "OS",

min : 1,
max : 10,
},
],

user : [
{
name : "usere8sv5",
vm_size : "Standard_E8s_v5",
os_disk_size_gb : 200,
min : 0,
max : 100,
},
],

dask : [],
}
74 changes: 37 additions & 37 deletions terraform/azure/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,20 +48,6 @@ variable "kubernetes_version" {
}


variable "core_node_vm_size" {
type = string
description = <<-EOT
VM Size to use for core nodes
Core nodes will always be on, and count as 'base cost'
for a cluster. We should try to run with as few of them
as possible.
WARNING: CHANGING THIS WILL DESTROY AND RECREATE THE CLUSTER!
EOT
}


variable "global_container_registry_name" {
type = string
description = <<-EOT
Expand Down Expand Up @@ -92,30 +78,44 @@ variable "ssh_pub_key" {
EOT
}

variable "notebook_nodes" {
type = map(object({
min : number,
max : number,
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
kubernetes_version : optional(string, "")
}))
description = "Notebook node pools to create"
default = {}
}
variable "node_pools" {
type = map(
list(
object({
name : string,
vm_size : string,
os_disk_size_gb : optional(number, 100),
kubelet_disk_type : optional(string, "Temporary"),
min : number,
max : number,
labels : optional(map(string), {}),
taints : optional(list(string), []),
kubernetes_version : optional(string, ""),
})
)
)
description = <<-EOT
Node pools to create to be listed under the keys 'core', 'user', and 'dask'.
There should be exactly one core node pool. The core node pool is given a
special treatment by being listed directly in the cluster resource's
'default_node_pool' field.
EOT

validation {
condition = length(var.node_pools["core"]) == 1
error_message = "The core node pool is mapped to the cluster resource's `default_node_pool`, due to this we require exactly one core node pool to be specified."
}

validation {
condition = length(setsubtract(keys(var.node_pools), ["core", "user", "dask"])) == 0
error_message = "Only three kinds of node pools supported: 'core', 'user', and 'dask'."
}

variable "dask_nodes" {
type = map(object({
min : number,
max : number,
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
kubernetes_version : optional(string, "")
}))
description = "Dask node pools to create"
default = {}
validation {
condition = length(setintersection(keys(var.node_pools), ["core", "user", "dask"])) == 3
error_message = "All three kinds of node pools ('core', 'user', and 'dask') must be declared, even if they are empty lists of node pools."
}
}

variable "create_service_principal" {
Expand Down