Skip to content

Commit

Permalink
Merge pull request #4975 from NASA-IMPACT/feat/veda-gpu
Browse files Browse the repository at this point in the history
nasa-veda: Enable GPU instance for select users
  • Loading branch information
yuvipanda authored Oct 19, 2024
2 parents 9b885f4 + 91f8115 commit e7ce866
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 1 deletion.
37 changes: 37 additions & 0 deletions config/clusters/nasa-veda/common.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,43 @@ basehub:
cpu_limit: 15.695
node_selector:
node.kubernetes.io/instance-type: r5.4xlarge
- display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs
description: "Start a container on a dedicated node with a GPU"
slug: "gpu"
allowed_groups:
- veda-analytics-access:maap-biomass-team
profile_options:
image:
display_name: Image
unlisted_choice:
enabled: true
display_name: "Custom image"
validation_regex: "^.+:.+$"
validation_message: "Must be a publicly available docker image of form <image-name>:<tag>"
kubespawner_override:
image: "{value}"
choices:
pytorch:
display_name: Pangeo PyTorch ML Notebook
default: false
slug: "pytorch"
kubespawner_override:
image: "quay.io/pangeo/pytorch-notebook:2024.08.18"
tensorflow2:
display_name: Pangeo Tensorflow2 ML Notebook
default: true
slug: "tensorflow2"
kubespawner_override:
image: "quay.io/pangeo/ml-notebook:2024.08.18"
kubespawner_override:
environment:
NVIDIA_DRIVER_CAPABILITIES: compute,utility
mem_limit: null
mem_guarantee: 14G
node_selector:
node.kubernetes.io/instance-type: g4dn.xlarge
extra_resource_limits:
nvidia.com/gpu: "1"

scheduling:
userScheduler:
Expand Down
14 changes: 13 additions & 1 deletion eksctl/nasa-veda.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,19 @@ local notebookNodes = [
namePrefix: "nb-binder",
labels+: { "2i2c/hub-name": "binder" },
tags+: { "2i2c:hub-name": "binder" }
}
},
{
instanceType: "g4dn.xlarge",
tags+: {
"k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
},
taints+: {
"nvidia.com/gpu": "present:NoSchedule"
},
// Allow provisioning GPUs across all AZs, to prevent situation where all
// GPUs in a single AZ are in use and no new nodes can be spawned
availabilityZones: masterAzs,
},
];

local daskNodes = [
Expand Down

0 comments on commit e7ce866

Please sign in to comment.