Skip to content

Commit

Permalink
add GPU instructions
Browse files Browse the repository at this point in the history
  • Loading branch information
eeholmes committed Jun 26, 2024
1 parent 049b7e0 commit 6fc324a
Show file tree
Hide file tree
Showing 24 changed files with 1,760 additions and 415 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
config/jhub/config.yaml
config/dhub/dconfig2.yaml
config/dhub/dconfig-resource.yaml
config/fish/*.yaml

config/uhub/ssh-info

Expand Down
4 changes: 3 additions & 1 deletion _quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ website:
href: posts/Set-up-centos-security.Rmd
- text: "Set-up authentication"
href: posts/set-up-authentication.Rmd
- text: "Tips"
- text: "Set-up GPU"
href: posts/Set-up-gpu.Rmd
- text: "Troubleshooting"
href: posts/tips.Rmd

format:
Expand Down
229 changes: 229 additions & 0 deletions config-template/fish-gconfig.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
# The config file for the dask-enabled hub with GPU node pool
jupyterhub:
hub:
extraEnv:
DASK_DISTRIBUTED__DASHBOARD_LINK: '/user/{JUPYTERHUB_USER}/proxy/{port}/status'
config:
GitHubOAuthenticator:
client_id: <get from GitHub when you define the oauth>
client_secret: <get from GitHub when you define the oauth>
oauth_callback_url: https://<url to your hub>/hub/oauth_callback
allowed_organizations:
- <org>:<team>
- myghorg:team1
scope:
- read:org
Authenticator:
admin_users:
- <GitHub user name>
= JupyterHub:
authenticator_class: github
KubeSpawner:
working_dir: /home/jovyan
allowNamedServers: true
networkPolicy:
# FIXME: For dask gateway
enabled: false
readinessProbe:
enabled: false
initContainers:
# Optional if you want a custom landing page
# copy https://github.com/nmfs-opensci/jupyterhub-templates as example
# template repo must be PUBLIC
- name: git-clone-templates
image: alpine/git
command:
- /bin/sh
- -c
args:
- >-
git clone --branch=master https://github.com/<your GitHub org>/jupyterhub-templates.git &&
cp -r jupyterhub-templates/templates/* /templates &&
cp -r jupyterhub-templates/extra-assets/* /extra-assets
volumeMounts:
- name: custom-templates
mountPath: /templates
- name: custom-templates-static
mountPath: /extra-assets
extraVolumes:
- name: custom-templates
emptyDir: {}
- name: custom-templates-static
emptyDir: {}
extraVolumeMounts:
- name: custom-templates
mountPath: /usr/local/share/jupyterhub/custom_templates
- name: custom-templates-static
mountPath: /usr/local/share/jupyterhub/static/extra_assets
extraConfig:
templates: |
c.JupyterHub.template_paths = ['/usr/local/share/jupyterhub/custom_templates/']
proxy:
https:
enabled: true
hosts:
- fish.opensci.live
letsencrypt:
contactEmail: [email protected]
singleuser:
startTimeout: 600
defaultUrl: /lab
storage:
capacity: 50Gi
extraVolumes:
- name: jupyterhub-shared
persistentVolumeClaim:
claimName: safshub-pvc
extraVolumeMounts:
- name: jupyterhub-shared
mountPath: /home/jovyan/shared
image:
name: openscapes/python
tag: 6ee57a9
pullPolicy: Always
profileList:
- display_name: "Intel Xeon - Select image and RAM"
description: &profile_list_description "Start a container with at least a chosen share of capacity on a node of this type"
default: true
slug: small
profile_options:
image: &profile_options_image
display_name: Image
choices:
pyrbase:
display_name: PyR - base image R vrs 4.4 and Python vrs 3.10
slug: pyrbase
kubespawner_override:
image: ghcr.io/nmfs-opensci/container-images/py-rocket-base:latest
pyrgeo:
display_name: PyR - Geospatial Python and R - py-rocket-geospatial latest
slug: pyrgeo
default: true
kubespawner_override:
image: ghcr.io/nmfs-opensci/container-images/py-rocket-geospatial:latest
coastwatch:
display_name: PyR - CoastWatch - nmfs-opensci coastwatch latest
slug: coastwatch
kubespawner_override:
image: ghcr.io/nmfs-opensci/container-images/coastwatch:latest
iopython:
display_name: Py - Python w tensorflow - eeholmes iopython 20230714
slug: iopython
kubespawner_override:
image: eeholmes/iopython:20230714
iorocker:
display_name: R - Geospatial w sdmTMB - r-geospatial-sdm latest
slug: rgeospatialsdm
kubespawner_override:
image: ghcr.io/nmfs-opensci/container-images/r-geospatial-sdm:latest
echopype:
display_name: Py - Echopype with pangeo nmfs-opensci echopype latest
slug: echopype
kubespawner_override:
image: ghcr.io/nmfs-opensci/container-images/echopype:latest
arcgis:
display_name: Py - ArcGIS Python 3.9
slug: arcgis
kubespawner_override:
image: ghcr.io/nmfs-opensci/container-images/arcgis:latest
requests: &profile_options_resource_allocation
display_name: Resource Allocation
choices:
mem_1_9:
display_name: 1.9 GB RAM up to 3 CPUs
kubespawner_override:
mem_guarantee: 1991244775
mem_limit: 1991244775
cpu_guarantee: 0.2328125
cpu_limit: 3
default: true
node_selector:
node.kubernetes.io/instance-type: Standard_E4s_v5
mem_3_7:
display_name: 3.7 GB RAM up to 3 CPUs
kubespawner_override:
mem_guarantee: 3982489550
mem_limit: 3982489550
cpu_guarantee: 0.465625
cpu_limit: 3
node_selector:
node.kubernetes.io/instance-type: Standard_E4s_v5
mem_7_4:
display_name: 7.4 GB RAM up to 3 CPUs
kubespawner_override:
mem_guarantee: 7964979101
mem_limit: 7964979101
cpu_guarantee: 0.93125
cpu_limit: 3
node_selector:
node.kubernetes.io/instance-type: Standard_E4s_v5
mem_14_8:
display_name: 15.1 GB RAM up to 3 CPUs
kubespawner_override:
mem_guarantee: 15929958203
mem_limit: 15929958203
cpu_guarantee: 1.8625
cpu_limit: 3
node_selector:
node.kubernetes.io/instance-type: Standard_E4s_v5
mem_28_0:
display_name: 27.3 GB RAM up to 3 CPUs
kubespawner_override:
mem_guarantee: 28673924765
mem_limit: 28673924765
cpu_guarantee: 3
cpu_limit: 3
node_selector:
node.kubernetes.io/instance-type: Standard_E4s_v5
mem_53_5:
display_name: 64 GB RAM, upto 15 CPUs
kubespawner_override:
mem_guarantee: 68576867914
mem_limit: 68576867914
cpu_guarantee: 8
cpu_limit: 15
node_selector:
node.kubernetes.io/instance-type: Standard_E16s_v5
mem_112:
display_name: 112 GB RAM, upto 15 CPUs
kubespawner_override:
mem_guarantee: 118471896889
mem_limit: 118471896889
cpu_guarantee: 15
cpu_limit: 15
node_selector:
node.kubernetes.io/instance-type: Standard_E16s_v5
- display_name: NVIDIA Tesla T4, 28 GB, 4 CPUs
description: "Start a container on a dedicated node with a GPU"
slug: "gpu"
profile_options:
image:
display_name: Image
choices:
pytorch:
display_name: Pangeo PyTorch ML Notebook
default: true
slug: "pytorch"
kubespawner_override:
image: "quay.io/pangeo/pytorch-notebook:2024.06.24"
tensorflow:
display_name: Pangeo TensorFlow ML Notebook
default: true
slug: "tensorflow"
kubespawner_override:
image: "quay.io/pangeo/ml-notebook:2024.06.24"
kubespawner_override:
environment:
NVIDIA_DRIVER_CAPABILITIES: compute,utility
mem_limit: null
mem_guarantee: 14G
node_selector:
node.kubernetes.io/instance-type: Standard_NC4as_T4_v3
extra_resource_limits:
nvidia.com/gpu: "1"
dask-gateway:
gateway:
extraConfig:
idle: |-
# timeout after 30 minutes of inactivity
c.KubeClusterConfig.idle_timeout = 1800
4 changes: 3 additions & 1 deletion config/dhub/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
To restart hub after changing `dconfig2.yaml`
```
helm upgrade --cleanup-on-fail --render-subchart-notes dhub dask/daskhub --namespace dhub --version=2023.1.0 --values dconfig2.yaml
helm upgrade --cleanup-on-fail --render-subchart-notes dhub dask/daskhub --namespace dhub --version=2023.1.0 --values dconfig-resource.yaml
```

`dconfig-resource.yaml` has resources

To get the pods
```
kubectl get pods -n dhub
Expand Down
12 changes: 12 additions & 0 deletions config/fish/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
To restart hub after changing `dconfig2.yaml`
```
helm upgrade --cleanup-on-fail --render-subchart-notes dhub dask/daskhub --namespace dhub --version=2023.1.0 --values dconfig-resource.yaml
```

`dconfig-resource.yaml` has resources

To get the pods
```
kubectl get pods -n dhub
```

2 changes: 1 addition & 1 deletion config/fish/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ jupyterhub:
cpu_limit: 15
node_selector:
node.kubernetes.io/instance-type: Standard_E16s_v5
mem_121_:
mem_121:
display_name: 112 GB RAM, upto 15 CPUs
kubespawner_override:
mem_guarantee: 118471896889
Expand Down
8 changes: 7 additions & 1 deletion docs/posts/JHub-User-Guide.html
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,17 @@
<a href="../posts/set-up-authentication.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up authentication</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/Set-up-gpu.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up GPU</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/tips.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Tips</span></a>
<span class="menu-text">Troubleshooting</span></a>
</div>
</li>
</ul>
Expand Down
8 changes: 7 additions & 1 deletion docs/posts/Set-up-centos-security.html
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,17 @@
<a href="../posts/set-up-authentication.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up authentication</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/Set-up-gpu.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up GPU</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/tips.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Tips</span></a>
<span class="menu-text">Troubleshooting</span></a>
</div>
</li>
</ul>
Expand Down
8 changes: 7 additions & 1 deletion docs/posts/Set-up-centos-tljh.html
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,17 @@
<a href="../posts/set-up-authentication.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up authentication</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/Set-up-gpu.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up GPU</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/tips.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Tips</span></a>
<span class="menu-text">Troubleshooting</span></a>
</div>
</li>
</ul>
Expand Down
8 changes: 7 additions & 1 deletion docs/posts/Set-up-centos.html
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,17 @@
<a href="../posts/set-up-authentication.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up authentication</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/Set-up-gpu.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up GPU</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/tips.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Tips</span></a>
<span class="menu-text">Troubleshooting</span></a>
</div>
</li>
</ul>
Expand Down
8 changes: 7 additions & 1 deletion docs/posts/Set-up-daskhub.html
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,17 @@
<a href="../posts/set-up-authentication.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up authentication</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/Set-up-gpu.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up GPU</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/tips.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Tips</span></a>
<span class="menu-text">Troubleshooting</span></a>
</div>
</li>
</ul>
Expand Down
8 changes: 7 additions & 1 deletion docs/posts/Set-up-daskhub2.html
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,17 @@
<a href="../posts/set-up-authentication.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up authentication</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/Set-up-gpu.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Set-up GPU</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../posts/tips.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Tips</span></a>
<span class="menu-text">Troubleshooting</span></a>
</div>
</li>
</ul>
Expand Down
Loading

0 comments on commit 6fc324a

Please sign in to comment.