Skip to content

Commit fada2c8

Browse files
committed
add administration tools : prepull-datascience-images & pause-services-unused-gpu
1 parent 45c12d6 commit fada2c8

File tree

7 files changed

+178
-0
lines changed

7 files changed

+178
-0
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
apiVersion: batch/v1
2+
kind: CronJob
3+
metadata:
4+
name: pause-services-unused-gpus
5+
spec:
6+
schedule: "0 */2 * * *"
7+
jobTemplate:
8+
spec:
9+
template:
10+
spec:
11+
initContainers:
12+
- name: init
13+
image: inseefrlab/onyxia-base:latest
14+
imagePullPolicy: IfNotPresent
15+
command:
16+
- /bin/sh
17+
- -c
18+
- git clone https://github.com/InseeFrLab/onyxia-ops.git /gitops
19+
volumeMounts:
20+
- name: gitops
21+
mountPath: "/gitops"
22+
containers:
23+
- name: pause-services-unused-gpus
24+
image: inseefrlab/onyxia-python-minimal:py3.12.2
25+
imagePullPolicy: IfNotPresent
26+
command:
27+
- /bin/sh
28+
- -c
29+
- pip install -q prometheus-api-client && python /gitops/pause-services-unused-gpus/pause-services-unused-gpus.py
30+
volumeMounts:
31+
- name: gitops
32+
mountPath: "/gitops"
33+
volumes:
34+
- name: gitops
35+
emptyDir: {}
36+
restartPolicy: Never
37+
serviceAccountName: pause-services-unused-gpus
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import subprocess
2+
import logging
3+
import json
4+
5+
from prometheus_api_client import PrometheusConnect
6+
import kubernetes
7+
8+
9+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
10+
11+
# Prometheus config
12+
prom = PrometheusConnect(url="http://prometheus-server.prometheus",
13+
disable_ssl=True)
14+
15+
# Kube API config
16+
kubernetes.config.load_incluster_config()
17+
kube_core_api = kubernetes.client.CoreV1Api()
18+
19+
# Get list of pods that have been reserving a GPU for at least n hours
20+
N_HOURS_RES = 2 # Check last N hours
21+
N_DATA_POINT_MIN = N_HOURS_RES * 60 * 0.9 # 10% error margin to account for measurement error
22+
23+
query_up = f'''
24+
count_over_time(DCGM_FI_DEV_GPU_UTIL{{namespace=~".*",
25+
pod=~".*",
26+
job!="opencost"
27+
}}[{N_HOURS_RES}h]) > {N_DATA_POINT_MIN}
28+
'''
29+
res_up = prom.custom_query(query_up)
30+
pods_res_up = [(svc["metric"]["pod"], svc["metric"]["namespace"])
31+
for svc in res_up if "pod" in svc["metric"]]
32+
33+
# Get list of pods for which there is no sign of GPU activity in the last n hours
34+
query_no_usage = f'''
35+
sum_over_time(DCGM_FI_DEV_GPU_UTIL{{namespace=~".*",
36+
pod=~".*",
37+
job!="opencost"
38+
}}[{N_HOURS_RES}h]) == 0
39+
'''
40+
no_usage = prom.custom_query(query_no_usage)
41+
pods_no_usage = [(svc["metric"]["pod"], svc["metric"]["namespace"])
42+
for svc in no_usage if "pod" in svc["metric"]]
43+
44+
# Get list of pods that match the two criterion
45+
pods_to_pause = list(set(pods_res_up) & set(pods_no_usage))
46+
47+
# Filter out non-running pods
48+
pods_to_pause = [(pod, ns) for pod, ns in pods_to_pause
49+
if kube_core_api.read_namespaced_pod(name=pod, namespace=ns).status.phase == 'Running']
50+
51+
# Pause helm releases that match the two criterions
52+
cmd_helm_repo_add = 'helm repo add inseefrlab-datascience https://inseefrlab.github.io/helm-charts-interactive-services'
53+
subprocess.run(cmd_helm_repo_add.split(" "))
54+
for pod in pods_to_pause:
55+
# Extract relevant metadata from pod
56+
release_name = pod[0].split('-0')[0]
57+
chart = '-'.join(release_name.split('-')[:-1])
58+
namespace = pod[1]
59+
# Match corresponding helm release
60+
helm_ls = subprocess.run(['helm', 'ls', '-n', namespace, '--output', 'json'],
61+
capture_output=True, text=True)
62+
ns_releases = [rl for rl in json.loads(helm_ls.stdout) if rl['name'] == release_name]
63+
if ns_releases:
64+
# Extract chart metadata
65+
chart_splitted = ns_releases[0]["chart"].split('-')
66+
chart_name = '-'.join(chart_splitted[:-1])
67+
chart_version = chart_splitted[-1]
68+
# Launch pause command
69+
cmd = f"helm upgrade {release_name} inseefrlab-datascience/{chart_name} --version {chart_version} --history-max 0 --namespace={namespace} --reuse-values --set global.suspend=true"
70+
logging.info(f"Running command : {cmd}")
71+
subprocess.run(cmd.split(" "), stdout=subprocess.DEVNULL)
72+
else:
73+
logging.info(f"Pod {pod[0]} in namespace {namespace} is not associated to any helm release.")
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRoleBinding
3+
metadata:
4+
name: pause-services-unused-gpus
5+
roleRef:
6+
apiGroup: rbac.authorization.k8s.io
7+
kind: ClusterRole
8+
name: admin
9+
subjects:
10+
- kind: ServiceAccount
11+
name: pause-services-unused-gpus
12+
namespace: pause-services-unused-gpus
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: ServiceAccount
3+
metadata:
4+
name: pause-services-unused-gpus
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
apiVersion: batch/v1
2+
kind: CronJob
3+
metadata:
4+
name: prepull
5+
spec:
6+
schedule: "0 4 * * 1"
7+
jobTemplate:
8+
spec:
9+
template:
10+
spec:
11+
initContainers:
12+
- name: init
13+
image: inseefrlab/onyxia-python-minimal:py3.11.10
14+
imagePullPolicy: IfNotPresent
15+
command:
16+
- /bin/sh
17+
- -c
18+
- git clone https://github.com/InseeFrLab/helm-charts-interactive-services.git /catalog
19+
volumeMounts:
20+
- name: catalog
21+
mountPath: "/catalog"
22+
containers:
23+
- name: prepull
24+
image: inseefrlab/onyxia-python-minimal:py3.11.10
25+
imagePullPolicy: IfNotPresent
26+
command:
27+
- /bin/sh
28+
- -c
29+
- cd /catalog/utils && chmod +x prepull_images.sh && ./prepull_images.sh
30+
volumeMounts:
31+
- name: catalog
32+
mountPath: "/catalog"
33+
volumes:
34+
- name: catalog
35+
emptyDir: {}
36+
restartPolicy: Never
37+
serviceAccountName: prepull
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: RoleBinding
3+
metadata:
4+
name: prepull
5+
roleRef:
6+
apiGroup: rbac.authorization.k8s.io
7+
kind: ClusterRole
8+
name: admin
9+
subjects:
10+
- kind: ServiceAccount
11+
name: prepull
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: ServiceAccount
3+
metadata:
4+
name: prepull

0 commit comments

Comments
 (0)