Skip to content

Commit

Permalink
Add support for enabling GPU access
Browse files Browse the repository at this point in the history
Signed-off-by: YISH <[email protected]>
  • Loading branch information
mokeyish committed May 7, 2024
1 parent 33d7d35 commit 79865c2
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,6 @@ venv.bak/

# mypy
.mypy_cache/


.vscode
11 changes: 11 additions & 0 deletions examples/nvidia-smi/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
services:
test:
image: nvidia/cuda:12.3.1-base-ubuntu20.04
command: nvidia-smi
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
56 changes: 56 additions & 0 deletions podman_compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,62 @@ def get_secret_args(compose, cnt, secret, podman_is_building=False):


def container_to_res_args(cnt, podman_args):
container_to_cpu_res_args(cnt, podman_args)
container_to_gpu_res_args(cnt, podman_args)


def container_to_gpu_res_args(cnt, podman_args):
# https://docs.docker.com/compose/gpu-support/
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html

deploy = cnt.get("deploy", None) or {}
res = deploy.get("resources", None) or {}
reservations = res.get("reservations", None) or {}
devices = reservations.get("devices", [])
gpu_on = False
for device in devices:
driver = device.get("driver", None)
if driver is None:
continue

capabilities = device.get("capabilities", None)
if capabilities is None:
continue

if driver != "nvidia" or "gpu" not in capabilities:
continue

count = device.get("count", "all")
device_ids = device.get("device_ids", "all")
if device_ids != "all" and len(device_ids) > 0:
for device_id in device_ids:
podman_args.extend((
"--device",
f"nvidia.com/gpu={device_id}",
))
gpu_on = True
continue

if count != "all":
for device_id in range(count):
podman_args.extend((
"--device",
f"nvidia.com/gpu={device_id}",
))
gpu_on = True
continue

podman_args.extend((
"--device",
"nvidia.com/gpu=all",
))
gpu_on = True

if gpu_on:
podman_args.append("--security-opt=label=disable")


def container_to_cpu_res_args(cnt, podman_args):
# v2: https://docs.docker.com/compose/compose-file/compose-file-v2/#cpu-and-other-resources
# cpus, cpu_shares, mem_limit, mem_reservation
cpus_limit_v2 = try_float(cnt.get("cpus", None), None)
Expand Down
103 changes: 103 additions & 0 deletions pytests/test_container_to_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,106 @@ async def test_env_file_obj_optional(self):
"busybox",
],
)

async def test_gpu(self):
c = create_compose_mock()

cnt = get_minimal_container()
cnt["command"] = ["nvidia-smi"]
cnt["deploy"] = {"resources": {"reservations": {"devices": [{}]}}}

# count: all
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
"driver": "nvidia",
"count": "all",
"capabilities": ["gpu"],
}

args = await container_to_args(c, cnt)
self.assertEqual(
args,
[
"--name=project_name_service_name1",
"-d",
"--network=bridge",
"--network-alias=service_name",
"--device",
"nvidia.com/gpu=all",
"--security-opt=label=disable",
"busybox",
"nvidia-smi",
],
)

# count: 2
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
"driver": "nvidia",
"count": 2,
"capabilities": ["gpu"],
}

args = await container_to_args(c, cnt)
self.assertEqual(
args,
[
"--name=project_name_service_name1",
"-d",
"--network=bridge",
"--network-alias=service_name",
"--device",
"nvidia.com/gpu=0",
"--device",
"nvidia.com/gpu=1",
"--security-opt=label=disable",
"busybox",
"nvidia-smi",
],
)

# device_ids: all
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
"driver": "nvidia",
"device_ids": "all",
"capabilities": ["gpu"],
}

args = await container_to_args(c, cnt)
self.assertEqual(
args,
[
"--name=project_name_service_name1",
"-d",
"--network=bridge",
"--network-alias=service_name",
"--device",
"nvidia.com/gpu=all",
"--security-opt=label=disable",
"busybox",
"nvidia-smi",
],
)

# device_ids: 1,3
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
"driver": "nvidia",
"device_ids": [1, 3],
"capabilities": ["gpu"],
}

args = await container_to_args(c, cnt)
self.assertEqual(
args,
[
"--name=project_name_service_name1",
"-d",
"--network=bridge",
"--network-alias=service_name",
"--device",
"nvidia.com/gpu=1",
"--device",
"nvidia.com/gpu=3",
"--security-opt=label=disable",
"busybox",
"nvidia-smi",
],
)

0 comments on commit 79865c2

Please sign in to comment.