Skip to content

Commit

Permalink
feat: add hardware info reporting (#273)
Browse files Browse the repository at this point in the history
This commit adds route for `/hardware/info` and `/hardware/stats` that provides the GPUs for a pipeline/model_id and some basic information from torch.cuda.

---------

Co-authored-by: Rick Staa <[email protected]>
  • Loading branch information
ad-astra-video and rickstaa authored Nov 27, 2024
1 parent ee40bd6 commit 28e0095
Show file tree
Hide file tree
Showing 10 changed files with 787 additions and 99 deletions.
21 changes: 19 additions & 2 deletions runner/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import os
from contextlib import asynccontextmanager

import app
from app.routes import health, hardware
from fastapi import FastAPI
from fastapi.routing import APIRoute

from app.routes import health
from app.utils.hardware import get_gpu_info
from app.utils.nvml_manager import nvml_manager

logger = logging.getLogger(__name__)

Expand All @@ -14,16 +16,24 @@
async def lifespan(app: FastAPI):
config_logging()

nvml_manager.initialize()

app.include_router(health.router)
app.include_router(hardware.router)

pipeline = os.environ["PIPELINE"]
model_id = os.environ["MODEL_ID"]

app.pipeline = load_pipeline(pipeline, model_id)
app.include_router(load_route(pipeline))

print_cuda_devices()
logger.info(f"Started up with pipeline {app.pipeline}")

yield

nvml_manager.shutdown()

logger.info("Shutting down")


Expand Down Expand Up @@ -133,6 +143,13 @@ def config_logging():
)


def print_cuda_devices():
devices = get_gpu_info()
logger.info("Cuda devices available:")
for device in devices:
logger.info(devices[device])


def use_route_names_as_operation_ids(app: FastAPI) -> None:
for route in app.routes:
if isinstance(route, APIRoute):
Expand Down
65 changes: 65 additions & 0 deletions runner/app/routes/hardware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
from typing import Dict

from app.utils.hardware import (
GpuComputeInfo,
GpuUtilizationInfo,
get_gpu_info,
get_gpu_stats,
)
from fastapi import APIRouter
from pydantic import BaseModel

router = APIRouter()


class HardwareInformation(BaseModel):
"""Response model for GPU information."""

pipeline: str
model_id: str
gpu_info: Dict[int, GpuComputeInfo]


class HardwareStats(BaseModel):
"""Response model for real-time GPU statistics."""

pipeline: str
model_id: str
gpu_stats: Dict[int, GpuUtilizationInfo]


@router.get(
"/hardware/info",
operation_id="hardware_info",
response_model=HardwareInformation,
)
@router.get(
"/hardware/info/",
response_model=HardwareInformation,
include_in_schema=False,
)
async def hardware_info():
return HardwareInformation(
pipeline=os.environ["PIPELINE"],
model_id=os.environ["MODEL_ID"],
gpu_info=get_gpu_info(),
)


@router.get(
"/hardware/stats",
operation_id="hardware_stats",
response_model=HardwareStats,
)
@router.get(
"/hardware/stats/",
response_model=HardwareStats,
include_in_schema=False,
)
async def hardware_stats():
return HardwareStats(
pipeline=os.environ["PIPELINE"],
model_id=os.environ["MODEL_ID"],
gpu_stats=get_gpu_stats(),
)
104 changes: 104 additions & 0 deletions runner/app/utils/hardware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""Contains utility functions for hardware information."""

from typing import Dict
from pydantic import BaseModel
import logging
import pynvml

logger = logging.getLogger(__name__)


class GpuBaseInfo(BaseModel):
"""Model for general GPU information."""

id: str
name: str
memory_total: int
memory_free: int


class GpuComputeInfo(GpuBaseInfo):
"""Model for detailed GPU compute information."""

major: int
minor: int


class GpuUtilizationInfo(GpuBaseInfo):
"""Model for real-time GPU utilization statistics."""

utilization_compute: int
utilization_memory: int


class GpuInfo(GpuComputeInfo, GpuUtilizationInfo):
"""Model for full CUDA device information."""

pass


def retrieve_cuda_info() -> Dict[int, GpuInfo]:
"""Retrieve CUDA device information.
Returns:
CUDA device information.
"""
devices = {}
for i in range(pynvml.nvmlDeviceGetCount()):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
uuid = pynvml.nvmlDeviceGetUUID(handle)
name = pynvml.nvmlDeviceGetName(handle)
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
utilization_rates = pynvml.nvmlDeviceGetUtilizationRates(handle)
devices[i] = GpuInfo(
id=uuid,
name=name,
memory_total=memory_info.total,
memory_free=memory_info.free,
major=major,
minor=minor,
utilization_compute=utilization_rates.gpu,
utilization_memory=utilization_rates.memory,
)
return devices


def get_gpu_info() -> Dict[int, GpuComputeInfo]:
"""Get detailed GPU compute information.
Returns:
The detailed GPU compute information.
"""
basic_info = retrieve_cuda_info()
return {
i: GpuComputeInfo(
id=info.id,
name=info.name,
memory_total=info.memory_total,
memory_free=info.memory_free,
major=info.major,
minor=info.minor,
)
for i, info in basic_info.items()
}


def get_gpu_stats() -> Dict[int, GpuUtilizationInfo]:
"""Get real-time GPU utilization statistics.
Returns:
The real-time GPU utilization statistics.
"""
basic_info = retrieve_cuda_info()
return {
i: GpuUtilizationInfo(
id=info.id,
name=info.name,
memory_total=info.memory_total,
memory_free=info.memory_free,
utilization_compute=info.utilization_compute,
utilization_memory=info.utilization_memory,
)
for i, info in basic_info.items()
}
37 changes: 37 additions & 0 deletions runner/app/utils/nvml_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
This module manages NVML (NVIDIA Management Library) initialization and shutdown,
ensuring efficient resource management and improved performance for GPU operations.
"""
import pynvml
import logging
import atexit

logger = logging.getLogger(__name__)

class NVMLManager:
"""A class to manage NVML initialization and shutdown."""
def __init__(self):
self._initialized = False
atexit.register(self.shutdown)

def initialize(self):
"""Initialize NVML."""
if not self._initialized:
try:
pynvml.nvmlInit()
self._initialized = True
logger.info("NVML initialized successfully.")
except pynvml.NVMLError as e:
logger.error(f"Failed to initialize NVML: {e}")

def shutdown(self):
"""Shutdown NVML."""
if self._initialized:
try:
pynvml.nvmlShutdown()
self._initialized = False
logger.info("NVML shutdown successfully.")
except pynvml.NVMLError as e:
logger.error(f"Failed to shutdown NVML: {e}")

nvml_manager = NVMLManager()
19 changes: 10 additions & 9 deletions runner/gen_openapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from app.main import app
from app.routes import (
audio_to_text,
hardware,
health,
image_to_image,
image_to_text,
Expand Down Expand Up @@ -47,7 +48,6 @@ def translate_to_gateway(openapi: dict) -> dict:
.. note::
Differences between 'runner' and 'gateway' entrypoints:
- 'health' endpoint is removed.
- 'model_id' is enforced in all endpoints.
- 'metadata' property is removed from all schemas.
- 'VideoResponse' schema is updated to match the Gateway's transcoded mp4
Expand All @@ -59,11 +59,8 @@ def translate_to_gateway(openapi: dict) -> dict:
Returns:
The translated OpenAPI schema.
"""
# Remove 'health' related endpoints and schemas.
openapi["paths"].pop("/health")
openapi["components"]["schemas"].pop("HealthCheck")

# Enforce 'model_id' in all endpoints
logger.debug("Enforcing 'model_id' in all endpoints...")
for _, methods in openapi["paths"].items():
for _, details in methods.items():
if "requestBody" in details:
Expand All @@ -86,6 +83,7 @@ def translate_to_gateway(openapi: dict) -> dict:
# Update the 'VideoResponse' schema to match the Gateway's response.
# NOTE: This is necessary because the Gateway transcodes the runner's response and
# returns an mp4 file.
logger.debug("Updating 'VideoResponse' schema...")
openapi["components"]["schemas"]["VideoResponse"] = copy.deepcopy(
openapi["components"]["schemas"]["ImageResponse"]
)
Expand All @@ -103,7 +101,9 @@ def write_openapi(fname: str, entrypoint: str = "runner"):
entrypoint: The entrypoint to generate the OpenAPI schema for, either
'gateway' or 'runner'. Default is 'runner'.
"""
app.include_router(health.router)
if entrypoint != "gateway":
app.include_router(health.router)
app.include_router(hardware.router)
app.include_router(text_to_image.router)
app.include_router(image_to_image.router)
app.include_router(image_to_video.router)
Expand Down Expand Up @@ -164,8 +164,8 @@ def write_openapi(fname: str, entrypoint: str = "runner"):
parser.add_argument(
"--entrypoint",
type=str,
choices=["runner", "gateway"],
default=["runner", "gateway"],
choices=["gateway","runner"],
default=["gateway","runner"],
nargs="+",
help=(
"The entrypoint to generate the OpenAPI schema for, options are 'runner' "
Expand All @@ -176,5 +176,6 @@ def write_openapi(fname: str, entrypoint: str = "runner"):

# Generate orchestrator and Gateway facing OpenAPI schemas.
logger.info("Generating OpenAPI schema.")
for entrypoint in args.entrypoint:
entrypoints = sorted(args.entrypoint, key=lambda x: x != "gateway")
for entrypoint in entrypoints:
write_openapi(f"openapi.{args.type.lower()}", entrypoint=entrypoint)
Loading

0 comments on commit 28e0095

Please sign in to comment.