-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add hardware info reporting (#273)
This commit adds route for `/hardware/info` and `/hardware/stats` that provides the GPUs for a pipeline/model_id and some basic information from torch.cuda. --------- Co-authored-by: Rick Staa <[email protected]>
- Loading branch information
1 parent
ee40bd6
commit 28e0095
Showing
10 changed files
with
787 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import os | ||
from typing import Dict | ||
|
||
from app.utils.hardware import ( | ||
GpuComputeInfo, | ||
GpuUtilizationInfo, | ||
get_gpu_info, | ||
get_gpu_stats, | ||
) | ||
from fastapi import APIRouter | ||
from pydantic import BaseModel | ||
|
||
router = APIRouter() | ||
|
||
|
||
class HardwareInformation(BaseModel): | ||
"""Response model for GPU information.""" | ||
|
||
pipeline: str | ||
model_id: str | ||
gpu_info: Dict[int, GpuComputeInfo] | ||
|
||
|
||
class HardwareStats(BaseModel): | ||
"""Response model for real-time GPU statistics.""" | ||
|
||
pipeline: str | ||
model_id: str | ||
gpu_stats: Dict[int, GpuUtilizationInfo] | ||
|
||
|
||
@router.get( | ||
"/hardware/info", | ||
operation_id="hardware_info", | ||
response_model=HardwareInformation, | ||
) | ||
@router.get( | ||
"/hardware/info/", | ||
response_model=HardwareInformation, | ||
include_in_schema=False, | ||
) | ||
async def hardware_info(): | ||
return HardwareInformation( | ||
pipeline=os.environ["PIPELINE"], | ||
model_id=os.environ["MODEL_ID"], | ||
gpu_info=get_gpu_info(), | ||
) | ||
|
||
|
||
@router.get( | ||
"/hardware/stats", | ||
operation_id="hardware_stats", | ||
response_model=HardwareStats, | ||
) | ||
@router.get( | ||
"/hardware/stats/", | ||
response_model=HardwareStats, | ||
include_in_schema=False, | ||
) | ||
async def hardware_stats(): | ||
return HardwareStats( | ||
pipeline=os.environ["PIPELINE"], | ||
model_id=os.environ["MODEL_ID"], | ||
gpu_stats=get_gpu_stats(), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
"""Contains utility functions for hardware information.""" | ||
|
||
from typing import Dict | ||
from pydantic import BaseModel | ||
import logging | ||
import pynvml | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class GpuBaseInfo(BaseModel): | ||
"""Model for general GPU information.""" | ||
|
||
id: str | ||
name: str | ||
memory_total: int | ||
memory_free: int | ||
|
||
|
||
class GpuComputeInfo(GpuBaseInfo): | ||
"""Model for detailed GPU compute information.""" | ||
|
||
major: int | ||
minor: int | ||
|
||
|
||
class GpuUtilizationInfo(GpuBaseInfo): | ||
"""Model for real-time GPU utilization statistics.""" | ||
|
||
utilization_compute: int | ||
utilization_memory: int | ||
|
||
|
||
class GpuInfo(GpuComputeInfo, GpuUtilizationInfo): | ||
"""Model for full CUDA device information.""" | ||
|
||
pass | ||
|
||
|
||
def retrieve_cuda_info() -> Dict[int, GpuInfo]: | ||
"""Retrieve CUDA device information. | ||
Returns: | ||
CUDA device information. | ||
""" | ||
devices = {} | ||
for i in range(pynvml.nvmlDeviceGetCount()): | ||
handle = pynvml.nvmlDeviceGetHandleByIndex(i) | ||
uuid = pynvml.nvmlDeviceGetUUID(handle) | ||
name = pynvml.nvmlDeviceGetName(handle) | ||
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) | ||
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) | ||
utilization_rates = pynvml.nvmlDeviceGetUtilizationRates(handle) | ||
devices[i] = GpuInfo( | ||
id=uuid, | ||
name=name, | ||
memory_total=memory_info.total, | ||
memory_free=memory_info.free, | ||
major=major, | ||
minor=minor, | ||
utilization_compute=utilization_rates.gpu, | ||
utilization_memory=utilization_rates.memory, | ||
) | ||
return devices | ||
|
||
|
||
def get_gpu_info() -> Dict[int, GpuComputeInfo]: | ||
"""Get detailed GPU compute information. | ||
Returns: | ||
The detailed GPU compute information. | ||
""" | ||
basic_info = retrieve_cuda_info() | ||
return { | ||
i: GpuComputeInfo( | ||
id=info.id, | ||
name=info.name, | ||
memory_total=info.memory_total, | ||
memory_free=info.memory_free, | ||
major=info.major, | ||
minor=info.minor, | ||
) | ||
for i, info in basic_info.items() | ||
} | ||
|
||
|
||
def get_gpu_stats() -> Dict[int, GpuUtilizationInfo]: | ||
"""Get real-time GPU utilization statistics. | ||
Returns: | ||
The real-time GPU utilization statistics. | ||
""" | ||
basic_info = retrieve_cuda_info() | ||
return { | ||
i: GpuUtilizationInfo( | ||
id=info.id, | ||
name=info.name, | ||
memory_total=info.memory_total, | ||
memory_free=info.memory_free, | ||
utilization_compute=info.utilization_compute, | ||
utilization_memory=info.utilization_memory, | ||
) | ||
for i, info in basic_info.items() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
""" | ||
This module manages NVML (NVIDIA Management Library) initialization and shutdown, | ||
ensuring efficient resource management and improved performance for GPU operations. | ||
""" | ||
import pynvml | ||
import logging | ||
import atexit | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
class NVMLManager: | ||
"""A class to manage NVML initialization and shutdown.""" | ||
def __init__(self): | ||
self._initialized = False | ||
atexit.register(self.shutdown) | ||
|
||
def initialize(self): | ||
"""Initialize NVML.""" | ||
if not self._initialized: | ||
try: | ||
pynvml.nvmlInit() | ||
self._initialized = True | ||
logger.info("NVML initialized successfully.") | ||
except pynvml.NVMLError as e: | ||
logger.error(f"Failed to initialize NVML: {e}") | ||
|
||
def shutdown(self): | ||
"""Shutdown NVML.""" | ||
if self._initialized: | ||
try: | ||
pynvml.nvmlShutdown() | ||
self._initialized = False | ||
logger.info("NVML shutdown successfully.") | ||
except pynvml.NVMLError as e: | ||
logger.error(f"Failed to shutdown NVML: {e}") | ||
|
||
nvml_manager = NVMLManager() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.