Skip to content

Commit

Permalink
Begin adding AMD support.
Browse files Browse the repository at this point in the history
  • Loading branch information
bethune-bryant committed Jul 29, 2024
1 parent 16bb82e commit 65ba474
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 6 deletions.
15 changes: 9 additions & 6 deletions gpustat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
from blessed import Terminal

from gpustat import util
from gpustat import nvml
from gpustat.nvml import pynvml as N
from gpustat.nvml import check_driver_nvml_version
from gpustat import rocml as nvml
from gpustat import rocml as N
from gpustat.rocml import check_driver_nvml_version

NOT_SUPPORTED = 'Not Supported'
MB = 1024 * 1024
Expand Down Expand Up @@ -555,6 +555,7 @@ def _wrapped(*args, **kwargs):
processes = []
nv_comp_processes = nv_comp_processes or []
nv_graphics_processes = nv_graphics_processes or []
print(nv_comp_processes)
# A single process might run in both of graphics and compute mode,
# However we will display the process only once
seen_pids = set()
Expand Down Expand Up @@ -608,10 +609,12 @@ def _wrapped(*args, **kwargs):
handle: NVMLHandle = N.nvmlDeviceGetHandleByIndex(index)
gpu_info = get_gpu_info(handle)
gpu_stat = GPUStat(gpu_info)
except N.NVMLError_Unknown as e:
except Exception as e:
gpu_stat = InvalidGPU(index, "((Unknown Error))", e)
except N.NVMLError_GpuIsLost as e:
gpu_stat = InvalidGPU(index, "((GPU is lost))", e)
#except N.NVMLError_Unknown as e:
# gpu_stat = InvalidGPU(index, "((Unknown Error))", e)
#except N.NVMLError_GpuIsLost as e:
# gpu_stat = InvalidGPU(index, "((GPU is lost))", e)

if isinstance(gpu_stat, InvalidGPU):
log.add_exception("GPU %d" % index, gpu_stat.exception)
Expand Down
99 changes: 99 additions & 0 deletions gpustat/rocml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Imports pynvml with sanity checks and custom patches."""

# pylint: disable=protected-access

import atexit
import functools
import os
import sys
import textwrap
import warnings

from collections import namedtuple


from pyrsmi import rocml

NVML_TEMPERATURE_GPU = 1

def nvmlDeviceGetCount():
return rocml.smi_get_device_count()


def nvmlDeviceGetHandleByIndex(dev):
return dev

def nvmlDeviceGetIndex(dev):
return dev

def nvmlDeviceGetName(dev):
return rocml.smi_get_device_name(dev)

def nvmlDeviceGetUUID(dev):
return rocml.smi_get_device_uuid(dev)

def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU):
return rocml.smi_get_device_temp(dev, loc)

def nvmlSystemGetDriverVersion():
return rocml.smi_get_kernel_version()

def check_driver_nvml_version(driver_version_str: str):
return

def nvmlDeviceGetFanSpeed(dev):
return None#rocml.smi_get_device_fan_speed(dev)

MemoryInfo = namedtuple('MemoryInfo', ['total', 'used'])

def nvmlDeviceGetMemoryInfo(dev):
return MemoryInfo(total=rocml.smi_get_device_memory_total(dev), used=rocml.smi_get_device_memory_used(dev))

UtilizationRates = namedtuple('UtilizationRates', ['gpu'])

def nvmlDeviceGetUtilizationRates(dev):
return UtilizationRates(gpu=rocml.smi_get_device_utilization(dev))

def nvmlDeviceGetEncoderUtilization(dev):
return None

def nvmlDeviceGetDecoderUtilization(dev):
return None

def nvmlDeviceGetPowerUsage(dev):
return None#rocml.smi_get_device_average_power(dev)

def nvmlDeviceGetEnforcedPowerLimit(dev):
return None

ComputeProcess = namedtuple('ComputeProcess', ['pid'])

def nvmlDeviceGetComputeRunningProcesses(dev):
return [ComputeProcess(pid=i) for i in rocml.smi_get_device_compute_process()]

def nvmlDeviceGetGraphicsRunningProcesses(dev):
return None

# Upon importing this module, let pynvml be initialized and remain active
# throughout the lifespan of the python process (until gpustat exists).
_initialized: bool
_init_error = None
try:
rocml.smi_initialize()
_initialized = True

def _shutdown():
rocml.smi_shutdown()
atexit.register(_shutdown)

except pynvml.NVMLError as exc:
_initialized = False
_init_error = exc


def ensure_initialized():
if not _initialized:
raise _init_error # type: ignore



0 comments on commit 65ba474

Please sign in to comment.