Skip to content

Commit 5edd2a2

Browse files
Monitor GPU SM utilization (#35)
* feat: Add monitoring of GPU SM processes * feat: handle error in case no process is active * feat: update version to 0.3.0 and enhance GPU monitoring with SM and memory utilization metrics --------- Co-authored-by: Siddhant Sadangi <[email protected]>
1 parent 396e9c5 commit 5edd2a2

File tree

2 files changed

+39
-2
lines changed

2 files changed

+39
-2
lines changed

utils/monitoring_tools/hardware_monitoring/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ An extensible Python module for logging system and process hardware metrics (CPU
88

99
## Changelog
1010

11+
**v0.3.0** (2025-10-28)
12+
13+
- Added support for GPU SM utilization and memory utilization.
14+
1115
**v0.2.2** (2025-10-01)
1216

1317
- Handled error when using DDP on non-zero ranks.

utils/monitoring_tools/hardware_monitoring/neptune_hardware_monitoring.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
logger = get_logger()
1515

16-
__version__ = "0.2.2"
16+
__version__ = "0.3.0"
1717

1818
try:
1919
import torch
@@ -69,7 +69,8 @@ def __init__(
6969
else:
7070
_fork_step = self.run._fork_step
7171
self._monitoring_step = _fork_step + 1 if _fork_step is not None else 0
72-
72+
# Last time stamp GPU SM process information was retrieved
73+
self._last_process_time_stamp = 0
7374
self.hostname = socket.gethostname()
7475

7576
# Prime psutil.cpu_percent to avoid initial 0.0 reading
@@ -270,6 +271,38 @@ def _collect_gpu_metrics(self, metrics: Dict[str, Any], prefix: str) -> None:
270271
metrics[f"{prefix}/gpu/{i}/power_usage_watts"] = power
271272
except Exception as e:
272273
logger.warning(f"Error getting power usage for GPU {i} on {self.hostname}: {e}")
274+
# SM Process Utilization
275+
try:
276+
# SM Utilization samples is returned as a list of samples
277+
# There are as many as samples as active processes in the time stamp interval
278+
# For more details, see
279+
# https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1gb0ea5236f5e69e63bf53684a11c233bd
280+
sm_utilization_samples: list[pynvml.c_nvmlProcessUtilizationSample_t] = (
281+
pynvml.nvmlDeviceGetProcessUtilization(handle, self._last_process_time_stamp)
282+
)
283+
except pynvml.nvmlExceptionClass(pynvml.NVML_ERROR_NOT_FOUND) as e:
284+
# If no valid sample entries are found since the last seen time stamp, NVML_ERROR_NOT_FOUND is returned.
285+
# It is expected if no process is active during two consecutive calls.
286+
sm_util = 0
287+
mem_util = 0
288+
except Exception as e:
289+
logger.warning(
290+
f"Error getting process utilization for GPU {i} on {self.hostname}: {e}"
291+
)
292+
sm_util = None
293+
mem_util = None
294+
else:
295+
# We assume process utilization is given in percentage and can be summed
296+
# We expect only one process to be active at a time during training
297+
sm_util = sum(sample.smUtil for sample in sm_utilization_samples)
298+
mem_util = sum(sample.memUtil for sample in sm_utilization_samples)
299+
self._last_process_time_stamp = max(
300+
sample.timeStamp for sample in sm_utilization_samples
301+
)
302+
finally:
303+
if sm_util is not None and mem_util is not None:
304+
metrics[f"{prefix}/gpu/{i}/sm_utilization_percent"] = sm_util
305+
metrics[f"{prefix}/gpu/{i}/sm_memory_utilization_percent"] = mem_util
273306

274307
def _collect_process_metrics(self, metrics: Dict[str, Any], prefix: str) -> None:
275308
"""

0 commit comments

Comments
 (0)