|
13 | 13 |
|
14 | 14 | logger = get_logger() |
15 | 15 |
|
16 | | -__version__ = "0.2.2" |
| 16 | +__version__ = "0.3.0" |
17 | 17 |
|
18 | 18 | try: |
19 | 19 | import torch |
@@ -69,7 +69,8 @@ def __init__( |
69 | 69 | else: |
70 | 70 | _fork_step = self.run._fork_step |
71 | 71 | self._monitoring_step = _fork_step + 1 if _fork_step is not None else 0 |
72 | | - |
| 72 | + # Last time stamp GPU SM process information was retrieved |
| 73 | + self._last_process_time_stamp = 0 |
73 | 74 | self.hostname = socket.gethostname() |
74 | 75 |
|
75 | 76 | # Prime psutil.cpu_percent to avoid initial 0.0 reading |
@@ -270,6 +271,38 @@ def _collect_gpu_metrics(self, metrics: Dict[str, Any], prefix: str) -> None: |
270 | 271 | metrics[f"{prefix}/gpu/{i}/power_usage_watts"] = power |
271 | 272 | except Exception as e: |
272 | 273 | logger.warning(f"Error getting power usage for GPU {i} on {self.hostname}: {e}") |
| 274 | + # SM Process Utilization |
| 275 | + try: |
| 276 | + # SM Utilization samples is returned as a list of samples |
| 277 | + # There are as many as samples as active processes in the time stamp interval |
| 278 | + # For more details, see |
| 279 | + # https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1gb0ea5236f5e69e63bf53684a11c233bd |
| 280 | + sm_utilization_samples: list[pynvml.c_nvmlProcessUtilizationSample_t] = ( |
| 281 | + pynvml.nvmlDeviceGetProcessUtilization(handle, self._last_process_time_stamp) |
| 282 | + ) |
| 283 | + except pynvml.nvmlExceptionClass(pynvml.NVML_ERROR_NOT_FOUND) as e: |
| 284 | + # If no valid sample entries are found since the last seen time stamp, NVML_ERROR_NOT_FOUND is returned. |
| 285 | + # It is expected if no process is active during two consecutive calls. |
| 286 | + sm_util = 0 |
| 287 | + mem_util = 0 |
| 288 | + except Exception as e: |
| 289 | + logger.warning( |
| 290 | + f"Error getting process utilization for GPU {i} on {self.hostname}: {e}" |
| 291 | + ) |
| 292 | + sm_util = None |
| 293 | + mem_util = None |
| 294 | + else: |
| 295 | + # We assume process utilization is given in percentage and can be summed |
| 296 | + # We expect only one process to be active at a time during training |
| 297 | + sm_util = sum(sample.smUtil for sample in sm_utilization_samples) |
| 298 | + mem_util = sum(sample.memUtil for sample in sm_utilization_samples) |
| 299 | + self._last_process_time_stamp = max( |
| 300 | + sample.timeStamp for sample in sm_utilization_samples |
| 301 | + ) |
| 302 | + finally: |
| 303 | + if sm_util is not None and mem_util is not None: |
| 304 | + metrics[f"{prefix}/gpu/{i}/sm_utilization_percent"] = sm_util |
| 305 | + metrics[f"{prefix}/gpu/{i}/sm_memory_utilization_percent"] = mem_util |
273 | 306 |
|
274 | 307 | def _collect_process_metrics(self, metrics: Dict[str, Any], prefix: str) -> None: |
275 | 308 | """ |
|
0 commit comments