in optimum_benchmark/trackers/memory.py [0:0]
def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connection: Connection):
stop = False
max_used_global_memory = 0
max_used_process_memory = 0
monitored_process = psutil.Process(monitored_pid)
if monitored_process.is_running():
try:
connection.send(0)
except Exception:
exit(0)
if is_nvidia_system():
if not is_pynvml_available():
raise ValueError(
"The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
"Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
)
pynvml.nvmlInit()
devices_handles = [pynvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids]
while monitored_process.is_running() and not stop:
used_global_memory = 0
used_process_memory = 0
monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)]
for device_id, device_handle in zip(device_ids, devices_handles):
try:
device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
except Exception as e:
LOGGER.warning(f"Could not get process list for device {device_id}: {e}.")
continue
for device_process in device_processes:
if device_process.pid in monitored_pids:
used_process_memory += device_process.usedGpuMemory
try:
device_memory = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
except Exception as e:
LOGGER.warning(f"Could not get memory info for device {device_id}: {e}.")
continue
used_global_memory += device_memory.used
max_used_global_memory = max(max_used_global_memory, used_global_memory)
max_used_process_memory = max(max_used_process_memory, used_process_memory)
stop = connection.poll(MEMORY_CONSUMPTION_SAMPLING_RATE)
pynvml.nvmlShutdown()
elif is_rocm_system():
if not is_amdsmi_available():
raise ValueError(
"The library AMD SMI is required to track process-specific memory benchmark on AMD GPUs, but is not installed. "
"Please install the official and AMD maintained AMD SMI library from https://github.com/ROCm/amdsmi."
)
amdsmi.amdsmi_init()
permission_denied = False
devices_handles = amdsmi.amdsmi_get_processor_handles()
while monitored_process.is_running() and not stop:
used_global_memory = 0
used_process_memory = 0
monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)]
for device_id in device_ids:
device_handle = devices_handles[device_id]
try:
used_global_memory += amdsmi.amdsmi_get_gpu_memory_total(
device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM
)
except Exception as e:
LOGGER.warning(f"Could not get memory usage for device {device_id}: {e}")
if permission_denied:
continue
try:
processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle)
except Exception as e:
LOGGER.warning(f"Could not get process list for device {device_id}: {e}")
permission_denied = "Permission Denied" in str(e)
continue
for process_handle in processes_handles:
try:
gpu_process_info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle)
except Exception as e:
LOGGER.warning(f"Could not get process info for process {process_handle}: {e}")
permission_denied = "Permission Denied" in str(e)
continue
if gpu_process_info["pid"] in monitored_pids:
max_used_process_memory += gpu_process_info["memory_usage"]["vram_mem"]
max_used_global_memory = max(max_used_global_memory, used_global_memory)
max_used_process_memory = max(max_used_process_memory, used_process_memory)
stop = connection.poll(MEMORY_CONSUMPTION_SAMPLING_RATE)
amdsmi.amdsmi_shut_down()
else:
raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for VRAM tracking.")
if monitored_process.is_running():
try:
connection.send(max_used_global_memory / 1e6) # convert to MB
connection.send(max_used_process_memory / 1e6) # convert to MB
except Exception:
exit(0)
connection.close()