def monitor_gpu_vram_memory()

in optimum_benchmark/trackers/memory.py [0:0]


def monitor_gpu_vram_memory(monitored_pid: int, device_ids: List[int], connection: Connection):
    stop = False
    max_used_global_memory = 0
    max_used_process_memory = 0
    monitored_process = psutil.Process(monitored_pid)

    if monitored_process.is_running():
        try:
            connection.send(0)
        except Exception:
            exit(0)

    if is_nvidia_system():
        if not is_pynvml_available():
            raise ValueError(
                "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
                "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
            )

        pynvml.nvmlInit()
        devices_handles = [pynvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids]

        while monitored_process.is_running() and not stop:
            used_global_memory = 0
            used_process_memory = 0

            monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)]

            for device_id, device_handle in zip(device_ids, devices_handles):
                try:
                    device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
                except Exception as e:
                    LOGGER.warning(f"Could not get process list for device {device_id}: {e}.")
                    continue

                for device_process in device_processes:
                    if device_process.pid in monitored_pids:
                        used_process_memory += device_process.usedGpuMemory

                try:
                    device_memory = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
                except Exception as e:
                    LOGGER.warning(f"Could not get memory info for device {device_id}: {e}.")
                    continue

                used_global_memory += device_memory.used

            max_used_global_memory = max(max_used_global_memory, used_global_memory)
            max_used_process_memory = max(max_used_process_memory, used_process_memory)
            stop = connection.poll(MEMORY_CONSUMPTION_SAMPLING_RATE)

        pynvml.nvmlShutdown()

    elif is_rocm_system():
        if not is_amdsmi_available():
            raise ValueError(
                "The library AMD SMI is required to track process-specific memory benchmark on AMD GPUs, but is not installed. "
                "Please install the official and AMD maintained AMD SMI library from https://github.com/ROCm/amdsmi."
            )

        amdsmi.amdsmi_init()
        permission_denied = False
        devices_handles = amdsmi.amdsmi_get_processor_handles()

        while monitored_process.is_running() and not stop:
            used_global_memory = 0
            used_process_memory = 0

            monitored_pids = [monitored_pid] + [child.pid for child in monitored_process.children(recursive=True)]

            for device_id in device_ids:
                device_handle = devices_handles[device_id]

                try:
                    used_global_memory += amdsmi.amdsmi_get_gpu_memory_total(
                        device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM
                    )
                except Exception as e:
                    LOGGER.warning(f"Could not get memory usage for device {device_id}: {e}")

                if permission_denied:
                    continue

                try:
                    processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle)
                except Exception as e:
                    LOGGER.warning(f"Could not get process list for device {device_id}: {e}")
                    permission_denied = "Permission Denied" in str(e)
                    continue

                for process_handle in processes_handles:
                    try:
                        gpu_process_info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle)
                    except Exception as e:
                        LOGGER.warning(f"Could not get process info for process {process_handle}: {e}")
                        permission_denied = "Permission Denied" in str(e)
                        continue

                    if gpu_process_info["pid"] in monitored_pids:
                        max_used_process_memory += gpu_process_info["memory_usage"]["vram_mem"]

            max_used_global_memory = max(max_used_global_memory, used_global_memory)
            max_used_process_memory = max(max_used_process_memory, used_process_memory)
            stop = connection.poll(MEMORY_CONSUMPTION_SAMPLING_RATE)

        amdsmi.amdsmi_shut_down()

    else:
        raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for VRAM tracking.")

    if monitored_process.is_running():
        try:
            connection.send(max_used_global_memory / 1e6)  # convert to MB
            connection.send(max_used_process_memory / 1e6)  # convert to MB
        except Exception:
            exit(0)

    connection.close()