optimum_benchmark/launchers/device_isolation_utils.py (126 lines of code) (raw):
import os
import signal
import sys
import time
from logging import getLogger
from typing import Set
from ..import_utils import is_amdsmi_available, is_psutil_available, is_pynvml_available
from ..logging_utils import setup_logging
from ..system_utils import is_nvidia_system, is_rocm_system
if is_psutil_available():
import psutil
if is_pynvml_available():
import pynvml
if is_amdsmi_available():
import amdsmi # type: ignore
LOGGER = getLogger("device-isolation")
class DeviceIsolationError(Exception):
pass
def isolation_error_signal_handler(signum, frame):
raise DeviceIsolationError("Received an error signal from the device isolation process")
if sys.platform == "linux":
signal.signal(signal.SIGUSR1, isolation_error_signal_handler)
def get_nvidia_devices_pids(device_ids: str) -> Set[int]:
if not is_pynvml_available():
raise ValueError(
"The library pynvml is required to get the pids running on NVIDIA GPUs, but is not installed. "
"Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
)
pynvml.nvmlInit()
devices_pids = set()
devices_ids = list(map(int, device_ids.split(",")))
for device_id in devices_ids:
device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
for device_process in device_processes:
devices_pids.add(device_process.pid)
pynvml.nvmlShutdown()
return devices_pids
def get_amd_devices_pids(device_ids: str) -> Set[int]:
if not is_amdsmi_available():
raise ValueError(
"The library amdsmi is required to get the pids running on AMD GPUs, but is not installed. "
"Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
)
amdsmi.amdsmi_init()
permission_denied = False
devices_pids = set()
devices_ids = list(map(int, device_ids.split(",")))
processor_handles = amdsmi.amdsmi_get_processor_handles()
for device_id in devices_ids:
processor_handle = processor_handles[device_id]
if permission_denied:
continue
try:
# these functions fail a lot for no apparent reason
processes_handles = amdsmi.amdsmi_get_gpu_process_list(processor_handle)
except Exception as e:
permission_denied = "Permission denied" in str(e)
continue
for process_handle in processes_handles:
try:
# these functions fail a lot for no apparent reason
info = amdsmi.amdsmi_get_gpu_process_info(processor_handle, process_handle)
except Exception as e:
permission_denied = "Permission denied" in str(e)
continue
if info["memory_usage"]["vram_mem"] == 4096:
# not sure why these processes are always present
continue
devices_pids.add(info["pid"])
amdsmi.amdsmi_shut_down()
return devices_pids
def get_pids_running_on_system_devices(device_ids: str) -> Set[int]:
"""Returns the set of pids running on the system device(s)."""
if is_nvidia_system():
devices_pids = get_nvidia_devices_pids(device_ids)
elif is_rocm_system():
devices_pids = get_amd_devices_pids(device_ids)
else:
raise ValueError("get_pids_running_on_system_device is only supported on NVIDIA and AMD GPUs")
return devices_pids
def get_children_pids(pid: int) -> Set[int]:
"""Returns the set of pids of the children of the given process."""
if not is_psutil_available():
raise ValueError(
"The library psutil is required to get the children pids of a process, but is not installed. "
"Please install the official and cross-platform psutil library through `pip install psutil`."
)
if not psutil.pid_exists(pid):
LOGGER.warn(f"Process with pid [{pid}] does not exist.")
return set()
process = psutil.Process(pid)
children = process.children(recursive=True)
children_pids = {child.pid for child in children}
return children_pids
def assert_device_isolation(pid: int, device_ids: str, action: str):
log_level = os.environ.get("LOG_LEVEL", "INFO")
log_to_file = os.environ.get("LOG_TO_FILE", "1") == "1"
setup_logging(log_level, to_file=log_to_file, prefix="DEVICE-ISOLATION-PROCESS")
device_isolation_pid = os.getpid()
permitted_parent_pids = {pid, device_isolation_pid}
while any(psutil.pid_exists(p) for p in permitted_parent_pids):
device_pids = get_pids_running_on_system_devices(device_ids=device_ids)
device_pids = {p for p in device_pids if psutil.pid_exists(p)}
permitted_children_pids = set()
for pid in permitted_parent_pids:
permitted_children_pids |= get_children_pids(pid)
permitted_pids = permitted_parent_pids | permitted_children_pids
permitted_pids = {p for p in permitted_pids if psutil.pid_exists(p)}
non_permitted_pids = device_pids - permitted_pids
if len(non_permitted_pids) > 0:
LOGGER.warn(
f"Found process(es) [{non_permitted_pids}] running on device(s) [{device_ids}], "
f"other than the isolated process [{pid}], the device isolation process [{device_isolation_pid}] "
f"and their children [{permitted_children_pids}]."
)
if action == "warn":
LOGGER.warn("Make sure no other process is running on the device(s) while benchmarking.")
elif action == "error":
LOGGER.error("Signaling the isolated process to error out.")
if sys.platform == "linux":
os.kill(pid, signal.SIGUSR1)
else:
LOGGER.error("Sending an error signal is only supported on Linux. Killing the isolated process.")
os.kill(pid, signal.SIGKILL)
elif action == "kill":
LOGGER.error("Killing the isolated process.")
os.kill(pid, signal.SIGKILL)
LOGGER.warn("Exiting device isolation process.")
exit(0)
time.sleep(1)