optimum_benchmark/system_utils.py (174 lines of code) (raw):

import functools import os import platform import re import subprocess from typing import List, Optional import psutil from .import_utils import is_amdsmi_available, is_pynvml_available, is_pyrsmi_available # Network related stuff def get_socket_ifname() -> Optional[str]: for interface in psutil.net_if_addrs(): if interface.startswith("e"): return interface raise None ## CPU related stuff def get_cpu() -> Optional[str]: if platform.system() == "Windows": return platform.processor() elif platform.system() == "Darwin": command = "sysctl -n machdep.cpu.brand_string" return str(subprocess.check_output(command, shell=True).decode().strip()) elif platform.system() == "Linux": command = "cat /proc/cpuinfo" all_info = subprocess.check_output(command, shell=True).decode().strip() for line in all_info.split("\n"): if "model name" in line: return re.sub(".*model name.*:", "", line, 1) return "Could not find device name" else: raise ValueError(f"Unknown system '{platform.system()}'") def get_cpu_ram_mb(): return psutil.virtual_memory().total / 1e6 ## GPU related stuff @functools.lru_cache(maxsize=1) def is_nvidia_system(): return subprocess.call("nvidia-smi", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) == 0 @functools.lru_cache(maxsize=1) def is_rocm_system(): return subprocess.call("rocm-smi", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) == 0 if is_nvidia_system() and is_pynvml_available(): import pynvml if is_rocm_system() and is_amdsmi_available(): import amdsmi # type: ignore if is_rocm_system() and is_pyrsmi_available(): from pyrsmi import rocml def get_rocm_version(): for folder in os.listdir("/opt/"): if "rocm" in folder and "rocm" != folder: return folder.split("-")[-1] raise ValueError("Could not find ROCm version.") def get_gpus(): if is_nvidia_system(): if not is_pynvml_available(): raise ValueError( "The library PyNVML is required to get available GPUs, but is not installed. " "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." ) gpus = [] pynvml.nvmlInit() for i in range(pynvml.nvmlDeviceGetCount()): handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu = pynvml.nvmlDeviceGetName(handle) # Older pynvml versions may return bytes gpu = gpu.decode("utf-8") if isinstance(gpu, bytes) else gpu gpus.append(gpu) pynvml.nvmlShutdown() elif is_rocm_system(): if not is_amdsmi_available() and not is_pyrsmi_available(): raise ValueError( "Either the library AMD SMI or PyRSMI is required to get available GPUs, but neither is installed. " "Please install the official and AMD maintained AMD SMI library from https://github.com/ROCm/amdsmi " "or PyRSMI library from https://github.com/ROCm/pyrsmi." ) gpus = [] if is_amdsmi_available(): amdsmi.amdsmi_init() for processor_handles in amdsmi.amdsmi_get_processor_handles(): gpus.append(amdsmi.amdsmi_get_gpu_vendor_name(processor_handles)) amdsmi.amdsmi_shut_down() elif is_pyrsmi_available(): rocml.smi_initialize() for i in range(rocml.smi_get_device_count()): gpus.append(rocml.smi_get_device_name(i)) rocml.smi_shutdown() else: raise ValueError("No NVIDIA or ROCm GPUs found.") return gpus def get_gpu_vram_mb() -> List[int]: if is_nvidia_system(): if not is_pynvml_available(): raise ValueError( "The library PyNVML is required to get GPU VRAM, but is not installed. " "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." ) pynvml.nvmlInit() vrams = [ pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(pynvml.nvmlDeviceGetCount()) ] pynvml.nvmlShutdown() elif is_rocm_system(): if not is_amdsmi_available() and not is_pyrsmi_available(): raise ValueError( "Either the library AMD SMI or PyRSMI is required to get GPU VRAM, but neither is installed. " "Please install the official and AMD maintained AMD SMI library from https://github.com/ROCm/amdsmi " "or PyRSMI library from https://github.com/ROCm/pyrsmi." ) if is_amdsmi_available(): amdsmi.amdsmi_init() vrams = [ amdsmi.amdsmi_get_gpu_memory_total(processor_handles, mem_type=amdsmi.AmdSmiMemoryType.VRAM) for processor_handles in amdsmi.amdsmi_get_processor_handles() ] amdsmi.amdsmi_shut_down() elif is_pyrsmi_available(): rocml.smi_initialize() vrams = [rocml.smi_get_device_memory_total(i) for i in range(rocml.smi_get_device_count())] rocml.smi_shutdown() else: raise ValueError("No NVIDIA or ROCm GPUs found.") return sum(vrams) def get_gpu_device_ids() -> str: if is_nvidia_system(): if os.environ.get("NVIDIA_VISIBLE_DEVICES", None) is not None: device_ids = os.environ["NVIDIA_VISIBLE_DEVICES"] elif os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None: device_ids = os.environ["CUDA_VISIBLE_DEVICES"] else: if not is_pynvml_available(): raise ValueError( "The library PyNVML is required to get GPU device ids, but is not installed. " "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." ) pynvml.nvmlInit() device_ids = list(range(pynvml.nvmlDeviceGetCount())) device_ids = ",".join(str(i) for i in device_ids) pynvml.nvmlShutdown() elif is_rocm_system(): if os.environ.get("ROCR_VISIBLE_DEVICES", None) is not None: device_ids = os.environ["ROCR_VISIBLE_DEVICES"] elif os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None: device_ids = os.environ["CUDA_VISIBLE_DEVICES"] else: if not is_amdsmi_available() or not is_pyrsmi_available(): raise ValueError( "Either the library AMD SMI or PyRSMI is required to get GPU device ids, but neither is installed. " "Please install the official and AMD maintained AMD SMI library from https://github.com/ROCm/amdsmi " "or PyRSMI library from https://github.com/ROCm/pyrsmi." ) if is_pyrsmi_available(): rocml.smi_initialize() device_ids = list(range(rocml.smi_get_device_count())) device_ids = ",".join(str(i) for i in device_ids) rocml.smi_shutdown() elif is_amdsmi_available(): amdsmi.amdsmi_init() device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles()))) device_ids = ",".join(str(i) for i in device_ids) amdsmi.amdsmi_shut_down() else: raise ValueError("Couldn't infer GPU device ids.") return device_ids ## System related stuff def get_system_info() -> dict: system_dict = { "cpu": get_cpu(), "cpu_count": os.cpu_count(), "cpu_ram_mb": get_cpu_ram_mb(), "system": platform.system(), "machine": platform.machine(), "platform": platform.platform(), "processor": platform.processor(), "python_version": platform.python_version(), } if is_nvidia_system() or is_rocm_system(): system_dict["gpu"] = get_gpus() system_dict["gpu_count"] = len(get_gpus()) system_dict["gpu_vram_mb"] = get_gpu_vram_mb() return system_dict