optimum/amd/cli.py

import logging import os import subprocess import sys logger = logging.getLogger(__name__) try: subprocess.run(["rocm-smi"], check=True) ROCM_AVAILABLE = True except Exception: ROCM_AVAILABLE = False try: import zentorch # noqa: F401 # type: ignore ZENTORCH_AVAILABLE = True except Exception: ZENTORCH_AVAILABLE = False def get_env_vars_overrides(): """ Returns a dictionary of environment variables that are set in the command line arguments. """ env = {} for arg in sys.argv: if "=" in arg: key, value = arg.split("=") env[key] = value return env def get_amd_zentorch_env(): """ Returns a dictionary of environment variables that are optimized for the AMD ZenTorch plugin. The target environment variables are: - `OMP_NUM_THREADS`: The number of OpenMP threads to use. - `OMP_DYNAMIC`: Whether or not OpenMP threads are dynamically allocated. - `OMP_WAIT_POLICY`: The OpenMP wait policy. - `ZENDNN_GEMM_ALGO`: The GEMM algorithm to use. - `GOMP_CPU_AFFINITY`: The CPU affinity for OpenMP threads. - `LD_PRELOAD`: The path to the Jemalloc library. - `MALLOC_CONF`: The Jemalloc configuration. """ # TODO: how to handle NUMA nodes and socket affinity? CPU_COUNT = os.cpu_count() MALLOC_CONF = "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libjemalloc.so" GOMP_CPU_AFFINITY = f"0-{CPU_COUNT - 1}" OMP_NUM_THREADS = f"{CPU_COUNT}" OMP_WAIT_POLICY = "ACTIVE" ZENDNN_GEMM_ALGO = "4" OMP_DYNAMIC = "False" env = {} env["OMP_NUM_THREADS"] = OMP_NUM_THREADS env["OMP_DYNAMIC"] = OMP_DYNAMIC env["OMP_WAIT_POLICY"] = OMP_WAIT_POLICY env["ZENDNN_GEMM_ALGO"] = ZENDNN_GEMM_ALGO env["GOMP_CPU_AFFINITY"] = GOMP_CPU_AFFINITY env["LD_PRELOAD"] = LD_PRELOAD env["MALLOC_CONF"] = MALLOC_CONF if not os.path.exists(LD_PRELOAD) and "LD_PRELOAD" not in get_env_vars_overrides(): logger.warning( f"Jemalloc not found at {LD_PRELOAD} either because it's not installed or because the path is incorrect." "Make sure it's installed and/or override `LD_PRELOAD` manually: `amdrun LD_PRELOAD=/path/to/libjemalloc.so python script.py script_args`" ) logger.info("AMD ZenTorch environment variables:") logger.info(f"- OMP_NUM_THREADS: {OMP_NUM_THREADS}") logger.info(f"- OMP_DYNAMIC: {OMP_DYNAMIC}") logger.info(f"- OMP_WAIT_POLICY: {OMP_WAIT_POLICY}") logger.info(f"- ZENDNN_GEMM_ALGO: {ZENDNN_GEMM_ALGO}") logger.info(f"- GOMP_CPU_AFFINITY: {GOMP_CPU_AFFINITY}") logger.info(f"- LD_PRELOAD: {LD_PRELOAD}") logger.info(f"- MALLOC_CONF: {MALLOC_CONF}") return env def get_amd_rocm_env(): """ Returns a dictionary of environment variables that are optimized for AMD's ROCm platform. The target environment variables are: - `ROCR_VISIBLE_DEVICES`: The list of devices to use (maximizing the average bandwidth between them). """ from .topology_utils import extract_max_avg_bandwidth_cluster, get_bandwidth_matrix # extract the number of devices to use if "--nproc_per_node" in sys.argv: # torchrun style nproc_per_node_index = sys.argv.index("--nproc_per_node") num_devices = int(sys.argv[nproc_per_node_index + 1]) elif "--ngpus" in sys.argv: # accelerate/deepspeed style ngpus_index = sys.argv.index("--ngpus") num_devices = int(sys.argv[ngpus_index + 1]) else: # early exit if we can't find the number of devices return {} bandwidth_matrix = get_bandwidth_matrix() max_avg_bandwidth_cluster, max_avg_bandwidth = extract_max_avg_bandwidth_cluster(bandwidth_matrix, num_devices) # lowest level isolation env var on AMD GPUs ROCR_VISIBLE_DEVICES = ",".join(list(map(str, max_avg_bandwidth_cluster))) logger.info("AMD ROCm environment variables:") logger.info(f"- ROCR_VISIBLE_DEVICES: {ROCR_VISIBLE_DEVICES}") return {"ROCR_VISIBLE_DEVICES": ROCR_VISIBLE_DEVICES} def amdrun(): """ A cli command that sets a couple of ZenTorch & ROCm environment variables to maximize performance. Usage: amdrun <script> <script_args> Example: amdrun torchrun --nproc_per_node 4 train.py """ env = os.environ.copy() if ROCM_AVAILABLE: env.update(get_amd_rocm_env()) if ZENTORCH_AVAILABLE: env.update(get_amd_zentorch_env()) exit_code = subprocess.run(sys.argv[1:], env=env).returncode sys.exit(exit_code)

optimum/amd/cli.py (77 lines of code) (raw):