in src/job-exporter/src/collector.py [0:0]
def convert_nvidia_gpu_info_to_metrics(gpu_info, zombie_info, pid_to_cid_fn, mem_leak_thrashold, node_name=os.environ.get("NODE_NAME")):
""" This fn used to convert gpu_info & zombie_info into metrics, used to make
it easier to do unit test """
# common gpu metrics
gpu_core_util, gpu_mem_util = GpuCollector.gen_common_gpu_gauge()
# nvidia metrics
nvidia_core_utils = gen_nvidia_gpu_util_gauge()
nvidia_mem_utils = gen_nvidia_gpu_mem_util_gauge()
nvidia_gpu_temp = gen_nvidia_gpu_temperature_gauge()
nvidia_ecc_errors = gen_nvidia_gpu_ecc_counter()
nvidia_mem_leak = gen_nvidia_gpu_memory_leak_counter()
nvidia_performance_state = gen_nvidia_gpu_performance_state()
external_process = gen_gpu_used_by_external_process_counter()
zombie_container = gen_gpu_used_by_zombie_container_counter()
pids_use_gpu = {} # key is gpu minor, value is an array of pid
for minor, info in gpu_info.items():
if not minor.isdigit():
continue # ignore UUID
gpu_core_util.add_metric([minor, GpuVendor.NVIDIA.value], info.gpu_util)
gpu_mem_util.add_metric([minor, GpuVendor.NVIDIA.value], info.gpu_mem_util)
nvidia_core_utils.add_metric([minor], info.gpu_util)
nvidia_mem_utils.add_metric([minor], info.gpu_mem_util)
if info.temperature is not None:
nvidia_gpu_temp.add_metric([minor], info.temperature)
nvidia_ecc_errors.add_metric([node_name, minor, "single"], info.ecc_errors.single)
nvidia_ecc_errors.add_metric([node_name, minor, "double"], info.ecc_errors.double)
nvidia_performance_state.add_metric([node_name, minor, ",".join(info.clocks_throttle_reasons)], info.performance_state)
# TODO: this piece of code seems not corret, gpu_mem_util is
# a percentage number but mem_leak_thrashold is memory size. Need to fix it.
if info.gpu_mem_util > mem_leak_thrashold and len(info.pids) == 0:
# we found memory leak less than 20M can be mitigated automatically
nvidia_mem_leak.add_metric([minor], 1)
if len(info.pids) > 0:
pids_use_gpu[minor]= info.pids
logger.debug("pids_use_gpu is %s, zombie_info is %s", pids_use_gpu, zombie_info)
if len(pids_use_gpu) > 0:
if zombie_info is None:
zombie_info = []
for minor, pids in pids_use_gpu.items():
for pid in pids:
found, z_id = pid_to_cid_fn(pid)
logger.debug("pid %s has found %s, z_id %s", pid, found, z_id)
if found:
# NOTE: zombie_info is a set of short docker container id, but
# z_id is full id.
for zombie_id in zombie_info:
if z_id.startswith(zombie_id):
# found corresponding container
zombie_container.add_metric([minor, zombie_id], 1)
else:
external_process.add_metric([minor, str(pid)], 1)
if len(zombie_container.samples) > 0 or len(external_process.samples) > 0:
logger.warning("found gpu used by external %s, zombie container %s",
external_process, zombie_container)
return [
nvidia_core_utils, nvidia_mem_utils, nvidia_ecc_errors,
nvidia_mem_leak, external_process, zombie_container,
nvidia_gpu_temp, gpu_core_util, gpu_mem_util, nvidia_performance_state
]