def convert_nvidia_gpu_info_to_metrics()

in src/job-exporter/src/collector.py [0:0]


    def convert_nvidia_gpu_info_to_metrics(gpu_info, zombie_info, pid_to_cid_fn, mem_leak_thrashold, node_name=os.environ.get("NODE_NAME")):
        """ This fn used to convert gpu_info & zombie_info into metrics, used to make
        it easier to do unit test """
        # common gpu metrics
        gpu_core_util, gpu_mem_util = GpuCollector.gen_common_gpu_gauge()
        # nvidia metrics
        nvidia_core_utils = gen_nvidia_gpu_util_gauge()
        nvidia_mem_utils = gen_nvidia_gpu_mem_util_gauge()
        nvidia_gpu_temp = gen_nvidia_gpu_temperature_gauge()
        nvidia_ecc_errors = gen_nvidia_gpu_ecc_counter()
        nvidia_mem_leak = gen_nvidia_gpu_memory_leak_counter()
        nvidia_performance_state = gen_nvidia_gpu_performance_state()
        external_process = gen_gpu_used_by_external_process_counter()
        zombie_container = gen_gpu_used_by_zombie_container_counter()

        pids_use_gpu = {} # key is gpu minor, value is an array of pid

        for minor, info in gpu_info.items():
            if not minor.isdigit():
                continue # ignore UUID

            gpu_core_util.add_metric([minor, GpuVendor.NVIDIA.value], info.gpu_util)
            gpu_mem_util.add_metric([minor, GpuVendor.NVIDIA.value], info.gpu_mem_util)
            nvidia_core_utils.add_metric([minor], info.gpu_util)
            nvidia_mem_utils.add_metric([minor], info.gpu_mem_util)
            if info.temperature is not None:
                nvidia_gpu_temp.add_metric([minor], info.temperature)
            nvidia_ecc_errors.add_metric([node_name, minor, "single"], info.ecc_errors.single)
            nvidia_ecc_errors.add_metric([node_name, minor, "double"], info.ecc_errors.double)
            nvidia_performance_state.add_metric([node_name, minor, ",".join(info.clocks_throttle_reasons)], info.performance_state)

            # TODO: this piece of code seems not corret, gpu_mem_util is
            # a percentage number but mem_leak_thrashold is memory size. Need to fix it.
            if info.gpu_mem_util > mem_leak_thrashold and len(info.pids) == 0:
                # we found memory leak less than 20M can be mitigated automatically
                nvidia_mem_leak.add_metric([minor], 1)

            if len(info.pids) > 0:
                pids_use_gpu[minor]= info.pids

        logger.debug("pids_use_gpu is %s, zombie_info is %s", pids_use_gpu, zombie_info)
        if len(pids_use_gpu) > 0:
            if zombie_info is None:
                zombie_info = []

            for minor, pids in pids_use_gpu.items():
                for pid in pids:
                    found, z_id = pid_to_cid_fn(pid)
                    logger.debug("pid %s has found %s, z_id %s", pid, found, z_id)
                    if found:
                        # NOTE: zombie_info is a set of short docker container id, but
                        # z_id is full id.
                        for zombie_id in zombie_info:
                            if z_id.startswith(zombie_id):
                                # found corresponding container
                                zombie_container.add_metric([minor, zombie_id], 1)
                    else:
                        external_process.add_metric([minor, str(pid)], 1)
            if len(zombie_container.samples) > 0 or len(external_process.samples) > 0:
                logger.warning("found gpu used by external %s, zombie container %s",
                        external_process, zombie_container)

        return [
            nvidia_core_utils, nvidia_mem_utils, nvidia_ecc_errors,
            nvidia_mem_leak, external_process, zombie_container,
            nvidia_gpu_temp, gpu_core_util, gpu_mem_util, nvidia_performance_state
        ]