def parse_smi_xml_result()

in src/job-exporter/src/nvidia.py [0:0]


def parse_smi_xml_result(smi):
    """ return a map, key is gpu_index(minor number or gpu sequence index) and gpu uuid, value is NvidiaGpuStatus """
    xmldoc = minidom.parseString(smi)
    gpus = xmldoc.getElementsByTagName("gpu")

    result = {}

    for index, gpu in enumerate(gpus):
        if os.getenv("LAUNCHER_TYPE") == "k8s":
            # For pai k8s, the minor number doesn't match the NVIDIA_VISIBLE_DEVICES number,
            # use nvidia-smi gpu sequence index instead
            gpu_index = index
        else:
            gpu_index = gpu.getElementsByTagName("minor_number")[0].childNodes[0].data
        utilization = gpu.getElementsByTagName("utilization")[0]

        gpu_util = utilization.getElementsByTagName("gpu_util")[0].childNodes[0].data.replace("%", "").strip()

        gpu_mem_util = "N/A"

        memory_usage_list = gpu.getElementsByTagName("fb_memory_usage")
        if len(memory_usage_list) != 0:
            memory_usage = memory_usage_list[0]
            mem_used = convert_to_byte(memory_usage.getElementsByTagName("used")[0].childNodes[0].data)
            mem_total = convert_to_byte(memory_usage.getElementsByTagName("total")[0].childNodes[0].data)

            if mem_total != 0:
                gpu_mem_util = mem_used / mem_total * 100

        if gpu_util == "N/A" or gpu_mem_util == "N/A":
            continue

        pids = []
        processes = gpu.getElementsByTagName("process_info")
        if len(processes) != 0:
            for process in processes:
                pids.append(int(
                    process.getElementsByTagName("pid")[0].childNodes[0].data))

        ecc_single = ecc_double = 0

        """Here we try to get the ecc error count.
        If there is no single_bit tag, it means that this GPU do not support 
        """
        try:
            ecc_errors = gpu.getElementsByTagName("ecc_errors")
            if len(ecc_errors) > 0:
                volatile = ecc_errors[0].getElementsByTagName("volatile")
                if len(volatile) > 0:
                    volatile = volatile[0]
                    single = volatile.getElementsByTagName("single_bit")[0].getElementsByTagName("total")[0]
                    double = volatile.getElementsByTagName("double_bit")[0].getElementsByTagName("total")[0]
                    single = single.childNodes[0].data
                    double = double.childNodes[0].data
                    if single != "N/A":
                        ecc_single = int(single)
                    if double != "N/A":
                        ecc_double = int(double)
        except IndexError:
            pass

        uuid = gpu.getElementsByTagName("uuid")[0].childNodes[0].data

        temperature = None
        try:
            temp_node = gpu.getElementsByTagName("temperature")
            if len(temp_node) > 0:
                temp_s = temp_node[0].getElementsByTagName("gpu_temp")[0].childNodes[0].data
                temperature = float(re.findall(r"[0-9.]+", temp_s)[0])
        except Exception:
            logger.warning("Failed to get GPU temperature", exc_info=True)

        performance_state = None
        try:
            temp_node = gpu.getElementsByTagName("performance_state")
            if len(temp_node) > 0:
                performance_state = int(re.findall(r"\d+", temp_node[0].childNodes[0].data)[0])
        except:
            logger.warning("Failed to get GPU performance status", exc_info=True)

        throttle_reasons = []
        try:
            temp_node = gpu.getElementsByTagName("clocks_throttle_reasons")
            for node in temp_node[0].childNodes:
                if node.nodeType != node.ELEMENT_NODE:
                    continue
                if node.childNodes[0].data.lower() == "active":
                    throttle_reasons.append(node.tagName)

        except Exception:
            logger.warning("Failed to get GPU clock throttle reasons", exc_info=True)

        status = NvidiaGpuStatus(
                float(gpu_util),
                float(gpu_mem_util),
                pids,
                EccError(single=ecc_single, double=ecc_double),
                str(gpu_index),
                uuid,
                temperature,
                performance_state,
                throttle_reasons)

        result[str(gpu_index)] = result[uuid] = status

    return result