in src/job-exporter/src/nvidia.py [0:0]
def parse_smi_xml_result(smi):
""" return a map, key is gpu_index(minor number or gpu sequence index) and gpu uuid, value is NvidiaGpuStatus """
xmldoc = minidom.parseString(smi)
gpus = xmldoc.getElementsByTagName("gpu")
result = {}
for index, gpu in enumerate(gpus):
if os.getenv("LAUNCHER_TYPE") == "k8s":
# For pai k8s, the minor number doesn't match the NVIDIA_VISIBLE_DEVICES number,
# use nvidia-smi gpu sequence index instead
gpu_index = index
else:
gpu_index = gpu.getElementsByTagName("minor_number")[0].childNodes[0].data
utilization = gpu.getElementsByTagName("utilization")[0]
gpu_util = utilization.getElementsByTagName("gpu_util")[0].childNodes[0].data.replace("%", "").strip()
gpu_mem_util = "N/A"
memory_usage_list = gpu.getElementsByTagName("fb_memory_usage")
if len(memory_usage_list) != 0:
memory_usage = memory_usage_list[0]
mem_used = convert_to_byte(memory_usage.getElementsByTagName("used")[0].childNodes[0].data)
mem_total = convert_to_byte(memory_usage.getElementsByTagName("total")[0].childNodes[0].data)
if mem_total != 0:
gpu_mem_util = mem_used / mem_total * 100
if gpu_util == "N/A" or gpu_mem_util == "N/A":
continue
pids = []
processes = gpu.getElementsByTagName("process_info")
if len(processes) != 0:
for process in processes:
pids.append(int(
process.getElementsByTagName("pid")[0].childNodes[0].data))
ecc_single = ecc_double = 0
"""Here we try to get the ecc error count.
If there is no single_bit tag, it means that this GPU do not support
"""
try:
ecc_errors = gpu.getElementsByTagName("ecc_errors")
if len(ecc_errors) > 0:
volatile = ecc_errors[0].getElementsByTagName("volatile")
if len(volatile) > 0:
volatile = volatile[0]
single = volatile.getElementsByTagName("single_bit")[0].getElementsByTagName("total")[0]
double = volatile.getElementsByTagName("double_bit")[0].getElementsByTagName("total")[0]
single = single.childNodes[0].data
double = double.childNodes[0].data
if single != "N/A":
ecc_single = int(single)
if double != "N/A":
ecc_double = int(double)
except IndexError:
pass
uuid = gpu.getElementsByTagName("uuid")[0].childNodes[0].data
temperature = None
try:
temp_node = gpu.getElementsByTagName("temperature")
if len(temp_node) > 0:
temp_s = temp_node[0].getElementsByTagName("gpu_temp")[0].childNodes[0].data
temperature = float(re.findall(r"[0-9.]+", temp_s)[0])
except Exception:
logger.warning("Failed to get GPU temperature", exc_info=True)
performance_state = None
try:
temp_node = gpu.getElementsByTagName("performance_state")
if len(temp_node) > 0:
performance_state = int(re.findall(r"\d+", temp_node[0].childNodes[0].data)[0])
except:
logger.warning("Failed to get GPU performance status", exc_info=True)
throttle_reasons = []
try:
temp_node = gpu.getElementsByTagName("clocks_throttle_reasons")
for node in temp_node[0].childNodes:
if node.nodeType != node.ELEMENT_NODE:
continue
if node.childNodes[0].data.lower() == "active":
throttle_reasons.append(node.tagName)
except Exception:
logger.warning("Failed to get GPU clock throttle reasons", exc_info=True)
status = NvidiaGpuStatus(
float(gpu_util),
float(gpu_mem_util),
pids,
EccError(single=ecc_single, double=ecc_double),
str(gpu_index),
uuid,
temperature,
performance_state,
throttle_reasons)
result[str(gpu_index)] = result[uuid] = status
return result