in src/gpu_healthcheck/gpu_healthcheck.py [0:0]
def run_reboot_required_check(node_name: str) -> bool:
"""run reboot required check."""
smi_output = checker_common.run_command(NVIDIA_SMI_COMMAND)
total_errors = 0
reboot_required = False
smi_lines = smi_output.stdout.splitlines()
if len(smi_lines) != 8:
reboot_required = True
print("vm '%s' has only $s GPUs instead of 8" % node_name, len(smi_lines))
for l in smi_lines:
# If any gpu has that flag then we need reboot.
try:
total_errors += int(l)
except ValueError:
# If l is not a valid integer, handle the exception
print(f"Error: cannot parse '{l}' to a number.")
# That will taint the node and let dcgm test to run
total_errors += 1
# If any errors then label node as reboot required.
if reboot_required or total_errors > 0:
print(
"adding reboot required label for node %s due to %s errors"
% (node_name, total_errors),
)
checker_common.add_label(
node_name, REBOOT_REQUIRED_LABEL_KEY, "true", K_ADD_LABEL_FORMAT
)
taint_node(node_name, TAINT_KEY, TAINT_VALUE, TAINT_EFFECT)
return True
else:
print("reboot is not required for node %s" % node_name)
remove_label(node_name, REBOOT_REQUIRED_LABEL_KEY)
return False