def run_reboot_required_check()

in src/gpu_healthcheck/gpu_healthcheck.py [0:0]


def run_reboot_required_check(node_name: str) -> bool:
  """run reboot required check."""
  smi_output = checker_common.run_command(NVIDIA_SMI_COMMAND)

  total_errors = 0
  reboot_required = False

  smi_lines = smi_output.stdout.splitlines()
  if len(smi_lines) != 8:
    reboot_required = True
    print("vm '%s' has only $s GPUs instead of 8" % node_name, len(smi_lines))

  for l in smi_lines:
    # If any gpu has that flag then we need reboot.
    try:
      total_errors += int(l)
    except ValueError:
      # If l is not a valid integer, handle the exception
      print(f"Error: cannot parse '{l}' to a number.")
      # That will taint the node and let dcgm test to run
      total_errors += 1

  # If any errors then label node as reboot required.
  if reboot_required or total_errors > 0:
    print(
        "adding reboot required label for node %s due to %s errors"
        % (node_name, total_errors),
    )
    checker_common.add_label(
        node_name, REBOOT_REQUIRED_LABEL_KEY, "true", K_ADD_LABEL_FORMAT
    )
    taint_node(node_name, TAINT_KEY, TAINT_VALUE, TAINT_EFFECT)
    return True
  else:
    print("reboot is not required for node %s" % node_name)
    remove_label(node_name, REBOOT_REQUIRED_LABEL_KEY)
    return False