def run_dcgm_diag()

in src/gpu_healthcheck/gpu_healthcheck.py [0:0]


def run_dcgm_diag(node_name: str, reboot_required: bool) -> None:
  """run dcgm diag."""
  command = generate_dcgm_command()
  diag_output = checker_common.run_command(command)

  try:
    print("Converting from json output to proto")
    output = convert_output_to_proto(diag_output.stdout)
    print(output)
    failed = is_bad_node_from_proto(output)
  except json.JSONDecodeError as e:
    print("Error deserializing JSON: %s", e)
    failed = True

  checker_common.log_results(
      test_name="dcgm",
      passed=not failed,
      node_name=node_name,
      workflow_id=os.environ.get("WORKFLOW_ID"),  # Remove workflow id
  )
  checker_common.add_label(
      node_name,
      _RESULT_LABEL_KEY,
      "fail" if failed else "pass",
      K_ADD_LABEL_FORMAT,
  )
  if failed:
    print(f"Node {node_name} failed dcgm test")
    taint_node(node_name, TAINT_KEY, TAINT_VALUE, TAINT_EFFECT)
  else:
    print(f"Node {node_name} passed dcgm test")
    if not reboot_required:
      un_taint_node(node_name, TAINT_KEY)