in src/gpu_healthcheck/gpu_healthcheck.py [0:0]
def run_dcgm_diag(node_name: str, reboot_required: bool) -> None:
"""run dcgm diag."""
command = generate_dcgm_command()
diag_output = checker_common.run_command(command)
try:
print("Converting from json output to proto")
output = convert_output_to_proto(diag_output.stdout)
print(output)
failed = is_bad_node_from_proto(output)
except json.JSONDecodeError as e:
print("Error deserializing JSON: %s", e)
failed = True
checker_common.log_results(
test_name="dcgm",
passed=not failed,
node_name=node_name,
workflow_id=os.environ.get("WORKFLOW_ID"), # Remove workflow id
)
checker_common.add_label(
node_name,
_RESULT_LABEL_KEY,
"fail" if failed else "pass",
K_ADD_LABEL_FORMAT,
)
if failed:
print(f"Node {node_name} failed dcgm test")
taint_node(node_name, TAINT_KEY, TAINT_VALUE, TAINT_EFFECT)
else:
print(f"Node {node_name} passed dcgm test")
if not reboot_required:
un_taint_node(node_name, TAINT_KEY)