in src/gpu_healthcheck/gpu_healthcheck.py [0:0]
def is_bad_node_from_proto(report: dcgm_pb2.DiagnosticReport) -> bool:
"""Returns True if the node is bad based on the DCGM diagnostic report."""
bad_node = False
strict_mode = os.environ.get("STRICT_MODE", "true")
for category in report.dcgm_gpu_diagnostic.test_categories:
for test in category.tests:
for result in test.results:
if result.status.lower() == "fail":
gpu_info = (
f" (GPU {result.gpu_id})" if result.gpu_id else ""
) # Add GPU ID if available
print(
f"Test '{test.name}' in category '{category.category}'"
f" {gpu_info}: {result.status} - {result.info}",
)
# If in strict mode, fail on any error.
if strict_mode == "true":
bad_node = True
continue
# If not in strict mode check the error severity and only fail on
# critical errors.
for warning in result.warnings:
if warning.error_severity in [
K_DCGM_ERROR_ISOLATE,
K_DCGM_ERROR_RESET,
]:
bad_node = True
return bad_node