def is_bad_node_from_proto()

in src/gpu_healthcheck/gpu_healthcheck.py [0:0]


def is_bad_node_from_proto(report: dcgm_pb2.DiagnosticReport) -> bool:
  """Returns True if the node is bad based on the DCGM diagnostic report."""
  bad_node = False
  strict_mode = os.environ.get("STRICT_MODE", "true")
  for category in report.dcgm_gpu_diagnostic.test_categories:
    for test in category.tests:
      for result in test.results:
        if result.status.lower() == "fail":
          gpu_info = (
              f" (GPU {result.gpu_id})" if result.gpu_id else ""
          )  # Add GPU ID if available
          print(
              f"Test '{test.name}' in category '{category.category}'"
              f" {gpu_info}: {result.status} - {result.info}",
          )
          # If in strict mode, fail on any error.
          if strict_mode == "true":
            bad_node = True
            continue

          # If not in strict mode check the error severity and only fail on
          # critical errors.
          for warning in result.warnings:
            if warning.error_severity in [
                K_DCGM_ERROR_ISOLATE,
                K_DCGM_ERROR_RESET,
            ]:
              bad_node = True
  return bad_node