def main()

in src/health_runner/health_runner.py [0:0]


def main() -> None:
  logging.info("%s health runner started", socket.gethostname())
  sleep_time, timeout = setup_sleep_and_timeout()
  # Test for NCCL health check
  health_app = os.environ.get("HEALTH_APP", "").lower()
  # Create Helm releases for each health check
  if health_app:
    logging.info("Running NCCL health check via `HEALTH_APP`")
    run_health_app(health_app)
  else:
    signal.alarm(timeout * 60)
    # Set timeout
    logging.info("Set timeout to %s minutes", timeout)

    logging.info("Running Helm health check")
    run_health_check(sleep_time)