in src/health_runner/health_runner.py [0:0]
def main() -> None:
logging.info("%s health runner started", socket.gethostname())
sleep_time, timeout = setup_sleep_and_timeout()
# Test for NCCL health check
health_app = os.environ.get("HEALTH_APP", "").lower()
# Create Helm releases for each health check
if health_app:
logging.info("Running NCCL health check via `HEALTH_APP`")
run_health_app(health_app)
else:
signal.alarm(timeout * 60)
# Set timeout
logging.info("Set timeout to %s minutes", timeout)
logging.info("Running Helm health check")
run_health_check(sleep_time)