in cookbooks/aws-parallelcluster-slurm/files/default/config_slurm/scripts/health_check_manager.py [0:0]
def main():
default_log_file = "/var/log/parallelcluster/slurm_health_check.log"
logging.basicConfig(
filename=default_log_file,
level=logging.INFO,
format="%(asctime)s - [%(filename)s:%(funcName)s] - %(levelname)s - JobID %(job_id)s - %(message)s",
)
try:
args = _parse_arguments()
# Override global log object
global log # pylint: disable=W0603
log = logging.LoggerAdapter(log, {"job_id": args.job_id})
log.info("HealthCheckManager startup.")
config_file = os.environ.get("CONFIG_FILE", os.path.join(CONFIG_FILE_DIR, "health_check_manager.conf"))
health_check_manager_config = HealthCheckManagerConfig(config_file)
try:
# Configure root logger
fileConfig(health_check_manager_config.logging_config, disable_existing_loggers=False)
except Exception as err:
if hasattr(err, "message"):
err = err.message
log.warning(
"Unable to configure logging from %s, using default settings and writing to %s.\nException: %s",
health_check_manager_config.logging_config,
default_log_file,
err,
)
log.info(f"HealthCheckManager config: {health_check_manager_config}")
exit_code = _execute_health_checks(health_check_manager_config, args)
log.info(f"HealthCheckManager finished with exit code '{exit_code}'.")
raise SystemExit(exit_code)
except Exception as err:
if hasattr(err, "message"):
err = err.message
log.exception("Encountered exception when running Health Check Manager, exiting gracefully: %s", err)
raise SystemExit(0)