in cookbooks/aws-parallelcluster-slurm/files/default/config_slurm/scripts/health_check_manager.py [0:0]
def _execute_health_checks(health_check_manager_config: HealthCheckManagerConfig, args: argparse.Namespace) -> int:
"""Execute all Health Check."""
health_check_conf = HealthCheckConfigLoader().load_configuration(health_check_manager_config, args)
event_publisher = _get_event_publisher(args)
exit_code_sum = 0
for health_check in health_check_conf.health_checks:
if health_check.is_enabled:
try:
log.info(
"Executing Health Check '%s' for queue '%s' and compute resource '%s'",
health_check.name,
health_check_conf.queue_name,
health_check_conf.compute_resource_name,
)
# The command in this subprocess call is built as literal
result = subprocess.run(
health_check.check_path,
timeout=health_check_manager_config.health_check_timeout,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding="utf-8",
check=False,
shell=False, # nosec B603
)
exit_code_sum += result.returncode
if result.stdout:
output = f":\n{result.stdout}"
else:
output = " empty"
log.info("Output of Health Check '%s' execution is%s", health_check.name, output)
publish_health_check_result(
event_publisher, args.job_id, health_check.name, result.returncode, result.stdout
)
except (subprocess.SubprocessError, OSError) as err:
if hasattr(err, "message"):
err = err.message
log.error(
"Failure when executing Health Check '%s' for queue '%s' and compute resource '%s', with error: %s",
health_check.name,
health_check_conf.queue_name,
health_check_conf.compute_resource_name,
err,
)
publish_health_check_exception(event_publisher, args.job_id, health_check.name, err)
if not health_check_conf.health_checks:
log.info("No Health Check enabled found")
return exit_code_sum