in src/slurm_plugin/computemgtd.py [0:0]
def _run_computemgtd(config_file):
"""Run computemgtd actions."""
# Initial default heartbeat time as computemgtd startup time
last_heartbeat = datetime.now(tz=timezone.utc)
log.info("Initializing clustermgtd heartbeat to be computemgtd startup time: %s", last_heartbeat)
computemgtd_config = _load_daemon_config(config_file)
reload_config_counter = RELOAD_CONFIG_ITERATIONS
while True:
# Get current time
current_time = datetime.now(tz=timezone.utc)
if reload_config_counter <= 0:
try:
computemgtd_config = _load_daemon_config(config_file)
reload_config_counter = RELOAD_CONFIG_ITERATIONS
except Exception as e:
log.warning("Unable to reload daemon config, using previous one.\nException: %s", e)
else:
reload_config_counter -= 1
# Check heartbeat
try:
last_heartbeat = get_clustermgtd_heartbeat(computemgtd_config.clustermgtd_heartbeat_file_path)
log.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
except Exception as e:
log.warning(
"Unable to retrieve clustermgtd heartbeat. Using last known heartbeat: %s with exception: %s",
last_heartbeat,
e,
)
if expired_clustermgtd_heartbeat(last_heartbeat, current_time, computemgtd_config.clustermgtd_timeout):
if computemgtd_config.disable_computemgtd_actions:
log.info("All computemgtd actions currently disabled")
elif _is_self_node_down(computemgtd_config.nodename):
_self_terminate()
sleep_remaining_loop_time(computemgtd_config.loop_time, current_time)