in src/slurm_plugin/clustermgtd.py [0:0]
def _handle_nodes_failing_health_check(self, nodes_failing_health_check: List[SlurmNode], health_check_type: str):
# Place unhealthy node into drain, this operation is idempotent
nodes_name_failing_health_check = set()
nodes_name_recently_rebooted = set()
if nodes_failing_health_check:
for node in nodes_failing_health_check:
# Do not consider nodes failing health checks as unhealthy if:
# 1. the node is still rebooting, OR
# 2. slurmd was recently restarted (less than health_check_timeout_after_slurmdstarttime seconds ago).
# In the implementation the logic is reversed to exploit the `and` in the if clause
if not node.is_reboot_issued() and time_is_up(
node.slurmdstarttime, self._current_time, self._config.health_check_timeout_after_slurmdstarttime
):
nodes_name_failing_health_check.add(node.name)
else:
nodes_name_recently_rebooted.add(node.name)
if len(nodes_name_failing_health_check) > 0:
log.warning(
"Setting nodes failing health check type %s to DRAIN: %s",
health_check_type,
nodes_name_failing_health_check,
)
set_nodes_drain(nodes_name_failing_health_check, reason=f"Node failing {health_check_type}")
if len(nodes_name_recently_rebooted) > 0:
log.info(
"Ignoring health check failure due to reboot for nodes: %s",
nodes_name_recently_rebooted,
)
self._event_publisher.publish_nodes_failing_health_check_events(
health_check_type, nodes_name_failing_health_check
)