def _handle_nodes_failing_health_check()

in src/slurm_plugin/clustermgtd.py [0:0]


    def _handle_nodes_failing_health_check(self, nodes_failing_health_check: List[SlurmNode], health_check_type: str):
        # Place unhealthy node into drain, this operation is idempotent
        nodes_name_failing_health_check = set()
        nodes_name_recently_rebooted = set()
        if nodes_failing_health_check:
            for node in nodes_failing_health_check:
                # Do not consider nodes failing health checks as unhealthy if:
                # 1. the node is still rebooting, OR
                # 2. slurmd was recently restarted (less than health_check_timeout_after_slurmdstarttime seconds ago).
                # In the implementation the logic is reversed to exploit the `and` in the if clause
                if not node.is_reboot_issued() and time_is_up(
                    node.slurmdstarttime, self._current_time, self._config.health_check_timeout_after_slurmdstarttime
                ):
                    nodes_name_failing_health_check.add(node.name)
                else:
                    nodes_name_recently_rebooted.add(node.name)
            if len(nodes_name_failing_health_check) > 0:
                log.warning(
                    "Setting nodes failing health check type %s to DRAIN: %s",
                    health_check_type,
                    nodes_name_failing_health_check,
                )
                set_nodes_drain(nodes_name_failing_health_check, reason=f"Node failing {health_check_type}")
            if len(nodes_name_recently_rebooted) > 0:
                log.info(
                    "Ignoring health check failure due to reboot for nodes: %s",
                    nodes_name_recently_rebooted,
                )
        self._event_publisher.publish_nodes_failing_health_check_events(
            health_check_type, nodes_name_failing_health_check
        )