def _find_unhealthy_slurm_nodes()

in src/slurm_plugin/clustermgtd.py [0:0]


    def _find_unhealthy_slurm_nodes(self, slurm_nodes):
        """
        Find unhealthy static slurm nodes and dynamic slurm nodes.

        Check and return slurm nodes with unhealthy and healthy scheduler state, grouping unhealthy nodes
        by node type (static/dynamic).
        """
        unhealthy_static_nodes = []
        unhealthy_dynamic_nodes = []
        ice_compute_resources_and_nodes_map = {}
        all_unhealthy_nodes = []

        # Remove the nodes part of inactive Capacity Blocks from the list of unhealthy nodes.
        # Nodes from active Capacity Blocks will be instead managed as unhealthy instances.
        reserved_nodenames = []
        if not self._config.disable_capacity_blocks_management:
            reserved_nodenames = self._capacity_block_manager.get_reserved_nodenames(slurm_nodes)
            if reserved_nodenames:
                log.info(
                    "The nodes associated with inactive Capacity Blocks and not considered as unhealthy nodes are: %s",
                    ",".join(reserved_nodenames),
                )
            else:
                log.debug("No nodes found associated with inactive Capacity Blocks.")

        for node in slurm_nodes:
            if not node.is_healthy(
                consider_drain_as_unhealthy=self._config.terminate_drain_nodes,
                consider_down_as_unhealthy=self._config.terminate_down_nodes,
                ec2_instance_missing_max_count=self._config.ec2_instance_missing_max_count,
                nodes_without_backing_instance_count_map=self._nodes_without_backing_instance_count_map,
                log_warn_if_unhealthy=node.name not in reserved_nodenames,
            ):
                if not self._config.disable_capacity_blocks_management and node.name in reserved_nodenames:
                    # do not consider as unhealthy the nodes reserved for capacity blocks
                    continue

                all_unhealthy_nodes.append(node)

                if isinstance(node, StaticNode):
                    unhealthy_static_nodes.append(node)
                elif self._config.disable_nodes_on_insufficient_capacity and node.is_ice():
                    ice_compute_resources_and_nodes_map.setdefault(node.queue_name, {}).setdefault(
                        node.compute_resource_name, []
                    ).append(node)
                else:
                    unhealthy_dynamic_nodes.append(node)
        self._event_publisher.publish_unhealthy_node_events(
            all_unhealthy_nodes,
            self._config.ec2_instance_missing_max_count,
            self._nodes_without_backing_instance_count_map,
        )
        return (
            unhealthy_dynamic_nodes,
            unhealthy_static_nodes,
            ice_compute_resources_and_nodes_map,
        )