in src/slurm_plugin/clustermgtd.py [0:0]
def _find_unhealthy_slurm_nodes(self, slurm_nodes):
"""
Find unhealthy static slurm nodes and dynamic slurm nodes.
Check and return slurm nodes with unhealthy and healthy scheduler state, grouping unhealthy nodes
by node type (static/dynamic).
"""
unhealthy_static_nodes = []
unhealthy_dynamic_nodes = []
ice_compute_resources_and_nodes_map = {}
all_unhealthy_nodes = []
# Remove the nodes part of inactive Capacity Blocks from the list of unhealthy nodes.
# Nodes from active Capacity Blocks will be instead managed as unhealthy instances.
reserved_nodenames = []
if not self._config.disable_capacity_blocks_management:
reserved_nodenames = self._capacity_block_manager.get_reserved_nodenames(slurm_nodes)
if reserved_nodenames:
log.info(
"The nodes associated with inactive Capacity Blocks and not considered as unhealthy nodes are: %s",
",".join(reserved_nodenames),
)
else:
log.debug("No nodes found associated with inactive Capacity Blocks.")
for node in slurm_nodes:
if not node.is_healthy(
consider_drain_as_unhealthy=self._config.terminate_drain_nodes,
consider_down_as_unhealthy=self._config.terminate_down_nodes,
ec2_instance_missing_max_count=self._config.ec2_instance_missing_max_count,
nodes_without_backing_instance_count_map=self._nodes_without_backing_instance_count_map,
log_warn_if_unhealthy=node.name not in reserved_nodenames,
):
if not self._config.disable_capacity_blocks_management and node.name in reserved_nodenames:
# do not consider as unhealthy the nodes reserved for capacity blocks
continue
all_unhealthy_nodes.append(node)
if isinstance(node, StaticNode):
unhealthy_static_nodes.append(node)
elif self._config.disable_nodes_on_insufficient_capacity and node.is_ice():
ice_compute_resources_and_nodes_map.setdefault(node.queue_name, {}).setdefault(
node.compute_resource_name, []
).append(node)
else:
unhealthy_dynamic_nodes.append(node)
self._event_publisher.publish_unhealthy_node_events(
all_unhealthy_nodes,
self._config.ec2_instance_missing_max_count,
self._nodes_without_backing_instance_count_map,
)
return (
unhealthy_dynamic_nodes,
unhealthy_static_nodes,
ice_compute_resources_and_nodes_map,
)