in src/slurm_plugin/clustermgtd.py [0:0]
def _maintain_nodes(self, partitions_name_map, compute_resource_nodes_map):
"""
Call functions to maintain unhealthy nodes.
This function needs to handle the case that 2 slurm nodes have the same IP/nodeaddr.
A list of slurm nodes is passed in and slurm node map with IP/nodeaddr as key should be avoided.
"""
log.info("Performing node maintenance actions")
# Retrieve nodes from Slurm partitions in ACTIVE state
active_nodes = self._find_active_nodes(partitions_name_map)
# Update self.static_nodes_in_replacement by removing from the set any node that is up or in maintenance
self._update_static_nodes_in_replacement(active_nodes)
log.info(
"Following nodes are currently in replacement: %s", print_with_count(self._static_nodes_in_replacement)
)
# terminate powering down instances
self._handle_powering_down_nodes(active_nodes)
# retrieve and manage unhealthy nodes
(
unhealthy_dynamic_nodes,
unhealthy_static_nodes,
ice_compute_resources_and_nodes_map,
) = self._find_unhealthy_slurm_nodes(active_nodes)
if unhealthy_dynamic_nodes:
log.info("Found the following unhealthy dynamic nodes: %s", print_with_count(unhealthy_dynamic_nodes))
self._handle_unhealthy_dynamic_nodes(unhealthy_dynamic_nodes)
if unhealthy_static_nodes:
log.info("Found the following unhealthy static nodes: %s", print_with_count(unhealthy_static_nodes))
self._handle_unhealthy_static_nodes(unhealthy_static_nodes)
# evaluate partitions to put in protected mode and ICEs nodes to terminate
if self._is_protected_mode_enabled():
self._handle_protected_mode_process(active_nodes, partitions_name_map)
if self._config.disable_nodes_on_insufficient_capacity:
self._handle_ice_nodes(ice_compute_resources_and_nodes_map, compute_resource_nodes_map)
self._handle_failed_health_check_nodes_in_replacement(active_nodes)