in src/slurm_plugin/clustermgtd.py [0:0]
def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes):
"""
Maintain any unhealthy static node.
Set node to down, terminate backing instance, and launch new instance for static node.
"""
node_list = [node.name for node in unhealthy_static_nodes]
# Set nodes into down state so jobs can be requeued immediately
try:
log.info("Setting unhealthy static nodes to DOWN")
set_nodes_down(node_list, reason="Static node maintenance: unhealthy node is being replaced")
except Exception as e:
log.error("Encountered exception when setting unhealthy static nodes into down state: %s", e)
instances_to_terminate = [node.instance.id for node in unhealthy_static_nodes if node.instance]
if instances_to_terminate:
log.info("Terminating instances backing unhealthy static nodes")
self._instance_manager.delete_instances(
instances_to_terminate, terminate_batch_size=self._config.terminate_max_batch_size
)
log.info("Launching new instances for unhealthy static nodes")
self._instance_manager.add_instances_for_nodes(
node_list, self._config.launch_max_batch_size, self._config.update_node_address
)
# Add launched nodes to list of nodes being replaced, excluding any nodes that failed to launch
launched_nodes = set(node_list) - set(self._instance_manager.failed_nodes)
self._static_nodes_in_replacement |= launched_nodes
log.info(
"After node maintenance, following nodes are currently in replacement: %s",
print_with_count(self._static_nodes_in_replacement),
)