def _handle_unhealthy_static_nodes()

in src/slurm_plugin/clustermgtd.py [0:0]


    def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes):
        """
        Maintain any unhealthy static node.

        Set node to down, terminate backing instance, and launch new instance for static node.
        """
        node_list = [node.name for node in unhealthy_static_nodes]
        # Set nodes into down state so jobs can be requeued immediately
        try:
            log.info("Setting unhealthy static nodes to DOWN")
            set_nodes_down(node_list, reason="Static node maintenance: unhealthy node is being replaced")
        except Exception as e:
            log.error("Encountered exception when setting unhealthy static nodes into down state: %s", e)

        instances_to_terminate = [node.instance.id for node in unhealthy_static_nodes if node.instance]

        if instances_to_terminate:
            log.info("Terminating instances backing unhealthy static nodes")
            self._instance_manager.delete_instances(
                instances_to_terminate, terminate_batch_size=self._config.terminate_max_batch_size
            )
        log.info("Launching new instances for unhealthy static nodes")
        self._instance_manager.add_instances_for_nodes(
            node_list, self._config.launch_max_batch_size, self._config.update_node_address
        )
        # Add launched nodes to list of nodes being replaced, excluding any nodes that failed to launch
        launched_nodes = set(node_list) - set(self._instance_manager.failed_nodes)
        self._static_nodes_in_replacement |= launched_nodes
        log.info(
            "After node maintenance, following nodes are currently in replacement: %s",
            print_with_count(self._static_nodes_in_replacement),
        )