def _maintain_nodes()

in src/slurm_plugin/clustermgtd.py [0:0]


    def _maintain_nodes(self, partitions_name_map, compute_resource_nodes_map):
        """
        Call functions to maintain unhealthy nodes.

        This function needs to handle the case that 2 slurm nodes have the same IP/nodeaddr.
        A list of slurm nodes is passed in and slurm node map with IP/nodeaddr as key should be avoided.
        """
        log.info("Performing node maintenance actions")
        # Retrieve nodes from Slurm partitions in ACTIVE state
        active_nodes = self._find_active_nodes(partitions_name_map)

        # Update self.static_nodes_in_replacement by removing from the set any node that is up or in maintenance
        self._update_static_nodes_in_replacement(active_nodes)
        log.info(
            "Following nodes are currently in replacement: %s", print_with_count(self._static_nodes_in_replacement)
        )
        # terminate powering down instances
        self._handle_powering_down_nodes(active_nodes)

        # retrieve and manage unhealthy nodes
        (
            unhealthy_dynamic_nodes,
            unhealthy_static_nodes,
            ice_compute_resources_and_nodes_map,
        ) = self._find_unhealthy_slurm_nodes(active_nodes)
        if unhealthy_dynamic_nodes:
            log.info("Found the following unhealthy dynamic nodes: %s", print_with_count(unhealthy_dynamic_nodes))
            self._handle_unhealthy_dynamic_nodes(unhealthy_dynamic_nodes)
        if unhealthy_static_nodes:
            log.info("Found the following unhealthy static nodes: %s", print_with_count(unhealthy_static_nodes))
            self._handle_unhealthy_static_nodes(unhealthy_static_nodes)

        # evaluate partitions to put in protected mode and ICEs nodes to terminate
        if self._is_protected_mode_enabled():
            self._handle_protected_mode_process(active_nodes, partitions_name_map)
        if self._config.disable_nodes_on_insufficient_capacity:
            self._handle_ice_nodes(ice_compute_resources_and_nodes_map, compute_resource_nodes_map)
        self._handle_failed_health_check_nodes_in_replacement(active_nodes)