def _manage_compute_fleet_status

def _manage_compute_fleet_status_transitions()

in src/slurm_plugin/clustermgtd.py [0:0]
32 lines of code
11 McCabe index (conditional complexity)

    def _manage_compute_fleet_status_transitions(self):
        """
        Handle compute fleet status transitions.

        When running pcluster start/stop command the fleet status is set to START_REQUESTED/STOP_REQUESTED.
        The function fetches the current fleet status and performs the following transitions:
          - START_REQUESTED -> STARTING -> RUNNING
          - STOP_REQUESTED -> STOPPING -> STOPPED
        STARTING/STOPPING states are only used to communicate that the request is being processed by clustermgtd.
        The following actions are applied to the cluster based on the current status:
          - START_REQUESTED|STARTING: all Slurm partitions are enabled
          - STOP_REQUESTED|STOPPING|STOPPED: all Slurm partitions are disabled and EC2 instances terminated. These
            actions are executed also when the status is stopped to take into account changes that can be manually
            applied by the user by re-activating Slurm partitions.
        """
        self._compute_fleet_status = self._compute_fleet_status_manager.get_status(fallback=self._compute_fleet_status)
        log.info("Current compute fleet status: %s", self._compute_fleet_status)
        try:
            if ComputeFleetStatus.is_stop_status(self._compute_fleet_status):
                # Since Slurm partition status might have been manually modified, when STOPPED we want to keep checking
                # partitions and EC2 instances
                if self._compute_fleet_status == ComputeFleetStatus.STOP_REQUESTED:
                    self._update_compute_fleet_status(ComputeFleetStatus.STOPPING)
                # When setting partition to INACTIVE, always try to reset nodeaddr/nodehostname to avoid issue
                partitions_deactivated_successfully = update_all_partitions(
                    PartitionStatus.INACTIVE, reset_node_addrs_hostname=True
                )
                nodes_terminated = self._instance_manager.terminate_all_compute_nodes(
                    self._config.terminate_max_batch_size
                )
                if partitions_deactivated_successfully and nodes_terminated:
                    if self._compute_fleet_status == ComputeFleetStatus.STOPPING:
                        self._update_compute_fleet_status(ComputeFleetStatus.STOPPED)
            elif ComputeFleetStatus.is_start_in_progress(self._compute_fleet_status):
                if self._compute_fleet_status == ComputeFleetStatus.START_REQUESTED:
                    self._update_compute_fleet_status(ComputeFleetStatus.STARTING)
                # When setting partition to UP, DO NOT reset nodeaddr/nodehostname to avoid breaking nodes already up
                partitions_activated_successfully = update_all_partitions(
                    PartitionStatus.UP, reset_node_addrs_hostname=False
                )
                resume_powering_down_nodes()
                if partitions_activated_successfully:
                    self._update_compute_fleet_status(ComputeFleetStatus.RUNNING)
                    # Reset protected failure
                    self._partitions_protected_failure_count_map = {}
        except ComputeFleetStatusManager.ConditionalStatusUpdateFailed:
            log.warning(
                "Cluster status was updated while handling a transition from %s. "
                "Status transition will be retried at the next iteration",
                self._compute_fleet_status,
            )