in src/slurm_plugin/clustermgtd.py [0:0]
def _manage_compute_fleet_status_transitions(self):
"""
Handle compute fleet status transitions.
When running pcluster start/stop command the fleet status is set to START_REQUESTED/STOP_REQUESTED.
The function fetches the current fleet status and performs the following transitions:
- START_REQUESTED -> STARTING -> RUNNING
- STOP_REQUESTED -> STOPPING -> STOPPED
STARTING/STOPPING states are only used to communicate that the request is being processed by clustermgtd.
The following actions are applied to the cluster based on the current status:
- START_REQUESTED|STARTING: all Slurm partitions are enabled
- STOP_REQUESTED|STOPPING|STOPPED: all Slurm partitions are disabled and EC2 instances terminated. These
actions are executed also when the status is stopped to take into account changes that can be manually
applied by the user by re-activating Slurm partitions.
"""
self._compute_fleet_status = self._compute_fleet_status_manager.get_status(fallback=self._compute_fleet_status)
log.info("Current compute fleet status: %s", self._compute_fleet_status)
try:
if ComputeFleetStatus.is_stop_status(self._compute_fleet_status):
# Since Slurm partition status might have been manually modified, when STOPPED we want to keep checking
# partitions and EC2 instances
if self._compute_fleet_status == ComputeFleetStatus.STOP_REQUESTED:
self._update_compute_fleet_status(ComputeFleetStatus.STOPPING)
# When setting partition to INACTIVE, always try to reset nodeaddr/nodehostname to avoid issue
partitions_deactivated_successfully = update_all_partitions(
PartitionStatus.INACTIVE, reset_node_addrs_hostname=True
)
nodes_terminated = self._instance_manager.terminate_all_compute_nodes(
self._config.terminate_max_batch_size
)
if partitions_deactivated_successfully and nodes_terminated:
if self._compute_fleet_status == ComputeFleetStatus.STOPPING:
self._update_compute_fleet_status(ComputeFleetStatus.STOPPED)
elif ComputeFleetStatus.is_start_in_progress(self._compute_fleet_status):
if self._compute_fleet_status == ComputeFleetStatus.START_REQUESTED:
self._update_compute_fleet_status(ComputeFleetStatus.STARTING)
# When setting partition to UP, DO NOT reset nodeaddr/nodehostname to avoid breaking nodes already up
partitions_activated_successfully = update_all_partitions(
PartitionStatus.UP, reset_node_addrs_hostname=False
)
resume_powering_down_nodes()
if partitions_activated_successfully:
self._update_compute_fleet_status(ComputeFleetStatus.RUNNING)
# Reset protected failure
self._partitions_protected_failure_count_map = {}
except ComputeFleetStatusManager.ConditionalStatusUpdateFailed:
log.warning(
"Cluster status was updated while handling a transition from %s. "
"Status transition will be retried at the next iteration",
self._compute_fleet_status,
)