in src/slurm_plugin/clustermgtd.py [0:0]
def _handle_protected_mode_process(self, active_nodes, partitions_name_map):
"""Handle the process of entering protected mode."""
# Handle successfully launched nodes
if self._partitions_protected_failure_count_map:
self._handle_successfully_launched_nodes(partitions_name_map)
self._handle_bootstrap_failure_nodes(active_nodes)
# Enter protected mode
# We will put a partition into inactive state only if the partition satisfies the following:
# Partition is not INACTIVE
# Partition bootstrap failure count above threshold
# Partition does not have job running
if self._partitions_protected_failure_count_map:
log.info(
"Partitions bootstrap failure count: %s, cluster will be set into protected mode if "
"protected failure count reaches threshold %s",
self._partitions_protected_failure_count_map,
self._config.protected_failure_count,
)
partitions_to_disable = []
bootstrap_failure_partitions_have_jobs = []
for part_name, failures in self._partitions_protected_failure_count_map.items():
part = partitions_name_map.get(part_name)
if part and not part.is_inactive() and sum(failures.values()) >= self._config.protected_failure_count:
if part.has_running_job():
bootstrap_failure_partitions_have_jobs.append(part_name)
else:
partitions_to_disable.append(part_name)
if bootstrap_failure_partitions_have_jobs:
log.info(
"Bootstrap failure partitions %s currently have jobs running, not disabling them",
bootstrap_failure_partitions_have_jobs,
)
if not partitions_to_disable:
log.info("Not entering protected mode since active jobs are running in bootstrap failure partitions")
elif partitions_to_disable:
self._enter_protected_mode(partitions_to_disable)
if ComputeFleetStatus.is_protected(self._compute_fleet_status):
log.warning(
"Cluster is in protected mode due to failures detected in node provisioning. "
"Please investigate the issue and then use 'pcluster update-compute-fleet --status START_REQUESTED' "
"command to re-enable the fleet."
)