in src/slurm_plugin/slurm_resources.py [0:0]
def is_bootstrap_failure(self, ec2_instance_missing_max_count, nodes_without_backing_instance_count_map: dict):
"""Check if a slurm node has boostrap failure."""
# no backing instance + [working state]# in node state
if (self.is_configuring_job() or self.is_powering_up_idle()) and not self.is_backing_instance_valid(
ec2_instance_missing_max_count=ec2_instance_missing_max_count,
nodes_without_backing_instance_count_map=nodes_without_backing_instance_count_map,
log_warn_if_unhealthy=False,
):
logger.warning(
"Node bootstrap error: Node %s is in power up state without valid backing instance, node state: %s",
self,
self.state_string,
)
return True
# Dynamic node in DOWN+CLOUD+POWERED_DOWN+NOT_RESPONDING state
elif self.is_bootstrap_timeout():
# We need to check if nodeaddr is set to avoid counting powering up nodes as bootstrap failure nodes during
# cluster start/stop.
logger.warning(
"Node bootstrap error: Resume timeout expires for node %s, node state: %s", self, self.state_string
)
return True
elif self.is_failing_health_check and self.is_powering_up():
logger.warning(
"Node bootstrap error: Node %s failed during bootstrap when performing health check, node state: %s",
self,
self.state_string,
)
return True
# Consider the invalid registration as a bootstrap failure event, but only the first time it is registered.
# After this, clustermgtd will mark the node as unhealthy and power it down.
# This does not clear the INVALID_REG flag immediately: this will happen only when the node is fully powered
# down. Therefore we exclude the nodes that are still pending powering down from this check.
elif self.is_invalid_slurm_registration() and not (self.is_power_down() or self.is_powering_down()):
logger.warning(
"Node bootstrap error: Node %s failed to register to the Slurm management daemon, node state: %s",
self,
self.state_string,
)
return True
return False