def is_bootstrap_failure()

in src/slurm_plugin/slurm_resources.py [0:0]


    def is_bootstrap_failure(self, ec2_instance_missing_max_count, nodes_without_backing_instance_count_map: dict):
        """Check if a slurm node has boostrap failure."""
        # no backing instance + [working state]# in node state
        if (self.is_configuring_job() or self.is_powering_up_idle()) and not self.is_backing_instance_valid(
            ec2_instance_missing_max_count=ec2_instance_missing_max_count,
            nodes_without_backing_instance_count_map=nodes_without_backing_instance_count_map,
            log_warn_if_unhealthy=False,
        ):
            logger.warning(
                "Node bootstrap error: Node %s is in power up state without valid backing instance, node state: %s",
                self,
                self.state_string,
            )
            return True
        # Dynamic node in DOWN+CLOUD+POWERED_DOWN+NOT_RESPONDING state
        elif self.is_bootstrap_timeout():
            # We need to check if nodeaddr is set to avoid counting powering up nodes as bootstrap failure nodes during
            # cluster start/stop.
            logger.warning(
                "Node bootstrap error: Resume timeout expires for node %s, node state: %s", self, self.state_string
            )
            return True
        elif self.is_failing_health_check and self.is_powering_up():
            logger.warning(
                "Node bootstrap error: Node %s failed during bootstrap when performing health check, node state: %s",
                self,
                self.state_string,
            )
            return True
        # Consider the invalid registration as a bootstrap failure event, but only the first time it is registered.
        # After this, clustermgtd will mark the node as unhealthy and power it down.
        # This does not clear the INVALID_REG flag immediately: this will happen only when the node is fully powered
        # down. Therefore we exclude the nodes that are still pending powering down from this check.
        elif self.is_invalid_slurm_registration() and not (self.is_power_down() or self.is_powering_down()):
            logger.warning(
                "Node bootstrap error: Node %s failed to register to the Slurm management daemon, node state: %s",
                self,
                self.state_string,
            )
            return True
        return False