def _terminate_cluster_nodes()

in cli/src/pcluster/resources/custom_resources/custom_resources_code/cleanup_resources.py [0:0]


def _terminate_cluster_nodes(event):
    """
    Terminate all EC2 instances associated with the given cluster.

    This function iterates over all EC2 instances associated with the specified cluster and attempts to terminate them.
    It handles retries for instances that fail to terminate initially and ensures that the function does not exceed
    the Lambda execution timeout.
    """
    try:
        start_time = time.time()
        max_exeution_time = 14 * 60  # Maximum allowed time for Lambda function execution (14 minutes) to avoid timeout
        logger.info("Compute fleet nodes terminate: STARTED")
        stack_name = event["ResourceProperties"]["StackName"]
        ec2 = boto3.client("ec2", config=boto3_config)

        completed_successfully = False
        while not completed_successfully:
            completed_successfully = True
            for instance_ids in _describe_instance_ids_iterator(stack_name):
                logger.info("Terminating instances %s", instance_ids)
                if instance_ids:
                    try:
                        ec2.terminate_instances(InstanceIds=instance_ids)
                    except Exception as e:
                        logger.error("Failed when terminating instances with error %s", e)
                        completed_successfully = False
                        continue
            logger.info("Sleeping for 10 seconds to allow all instances to initiate shut-down")
            time.sleep(10)

        while _has_shuttingdown_instances(stack_name):
            # This logic prevents Lambda function from timing out if instance termination exceeds 15 minutes
            # TODO: This approach may cause potential cluster deletion failure when PlacementGroups are enabled
            # and instance termination time exceeds 15 minutes simultaneously. Resolve the above potential failure.
            if time.time() - start_time > max_exeution_time:
                logger.warning(
                    "Lambda execution time has exceeded 14 minutes, approaching timeout. "
                    "Returning from Lambda after a 30-second delay; instances may still be in a shutting-down state. "
                    "Note: Instances in shutting-down state are not recoverable and are not billed during this period."
                )
                time.sleep(30)
                return
            logger.info("Waiting for all nodes terminated...")
            time.sleep(10)

        # Sleep for 30 more seconds to give PlacementGroups the time to update
        time.sleep(30)

        logger.info("Compute fleet nodes terminate: COMPLETED")
    except Exception as e:
        logger.error("Failed when terminating instances with error %s", e)
        raise