def _resume()

in src/slurm_plugin/resume.py [0:0]


def _resume(arg_nodes, resume_config):
    """Launch new EC2 nodes according to nodes requested by slurm."""
    # Check heartbeat
    current_time = datetime.now(tz=timezone.utc)
    if not is_clustermgtd_heartbeat_valid(
        current_time, resume_config.clustermgtd_timeout, resume_config.clustermgtd_heartbeat_file_path
    ):
        log.error(
            "No valid clustermgtd heartbeat detected, clustermgtd is down!\n"
            "Please check clustermgtd log for error.\n"
            "Not launching nodes %s",
            arg_nodes,
        )
        _handle_failed_nodes(arg_nodes)
        return
    log.info("Launching EC2 instances for the following Slurm nodes: %s", arg_nodes)
    node_list = [node.name for node in get_nodes_info(arg_nodes)]
    log.debug("Retrieved nodelist: %s", node_list)

    instance_manager = InstanceManager(
        resume_config.region,
        resume_config.cluster_name,
        resume_config.boto3_config,
        table_name=resume_config.dynamodb_table,
        hosted_zone=resume_config.hosted_zone,
        dns_domain=resume_config.dns_domain,
        use_private_hostname=resume_config.use_private_hostname,
        head_node_private_ip=resume_config.head_node_private_ip,
        head_node_hostname=resume_config.head_node_hostname,
        instance_name_type_mapping=resume_config.instance_name_type_mapping,
        run_instances_overrides=resume_config.run_instances_overrides,
    )
    instance_manager.add_instances_for_nodes(
        node_list=node_list,
        launch_batch_size=resume_config.max_batch_size,
        update_node_address=resume_config.update_node_address,
        all_or_nothing_batch=resume_config.all_or_nothing_batch,
    )
    success_nodes = [node for node in node_list if node not in instance_manager.failed_nodes]
    log.info("Successfully launched nodes %s", print_with_count(success_nodes))
    if instance_manager.failed_nodes:
        log.error(
            "Failed to launch following nodes, setting nodes to down: %s",
            print_with_count(instance_manager.failed_nodes),
        )
        _handle_failed_nodes(instance_manager.failed_nodes)