in src/slurm_plugin/resume.py [0:0]
def _resume(arg_nodes, resume_config):
"""Launch new EC2 nodes according to nodes requested by slurm."""
# Check heartbeat
current_time = datetime.now(tz=timezone.utc)
if not is_clustermgtd_heartbeat_valid(
current_time, resume_config.clustermgtd_timeout, resume_config.clustermgtd_heartbeat_file_path
):
log.error(
"No valid clustermgtd heartbeat detected, clustermgtd is down!\n"
"Please check clustermgtd log for error.\n"
"Not launching nodes %s",
arg_nodes,
)
_handle_failed_nodes(arg_nodes)
return
log.info("Launching EC2 instances for the following Slurm nodes: %s", arg_nodes)
node_list = [node.name for node in get_nodes_info(arg_nodes)]
log.debug("Retrieved nodelist: %s", node_list)
instance_manager = InstanceManager(
resume_config.region,
resume_config.cluster_name,
resume_config.boto3_config,
table_name=resume_config.dynamodb_table,
hosted_zone=resume_config.hosted_zone,
dns_domain=resume_config.dns_domain,
use_private_hostname=resume_config.use_private_hostname,
head_node_private_ip=resume_config.head_node_private_ip,
head_node_hostname=resume_config.head_node_hostname,
instance_name_type_mapping=resume_config.instance_name_type_mapping,
run_instances_overrides=resume_config.run_instances_overrides,
)
instance_manager.add_instances_for_nodes(
node_list=node_list,
launch_batch_size=resume_config.max_batch_size,
update_node_address=resume_config.update_node_address,
all_or_nothing_batch=resume_config.all_or_nothing_batch,
)
success_nodes = [node for node in node_list if node not in instance_manager.failed_nodes]
log.info("Successfully launched nodes %s", print_with_count(success_nodes))
if instance_manager.failed_nodes:
log.error(
"Failed to launch following nodes, setting nodes to down: %s",
print_with_count(instance_manager.failed_nodes),
)
_handle_failed_nodes(instance_manager.failed_nodes)