in src/slurm_plugin/instance_manager.py [0:0]
def _update_slurm_node_addrs(self, slurm_nodes, launched_instances):
"""Update node information in slurm with info from launched EC2 instance."""
try:
# There could be fewer launched instances than nodes requested to be launched if best-effort scaling
# Group nodes into successfully launched and failed to launch based on number of launched instances
# fmt: off
launched_nodes = slurm_nodes[:len(launched_instances)]
fail_launch_nodes = slurm_nodes[len(launched_instances):]
# fmt: on
if launched_nodes:
# When using a cluster DNS domain we don't need to pass nodehostnames
# because they are equal to node names.
# It is possible to force the use of private hostnames by setting
# use_private_hostname = "true" as extra json parameter
node_hostnames = (
None if not self._use_private_hostname else [instance.hostname for instance in launched_instances]
)
update_nodes(
launched_nodes,
nodeaddrs=[instance.private_ip for instance in launched_instances],
nodehostnames=node_hostnames,
)
logger.info(
"Nodes are now configured with instances: %s",
print_with_count(zip(launched_nodes, launched_instances)),
)
if fail_launch_nodes:
logger.info("Failed to launch instances for following nodes: %s", print_with_count(fail_launch_nodes))
self.failed_nodes.extend(fail_launch_nodes)
return dict(zip(launched_nodes, launched_instances))
except subprocess.CalledProcessError:
logger.info(
"Encountered error when updating node %s with instance %s",
print_with_count(slurm_nodes),
print_with_count(launched_instances),
)
self.failed_nodes.extend(slurm_nodes)