in src/slurm_plugin/instance_manager.py [0:0]
def _get_slurm_resume_data(self, slurm_resume: Dict[str, any], node_list: List[str]) -> SlurmResumeData:
"""
Get SlurmResumeData object.
SlurmResumeData object contains the following:
* the node list for jobs allocated to single node
* the node list for jobs allocated to multiple nodes
* the job list with single node allocation
* the job list with multi node allocation
Example of Slurm Resume File (ref. https://slurm.schedmd.com/elastic_computing.html):
{
"all_nodes_resume": "cloud[1-3,7-8]",
"jobs": [
{
"extra": "An arbitrary string from --extra",
"features": "c1,c2",
"job_id": 140814,
"nodes_alloc": "queue1-st-c5xlarge-[4-5]",
"nodes_resume": "queue1-st-c5xlarge-[1,3]",
"oversubscribe": "OK",
"partition": "cloud",
"reservation": "resv_1234",
},
{
"extra": None,
"features": "c1,c2",
"job_id": 140815,
"nodes_alloc": "queue2-st-c5xlarge-[1-2]",
"nodes_resume": "queue2-st-c5xlarge-[1-2]",
"oversubscribe": "OK",
"partition": "cloud",
"reservation": None,
},
{
"extra": None,
"features": None,
"job_id": 140816,
"nodes_alloc": "queue2-st-c5xlarge-[7,8]",
"nodes_resume": "queue2-st-c5xlarge-[7,8]",
"oversubscribe": "NO",
"partition": "cloud_exclusive",
"reservation": None,
},
],
}
"""
jobs_single_node = []
jobs_multi_node = []
single_node = []
multi_node = []
slurm_resume_jobs = self._parse_slurm_resume(slurm_resume)
for job in slurm_resume_jobs:
if len(job.nodes_resume) == 1:
jobs_single_node.append(job)
single_node.extend(job.nodes_resume)
else:
jobs_multi_node.append(job)
multi_node.extend(job.nodes_resume)
nodes_difference = list(set(node_list) - (set(single_node) | set(multi_node)))
if nodes_difference:
logger.warning(
"Discarding NodeNames because of mismatch in Slurm Resume File Vs Nodes passed to Resume Program: %s",
", ".join(nodes_difference),
)
self._update_failed_nodes(set(nodes_difference), "InvalidNodenameError")
return SlurmResumeData(
single_node=list(dict.fromkeys(single_node)),
multi_node=list(dict.fromkeys(multi_node)),
jobs_single_node=jobs_single_node,
jobs_multi_node=jobs_multi_node,
)