in src/slurm_plugin/cluster_event_publisher.py [0:0]
def _generate_launch_failure_details(self, failed_nodes: Dict[str, List[str]]) -> Iterator:
"""
Build a dictionary based on failure category (e.g. ice-failure).
The elements contain the number of nodes and the node names in that category.
"""
detail_map = {"other-failures": {"count": 0, "error-details": {}}}
for failure_type in _LAUNCH_FAILURE_GROUPING.values():
detail_map.setdefault(failure_type, {"count": 0, "error-details": {}})
for error_code, nodes in failed_nodes.items():
failure_type = ClusterEventPublisher._get_failure_type_from_error_code(error_code)
error_entry = detail_map.get(failure_type)
error_entry.update(
{
"count": error_entry.get("count") + len(nodes),
}
)
error_details = error_entry.get("error-details")
error_details.update(
{
error_code: {
"count": len(nodes),
"nodes": self._generate_node_name_list(list(nodes)),
}
}
)
for failure_type, detail in detail_map.items():
count = detail.get("count", 0)
yield count, {
"failure-type": failure_type,
"count": count,
"error-details": detail.get("error-details", None),
}