in coreference_metrics.py [0:0]
def get_all_metrics(self, labels: List[List[List[Tuple[int, int]]]], preds: List[List[List[Tuple[int, int]]]])\
-> Dict[str, Dict[str, Dict[str, float]]]:
"""
Compute all metrics for coreference resolution.
In input are given two list of mention groups, for example:
[ # this is the corpus level, with a list of documents
[ # this is the document level, with a list of mention clusters
[ # this is the cluster level, with a list of spans
(5, 7),
(11, 19),
...
],
...
]
]
"""
assert len(labels) == len(preds)
result = {}
# compute micro-averaged scores (treat all clusters from all docs as a single list of clusters)
gold_clusters = [
[(i,) + span for span in cluster] for i, clusters in enumerate(labels) for cluster in clusters
]
predicted_clusters = [
[(i,) + span for span in cluster] for i, clusters in enumerate(preds) for cluster in clusters
]
result['micro'] = self._compute_coref_metrics(gold_clusters, predicted_clusters)
# compute macro-averaged scores (compute p/r/f1 for each doc first, then take average per doc)
doc_metrics = []
for gold_clusters, predicted_clusters in zip(labels, preds):
doc_metrics.append(self._compute_coref_metrics(
gold_clusters, predicted_clusters
))
result['macro'] = self._average_nested_dict(doc_metrics)
return result