in src/lighteval/metrics/metrics_sample.py [0:0]
def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float | dict:
"""Computes the metric(s) over a list of golds and predictions for one single sample.
Args:
golds (list[str]): Reference targets
predictions (list[str]): Predicted strings
Returns:
float or dict: Aggregated score over the current sample's items.
If several rouge functions have been selected, returns a dict which maps name and scores.
"""
from rouge_score import rouge_scorer
golds = doc.get_golds()
predictions = model_response.text
if self.scorer is None:
self.scorer = rouge_scorer.RougeScorer(self.methods, tokenizer=self.tokenizer)
# Normalize
if self.normalize_gold:
golds = [self.normalize_gold(g) for g in golds]
if self.normalize_pred:
predictions = [self.normalize_pred(p) for p in predictions]
if self.bootstrap: # For t5 style rouge score
scores = self._rouge_score_with_bootsrap(golds=golds, predictions=predictions)
elif self.multiple_golds:
scores = self._rouge_score_multi_golds(golds=golds, preds=predictions)
else:
scores = self._rouge_score(golds=golds, preds=predictions)
if len(scores) == 1:
return list(scores.values())[0]
return scores