in src/lighteval/metrics/metrics_sample.py [0:0]
def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
"""Computes all the requested metrics on the golds and prediction.
Args:
golds (list[str]): A list of possible golds. If it contains more than one item, only the first one is kept.
predictions (list[str]): Predicted strings.
Returns:
dict: The different scores computed
"""
predictions = model_response.text
golds = doc.get_golds()
if len(golds) > 1:
logger.warning(
"Provided more than one gold to compute a string distance metric. Just using the first one."
)
reference = golds[0]
result = {m: [] for m in self.metric_types}
for sequence in predictions:
if self.strip_prediction:
completion = sequence.strip()
else:
completion = sequence
# `reference` is the entire remaining book for each instance.
# Truncate it here to be of the same length as the completion to ensure edit-distance is meaningful.
truncated_reference = reference[: len(completion)]
completion_tokens = np.array(TreebankWordTokenizer().tokenize(completion))
truncated_reference_tokens = np.array(TreebankWordTokenizer().tokenize(truncated_reference))
if "edit_distance" in self.metric_types:
result["edit_distance"].append(edit_distance(s1=completion_tokens, s2=truncated_reference_tokens))
if "edit_similarity" in self.metric_types:
result["edit_similarity"].append(
self.edit_similarity(s1=completion_tokens, s2=truncated_reference_tokens)
)
if "longest_common_prefix_length" in self.metric_types:
result["longest_common_prefix_length"].append(
self.longest_common_prefix_length(s1=completion_tokens, s2=truncated_reference_tokens)
)
final_result = {}
# We cast to float as final results can be numpy types, not JSON serializable
for m, v in result.items():
final_result[m] = float(self.sample_aggregations[m](v))
return final_result