in src/lighteval/pipeline.py [0:0]
def _compute_metrics(self, sampling_method_responses: dict[str, list[ModelResponse]]):
# To compute the metrics we first group the samples and task and then by metrics.
# This way we can batch the metrics computation for each task and metric category
# This variable will hold the samples grouped by task and metric category
# example:
# task_metric_category_groups = {
# "gsm8k_1": {
# "GENERATIVE": [
# (doc1, response1), (doc2, response2), ...,
# }
# "LOGLIKELIHOOD": [
# (doc1, response1), (doc2, response2), ...,
# ]
logger.info("--- COMPUTING METRICS ---")
task_metric_category_groups = collections.defaultdict(lambda: collections.defaultdict(list))
for sampling_method, model_responses in sampling_method_responses.items():
for doc, model_reponse in zip(self.sampling_docs[sampling_method], model_responses):
task_metric_category_groups[doc.task_name][sampling_method].append((doc, model_reponse))
for task_name, samples_per_method in task_metric_category_groups.items():
task: LightevalTask = self.tasks_dict[task_name]
for sampling_method, samples in samples_per_method.items():
metric_category_metrics = [metric for metric in task.metrics if metric.category == sampling_method]
docs = [doc for doc, _ in samples]
responses = [response for _, response in samples]
outputs = apply_metric(
docs=docs,
responses=responses,
metrics=metric_category_metrics,
)
for output, doc, response in zip(outputs, docs, responses):
self.evaluation_tracker.metrics_logger.log(task_name, output)
self.evaluation_tracker.details_logger.log(task_name, doc, response, output)