lmms_eval/tasks/mme/utils.py (88 lines of code) (raw):

from collections import defaultdict import os import datetime import json from lmms_eval.tasks._task_utils.file_utils import generate_submission_file import logging eval_logger = logging.getLogger("lmms-eval") dir_name = os.path.dirname(os.path.abspath(__file__)) eval_type_dict = { "Perception": [ "existence", "count", "position", "color", "posters", "celebrity", "scene", "landmark", "artwork", "OCR", ], "Cognition": [ "commonsense_reasoning", "numerical_calculation", "text_translation", "code_reasoning", ], } replace_prompt = " Please answer yes or no." def mme_doc_to_visual(doc): return [doc["image"].convert("RGB")] def mme_doc_to_text(doc, model_specific_prompt_kwargs=None): question = doc["question"].strip() if "pre_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["pre_prompt"] != "": question = question.replace(replace_prompt, "") question = f"{model_specific_prompt_kwargs['pre_prompt']}{question}" if "post_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["post_prompt"] != "": question = question.replace(replace_prompt, "") question = f"{question}{model_specific_prompt_kwargs['post_prompt']}" return question def parse_pred_ans(pred_ans): """Brought from Otter Eval""" pred_ans = pred_ans.lower().strip().replace(".", "") pred_label = None if pred_ans in ["yes", "no"]: pred_label = pred_ans else: prefix_pred_ans = pred_ans[:4] if "yes" in prefix_pred_ans: pred_label = "yes" elif "no" in prefix_pred_ans: pred_label = "no" else: pred_label = "other" return pred_label def mme_process_results(doc, results): """ Args: doc: a instance of the eval dataset results: [pred] Returns: a dictionary with key: metric name (in this case mme score), value: metric value """ pred = results[0] pred_ans = parse_pred_ans(pred) gt_ans = doc["answer"].lower().strip().replace(".", "") assert gt_ans in ["yes", "no"] assert pred_ans in ["yes", "no", "other"] score = 1.0 if pred_ans == gt_ans else 0.0 category = doc["category"] key_name = "mme_percetion_score" if category in eval_type_dict["Perception"] else "mme_cognition_score" # Note: the key name here is very important. It decides which aggregation function will receive the results # We note down the question id/category to help us aggregate the results later return {key_name: {"question_id": doc["question_id"], "category": category, "score": score}} def mme_aggregate_results(results): """ Args: results: a list of values returned by process_results Returns: A score """ category2score = defaultdict(dict) for result in results: question_id = result["question_id"] score = result["score"] category = result["category"] if question_id not in category2score[category]: category2score[category][question_id] = [] category2score[category][question_id].append(score) category2avg_score = {} for category, question2scores in category2score.items(): total_score = 0 for question_id, scores in question2scores.items(): assert len(scores) == 2 acc = sum(scores) / len(scores) * 100.0 acc_plus = (sum(scores) == 2) * 100.0 score = acc_plus + acc total_score += score avg_score = total_score / len(question2scores) category2avg_score[category] = avg_score for category, avg_score in category2avg_score.items(): eval_logger.info(f"{category}: {avg_score:.2f}") total_score = sum(category2avg_score.values()) return total_score