def eval_cmmmu()

in lmms_eval/tasks/cmmmu/utils.py [0:0]


def eval_cmmmu(entries):
    correct_cnt = 0
    for entry in entries:
        parsed_pred = entry.get("parsed_pred", "")
        correct = False
        if entry.get("question_type") == "选择":
            if parsed_pred == entry["answer"]:
                correct_cnt += 1
                correct = True

        elif entry.get("question_type") == "填空":
            norm_answers = normalize_str(entry["answer"], entry["answer"])

            for pred in parsed_pred:
                # already normalized
                if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
                    for norm_ans in norm_answers:
                        # only see if the string answer in the string pred
                        # print(norm_ans, pred)
                        if isinstance(norm_ans, str) and norm_ans in pred:
                            if not correct:
                                correct_cnt += 1
                                correct = True
                            break
                else:  # it's a number
                    if pred in norm_answers:
                        if not correct:
                            correct_cnt += 1
                            correct = True
                        break

        else:
            positive_keywords = ["正确", "对", "准确", "肯定", "对的"]
            negative_keywords = ["不对", "错误", "不正确", "不准确", "不合适", "否定", "错的", "错"]
            ambiguous_keywords = ["对错", "是否正确", "否正确", "或者", "是否", "正确性", "对不"]

            def judge_similarity(pred_list, positive_keywords, negative_keywords):
                positive_count = 0
                negative_count = 0

                for pred in pred_list:
                    if any(pos_word in pred for pos_word in positive_keywords):
                        positive_count += 1
                    elif any(neg_word in pred for neg_word in negative_keywords):
                        negative_count += 1

                if positive_count > negative_count:
                    return "对"
                elif negative_count > positive_count:
                    return "错"
                else:
                    return random.choice(["对", "错"])

            answer = entry["answer"]
            parsed_pred = [word for word in parsed_pred if not any(ambiguous in word for ambiguous in ambiguous_keywords)]
            result = judge_similarity(parsed_pred, positive_keywords, negative_keywords)
            if result == answer:
                correct_cnt += 1
                correct = True
        if correct:
            entry["judge"] = "正确"
        else:
            entry["judge"] = "错误"

    if len(entries) == 0:
        print("entries_num == 0, please check your file")
        results_count = {"correct_num": 0, "entries_num": 0, "acc": 0}
    else:
        results_count = {"correct_num": correct_cnt, "entries_num": len(entries), "acc": correct_cnt / len(entries)}

    return results_count