in lmms_eval/tasks/cmmmu/utils.py [0:0]
def eval_cmmmu(entries):
correct_cnt = 0
for entry in entries:
parsed_pred = entry.get("parsed_pred", "")
correct = False
if entry.get("question_type") == "选择":
if parsed_pred == entry["answer"]:
correct_cnt += 1
correct = True
elif entry.get("question_type") == "填空":
norm_answers = normalize_str(entry["answer"], entry["answer"])
for pred in parsed_pred:
# already normalized
if isinstance(pred, str): # if it's a string, then find if ans in the pred_i
for norm_ans in norm_answers:
# only see if the string answer in the string pred
# print(norm_ans, pred)
if isinstance(norm_ans, str) and norm_ans in pred:
if not correct:
correct_cnt += 1
correct = True
break
else: # it's a number
if pred in norm_answers:
if not correct:
correct_cnt += 1
correct = True
break
else:
positive_keywords = ["正确", "对", "准确", "肯定", "对的"]
negative_keywords = ["不对", "错误", "不正确", "不准确", "不合适", "否定", "错的", "错"]
ambiguous_keywords = ["对错", "是否正确", "否正确", "或者", "是否", "正确性", "对不"]
def judge_similarity(pred_list, positive_keywords, negative_keywords):
positive_count = 0
negative_count = 0
for pred in pred_list:
if any(pos_word in pred for pos_word in positive_keywords):
positive_count += 1
elif any(neg_word in pred for neg_word in negative_keywords):
negative_count += 1
if positive_count > negative_count:
return "对"
elif negative_count > positive_count:
return "错"
else:
return random.choice(["对", "错"])
answer = entry["answer"]
parsed_pred = [word for word in parsed_pred if not any(ambiguous in word for ambiguous in ambiguous_keywords)]
result = judge_similarity(parsed_pred, positive_keywords, negative_keywords)
if result == answer:
correct_cnt += 1
correct = True
if correct:
entry["judge"] = "正确"
else:
entry["judge"] = "错误"
if len(entries) == 0:
print("entries_num == 0, please check your file")
results_count = {"correct_num": 0, "entries_num": 0, "acc": 0}
else:
results_count = {"correct_num": correct_cnt, "entries_num": len(entries), "acc": correct_cnt / len(entries)}
return results_count