in lmms_eval/tasks/hallusion_bench/evaluate_hb.py [0:0]
def hb_aggregation_result_intern(results, metric):
scores = []
for result in results:
ans = "1" if result["model_prediction"].lower().find("yes") != -1 else "0"
scores.append(ans == result["gt_answer"])
result["answer"] = ans
if metric == "aAcc":
return sum(scores) / len(scores)
elif metric == "qAcc":
qlist = {}
for r in results:
key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
try:
qlist[key].append(r["answer"] == r["gt_answer"])
except:
qlist[key] = [r["answer"] == r["gt_answer"]]
out = []
for q, v in qlist.items():
out.append(min(v))
return sum(out) / len(out)
elif metric == "fAcc":
qlist = {}
for r in results:
key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"])])
try:
qlist[key].append(r["answer"] == r["gt_answer"])
except:
qlist[key] = [r["answer"] == r["gt_answer"]]
out = []
for q, v in qlist.items():
out.append(min(v))
return sum(out) / len(out)