in lmms_eval/tasks/mathvista/utils.py [0:0]
def mathvista_aggregate_results(results, args, *, calculate_gain=False, random_scores=None):
split_flag = results[0]["metadata"]["split"]
full_pids = [result["question_id"] for result in results]
total = len(results)
correct = sum(1 for idx, pid in enumerate(full_pids) if results[idx]["true_false"])
accuracy = round(correct / total * 100, 2)
scores = {"average": {"accuracy": accuracy, "correct": correct, "total": total}}
for result in results:
result.update(result.pop("metadata"))
results_dict = {result["question_id"]: result for result in results}
df = pd.DataFrame(results_dict).T
target_keys = ["question_type", "answer_type", "language", "source", "category", "task", "context", "grade", "skills"]
for key in target_keys:
values = df[key].explode().unique() if key == "skills" else df[key].unique()
scores[key] = {}
for value in values:
correct, total, acc = mathvista_evaluator.get_acc_with_contion(df, key, value)
if total > 0:
scores[key][value] = {"accuracy": acc, "correct": correct, "total": total}
scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]["accuracy"]), reverse=True))
if calculate_gain:
for key in scores:
if key == "average":
gain = round(float(scores[key]["accuracy"]) - float(random_scores[key]["accuracy"]), 2)
scores[key]["acc_gain"] = gain
else:
for sub_key in scores[key]:
gain = round(float(scores[key][sub_key]["accuracy"]) - float(random_scores[key][sub_key]["accuracy"]), 2)
scores[key][sub_key]["acc_gain"] = gain
path = generate_submission_file(f"mathvista_{split_flag}_scores.json", args)
with open(path, "w") as f:
json.dump(results_dict, f, indent=4)
eval_logger.info(f"Saved results to {path}")
if scores["average"]["accuracy"] == 0:
return None
return scores["average"]["accuracy"]