in evaluate_model_outputs.py [0:0]
def process_answers(df: pd.DataFrame, gold_is_latex: bool) -> pd.DataFrame:
"""Process each answer through the sympy extraction workflow and compare with gold using math_verify."""
results = []
correct_count = 0
total_count = 0
# Create the verification function
verify_func = math_metric(
gold_extraction_target=(LatexExtractionConfig() if gold_is_latex else ExprExtractionConfig(),),
pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
aggregation_function=max,
precision=6
)
for _, row in df.iterrows():
extracted_answers = None
gold_answers = None
grade = 0
try:
# Use the verification function
grade, extracted_answers = verify_func([row['gold']], [row['answer']])
if extracted_answers is None:
extracted_answers = None
gold_answers = None
else:
gold_answers = extracted_answers[0]
extracted_answers = extracted_answers[1]
total_count += 1
if grade == 1:
correct_count += 1
result = {
'original_answer': row['answer'],
'gold_answer': row['gold'],
'extracted_answer': extracted_answers,
'extracted_gold': gold_answers,
'is_correct': grade == 1
}
results.append(result)
except Exception as e:
results.append({
'original_answer': row['answer'],
'gold_answer': row['gold'],
'extracted_answer': extracted_answers,
'extracted_gold': gold_answers,
'is_correct': grade == 1,
'error': str(e)
})
results_df = pd.DataFrame(results)
# Calculate accuracy
accuracy = correct_count / total_count if total_count > 0 else 0
print(f"\nEvaluation Results:")
print(f"Total examples: {total_count}")
print(f"Correct answers: {correct_count}")
print(f"Accuracy: {accuracy:.2%}")
# Add summary stats to the dataframe
results_df.attrs['accuracy'] = accuracy
results_df.attrs['total_count'] = total_count
results_df.attrs['correct_count'] = correct_count
return results_df