def process_answers()

in evaluate_model_outputs.py [0:0]


def process_answers(df: pd.DataFrame, gold_is_latex: bool) -> pd.DataFrame:
    """Process each answer through the sympy extraction workflow and compare with gold using math_verify."""
    results = []
    
    
    correct_count = 0
    total_count = 0
    
    # Create the verification function
    verify_func = math_metric(
        gold_extraction_target=(LatexExtractionConfig() if gold_is_latex else ExprExtractionConfig(),),
        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
        aggregation_function=max,
        precision=6
    )
    
    for _, row in df.iterrows():
        extracted_answers = None
        gold_answers = None
        grade = 0
        try:
            # Use the verification function
            grade, extracted_answers = verify_func([row['gold']], [row['answer']])
            
            if extracted_answers is None:
                extracted_answers = None
                gold_answers = None
            else:
                gold_answers = extracted_answers[0]
                extracted_answers = extracted_answers[1]

            total_count += 1
            if grade == 1:
                correct_count += 1
            
            result = {
                'original_answer': row['answer'],
                'gold_answer': row['gold'],
                'extracted_answer': extracted_answers,
                'extracted_gold': gold_answers,
                'is_correct': grade == 1
            }
            
            results.append(result)
            
        except Exception as e:
            results.append({
                'original_answer': row['answer'],
                'gold_answer': row['gold'],
                'extracted_answer': extracted_answers,
                'extracted_gold': gold_answers,
                'is_correct': grade == 1,
                'error': str(e)
            })
    
    results_df = pd.DataFrame(results)
    
    # Calculate accuracy
    accuracy = correct_count / total_count if total_count > 0 else 0
    print(f"\nEvaluation Results:")
    print(f"Total examples: {total_count}")
    print(f"Correct answers: {correct_count}")
    print(f"Accuracy: {accuracy:.2%}")
    
    # Add summary stats to the dataframe
    results_df.attrs['accuracy'] = accuracy
    results_df.attrs['total_count'] = total_count
    results_df.attrs['correct_count'] = correct_count
    
    return results_df