evaluation/evaluate_hf.py (98 lines of code) (raw):

import argparse import numpy as np from tqdm import tqdm from pebble import ProcessPool from concurrent.futures import TimeoutError from datasets import load_dataset from tqdm.auto import tqdm import pandas as pd from datasets import Dataset from concurrent.futures import ProcessPoolExecutor, as_completed from grader import * from parser import * from utils import load_jsonl from python_executor import PythonExecutor def evaluate(benchmark: str, dataset_id: str, dataset_config: str = None, dataset_split: str = "test", dataset_col: str = "pred", samples: list=None, max_num_samples=None): samples = load_dataset(dataset_id, name=dataset_config, split=dataset_split) if "idx" not in samples.column_names: samples = samples.map(lambda x, idx: {"idx": idx}, with_indices=True) if max_num_samples: print(f"max_num_samples: {max_num_samples} / {len(samples)}") samples = samples[:max_num_samples] def parse_gt(x): x['gt_cot'], x['gt'] = parse_ground_truth(x, benchmark) return x samples = samples.map(parse_gt, desc="Parsing ground truth", num_proc=12, load_from_cache_file=False) samples = samples.map(extract_answer_map, fn_kwargs={"data_name": benchmark, "col": dataset_col}, desc="Parsing predictions", num_proc=12, load_from_cache_file=False) params = [(idx, pred, gt) for idx, pred, gt in zip(samples['idx'], samples['pred'], samples['gt'])] scores = [] timeout_cnt = 0 with ProcessPool(max_workers=8) as pool: future = pool.map(math_equal_process, params, timeout=3) iterator = future.result() with tqdm(total=len(samples), desc="Evaluate") as progress_bar: while True: try: result = next(iterator) scores.append(result) except StopIteration: break except TimeoutError as error: print(error) scores.append(False) timeout_cnt += 1 except Exception as error: print(error.traceback) exit() progress_bar.update(1) mean_score = np.mean(scores) * 100 result_json = { "num_samples": len(samples), "num_scores": len(scores), "timeout_samples": timeout_cnt, "acc": mean_score } print(result_json) return samples, result_json def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--benchmark", type=str, default="math") parser.add_argument("--dataset_id", type=str, required=True) parser.add_argument("--dataset_config", type=str, default=None) parser.add_argument("--dataset_split", type=str, default="train") parser.add_argument("--max_num_samples", type=int, default=None) parser.add_argument("--voting_n", type=int, nargs='+', required=True) args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() data = {"n": [], "acc_naive": [], "acc_weighted": [], "acc_maj": []} def evaluate_for_n(n): local_data = {"n": n, "acc_naive": None, "acc_weighted": None, "acc_maj": None} for agg in ["naive", "weighted", "maj"]: _, scores = evaluate( benchmark=args.benchmark, dataset_id=args.dataset_id, dataset_config=args.dataset_config, dataset_split=args.dataset_split, dataset_col=f"pred_{agg}@{n}", max_num_samples=args.max_num_samples, ) local_data[f"acc_{agg}"] = scores["acc"] return local_data with ProcessPoolExecutor() as executor: futures = {executor.submit(evaluate_for_n, n): n for n in args.voting_n} with tqdm(total=len(futures), desc="Evaluating voting_n") as progress_bar: for future in as_completed(futures): try: result = future.result() data["n"].append(result["n"]) data["acc_naive"].append(result["acc_naive"]) data["acc_weighted"].append(result["acc_weighted"]) data["acc_maj"].append(result["acc_maj"]) except Exception as e: print(f"Error processing n={futures[future]}: {e}") progress_bar.update(1) # Save results ds = Dataset.from_dict(data) url = ds.push_to_hub(args.dataset_id, config_name=f"{args.dataset_config}--evals") print(f"Results pushed to {url}")