human_eval/evaluate_functional_correctness.py (17 lines of code) (raw):

import fire import sys from human_eval.data import HUMAN_EVAL from human_eval.evaluation import evaluate_functional_correctness def entry_point( sample_file: str, k: str = "1,10,100", n_workers: int = 4, timeout: float = 3.0, problem_file: str = HUMAN_EVAL, ): """ Evaluates the functional correctness of generated samples, and writes results to f"{sample_file}_results.jsonl.gz" """ k = list(map(int, k.split(","))) results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file) print(results) def main(): fire.Fire(entry_point) sys.exit(main())