in src/open_r1/utils/evaluation.py [0:0]
def run_benchmark_jobs(training_args: Union["SFTConfig", "GRPOConfig"], model_args: "ModelConfig") -> None:
benchmarks = training_args.benchmarks
if len(benchmarks) == 1 and benchmarks[0] == "all":
benchmarks = get_lighteval_tasks()
# Evaluate on all supported benchmarks. Later we may want to include a `chat` option
# that just evaluates on `ifeval` and `mt_bench` etc.
for benchmark in benchmarks:
print(f"Launching benchmark `{benchmark}`")
if benchmark in get_lighteval_tasks():
run_lighteval_job(benchmark, training_args, model_args)
else:
raise ValueError(f"Unknown benchmark {benchmark}")