in code/run_eval_prm_trl.py [0:0]
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--config",
type=str,
default="all",
choices=["gsm8k", "math", "olympiadbench", "omnimath", "all"],
help="The configuration to run from the dataset, by default will use 'all'.",
)
parser.add_argument("--model_name", type=str, required=True, help="")
parser.add_argument(
"--output_dir",
type=str,
default="./outputs",
help="The path to save the results to.",
)
parser.add_argument(
"--sep",
type=str,
default="\n",
help="Separator of the model, ensure it corresponds to the same one used during training.",
)
parser.add_argument(
"--batch_size",
type=int,
default=32,
help=(
"The number of examples to run in a single batch. Each question has multiple steps, "
"and a batch can contain multiple from different questions to speed up the process."
),
)
parser.add_argument(
"--max_elements",
type=int,
default=-1,
help="Number of elements to run. Helpful for testing, by default will run the full dataset.",
)
args = parser.parse_args()
# Determine the configs to evaluate
configs = CONFIGS if args.config == "all" else [args.config]
pipe: "Pipeline" | None = None
path = Path(args.output_dir).absolute() / args.model_name.replace("/", "__")
path.mkdir(exist_ok=True, parents=True)
aggregated_results = {}
for config in tqdm(configs, total=len(configs), desc="Configuration"):
config_file = path / f"{config}.jsonl"
if config_file.exists():
print(f"The results already exist for {config_file}")
results = read_results(config_file)
num_problems = len(results)
precision, recall = precision_recall(results)
f1 = f1_score(precision, recall)
aggregated_results[config] = {
"num_problems": num_problems,
"precision": precision,
"recall": recall,
"f1_score": f1
}
continue
# Only download the model and run it if the results are not already available.
if pipe is None:
pipe = pipeline("token-classification", model=args.model_name, device="cuda")
print(f"Start configuration: {config}")
subset = load_dataset("Qwen/ProcessBench", split=config)
if args.max_elements > -1:
subset = subset.select(range(args.max_elements))
# Prepare examples
examples = [
Example(
problem=row["problem"],
steps=row["steps"],
label=row["label"],
sep=args.sep,
)
for row in subset
]
# Create batch processor and the data structure to store results
batch_processor = BatchProcessor(examples, batch_size=args.batch_size)
processed_data = {}
for batch_steps, batch_indices in tqdm(
batch_processor,
total=batch_processor.get_total_batches(),
desc="Processing batches...",
):
# Actual predictions
batched_outputs = pipe(batch_steps)
# Assign results back to original structure
process_results(batched_outputs, batch_indices, processed_data)
results = obtain_results(examples, processed_data)
num_problems = len(results)
precision, recall = precision_recall(results)
f1 = f1_score(precision, recall)
aggregated_results[config] = {
"num_problems": num_problems,
"precision": precision,
"recall": recall,
"f1_score": f1
}
print(f"Writing results to {config_file}")
with open(str(config_file), "w") as f:
for r in results:
f.write(json.dumps(r) + "\n")
results_report(aggregated_results)