def main()

in code/run_eval_prm_trl.py [0:0]


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--config",
        type=str,
        default="all",
        choices=["gsm8k", "math", "olympiadbench", "omnimath", "all"],
        help="The configuration to run from the dataset, by default will use 'all'.",
    )
    parser.add_argument("--model_name", type=str, required=True, help="")
    parser.add_argument(
        "--output_dir",
        type=str,
        default="./outputs",
        help="The path to save the results to.",
    )
    parser.add_argument(
        "--sep",
        type=str,
        default="\n",
        help="Separator of the model, ensure it corresponds to the same one used during training.",
    )
    parser.add_argument(
        "--batch_size",
        type=int,
        default=32,
        help=(
            "The number of examples to run in a single batch. Each question has multiple steps, "
            "and a batch can contain multiple from different questions to speed up the process."
        ),
    )
    parser.add_argument(
        "--max_elements",
        type=int,
        default=-1,
        help="Number of elements to run. Helpful for testing, by default will run the full dataset.",
    )

    args = parser.parse_args()

    # Determine the configs to evaluate
    configs = CONFIGS if args.config == "all" else [args.config]
    pipe: "Pipeline" | None = None

    path = Path(args.output_dir).absolute() / args.model_name.replace("/", "__")

    path.mkdir(exist_ok=True, parents=True)
    aggregated_results = {}
    for config in tqdm(configs, total=len(configs), desc="Configuration"):
        config_file = path / f"{config}.jsonl"
        if config_file.exists():
            print(f"The results already exist for {config_file}")
            results = read_results(config_file)
            num_problems = len(results)
            precision, recall = precision_recall(results)
            f1 = f1_score(precision, recall)
            aggregated_results[config] = {
                "num_problems": num_problems,
                "precision": precision,
                "recall": recall,
                "f1_score": f1
            }
            continue

        # Only download the model and run it if the results are not already available.
        if pipe is None:
            pipe = pipeline("token-classification", model=args.model_name, device="cuda")

        print(f"Start configuration: {config}")
        subset = load_dataset("Qwen/ProcessBench", split=config)
        if args.max_elements > -1:
            subset = subset.select(range(args.max_elements))

        # Prepare examples
        examples = [
            Example(
                problem=row["problem"],
                steps=row["steps"],
                label=row["label"],
                sep=args.sep,
            )
            for row in subset
        ]

        # Create batch processor and the data structure to store results
        batch_processor = BatchProcessor(examples, batch_size=args.batch_size)
        processed_data = {}

        for batch_steps, batch_indices in tqdm(
            batch_processor,
            total=batch_processor.get_total_batches(),
            desc="Processing batches...",
        ):
            # Actual predictions
            batched_outputs = pipe(batch_steps)
            # Assign results back to original structure
            process_results(batched_outputs, batch_indices, processed_data)

        results = obtain_results(examples, processed_data)
        num_problems = len(results)
        precision, recall = precision_recall(results)
        f1 = f1_score(precision, recall)

        aggregated_results[config] = {
            "num_problems": num_problems,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        }
        print(f"Writing results to {config_file}")
        with open(str(config_file), "w") as f:
            for r in results:
                f.write(json.dumps(r) + "\n")

    results_report(aggregated_results)