def main()

in generate/evaluate.py [0:0]


def main():
    load_dotenv()  # Load environment variables from .env file
    
    import argparse
    parser = argparse.ArgumentParser(description="Evaluate LLMs on IOI problems")
    parser.add_argument("--org_id", required=True, help="Organization ID")
    parser.add_argument("--model_id", required=True, help="Model ID")
    parser.add_argument("--api_base", help="API base URL for the model")
    parser.add_argument("--subset", default="test", help="IOI subset to generate solutions for (train or test)")
    parser.add_argument("--num_generations", type=int, default=50, help="Number of generations per problem")
    parser.add_argument("--num_retries", type=int, default=10, help="Number of retries for failed API calls")
    parser.add_argument("--concurrency", type=int, default=20, help="Number of concurrent generations")
    parser.add_argument("--num_problems", type=int, default=None, help="Number of problems to evaluate (None for all)")
    parser.add_argument("--last_subtask", action="store_true", help="Only evaluate the last subtask for each problem (usually the full problem)")
    parser.add_argument("--dry_run", action="store_true", help="Run without making actual LLM calls")
    parser.add_argument("--override", action="store_true", help="Override existing results and start fresh")
    parser.add_argument("--model_postfix", help="Postfix for the model name")
    parser.add_argument("--revision", help="Revision to use for the model")
    parser.add_argument("--timeout", type=int, default=600, help="Timeout for the LLM call")
    parser.add_argument("--use_requests", action="store_true", default=False, help="Use requests instead of litellm")
    parser.add_argument("--max_tokens", type=int, default=None, help="Max tokens")
    args = parser.parse_args()

    evaluator = IOIEvaluator(
        org_id=args.org_id,
        model_id=args.model_id,
        api_base=args.api_base,
        subset=args.subset,
        num_generations=args.num_generations,
        num_retries=args.num_retries,
        concurrency=args.concurrency,
        num_problems=args.num_problems,
        last_subtask=args.last_subtask,
        dry_run=args.dry_run,
        override=args.override,
        model_postfix=args.model_postfix,
        revision=args.revision,
        timeout=args.timeout,
        use_requests=args.use_requests,
        max_tokens=args.max_tokens
    )
    asyncio.run(evaluator.run_evaluation())