in generate/evaluate.py [0:0]
def main():
load_dotenv() # Load environment variables from .env file
import argparse
parser = argparse.ArgumentParser(description="Evaluate LLMs on IOI problems")
parser.add_argument("--org_id", required=True, help="Organization ID")
parser.add_argument("--model_id", required=True, help="Model ID")
parser.add_argument("--api_base", help="API base URL for the model")
parser.add_argument("--subset", default="test", help="IOI subset to generate solutions for (train or test)")
parser.add_argument("--num_generations", type=int, default=50, help="Number of generations per problem")
parser.add_argument("--num_retries", type=int, default=10, help="Number of retries for failed API calls")
parser.add_argument("--concurrency", type=int, default=20, help="Number of concurrent generations")
parser.add_argument("--num_problems", type=int, default=None, help="Number of problems to evaluate (None for all)")
parser.add_argument("--last_subtask", action="store_true", help="Only evaluate the last subtask for each problem (usually the full problem)")
parser.add_argument("--dry_run", action="store_true", help="Run without making actual LLM calls")
parser.add_argument("--override", action="store_true", help="Override existing results and start fresh")
parser.add_argument("--model_postfix", help="Postfix for the model name")
parser.add_argument("--revision", help="Revision to use for the model")
parser.add_argument("--timeout", type=int, default=600, help="Timeout for the LLM call")
parser.add_argument("--use_requests", action="store_true", default=False, help="Use requests instead of litellm")
parser.add_argument("--max_tokens", type=int, default=None, help="Max tokens")
args = parser.parse_args()
evaluator = IOIEvaluator(
org_id=args.org_id,
model_id=args.model_id,
api_base=args.api_base,
subset=args.subset,
num_generations=args.num_generations,
num_retries=args.num_retries,
concurrency=args.concurrency,
num_problems=args.num_problems,
last_subtask=args.last_subtask,
dry_run=args.dry_run,
override=args.override,
model_postfix=args.model_postfix,
revision=args.revision,
timeout=args.timeout,
use_requests=args.use_requests,
max_tokens=args.max_tokens
)
asyncio.run(evaluator.run_evaluation())