community_tasks/aimo

# MIT License # Copyright (c) 2024 The HuggingFace Team # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ruff: noqa: F405, F403, F401 """ Task to evaluate LLMs on the training set of the Kaggle AIMO competition: https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize """ from lighteval.metrics.metrics import Metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc def aimo_prompt(line, task_name: str = None): return Doc( task_name=task_name, choices=[str(line["answer"])], gold_index=0, query=line["problem"], ) task = LightevalTaskConfig( name="aimo_progress_prize_1", prompt_function=aimo_prompt, suite=["community"], hf_subset="", hf_repo="lighteval/aimo_progress_prize_1", hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split="train", few_shots_select="sequential", metric=[Metrics.quasi_exact_match_math], generation_size=2048, stop_sequence=None, ) # STORE YOUR EVALS TASKS_TABLE = [task]

community_tasks/aimo_evals.py (25 lines of code) (raw):