community_tasks/_template.py (54 lines of code) (raw):

# MIT License # Copyright (c) 2024 The HuggingFace Team # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ruff: noqa: F405, F403, F401 """ Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task. This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. Author: """ import numpy as np from lighteval.metrics.metrics import SampleLevelMetric from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc, SamplingMethod # DEFINE YOUR PROMPT FUNCTIONS # Define as many as you need for your different tasks def prompt_fn(line, task_name: str = None): """Defines how to go from a dataset line to a doc object. Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info about what this function should do in the README. """ return Doc( task_name=task_name, query="", choices=[""], gold_index=0, instruction="", ) # EVAL WITH NO SUBSET ## # This is how you create a simple task (like hellaswag) which has one single subset # attached to it, and one evaluation possible. task = LightevalTaskConfig( name="myothertask", prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py suite=["community"], hf_repo="", hf_subset="default", hf_avail_splits=[], evaluation_splits=[], few_shots_split="", few_shots_select="", metrics=[], # select your metric in Metrics ) # EVALS WITH SUBSET # This is how you create a subset task (like MMLU), which has several subset # each being its own evaluation task. # fmt: off SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval # fmt: on class CustomSubsetTask(LightevalTaskConfig): def __init__( self, name, hf_subset, ): super().__init__( name=name, hf_subset=hf_subset, prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py hf_repo="", metrics=[custom_metric], # select your metric in Metrics or use your custom_metric hf_avail_splits=[], evaluation_splits=[], few_shots_split="", few_shots_select="", suite=["community"], generation_size=-1, stop_sequence=None, ) # STORE YOUR EVALS SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS] TASKS_TABLE = SUBSET_TASKS + [task] # CUSTOM METRIC IF NEEDED custom_metric = SampleLevelMetric( metric_name="my_custom_metric_name", higher_is_better=True, category=SamplingMethod.GENERATIVE, # or LOGPROBS, PERPLEXITY, etc. sample_level_fn=lambda x: x, # how to compute score for one sample corpus_level_fn=np.mean, # aggregation )