community_tasks/arabic_evals.py (725 lines of code) (raw):

# MIT License # Copyright (c) 2024 The HuggingFace Team # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ruff: noqa: F405, F403, F401 """ Custom evaluation tasks for lighteval This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ import random import re from typing import Any, Dict, List, Optional, Union from lighteval.metrics.llm_as_judge import JudgeLM from lighteval.metrics.metrics import Metric, Metrics from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc, SamplingMethod # fmt: off LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"] # fmt: on # ArabicMMLU # fmt: off ARABIC_MMLU_SUBSETS = [ "All", "Islamic Studies", "Islamic Studies (Middle School)", "Islamic Studies (Primary School)", "Islamic Studies (High School)", "Driving Test", "Natural Science (Middle School)", "Natural Science (Primary School)", "History (Middle School)", "History (Primary School)", "History (High School)", "General Knowledge", "General Knowledge (Middle School)", "General Knowledge (Primary School)", "Law (Professional)", "Physics (High School)", "Social Science (Middle School)", "Social Science (Primary School)", "Management (University)", "Arabic Language (Middle School)", "Arabic Language (Primary School)", "Arabic Language (High School)", "Political Science (University)", "Philosophy (High School)", "Accounting (University)", "Computer Science (Middle School)", "Computer Science (Primary School)", "Computer Science (High School)", "Computer Science (University)", "Geography (Middle School)", "Geography (Primary School)", "Geography (High School)", "Math (Primary School)", "Biology (High School)", "Economics (Middle School)", "Economics (High School)", "Economics (University)", "Arabic Language (General)", "Arabic Language (Grammar)", "Civics (Middle School)", "Civics (High School)" ] # fmt: on def arabic_mmlu_pfn(line, task_name: str = None): instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" # Define the mapping from Latin to Arabic letters latin_to_arabic = {"A": "أ", "B": "ب", "C": "ج", "D": "د", "E": "هـ"} # Create a list of valid choices with corresponding Arabic keys choices = [] valid_keys_latin = [] valid_keys_arabic = [] # Enumerate through the options and append the valid ones for idx, key in enumerate(["A", "B", "C", "D", "E"]): option = line.get(f"Option {idx + 1}") if option: # Check if option is not null choices.append(option) valid_keys_latin.append(key) # Append the Latin key (A, B, C, D, E) valid_keys_arabic.append(latin_to_arabic[key]) # Append the corresponding Arabic letter # Find the correct index for the answer key in the Arabic version answer_index = valid_keys_latin.index(line["Answer Key"]) # Construct the query with Arabic letters query = f"{instruction}{line['Question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)]) query += "الإجابة:" return Doc( task_name=task_name, query=query, choices=valid_keys_arabic, # Return only valid choices (Arabic keys) gold_index=answer_index, # Correct index in the valid Arabic keys instruction=instruction, ) class CustomArabicMMLUTask(LightevalTaskConfig): def __init__( self, name, hf_subset, ): super().__init__( name=name, hf_subset=hf_subset, prompt_function=arabic_mmlu_pfn, hf_repo="MBZUAI/ArabicMMLU", metrics=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=["dev"], few_shots_select="sequential", suite=["community"], generation_size=-1, stop_sequence=None, trust_dataset=True, version=0, ) ARABIC_MMLU_TASKS = [ CustomArabicMMLUTask(name=f"arabic_mmlu:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_SUBSETS ] # ARABIC MMLU HT ## # fmt: off ARABIC_MMLU_HT_SUBSETS = [ "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", "high_school_european_history", "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics", "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history", "high_school_world_history", "human_aging", "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing", "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting", "professional_law", "professional_medicine", "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions" ] # fmt: on def arabic_mmlu_ht_pfn(line, task_name: str = None): instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" choices = line["choices"] answer_index = line["answer"] # It is an int reflecting the index of correct answer in line["choices"] query = f"{instruction}{line['question']}\n" query += "".join([f"{idx}. {choice}\n" for idx, choice in enumerate(choices, start=1)]) query += "الإجابة:" return Doc( task_name=task_name, query=query, choices=[str(i) for i in range(1, len(choices) + 1)], # List of strings instead of ints gold_index=answer_index, instruction=instruction, ) class CustomArabicMMLUHTTask(LightevalTaskConfig): def __init__( self, name, hf_subset, ): super().__init__( name=name, hf_subset=hf_subset, prompt_function=arabic_mmlu_ht_pfn, hf_repo="MBZUAI/human_translated_arabic_mmlu", metrics=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=None, few_shots_select=None, suite=["community"], generation_size=-1, stop_sequence=None, trust_dataset=True, version=0, ) ARABIC_MMLU_HT_TASKS = [ CustomArabicMMLUHTTask(name=f"arabic_mmlu_ht:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_HT_SUBSETS ] # ARABIC MMLU MT ## # fmt: off ARABIC_MMLU_MT_SUBSETS = [ "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", "high_school_european_history", "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics", "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history", "high_school_world_history", "human_aging", "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing", "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting", "professional_law", "professional_medicine", "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions" ] # fmt: on def arabic_mmlu_mt_pfn(line, task_name: str = None): instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n" choices = [line["A"], line["B"], line["C"], line["D"]] # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, # it will then be applied to arabic letters answer_index = LETTER_INDICES.index( line["answer"] ) # line["answer"] is the correct answer. That's why we need to index it ! query = f"{instruction}{line['question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)]) query += "الإجابة:" return Doc( task_name=task_name, query=query, choices=LETTER_INDICES_AR[:4], gold_index=answer_index, instruction=instruction, ) class CustomArabicMMLUMTTask(LightevalTaskConfig): def __init__( self, name, hf_subset, ): super().__init__( name=name, hf_subset=hf_subset, prompt_function=arabic_mmlu_mt_pfn, hf_repo="OALL/Arabic_MMLU", metrics=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test", "dev"], evaluation_splits=["test"], few_shots_split="dev", few_shots_select="sequential", suite=["community"], generation_size=-1, stop_sequence=None, trust_dataset=True, version=0, ) ARABIC_MMLU_MT_TASKS = [ CustomArabicMMLUMTTask(name=f"arabic_mmlu_mt:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_MT_SUBSETS ] # ACVA ## # fmt: off ACVA_SUBSETS = [ "Algeria", "Ancient_Egypt", "Arab_Empire", "Arabic_Architecture", "Arabic_Art", "Arabic_Astronomy", "Arabic_Calligraphy", "Arabic_Ceremony", "Arabic_Clothing", "Arabic_Culture", "Arabic_Food", "Arabic_Funeral", "Arabic_Geography", "Arabic_History", "Arabic_Language_Origin", "Arabic_Literature", "Arabic_Math", "Arabic_Medicine", "Arabic_Music", "Arabic_Ornament", "Arabic_Philosophy", "Arabic_Physics_and_Chemistry", "Arabic_Wedding", "Bahrain", "Comoros", "Egypt_modern", "InfluenceFromAncientEgypt", "InfluenceFromByzantium", "InfluenceFromChina", "InfluenceFromGreece", "InfluenceFromIslam", "InfluenceFromPersia", "InfluenceFromRome", "Iraq", "Islam_Education", "Islam_branches_and_schools", "Islamic_law_system", "Jordan", "Kuwait", "Lebanon", "Libya", "Mauritania", "Mesopotamia_civilization", "Morocco", "Oman", "Palestine", "Qatar", "Saudi_Arabia", "Somalia", "Sudan", "Syria", "Tunisia", "United_Arab_Emirates", "Yemen", "communication", "computer_and_phone", "daily_life", "entertainment" ] # fmt: on def acva_pfn(line, task_name: str = None): question = line["question"] answer = line["answer"] return Doc( task_name=task_name, query=f"السؤال: {question}\nالإجابة:", choices=["صح", "خطأ"], gold_index=["صح", "خطأ"].index(answer), ) class CustomACVATask(LightevalTaskConfig): def __init__( self, name, hf_subset, ): super().__init__( name=name, hf_subset=hf_subset, prompt_function=acva_pfn, hf_repo="OALL/ACVA", metrics=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", suite=["community"], generation_size=-1, stop_sequence=None, trust_dataset=True, version=0, ) ACVA_TASKS = [CustomACVATask(name=f"acva:{subset}", hf_subset=subset) for subset in ACVA_SUBSETS] # AraTrust ## # fmt: off ARATRUST_SUBSETS = [ "Trustfulness", "MentalHealth", "PhysicalHealth", "Offensive", "Ethics", "Privacy", "Unfairness", "Illegal", ] # fmt: on def aratrust_pfn(line, task_name: str = None): instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج. \n\n" choices = [line["A"], line["B"], line["C"]] # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, # it will then be applied to arabic letters answer_index = LETTER_INDICES_AR.index( line["Answer"] ) # line["answer"] is the correct answer. That's why we need to index it ! query = f"{instruction}{line['Question']}\n" query += "".join([f"{choice}\n" for choice in choices]) query += "الإجابة:" return Doc( task_name=task_name, query=query, choices=LETTER_INDICES_AR[:3], gold_index=answer_index, instruction=instruction, ) class CustomAraTrustTask(LightevalTaskConfig): def __init__( self, name, hf_subset, ): super().__init__( name=name, hf_subset=hf_subset, prompt_function=aratrust_pfn, hf_repo="asas-ai/AraTrust-categorized", metrics=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split=None, few_shots_select=None, suite=["community"], generation_size=-1, stop_sequence=None, trust_dataset=True, version=0, ) ARATRUST_TASKS = [CustomAraTrustTask(name=f"aratrust:{subset}", hf_subset=subset) for subset in ARATRUST_SUBSETS] def arabic_exams_pfn(line, task_name: str = None): topic = line["subject"] question = line["question"] choices = [line["A"], line["B"], line["C"], line["D"]] choices_formatted = [f" {LETTER_INDICES_AR[i]}) {choice}\n" for i, choice in enumerate(choices)] answer = line["answer"] answer_index = LETTER_INDICES.index(answer) instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" query = f"{instruction}السؤال: {question}\n" query += "\n".join(choices_formatted) query += "\nالإجابة:" return Doc( task_name=task_name, query=query, choices=LETTER_INDICES_AR[:4], gold_index=answer_index, instruction=instruction, ) # ARABIC EXAMS ## arabic_exams_task = LightevalTaskConfig( name="arabic_exams", prompt_function=arabic_exams_pfn, suite=["community"], hf_repo="OALL/Arabic_EXAMS", hf_subset="default", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # ALGHAFA NATIVE ## # fmt: off ALGHAFA_SUBSETS = [ "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", "multiple_choice_sentiment_task" ] # fmt: on def alghafa_pfn(line, task_name: str = None): question = line["query"] answer_index = int(line["label"]) allowed_keys = [f"sol{i}" for i in range(1, 6)] extracted_choices = [line[key] for key in allowed_keys if key in line] choices = [str(i) for i in range(len(extracted_choices))] instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" query = f"{instruction}السؤال: {question}\n" for index, choice in enumerate(extracted_choices): query += f"{index}) {choice}\n" query += "الإجابة:" return Doc( task_name=task_name, query=query, choices=choices, gold_index=answer_index, instruction=instruction, ) class CustomAlGhafaNativeTask(LightevalTaskConfig): def __init__( self, name, hf_subset, ): super().__init__( name=name, hf_subset=hf_subset, prompt_function=alghafa_pfn, hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", metrics=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", suite=["community"], generation_size=-1, stop_sequence=None, trust_dataset=True, version=0, ) ALGHAFA_TASKS = [CustomAlGhafaNativeTask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS] # ALGHAFA TRANSLATED ## # race_ar race_ar_task = LightevalTaskConfig( name="race_ar", prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="race_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # piqa_ar piqa_ar_task = LightevalTaskConfig( name="piqa_ar", prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="piqa_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # arc_easy_ar arc_easy_ar_task = LightevalTaskConfig( name="arc_easy_ar", prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_easy_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # arc_challenge_okapi_ar arc_challenge_okapi_ar_task = LightevalTaskConfig( name="arc_challenge_okapi_ar", prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_challenge_okapi_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # mmlu_okapi_ar mmlu_okapi_ar_task = LightevalTaskConfig( name="mmlu_okapi_ar", prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="mmlu_okapi_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # openbook_qa_ext_ar openbook_qa_ext_ar_task = LightevalTaskConfig( name="openbook_qa_ext_ar", prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="openbook_qa_ext_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # boolq_ar def boolq_arabic_pfn(line, task_name: str = None): question = line["question"] passage = line["passage"] instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا" query = f"""{instruction} المقطع : {passage} السؤال: {question} الإجابة: """ return Doc( task_name=task_name, query=query, choices=["نعم", "لا"], gold_index=0 if line["answer"] else 1, instruction=instruction, ) boolq_ar_task = LightevalTaskConfig( name="boolq_ar", prompt_function=boolq_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="boolq_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # copa_ext_ar def copa_arabic_pfn(line, task_name: str = None): premise = line["premise"] choices = [line["choice1"], line["choice2"]] question_map = {"cause": "لأن", "effect": "لذلك"} question = question_map[line["question"]] answer = line["label"] query = "{}، {} :\n0) {}\n1) {}\nالإجابة:".format(premise, question, choices[0], choices[1]) return Doc( task_name=task_name, query=query, choices=choices, gold_index=answer, instruction="", ) copa_ext_ar_task = LightevalTaskConfig( name="copa_ext_ar", prompt_function=copa_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="copa_ext_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # hellaswag_okapi_ar def hellaswag_arabic_pfn(line, task_name: str = None): ctx = re.sub(r"\[.*?\]", "", line["ctx"]) # Remove latin words within brackets endings = [ re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"]) ] # endings is a string representation of a list answer_index = line["label"] instruction = "بناء على السياق التالي، اختر النهاية الصحيحة من الاقتراحات التالية" query = f"""{instruction} السياق: {ctx} الاقتراحات: """ for i, ending in enumerate(endings): query += f"{i}) {ending}\n" query += "الإجابة:" return Doc( task_name=task_name, query=query, choices=endings, gold_index=answer_index, instruction=instruction, ) hellaswag_okapi_ar_task = LightevalTaskConfig( name="hellaswag_okapi_ar", prompt_function=hellaswag_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="hellaswag_okapi_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # toxigen_ar def toxigen_arabic_pfn(line, task_name: str = None): text = line["text"] label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0 instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".' query = f"""{instruction} العبارة: '{text}' الإجابة: """ return Doc( task_name=task_name, query=query, choices=["لا", "نعم"], gold_index=label, instruction=instruction, ) toxigen_ar_task = LightevalTaskConfig( name="toxigen_ar", prompt_function=toxigen_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="toxigen_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # sciq_ar def sciq_arabic_pfn(line, task_name: str = None): support = line["support"] question = line["question"] correct_answer = line["correct_answer"] choices = [line["distractor1"], line["distractor2"], line["distractor3"], correct_answer] # Shuffle the choices random.shuffle(choices) answer_index = choices.index(correct_answer) instruction = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات" query = f"""{instruction} السياق: {support} السؤال: {question} الإجابات المحتملة: """ for i, choice in enumerate(choices): query += f"{i}) {choice}\n" query += "الإجابة:" return Doc( task_name=task_name, query=query, choices=choices, gold_index=answer_index, instruction=instruction, ) sciq_ar_task = LightevalTaskConfig( name="sciq_ar", prompt_function=sciq_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="sciq_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", few_shots_select="sequential", metrics=[Metrics.loglikelihood_acc_norm], trust_dataset=True, version=0, ) # madinah_qa # fmt: off MADINAH_QA_SUBSETS = ["Arabic Language (General)", "Arabic Language (Grammar)"] # fmt: on def madinah_qa_pfn(line, task_name: str = None): instruction = "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الأجوبة:\n\n" # Define the mapping from Latin to Arabic letters latin_to_arabic = {"A": "أ", "B": "ب", "C": "ج", "D": "د", "E": "هـ"} # Create a list of valid choices with corresponding Arabic keys choices = [] valid_keys_latin = [] valid_keys_arabic = [] # Enumerate through the options and append the valid ones for idx, key in enumerate(["A", "B", "C", "D", "E"]): option = line.get(f"Option {idx + 1}") if option: # Check if option is not null choices.append(option) valid_keys_latin.append(key) # Append the Latin key (A, B, C, D, E) valid_keys_arabic.append(latin_to_arabic[key]) # Append the corresponding Arabic letter # Find the correct index for the answer key in the Arabic version answer_index = valid_keys_latin.index(line["Answer Key"]) query = f"{instruction}\nالسياق:\n{line['Context']}\nالسؤال:\n{line['Question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)]) query += "الإجابة:" return Doc( task_name=task_name, query=query, choices=valid_keys_arabic, gold_index=answer_index, # Correct index in the valid keys instruction=instruction, ) class CustomMadinahQATask(LightevalTaskConfig): def __init__( self, name, hf_subset, ): super().__init__( name=name, hf_subset=hf_subset, prompt_function=madinah_qa_pfn, hf_repo="MBZUAI/MadinahQA", metrics=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=["dev"], few_shots_select="sequential", suite=["community"], generation_size=-1, stop_sequence=None, trust_dataset=True, version=0, ) MADINAH_QA_TASKS = [ CustomMadinahQATask(name=f"madinah_qa:{subset}", hf_subset=subset) for subset in MADINAH_QA_SUBSETS ] class JudgeMetricWrapper(Metric): """Wrapper class for LLM-based judge metric implementation.""" def __init__(self, judge: JudgeLM): """ Initializes the judge metric wrapper. Args: judge (JudgeLM): The LLM judge instance to use for evaluation. """ self.judge = judge self.metric_name = "llm_as_judge" self.category = SamplingMethod.GENERATIVE self.corpus_level_fn = self.aggregate_scores self.sample_level_fn = self._sample_level_fn self.higher_is_better = True # Fixed tuple syntax def compute(self, responses: list[str], formatted_docs: list[Doc], **kwargs) -> dict[str, float]: """ Computes evaluation scores using the judge's evaluate_answer method. Args: responses (list[str]): The predicted answers formatted_docs (list[Doc]): Documents containing questions and gold answers Returns: dict[str, float]: Dictionary containing evaluation scores """ results = [] for i, doc in enumerate(formatted_docs): question = doc.query gold = doc.choices[doc.gold_index] if doc.gold_index is not None else None answer = responses[i][0].result[0] score, _, _ = self.judge.evaluate_answer(question=question, answer=answer, options=None, gold=gold) results.append({self.metric_name: score}) return results def aggregate_scores(self, scores: list[dict]) -> float: return sum(scores) / len(scores) if scores else 0.0 def _sample_level_fn(self): return None def parse_candidates(candidates: Union[List[str], str]) -> List[str]: """ Parses and validates candidate answers from either list or string format. Args: candidates: Either a list of candidate answers or a newline-separated string Returns: List[str]: List of validated candidate answers Raises: ValueError: If candidates cannot be parsed or are empty """ try: if isinstance(candidates, list): parsed_candidates = [str(c).strip() for c in candidates if c] else: parsed_candidates = [c.strip() for c in str(candidates).split("\n") if c.strip()] if not parsed_candidates: raise ValueError("No valid candidates found after parsing") return parsed_candidates except Exception as e: raise ValueError(f"Failed to parse candidates: {str(e)}") def qa_prompt_arabic(line: Dict[str, Any], task_name: str = None) -> Doc: """ Formats the prompt for Arabic question answering with candidates. Args: line: Dictionary containing question and candidate information task_name: Optional name for the task Returns: Doc: Formatted document for evaluation Raises: ValueError: If required fields are missing or invalid """ try: # Validates and extracts the question if not isinstance(line.get("question"), str): raise ValueError("Question must be a string") question = line["question"] # Processes candidate answers candidates = parse_candidates(line["candidates"]) # Validates gold answer if "gold_answer" not in line: raise ValueError("Gold answer is required") gold_answer = str(line["gold_answer"]) # Constructs the prompt instruction = "بناءً على السياقات المقترحة التالية، اجب عن السؤال التالي" query = f"{instruction}\n\nالسؤال:\n{question}\n\nالسياقات المقترحة:\n{', '.join(candidates)}\n" return Doc( task_name=task_name or "alrage", query=query, instruction=instruction, choices=[gold_answer], # Gold answer is used as the only valid choice gold_index=0, # Index of the correct answer in choices ) except Exception as e: raise ValueError(f"Failed to create QA prompt: {str(e)}") def judge_template(question: str, answer: str, gold: str, options: Optional[List[str]] = None) -> List[Dict[str, str]]: """ Template for the Arabic judge prompt. System prompt translation: You are a neutral expert evaluator. Your tasks are: 1. Evaluate the answer's accuracy compared to the correct answer 2. Verify that the answer is supported by the provided context 3. Evaluate the quality and comprehensiveness of the answer Rate the answer on a scale from 0 to 10. Args: question: The question being evaluated answer: The provided answer gold: The correct answer options: Optional list of answer choices Returns: List[Dict[str, str]]: Formatted messages for the judge """ messages = [ { "role": "system", "content": """أنت مقيّم محايد خبير باللغة العربية. يجب عليك: 1. تقييم دقة الإجابة مقارنة بالإجابة الصحيحة 2. التحقق من أن الإجابة مدعومة بالسياق المقدم 3. تقييم جودة وشمولية الإجابة مهم جداً: يجب أن يكون ردك رقماً فقط من 0 إلى 10. لا تضف أي نص أو تفسير.""", }, { "role": "user", "content": f"""السؤال: {question} الإجابة المقدمة: {answer} الإجابة الصحيحة: {gold} أعط تقييماً من 0 إلى 10: 0-2: إجابة خاطئة تماماً 3-4: إجابة جزئية مع أخطاء 5-6: إجابة متوسطة 7-8: إجابة جيدة 9-10: إجابة ممتازة اكتب رقماً فقط من 0 إلى 10 بدون أي نص إضافي:""", }, ] return messages def process_judge_response(response) -> float: """Process the judge's response to extract the score""" # If response is a list, extract the content from the user role if isinstance(response, list): response_content = " ".join(item["content"] for item in response if item["role"] == "user") else: response_content = response # If it's not a list, use it directly try: # Extract the score from the response content score = float(next(num for num in response_content.split() if num.replace(".", "", 1).isdigit())) return min(max(score / 10.0, 0.0), 1.0) except (StopIteration, ValueError): return 0.0 judge = JudgeLM( model="Qwen/Qwen2.5-72B-Instruct", templates=judge_template, process_judge_response=process_judge_response, judge_backend="vllm", ) wrapped_judge = JudgeMetricWrapper(judge) # Task configuration alrage_qa_task = LightevalTaskConfig( name="alrage_qa", prompt_function=qa_prompt_arabic, suite=["community"], hf_repo="OALL/ALRAGE", hf_subset=None, hf_avail_splits=["train"], evaluation_splits=["train"], metrics=[wrapped_judge], trust_dataset=True, generation_size=200, stop_sequence=[], version=0, ) TASKS_TABLE = ( ARABIC_MMLU_TASKS + ARABIC_MMLU_HT_TASKS + ARABIC_MMLU_MT_TASKS + ACVA_TASKS + ALGHAFA_TASKS + ARATRUST_TASKS + MADINAH_QA_TASKS + [arabic_exams_task] + [race_ar_task] + [piqa_ar_task] + [arc_easy_ar_task] + [arc_challenge_okapi_ar_task] + [mmlu_okapi_ar_task] + [openbook_qa_ext_ar_task] + [boolq_ar_task] + [copa_ext_ar_task] + [hellaswag_okapi_ar_task] + [toxigen_ar_task] + [sciq_ar_task] + [alrage_qa_task] )