lm_eval/tasks/afrimmlu/utils.py (22 lines of code) (raw):

from sklearn.metrics import f1_score def doc_to_choice(doc): choices = eval(doc["choices"]) return choices def doc_to_text(doc): output = """You are a highly knowledgeable and intelligent artificial intelligence model answers multiple-choice questions about '{subject}' Question: '''{question}''' Choices: A: ''{choice1}''' B: ''{choice2}''' C: ''{choice3}''' D: ''{choice4}''' Answer: """ choices = eval(doc["choices"]) text = output.format( subject=doc["subject"], question=doc["question"], choice1=choices[0], choice2=choices[1], choice3=choices[2], choice4=choices[3], ) return text def weighted_f1_score(items): unzipped_list = list(zip(*items)) golds = unzipped_list[0] preds = unzipped_list[1] fscore = f1_score(golds, preds, average="weighted") return fscore