mlebench/competitions/lmsys-chatbot-arena/grade.py (39 lines of code) (raw):

import numpy as np import pandas as pd from sklearn.metrics import log_loss from mlebench.competitions.utils import one_hot_dfs_to_log_loss_inputs from mlebench.grade_helpers import InvalidSubmissionError def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict: """ The submission and answers are already one-hotted """ classes = ["winner_model_a", "winner_model_b", "winner_tie"] required_columns = ["id"] + classes # Check if submission has the required columns missing_columns = [col for col in required_columns if col not in submission.columns] if missing_columns: raise InvalidSubmissionError( f"Submission DataFrame is missing required columns: {missing_columns}" ) # Check if answers has the required columns assert set(required_columns).issubset( answers.columns ), f"Answers DataFrame is missing required columns: {set(required_columns) - set(answers.columns)}" # Check if submission has the correct number of rows if len(submission) != len(answers): raise InvalidSubmissionError( f"Submission DataFrame must have {len(answers)} rows, but has {len(submission)} rows." ) # Check if all values in submission are between 0 and 1 if ( not ((submission[classes] >= 0) & (submission[classes] <= 1)).all().all() ): # first all() checks if all rows are valid, second all() checks if all columns are valid raise InvalidSubmissionError("All values in submission DataFrame must be between 0 and 1.") # Check if each row in submission sums to 1 if not submission[classes].sum(axis=1).round(6).eq(1).all(): raise InvalidSubmissionError("Each row in submission DataFrame must sum to 1.") # Use only the required columns for further processing submission = submission[required_columns] answers = answers[required_columns] submission = submission.sort_values("id").reset_index(drop=True) answers = answers.sort_values("id").reset_index(drop=True) if (submission["id"].values != answers["id"].values).any(): raise InvalidSubmissionError("Submission and answer IDs do not match after sorting.") log_loss_inputs = one_hot_dfs_to_log_loss_inputs( submission, answers, id_column="id", apply_softmax=False ) return log_loss_inputs def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float: log_loss_inputs = prepare_for_metric(submission, answers) return log_loss(**log_loss_inputs)