mlebench/competitions/hotel-id-2021-fgvc8/grade.py (27 lines of code) (raw):

import numpy as np import pandas as pd from mlebench.grade_helpers import InvalidSubmissionError from mlebench.metrics import mean_average_precision_at_k def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict: id_col = "image" target_col = "hotel_id" if id_col not in submission.columns: raise InvalidSubmissionError(f"Submission is missing `{id_col}` column") if target_col not in submission.columns: raise InvalidSubmissionError(f"Submission is missing `{target_col}` column") assert id_col in answers.columns, f"Answers is missing `{id_col}` column" assert target_col in answers.columns, f"Answers is missing `{target_col}` column" if not set(submission[id_col]) == set(answers[id_col]): raise InvalidSubmissionError("Submission and answers have different ids") if not len(submission) == len(answers): raise InvalidSubmissionError("Submission and answers have different lengths") # sort to ensure we're grading the right rows submission = submission.sort_values(id_col).reset_index(drop=True) answers = answers.sort_values(id_col).reset_index(drop=True) # split `hotel_id` column into list of strings submission[target_col] = submission[target_col].astype(str).str.split(" ") answers[target_col] = answers[target_col].astype(str).str.split(" ").apply(set) actual = answers[target_col].tolist() predicted = submission[target_col].tolist() return {"actual": actual, "predicted": predicted} def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float: map_inputs = prepare_for_metric(submission, answers) return mean_average_precision_at_k(**map_inputs, k=5)