mlebench/competitions/multi-modal-gesture-recognition/grade.py (35 lines of code) (raw):
from typing import List
import numpy as np
import pandas as pd
from Levenshtein import distance
from mlebench.grade_helpers import InvalidSubmissionError
def final_edit_distance(y_true: List[int], y_pred: List[int]) -> float:
"""
> The overall score we compute is the sum of the Levenshtein distances for all the lines of the
result file compared to the corresponding lines in the truth value file, divided by the total
number of gestures in the truth value file. This score is analogous to an error rate.
However, it can exceed one.
(https://www.kaggle.com/competitions/multi-modal-gesture-recognition/overview)
"""
sum_of_distances = np.sum([distance(a, b) for a, b in zip(y_true, y_pred)])
total_num_gestures = sum(len(x) for x in y_true)
return sum_of_distances / total_num_gestures
def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
if len(submission) != len(answers):
raise InvalidSubmissionError("Submission and answers must have the same length")
if "Id" not in submission.columns:
raise InvalidSubmissionError("Submission must have an 'Id' column")
if "Sequence" not in submission.columns:
raise InvalidSubmissionError("Submission must have a 'Sequence' column")
assert "Id" in answers.columns, "Answers must have 'Id' column"
assert "Sequence" in answers.columns, "Answers must have 'Sequence' column"
submission = submission.sort_values("Id")
answers = answers.sort_values("Id")
if (submission["Id"].values != answers["Id"].values).any():
raise InvalidSubmissionError("Submission and answers must have the same ids")
# Read as strings, convert to list of numbers
submission["Sequence"] = submission["Sequence"].astype(str)
answers["Sequence"] = answers["Sequence"].astype(str)
answers_sequences = [list(map(int, x.split())) for x in answers["Sequence"]]
try:
submission_sequences = [list(map(int, x.split())) for x in submission["Sequence"]]
except ValueError as e:
raise InvalidSubmissionError(
f"Submission sequences must be integers separated by spaces. Failed to convert Sequence to list of integers: {e}"
)
return answers_sequences, submission_sequences
def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
y_true, y_pred = prepare_for_metric(submission, answers)
return final_edit_distance(y_true, y_pred)