mlebench/competitions/text-normalization-challenge-russian-language/grade.py (34 lines of code) (raw):
import pandas as pd
from sklearn.metrics import accuracy_score
from mlebench.grade_helpers import InvalidSubmissionError
def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
accuracy_inputs = prepare_for_metric(
submission=submission,
answers=answers,
target_column="after",
id_column="id",
)
return accuracy_score(y_pred=accuracy_inputs["y_pred"], y_true=accuracy_inputs["y_true"])
def prepare_for_metric(
submission: pd.DataFrame,
answers: pd.DataFrame,
target_column: str,
id_column: str,
) -> dict:
# Answers checks
assert target_column in answers.columns, f"Answers must have a `{target_column}` column"
assert id_column in answers.columns, f"Answers must have a `{id_column}` column"
# Submission checks
if len(submission) != len(answers):
raise InvalidSubmissionError("Submission must have the same length as the answers.")
if target_column not in submission.columns:
raise InvalidSubmissionError(f"Submission must have a `{target_column}` column")
if id_column not in submission.columns:
raise InvalidSubmissionError(f"Submission must have a `{id_column}` column")
# sort on id to ensure correct order
submission = submission.sort_values(id_column)
answers = answers.sort_values(id_column)
if (submission[id_column].values != answers[id_column].values).any():
raise InvalidSubmissionError(f"Submission and Answers `{id_column}`'s do not match")
submission[target_column] = submission[target_column].astype(str)
answers[target_column] = answers[target_column].astype(str)
y_pred = submission[target_column].to_numpy()
y_true = answers[target_column].to_numpy()
return {"y_true": y_true, "y_pred": y_pred}