mlebench/competitions/bms-molecular-translation/grade.py (25 lines of code) (raw):

import numpy as np import pandas as pd from Levenshtein import distance from mlebench.grade_helpers import InvalidSubmissionError def edit_distance_array(y_true: pd.Series, y_pred: pd.Series) -> float: return np.mean([distance(a, b) for a, b in zip(y_true, y_pred)]) def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame): if len(submission) != len(answers): raise InvalidSubmissionError("Submission and answers must have the same length") if "image_id" not in submission.columns: raise InvalidSubmissionError("Submission must have an 'image_id' column") if "InChI" not in submission.columns: raise InvalidSubmissionError("Submission must have a 'InChI' column") assert "image_id" in answers.columns, "Answers must have 'image_id' column" assert "InChI" in answers.columns, "Answers must have 'InChI' column" submission = submission.sort_values("image_id") answers = answers.sort_values("image_id") if (submission["image_id"].values != answers["image_id"].values).any(): raise InvalidSubmissionError("Submission and answers must have the same ids") # Convert to strings submission["InChI"] = submission["InChI"].astype(str) answers["InChI"] = answers["InChI"].astype(str) return answers["InChI"], submission["InChI"] def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float: y_true, y_pred = prepare_for_metric(submission, answers) return edit_distance_array(y_true, y_pred)