mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py (60 lines of code) (raw):

""" Adapted from: https://www.kaggle.com/code/metric/kullback-leibler-divergence Linked from: https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification """ from typing import Optional import numpy as np import pandas as pd import pandas.api.types from . import kaggle_metric_utilities class ParticipantVisibleError(Exception): pass def kl_divergence( solution: pd.DataFrame, submission: pd.DataFrame, epsilon: float, micro_average: bool, sample_weights: Optional[pd.Series], ): # Overwrite solution for convenience for col in solution.columns: # Prevent issue with populating int columns with floats if not pandas.api.types.is_float_dtype(solution[col]): solution[col] = solution[col].astype(float) # Clip both the min and max following Kaggle conventions for related metrics like log loss # Clipping the max avoids cases where the loss would be infinite or undefined, clipping the min # prevents users from playing games with the 20th decimal place of predictions. submission[col] = np.clip(submission[col], epsilon, 1 - epsilon) y_nonzero_indices = solution[col] != 0 solution[col] = solution[col].astype(float) solution.loc[y_nonzero_indices, col] = solution.loc[y_nonzero_indices, col] * np.log( solution.loc[y_nonzero_indices, col] / submission.loc[y_nonzero_indices, col] ) # Set the loss equal to zero where y_true equals zero following the scipy convention: # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr solution.loc[~y_nonzero_indices, col] = 0 if micro_average: return np.average(solution.sum(axis=1), weights=sample_weights) else: return np.average(solution.mean()) def score( solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, epsilon: float = 10**-15, micro_average: bool = True, sample_weights_column_name: Optional[str] = None, ) -> float: """The Kullback–Leibler divergence. The KL divergence is technically undefined/infinite where the target equals zero. This implementation always assigns those cases a score of zero; effectively removing them from consideration. The predictions in each row must add to one so any probability assigned to a case where y == 0 reduces another prediction where y > 0, so crucially there is an important indirect effect. https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence solution: pd.DataFrame submission: pd.DataFrame epsilon: KL divergence is undefined for p=0 or p=1. If epsilon is not null, solution and submission probabilities are clipped to max(eps, min(1 - eps, p). row_id_column_name: str micro_average: bool. Row-wise average if True, column-wise average if False. Examples -------- >>> import pandas as pd >>> row_id_column_name = "id" >>> score(pd.DataFrame({'id': range(4), 'ham': [0, 1, 1, 0], 'spam': [1, 0, 0, 1]}), pd.DataFrame({'id': range(4), 'ham': [.1, .9, .8, .35], 'spam': [.9, .1, .2, .65]}), row_id_column_name=row_id_column_name) 0.216161... >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]}) >>> submission = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]}) >>> score(solution, submission, 'id') 0.0 >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]}) >>> submission = pd.DataFrame({'id': range(3), 'ham': [0.2, 0.3, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.7, 0.2, 0]}) >>> score(solution, submission, 'id') 0.160531... """ del solution[row_id_column_name] del submission[row_id_column_name] sample_weights = None if sample_weights_column_name: if sample_weights_column_name not in solution.columns: raise ParticipantVisibleError( f"{sample_weights_column_name} not found in solution columns" ) sample_weights = solution.pop(sample_weights_column_name) if sample_weights_column_name and not micro_average: raise ParticipantVisibleError("Sample weights are only valid if `micro_average` is `True`") for col in solution.columns: if col not in submission.columns: raise ParticipantVisibleError(f"Missing submission column {col}") kaggle_metric_utilities.verify_valid_probabilities(solution, "solution") kaggle_metric_utilities.verify_valid_probabilities(submission, "submission") return kaggle_metric_utilities.safe_call_score( kl_divergence, solution, submission, epsilon=epsilon, micro_average=micro_average, sample_weights=sample_weights, )