mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py (60 lines of code) (raw):
"""
Adapted from: https://www.kaggle.com/code/metric/kullback-leibler-divergence
Linked from: https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification
"""
from typing import Optional
import numpy as np
import pandas as pd
import pandas.api.types
from . import kaggle_metric_utilities
class ParticipantVisibleError(Exception):
pass
def kl_divergence(
solution: pd.DataFrame,
submission: pd.DataFrame,
epsilon: float,
micro_average: bool,
sample_weights: Optional[pd.Series],
):
# Overwrite solution for convenience
for col in solution.columns:
# Prevent issue with populating int columns with floats
if not pandas.api.types.is_float_dtype(solution[col]):
solution[col] = solution[col].astype(float)
# Clip both the min and max following Kaggle conventions for related metrics like log loss
# Clipping the max avoids cases where the loss would be infinite or undefined, clipping the min
# prevents users from playing games with the 20th decimal place of predictions.
submission[col] = np.clip(submission[col], epsilon, 1 - epsilon)
y_nonzero_indices = solution[col] != 0
solution[col] = solution[col].astype(float)
solution.loc[y_nonzero_indices, col] = solution.loc[y_nonzero_indices, col] * np.log(
solution.loc[y_nonzero_indices, col] / submission.loc[y_nonzero_indices, col]
)
# Set the loss equal to zero where y_true equals zero following the scipy convention:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr
solution.loc[~y_nonzero_indices, col] = 0
if micro_average:
return np.average(solution.sum(axis=1), weights=sample_weights)
else:
return np.average(solution.mean())
def score(
solution: pd.DataFrame,
submission: pd.DataFrame,
row_id_column_name: str,
epsilon: float = 10**-15,
micro_average: bool = True,
sample_weights_column_name: Optional[str] = None,
) -> float:
"""The Kullback–Leibler divergence.
The KL divergence is technically undefined/infinite where the target equals zero.
This implementation always assigns those cases a score of zero; effectively removing them from consideration.
The predictions in each row must add to one so any probability assigned to a case where y == 0 reduces
another prediction where y > 0, so crucially there is an important indirect effect.
https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
solution: pd.DataFrame
submission: pd.DataFrame
epsilon: KL divergence is undefined for p=0 or p=1. If epsilon is not null, solution and submission probabilities are clipped to max(eps, min(1 - eps, p).
row_id_column_name: str
micro_average: bool. Row-wise average if True, column-wise average if False.
Examples
--------
>>> import pandas as pd
>>> row_id_column_name = "id"
>>> score(pd.DataFrame({'id': range(4), 'ham': [0, 1, 1, 0], 'spam': [1, 0, 0, 1]}), pd.DataFrame({'id': range(4), 'ham': [.1, .9, .8, .35], 'spam': [.9, .1, .2, .65]}), row_id_column_name=row_id_column_name)
0.216161...
>>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
>>> submission = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
>>> score(solution, submission, 'id')
0.0
>>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
>>> submission = pd.DataFrame({'id': range(3), 'ham': [0.2, 0.3, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.7, 0.2, 0]})
>>> score(solution, submission, 'id')
0.160531...
"""
del solution[row_id_column_name]
del submission[row_id_column_name]
sample_weights = None
if sample_weights_column_name:
if sample_weights_column_name not in solution.columns:
raise ParticipantVisibleError(
f"{sample_weights_column_name} not found in solution columns"
)
sample_weights = solution.pop(sample_weights_column_name)
if sample_weights_column_name and not micro_average:
raise ParticipantVisibleError("Sample weights are only valid if `micro_average` is `True`")
for col in solution.columns:
if col not in submission.columns:
raise ParticipantVisibleError(f"Missing submission column {col}")
kaggle_metric_utilities.verify_valid_probabilities(solution, "solution")
kaggle_metric_utilities.verify_valid_probabilities(submission, "submission")
return kaggle_metric_utilities.safe_call_score(
kl_divergence,
solution,
submission,
epsilon=epsilon,
micro_average=micro_average,
sample_weights=sample_weights,
)