in src/sagemaker_sklearn_extension/preprocessing/encoders.py [0:0]
def _woe(self, x, count_y_0, mask_y_0, beta):
"""Return the categories for a feature vector `x` as well as the corresponding
weight of evidence value for each of those categories.
Parameters
----------
x: vector, shape (n_samples,)
Feature vector to encode.
count_y: list of length 2
List of counts for the number of observations with the first (resp. the second)
target category.
mask_y_0: vector, shape (n_samples,)
Mask of observations with the first target category.
beta: float
Value to use for Laplace Smoothing (0 if laplace is False).
"""
cat_x = np.unique(x)
mask_y_1 = np.logical_not(mask_y_0)
count_y_1 = len(mask_y_0) - count_y_0
# Computation of the Weight of Evidence for a category c in cat_x and with
# regularization α
#
# woe_c = log( { #(y==0 | c) + α / #(y==1 | c) + α } *
# { #(y==1) + β / #(y==0) + β } )
#
# where β = 2α if laplace == True, 0 otherwise. The second factor can be computed
# once, call it `r10` then
#
# woe_c = log( r10 * ratio(c) )
#
# where
#
# ratio(c) = { #(y==0 | c) + α } / { #(y==1 | x==c) + α }
#
def ratio(c):
x_c = x == c
# retrieve the number of (y == 0 | x == c) and same for y == 1
y_0_c = sum(np.logical_and(mask_y_0, x_c))
y_1_c = sum(np.logical_and(mask_y_1, x_c))
# compute the ratio with regularization for 0 events
return (y_0_c + self.alpha) / (y_1_c + self.alpha)
# computation of woe possibly using Laplace smoothing (beta factor)
r10 = (count_y_1 + beta) / (count_y_0 + beta)
woe = np.log(r10 * np.array([ratio(c) for c in cat_x]))
# encoder from unique values of x to index
codex = {c: i for (i, c) in enumerate(cat_x)}
return (codex, woe)