in src/sagemaker_sklearn_extension/preprocessing/encoders.py [0:0]
def fit(self, X, y):
"""Fit Weight of Evidence encoder to `X` and `y`.
Parameters
----------
X: array-like, shape (n_samples, n_features)
The data to encode.
y: array-like, shape (n_samples,)
The binary target vector.
Returns
-------
self: WOEEncoder.
"""
# Validate parameters
if self.binning:
assert self.binning in ("uniform", "quantile", "kmeans"), WOEAsserts.BINNING
assert self.n_bins >= 2, WOEAsserts.NBINS
assert self.alpha >= 0, WOEAsserts.ALPHA
# Validate data
X, y = check_X_y(X, y)
# Keep track of number of features encoded
self._dim = X.shape[1]
# recover the target categories and check there's only two
cat_y = np.unique(y)
# it should be == 2 but relax to <= 2 for a single-sample test by check_estimator
assert len(cat_y) <= 2, WOEAsserts.BINARY
# value for laplace smoothing
beta = 2 * self.alpha * self.laplace
# count the number of occurrences per target class and form the mask
# for the rows for which y==0
mask_y_0 = y == cat_y[0]
count_y_0 = sum(mask_y_0)
if self.binning:
self.binner_ = KBinsDiscretizer(n_bins=self.n_bins, strategy=self.binning, encode="ordinal")
Xp = self.binner_.fit_transform(X)
else:
Xp = X
# go over each column and compute the woe
self.woe_pairs_ = list(map(lambda x: self._woe(x, count_y_0, mask_y_0, beta), Xp.T))
return self