def _woe()

in src/sagemaker_sklearn_extension/preprocessing/encoders.py [0:0]
13 lines of code
9 McCabe index (conditional complexity)

    def _woe(self, x, count_y_0, mask_y_0, beta):
        """Return the categories for a feature vector `x` as well as the corresponding
        weight of evidence value for each of those categories.

        Parameters
        ----------
        x: vector, shape (n_samples,)
            Feature vector to encode.
        count_y: list of length 2
            List of counts for the number of observations with the first (resp. the second)
            target category.
        mask_y_0: vector, shape (n_samples,)
            Mask of observations with the first target category.
        beta: float
            Value to use for Laplace Smoothing (0 if laplace is False).
        """
        cat_x = np.unique(x)
        mask_y_1 = np.logical_not(mask_y_0)
        count_y_1 = len(mask_y_0) - count_y_0

        # Computation of the Weight of Evidence for a category c in cat_x and with
        # regularization α
        #
        #   woe_c = log( { #(y==0 | c) + α / #(y==1 | c)  + α } *
        #                { #(y==1) + β / #(y==0) + β } )
        #
        # where β = 2α if laplace == True, 0 otherwise. The second factor can be computed
        # once, call it `r10` then
        #
        #   woe_c = log( r10 * ratio(c) )
        #
        # where
        #
        #   ratio(c) = { #(y==0 | c) + α } / { #(y==1 | x==c) + α }
        #

        def ratio(c):
            x_c = x == c
            # retrieve the number of (y == 0 | x == c) and same for y == 1
            y_0_c = sum(np.logical_and(mask_y_0, x_c))
            y_1_c = sum(np.logical_and(mask_y_1, x_c))
            # compute the ratio with regularization for 0 events
            return (y_0_c + self.alpha) / (y_1_c + self.alpha)

        # computation of woe possibly using Laplace smoothing (beta factor)
        r10 = (count_y_1 + beta) / (count_y_0 + beta)
        woe = np.log(r10 * np.array([ratio(c) for c in cat_x]))
        # encoder from unique values of x to index
        codex = {c: i for (i, c) in enumerate(cat_x)}
        return (codex, woe)