def fit()

in src/sagemaker_sklearn_extension/preprocessing/encoders.py [0:0]
32 lines of code
10 McCabe index (conditional complexity)

    def fit(self, X, y=None):
        """Fit ThresholdOneHotEncoder to X.

        Overrides self.categories_ under the following conditions:
         - include values that appear at least ``threshold`` number of times
         - include the top ``self.max_categories`` number of categories to encode

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to determine the categories of each feature.

        Returns
        -------
        self : ThresholdOneHotEncoder
        """
        super().fit(X, y)
        assert self.max_categories >= 1

        n_samples, n_features = X.shape

        if not self.threshold:
            threshold = max(10, n_samples / 1000)
        elif self.threshold >= 1:
            threshold = self.threshold
        else:
            threshold = ceil(self.threshold * n_samples)

        n_features_completely_under_threshold = 0

        for j in range(n_features):
            # get unique values and their counts
            items, counts = np.unique([X[:, j]], return_counts=True)

            # add items that appear more than threshold times
            self.categories_[j] = items[counts >= threshold].astype("O")

            if self.categories_[j].size == 0:
                n_features_completely_under_threshold += 1
                # If no category is above the threshold, then create an unknown category to prevent
                # self.transform() from raising an IndexError.
                items.sort()
                unknown_category = "{}___".format(items[-1])
                # It's important to keep the dtype of `self.categories_[j]` as 'U' here because our `unknown_category`
                # might end up being longer than any of the seen categories, and that changes the behavior of
                # the `self._transform` method.
                self.categories_[j] = np.asarray([unknown_category], dtype="U")
            elif len(self.categories_[j]) > self.max_categories:
                items_and_counts = dict(zip(items, counts))
                self.categories_[j] = np.asarray(
                    sorted(items_and_counts, key=items_and_counts.get, reverse=True)[: self.max_categories], dtype="O"
                )

        if n_features_completely_under_threshold > 0:
            times = "time" if self.threshold == 1 else "times"
            warnings.warn(
                "{} out of {} features do not have any categories appearing more than threshold={} {}.".format(
                    n_features_completely_under_threshold, n_features, self.threshold, times
                )
            )

        return self