def fit()

in src/sagemaker_sklearn_extension/preprocessing/encoders.py [0:0]
29 lines of code
9 McCabe index (conditional complexity)

    def fit(self, X, y=None):
        """Fit the RobustOrdinalEncoder to X.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to determine the categories of each feature, assuming the input parameter categories equals 'auto'

        Returns
        -------
        self

        """
        self._fit(X, handle_unknown="ignore")

        assert self.max_categories >= 1

        self.feature_idxs_no_categories_ = []

        if isinstance(self.max_categories, int) or self.threshold != 1:
            X_columns, n_samples, n_features = self._check_X(X)

            if self.threshold == "auto":
                threshold = max(10, n_samples / 1000)
            elif self.threshold >= 1:
                threshold = self.threshold
            else:
                threshold = ceil(self.threshold * n_samples)

            for i in range(n_features):
                dtype = X_columns[i].dtype
                items, counts = np.unique(X_columns[i].astype(str), return_counts=True)
                categories_to_encode = items[counts >= threshold].astype("O")
                if categories_to_encode.size == 0:
                    warnings.warn(
                        "feature at index {} does not have any categories appearing more than {} {}".format(
                            i, threshold, "time" if threshold == 1 else "times"
                        )
                    )
                    # If no category is above the threshold, create an unknown category to prevent
                    # self._transform() from raising an IndexError
                    categories_to_encode = np.array(["unknown"])
                    self.feature_idxs_no_categories_.append(i)
                if len(categories_to_encode) > self.max_categories:
                    most_freq_idxs = np.argsort(counts)[len(counts) - self.max_categories :]
                    categories_to_encode = items[most_freq_idxs]
                self.categories_[i] = np.sort(categories_to_encode.astype(dtype))

        return self