in src/sagemaker_sklearn_extension/preprocessing/encoders.py [0:0]
def fit(self, X, y=None):
"""Fit the RobustOrdinalEncoder to X.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to determine the categories of each feature, assuming the input parameter categories equals 'auto'
Returns
-------
self
"""
self._fit(X, handle_unknown="ignore")
assert self.max_categories >= 1
self.feature_idxs_no_categories_ = []
if isinstance(self.max_categories, int) or self.threshold != 1:
X_columns, n_samples, n_features = self._check_X(X)
if self.threshold == "auto":
threshold = max(10, n_samples / 1000)
elif self.threshold >= 1:
threshold = self.threshold
else:
threshold = ceil(self.threshold * n_samples)
for i in range(n_features):
dtype = X_columns[i].dtype
items, counts = np.unique(X_columns[i].astype(str), return_counts=True)
categories_to_encode = items[counts >= threshold].astype("O")
if categories_to_encode.size == 0:
warnings.warn(
"feature at index {} does not have any categories appearing more than {} {}".format(
i, threshold, "time" if threshold == 1 else "times"
)
)
# If no category is above the threshold, create an unknown category to prevent
# self._transform() from raising an IndexError
categories_to_encode = np.array(["unknown"])
self.feature_idxs_no_categories_.append(i)
if len(categories_to_encode) > self.max_categories:
most_freq_idxs = np.argsort(counts)[len(counts) - self.max_categories :]
categories_to_encode = items[most_freq_idxs]
self.categories_[i] = np.sort(categories_to_encode.astype(dtype))
return self