in src/sagemaker_sklearn_extension/preprocessing/encoders.py [0:0]
def fit(self, X, y=None):
"""Fit ThresholdOneHotEncoder to X.
Overrides self.categories_ under the following conditions:
- include values that appear at least ``threshold`` number of times
- include the top ``self.max_categories`` number of categories to encode
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to determine the categories of each feature.
Returns
-------
self : ThresholdOneHotEncoder
"""
super().fit(X, y)
assert self.max_categories >= 1
n_samples, n_features = X.shape
if not self.threshold:
threshold = max(10, n_samples / 1000)
elif self.threshold >= 1:
threshold = self.threshold
else:
threshold = ceil(self.threshold * n_samples)
n_features_completely_under_threshold = 0
for j in range(n_features):
# get unique values and their counts
items, counts = np.unique([X[:, j]], return_counts=True)
# add items that appear more than threshold times
self.categories_[j] = items[counts >= threshold].astype("O")
if self.categories_[j].size == 0:
n_features_completely_under_threshold += 1
# If no category is above the threshold, then create an unknown category to prevent
# self.transform() from raising an IndexError.
items.sort()
unknown_category = "{}___".format(items[-1])
# It's important to keep the dtype of `self.categories_[j]` as 'U' here because our `unknown_category`
# might end up being longer than any of the seen categories, and that changes the behavior of
# the `self._transform` method.
self.categories_[j] = np.asarray([unknown_category], dtype="U")
elif len(self.categories_[j]) > self.max_categories:
items_and_counts = dict(zip(items, counts))
self.categories_[j] = np.asarray(
sorted(items_and_counts, key=items_and_counts.get, reverse=True)[: self.max_categories], dtype="O"
)
if n_features_completely_under_threshold > 0:
times = "time" if self.threshold == 1 else "times"
warnings.warn(
"{} out of {} features do not have any categories appearing more than threshold={} {}.".format(
n_features_completely_under_threshold, n_features, self.threshold, times
)
)
return self