in src/sagemaker_sklearn_extension/preprocessing/base.py [0:0]
def fit(self, X, y=None):
"""Compute the lower and upper quantile cutoffs and which columns to transform.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data array to transform. Must be numeric, non-sparse, and two-dimensional.
Returns
-------
self : BaseExtremeValueTransformer
"""
if not 0 <= self.quantile <= 100:
raise ValueError(
"Parameter `quantile` {} is invalid. `quantile` must be an integer between 0 and 100".format(
self.quantile
)
)
X = check_array(X)
_, self.n_input_features_ = X.shape
self.quantiles_ = np.percentile(X, [100 - self.quantile, self.quantile], axis=0)
nonstandard_threshold_stds = self.threshold_std * np.std(X, axis=0)
col_means = np.mean(X, axis=0)
threshold_upper_bound = nonstandard_threshold_stds + col_means
threshold_lower_bound = -nonstandard_threshold_stds + col_means
self.cols_to_transform_ = [
j
for j in range(self.n_input_features_)
if self.quantiles_[0, j] < threshold_lower_bound[j] or self.quantiles_[1, j] > threshold_upper_bound[j]
]
return self