in tabular/src/autogluon/tabular/models/tabular_nn/utils/categorical_encoders.py [0:0]
def transform(self, X):
"""Transform X using one-hot encoding.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to encode.
Returns
-------
X_out : sparse matrix if sparse=True else a 2-d array
Transformed input.
"""
X = np.array(X).tolist() # converts all elements in X to the same type (i.e. cannot mix floats, ints, and str)
check_is_fitted(self, 'categories_')
# validation of X happens in _check_X called by _transform
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
n_samples, n_features = X_int.shape
# n_columns indicates, for each feature, how many columns are used in
# X_trans. By default this corresponds to the number of categories, but
# will differ if we drop some of them, or if there are infrequent
# categories (all mapped to the same column)
n_columns = [len(cats) for cats in self.categories_]
for feature_idx in range(n_features):
n_infrequent = self.infrequent_indices_[feature_idx].size
if n_infrequent > 0:
# still add 1 for the infrequent column
n_columns[feature_idx] += 1 - n_infrequent
if self.drop is not None:
# if drop is not None we always drop one column in general,
# except when drop is 'infrequent' and there is no infrequent
# category.
n_columns[feature_idx] -= 1
if (isinstance(self.drop, str) and self.drop == 'infrequent'
and n_infrequent == 0):
n_columns[feature_idx] += 1 # revert decrement from above
if self.drop is not None:
to_drop = self.drop_idx_.copy()
if isinstance(self.drop, str):
if self.drop == 'infrequent':
for feature_idx in range(n_features):
if self.infrequent_indices_[feature_idx].size > 0:
# drop the infrequent column (i.e. the last one)
to_drop[feature_idx] = n_columns[feature_idx]
else:
# no infrequent category, use special marker -1
# so that no dropping happens for this feature
to_drop[feature_idx] = -1
else:
# self.drop is an array of categories. we need to remap the
# dropped indexes if some of the categories are infrequent.
# see _transform() for details about the mapping.
for feature_idx in range(n_features):
if self.infrequent_indices_[feature_idx].size > 0:
mapping = self._infrequent_mappings[feature_idx]
to_drop[feature_idx] = mapping[to_drop[feature_idx]]
# We remove all the dropped categories from mask, and decrement
# all categories that occur after them to avoid an empty column.
to_drop = to_drop.reshape(1, -1)
keep_cells = (X_int != to_drop) | (to_drop == -1)
X_mask &= keep_cells
X_int[(X_int > to_drop) & (to_drop != -1)] -= 1
mask = X_mask.ravel()
n_values = np.array([0] + n_columns)
feature_indices = np.cumsum(n_values)
indices = (X_int + feature_indices[:-1]).ravel()[mask]
indptr = X_mask.sum(axis=1).cumsum()
indptr = np.insert(indptr, 0, 0)
data = np.ones(n_samples * n_features)[mask]
out = sparse.csr_matrix((data, indices, indptr),
shape=(n_samples, feature_indices[-1]),
dtype=self.dtype)
if not self.sparse:
return out.toarray()
else:
return out