def transform()

in tabular/src/autogluon/tabular/models/tabular_nn/utils/categorical_encoders.py [0:0]
47 lines of code
29 McCabe index (conditional complexity)

    def transform(self, X):
        """Transform X using one-hot encoding.
        
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        
        Returns
        -------
        X_out : sparse matrix if sparse=True else a 2-d array
            Transformed input.
        """
        X = np.array(X).tolist() # converts all elements in X to the same type (i.e. cannot mix floats, ints, and str)
        check_is_fitted(self, 'categories_')
        # validation of X happens in _check_X called by _transform
        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
        n_samples, n_features = X_int.shape
        
        # n_columns indicates, for each feature, how many columns are used in
        # X_trans. By default this corresponds to the number of categories, but
        # will differ if we drop some of them, or if there are infrequent
        # categories (all mapped to the same column)
        n_columns = [len(cats) for cats in self.categories_]
        for feature_idx in range(n_features):
            n_infrequent = self.infrequent_indices_[feature_idx].size
            if n_infrequent > 0:
                # still add 1 for the infrequent column
                n_columns[feature_idx] += 1 - n_infrequent
            if self.drop is not None:
                # if drop is not None we always drop one column in general,
                # except when drop is 'infrequent' and there is no infrequent
                # category.
                n_columns[feature_idx] -= 1
                if (isinstance(self.drop, str) and self.drop == 'infrequent'
                        and n_infrequent == 0):
                    n_columns[feature_idx] += 1  # revert decrement from above
        
        if self.drop is not None:
            to_drop = self.drop_idx_.copy()
            if isinstance(self.drop, str):
                if self.drop == 'infrequent':
                    for feature_idx in range(n_features):
                        if self.infrequent_indices_[feature_idx].size > 0:
                            # drop the infrequent column (i.e. the last one)
                            to_drop[feature_idx] = n_columns[feature_idx]
                        else:
                            # no infrequent category, use special marker -1
                            # so that no dropping happens for this feature
                            to_drop[feature_idx] = -1
            else:
                # self.drop is an array of categories. we need to remap the
                # dropped indexes if some of the categories are infrequent.
                # see _transform() for details about the mapping.
                for feature_idx in range(n_features):
                    if self.infrequent_indices_[feature_idx].size > 0:
                        mapping = self._infrequent_mappings[feature_idx]
                        to_drop[feature_idx] = mapping[to_drop[feature_idx]]
            
            # We remove all the dropped categories from mask, and decrement
            # all categories that occur after them to avoid an empty column.
            to_drop = to_drop.reshape(1, -1)
            keep_cells = (X_int != to_drop) | (to_drop == -1)
            X_mask &= keep_cells
            X_int[(X_int > to_drop) & (to_drop != -1)] -= 1
        
        mask = X_mask.ravel()
        n_values = np.array([0] + n_columns)
        feature_indices = np.cumsum(n_values)
        indices = (X_int + feature_indices[:-1]).ravel()[mask]
        indptr = X_mask.sum(axis=1).cumsum()
        indptr = np.insert(indptr, 0, 0)
        data = np.ones(n_samples * n_features)[mask]
        
        out = sparse.csr_matrix((data, indices, indptr),
                                shape=(n_samples, feature_indices[-1]),
                                dtype=self.dtype)
        if not self.sparse:
            return out.toarray()
        else:
            return out