in bayesmark/space.py [0:0]
def encode(X, labels, assume_sorted=False, dtype=bool, assume_valid=False):
"""Perform one hot encoding of categorical data in :class:`numpy:numpy.ndarray` variable `X` of any dimension.
Parameters
----------
X : :class:`numpy:numpy.ndarray` of shape (...)
Categorical values of any standard type. Vectorized to work for any dimensional `X`.
labels : :class:`numpy:numpy.ndarray` of shape (n,)
Complete list of all possible labels. List is flattened if it is not already 1 dimensional.
assume_sorted : bool
If true, assume labels is already sorted and unique. This saves the computational cost of calling
:func:`numpy:numpy.unique`.
dtype : type
Desired data of feature array. One-hot is most logically `bool`, but feature matrices are usually `float`.
assume_valid : bool
If true, assume all element of `X` are in the list `labels`. This saves the computational cost of verifying
`X` are in `labels`. If true and a non-label `X` occurs this routine will silently give bogus result.
Returns
-------
Y : :class:`numpy:numpy.ndarray` of shape (..., n)
One-hot encoding of `X`. Extra dimension is appended at end for the one-hot vector. It has data type `dtype`.
"""
X = np.asarray(X)
labels = np.asarray(labels) if assume_sorted else np.unique(labels)
check_array(labels, "labels", pre=True, ndim=1, min_size=1)
idx = np.searchsorted(labels, X)
# If x is not even in labels then this will fail. This is not ValueError
# because the user explictly asked for this using argument assume_valid.
assert assume_valid or np.all(np.asarray(labels[idx]) == X)
# This is using some pro np indexing technique to vectorize across all
# possible input dimensions for X in the same code.
Y = np.zeros(X.shape + (len(labels),), dtype=dtype)
Y[unravel_index(X.shape) + (idx.ravel(),)] = True
return Y