in datawig/imputer.py [0:0]
def __persist_class_prototypes(self, iter_train, train_df):
"""
Save mean feature pattern as self.__class_patterns for each label_encoder, for each label, for each data encoder,
given by the projection of the feature matrix (items by ngrams/categories)
onto the softmax outputs (items by labels).
self.__class_patterns is a list of tuples of the form (column_encoder, feature-label-correlation-matrix).
"""
if len(self.label_encoders) > 1:
logger.warning('Persisting class prototypes works only for a single output column. '
'Choosing ' + str(self.label_encoders[0].output_column) + '.')
label_name = self.label_encoders[0].output_column
iter_train.reset()
p = self.__predict_mxnet_iter(iter_train)[label_name] # class probabilities for every item (items x labels)
# center and whiten the class probabilities
p_normalized = StandardScaler().fit_transform(p)
# Generate list of data encoders, with features suitable for explanation. Only TfIDf and Categorical supported.
explainable_data_encoders = []
explainable_data_encoders_idx = []
for encoder_idx, encoder in enumerate(self.data_encoders):
if not (isinstance(encoder, TfIdfEncoder) or isinstance(encoder, CategoricalEncoder)):
logger.warning("Data encoder type {} incompatible for explaining classes".format(type(encoder)))
else:
explainable_data_encoders.append(encoder)
explainable_data_encoders_idx.append(encoder_idx)
# encoded representations of training data ([items x features] for every encoded column.)
X = [enc.transform(train_df).transpose() for enc in explainable_data_encoders]
# whiten the feature matrix. Centering is not supported for sparse matrices.
# Doesn't do anything for categorical data where the shape is (1, num_items)
X_scaled = [StandardScaler(with_mean=False).fit_transform(feature_matrix) for feature_matrix in X]
# compute correlation between features and labels
class_patterns = []
for feature_matrix_scaled, encoder in zip(X_scaled, explainable_data_encoders):
if isinstance(encoder, TfIdfEncoder):
# project features onto labels and sum across items
# We need to limit the columns of feature matrix scaled, such that its number modulo batch size is zero.
# See also .start_padding in iterators.py.
class_patterns.append((encoder, feature_matrix_scaled[:, :p_normalized.shape[0]].dot(p_normalized)))
elif isinstance(encoder, CategoricalEncoder):
# compute mean class output for all input labels
class_patterns.append((encoder, np.array(
[np.sum(p_normalized[np.where(feature_matrix_scaled[0, :] == category)[0], :], axis=0)
for category in encoder.idx_to_token.keys()])))
else:
logger.warning("column encoder not supported for explain.")
self.__class_patterns = class_patterns