def __persist_class_prototypes()

in datawig/imputer.py [0:0]


    def __persist_class_prototypes(self, iter_train, train_df):
        """
        Save mean feature pattern as self.__class_patterns for each label_encoder, for each label, for each data encoder,
        given by the projection of the feature matrix (items by ngrams/categories)
        onto the softmax outputs (items by labels).
        self.__class_patterns is a list of tuples of the form (column_encoder, feature-label-correlation-matrix).
        """

        if len(self.label_encoders) > 1:
            logger.warning('Persisting class prototypes works only for a single output column. '
                        'Choosing ' + str(self.label_encoders[0].output_column) + '.')
        label_name = self.label_encoders[0].output_column

        iter_train.reset()
        p = self.__predict_mxnet_iter(iter_train)[label_name]  # class probabilities for every item (items x labels)

        # center and whiten the class probabilities
        p_normalized = StandardScaler().fit_transform(p)

        # Generate list of data encoders, with features suitable for explanation. Only TfIDf and Categorical supported.
        explainable_data_encoders = []
        explainable_data_encoders_idx = []
        for encoder_idx, encoder in enumerate(self.data_encoders):
            if not (isinstance(encoder, TfIdfEncoder) or isinstance(encoder, CategoricalEncoder)):
                logger.warning("Data encoder type {} incompatible for explaining classes".format(type(encoder)))
            else:
                explainable_data_encoders.append(encoder)
                explainable_data_encoders_idx.append(encoder_idx)

        # encoded representations of training data ([items x features] for every encoded column.)
        X = [enc.transform(train_df).transpose() for enc in explainable_data_encoders]

        # whiten the feature matrix. Centering is not supported for sparse matrices.
        # Doesn't do anything for categorical data where the shape is (1, num_items)
        X_scaled = [StandardScaler(with_mean=False).fit_transform(feature_matrix) for feature_matrix in X]

        # compute correlation between features and labels
        class_patterns = []
        for feature_matrix_scaled, encoder in zip(X_scaled, explainable_data_encoders):
            if isinstance(encoder, TfIdfEncoder):
                # project features onto labels and sum across items
                # We need to limit the columns of feature matrix scaled, such that its number modulo batch size is zero.
                # See also .start_padding in iterators.py.
                class_patterns.append((encoder, feature_matrix_scaled[:, :p_normalized.shape[0]].dot(p_normalized)))
            elif isinstance(encoder, CategoricalEncoder):
                # compute mean class output for all input labels
                class_patterns.append((encoder, np.array(
                        [np.sum(p_normalized[np.where(feature_matrix_scaled[0, :] == category)[0], :], axis=0)
                         for category in encoder.idx_to_token.keys()])))
            else:
                logger.warning("column encoder not supported for explain.")

        self.__class_patterns = class_patterns