def learn()

in misc/CCSynth/CC/DataInsights/src/prose/datainsights/_assertion/_pca_assertions.py [0:0]


    def learn(self):
        transformed_df = self._transform(self._input_df)
        transformed_column_names = transformed_df.columns

        invs = pd.DataFrame(index=transformed_column_names)
        invs_all = pd.DataFrame(index=transformed_column_names)

        # break the data into overlapping chunks of self._max_col_in_slice columns
        # pick self._max_row_in_slice rows from the dataframe to learn the invariant, and check the rest...
        for df_slice, is_base in self._create_slices(transformed_df):
            # remove rows with nan values
            clean_df_slice = df_slice.dropna()
            if (
                is_base
                and not clean_df_slice.empty
                # number of rows should be more than twice as much as the number of (numerical) columns
                and clean_df_slice.shape[0] > 2 * clean_df_slice.shape[1]
            ):
                if self._cross_validate:
                    self._compute_best_num_of_assertions_to_learn(clean_df_slice)

                inv_vecs_df, inv_vecs_df_all = self._find_candidate_invariants(
                    clean_df_slice
                )
                invs = pd.concat(
                    (invs, inv_vecs_df),
                    axis=1,
                    ignore_index=True,
                    copy=False,
                    sort=False,
                )

                invs_all = pd.concat(
                    (invs_all, inv_vecs_df_all),
                    axis=1,
                    ignore_index=True,
                    copy=False,
                    sort=False,
                )
            else:
                break
        invs.fillna(0, inplace=True)
        invs_all.fillna(0, inplace=True)

        if invs.shape[1] > 0:
            advanced_inv_display = lambda: [
                " + ".join(
                    [
                        "{0}*{1}".format(
                            np.round(invs.loc[row, col], COEFFICIENT_PRECISION), row
                        )
                        for row in invs.index
                        if np.fabs(invs.loc[row, col]) > EPS
                    ]
                )
                for col in invs.columns
            ]
        else:
            advanced_inv_display = lambda: ["true"]

        return PcaAssertion(
            transformed_df=transformed_df,
            display_func=advanced_inv_display,
            inv_matrix=invs,
            inv_matrix_all=invs_all,
            transform_func=self._transform,
            df=self._input_df,
            features=list(self._input_df.columns),
        )