in misc/CCSynth/CC/DataInsights/src/prose/datainsights/_assertion/_pca_assertions.py [0:0]
def learn(self):
transformed_df = self._transform(self._input_df)
transformed_column_names = transformed_df.columns
invs = pd.DataFrame(index=transformed_column_names)
invs_all = pd.DataFrame(index=transformed_column_names)
# break the data into overlapping chunks of self._max_col_in_slice columns
# pick self._max_row_in_slice rows from the dataframe to learn the invariant, and check the rest...
for df_slice, is_base in self._create_slices(transformed_df):
# remove rows with nan values
clean_df_slice = df_slice.dropna()
if (
is_base
and not clean_df_slice.empty
# number of rows should be more than twice as much as the number of (numerical) columns
and clean_df_slice.shape[0] > 2 * clean_df_slice.shape[1]
):
if self._cross_validate:
self._compute_best_num_of_assertions_to_learn(clean_df_slice)
inv_vecs_df, inv_vecs_df_all = self._find_candidate_invariants(
clean_df_slice
)
invs = pd.concat(
(invs, inv_vecs_df),
axis=1,
ignore_index=True,
copy=False,
sort=False,
)
invs_all = pd.concat(
(invs_all, inv_vecs_df_all),
axis=1,
ignore_index=True,
copy=False,
sort=False,
)
else:
break
invs.fillna(0, inplace=True)
invs_all.fillna(0, inplace=True)
if invs.shape[1] > 0:
advanced_inv_display = lambda: [
" + ".join(
[
"{0}*{1}".format(
np.round(invs.loc[row, col], COEFFICIENT_PRECISION), row
)
for row in invs.index
if np.fabs(invs.loc[row, col]) > EPS
]
)
for col in invs.columns
]
else:
advanced_inv_display = lambda: ["true"]
return PcaAssertion(
transformed_df=transformed_df,
display_func=advanced_inv_display,
inv_matrix=invs,
inv_matrix_all=invs_all,
transform_func=self._transform,
df=self._input_df,
features=list(self._input_df.columns),
)