in misc/CCSynth/CC/DataInsights/src/prose/datainsights/_assertion/_decision_tree_assertions.py [0:0]
def get_best_column_from_partitions(self):
root_assertion = PcaAssertion.learn(
df=self.df,
max_col_in_slice=self.max_col_in_slice,
slice_col_overlap=self.slice_col_overlap,
max_row_in_slice=self.max_row_in_slice,
use_const_term=self.use_const_term,
standardize_pca=self.standardize_pca,
max_self_violation=self.max_self_violation,
cross_validate=self.cross_validate,
n_fold=self.n_fold,
num_invs_to_return=None,
)
self.max_inv_count = 0
self.all_std_devs = dict()
self.child_count = dict()
self.lock = threading.Lock()
assertion_finding_thread = []
for col in self.cat_columns:
number_of_unique_values = len(self.df[col].unique())
if number_of_unique_values > 1:
for value in self.df[col].unique():
assertion_finding_thread.append(
threading.Thread(self.assertion_learner_thread(col, value))
)
for t in assertion_finding_thread:
t.start()
for t in assertion_finding_thread:
t.join()
if self.max_inv_count == 0:
best_col = None
else:
# "self.max_inv_count" is the maximum number of possible invariant in any children across any column split.
# Considering the top "self.max_inv_count" invariants at the root and at all the children,
# we pick the split that
# (1) minimizes the avg std_dev over all partitions(the lower avg std_devs are, the better)
# (2) the "improvement" of (new) avg std_dev, compared to the root's (old) avg std_dev is "significant".
# We consider an improvement significant if
# new_avg_std <= self.assertion_improvement_factor * avg_std_per_assertion_at_root
avg_std_per_assertion_at_root = sum(
root_assertion.std_dev_all[: self.max_inv_count]
) / float(self.max_inv_count)
best_col = None
best_avg_std_per_assertion = avg_std_per_assertion_at_root
for col in self.all_std_devs:
avg_std_per_assertion_at_this_split = np.sum(
self.all_std_devs[col][:, : self.max_inv_count]
) / float(self.child_count[col] * self.max_inv_count)
if avg_std_per_assertion_at_this_split < best_avg_std_per_assertion:
best_col, best_avg_std_per_assertion = (
col,
avg_std_per_assertion_at_this_split,
)
if (
best_avg_std_per_assertion
> self.assertion_improvement_factor * avg_std_per_assertion_at_root
and root_assertion.get_inv_count() > 0
):
# We are not seeing enough improvement in assertion quality with any splitting,
# where we already learnt some assertion at the root.
# So, rather not split
best_col = None
return best_col