def get_best_column_from_partitions()

in misc/CCSynth/CC/DataInsights/src/prose/datainsights/_assertion/_decision_tree_assertions.py [0:0]


    def get_best_column_from_partitions(self):
        root_assertion = PcaAssertion.learn(
            df=self.df,
            max_col_in_slice=self.max_col_in_slice,
            slice_col_overlap=self.slice_col_overlap,
            max_row_in_slice=self.max_row_in_slice,
            use_const_term=self.use_const_term,
            standardize_pca=self.standardize_pca,
            max_self_violation=self.max_self_violation,
            cross_validate=self.cross_validate,
            n_fold=self.n_fold,
            num_invs_to_return=None,
        )
        self.max_inv_count = 0
        self.all_std_devs = dict()
        self.child_count = dict()
        self.lock = threading.Lock()

        assertion_finding_thread = []

        for col in self.cat_columns:
            number_of_unique_values = len(self.df[col].unique())
            if number_of_unique_values > 1:
                for value in self.df[col].unique():
                    assertion_finding_thread.append(
                        threading.Thread(self.assertion_learner_thread(col, value))
                    )

        for t in assertion_finding_thread:
            t.start()

        for t in assertion_finding_thread:
            t.join()

        if self.max_inv_count == 0:
            best_col = None
        else:
            # "self.max_inv_count" is the maximum number of possible invariant in any children across any column split.
            # Considering the top "self.max_inv_count" invariants at the root and at all the children,
            # we pick the split that
            #   (1) minimizes the avg std_dev over all partitions(the lower avg std_devs are, the better)
            #   (2) the "improvement" of (new) avg std_dev, compared to the root's (old) avg std_dev is "significant".
            # We consider an improvement significant if
            #   new_avg_std  <= self.assertion_improvement_factor * avg_std_per_assertion_at_root

            avg_std_per_assertion_at_root = sum(
                root_assertion.std_dev_all[: self.max_inv_count]
            ) / float(self.max_inv_count)
            best_col = None
            best_avg_std_per_assertion = avg_std_per_assertion_at_root
            for col in self.all_std_devs:
                avg_std_per_assertion_at_this_split = np.sum(
                    self.all_std_devs[col][:, : self.max_inv_count]
                ) / float(self.child_count[col] * self.max_inv_count)
                if avg_std_per_assertion_at_this_split < best_avg_std_per_assertion:
                    best_col, best_avg_std_per_assertion = (
                        col,
                        avg_std_per_assertion_at_this_split,
                    )
            if (
                best_avg_std_per_assertion
                > self.assertion_improvement_factor * avg_std_per_assertion_at_root
                and root_assertion.get_inv_count() > 0
            ):
                # We are not seeing enough improvement in assertion quality with any splitting,
                # where we already learnt some assertion at the root.
                # So, rather not split
                best_col = None

        return best_col