def filter()

in rules/predicate.py [0:0]


def filter(predicates: Set[Predicate], df: pd.DataFrame, pos: pd.DataFrame, predicate_relevance_threshold=0.01, minimum_relative_coverage=0):
    """
        This is a pre-processing step to identify a subset of interesting predicates
        to consider in the seed set.
        It considers a predicate to be "interesting" if the data filtered according
        to this predicate has a slightly higher occurrence of trait compared
        to whole population

        Parameters
        ----------

        predicates : Set[Predicate]
            The feature we care about (e.g., misprediction)

        df : pd.DataFrame
            Tabular data as Pandas data frame

        pos : pd.DataFrame
            Boolean value of target_attrib (e.g., for True, we want to predict when it's true)

        Returns
        -------
        filtered: Set[Predicate]
            A subset of filtered predicates
    """
    filtered = set()
    num_pos = pos.shape[0]
    num_total = df.shape[0]
    ratio_whole = num_pos / num_total

    for predicate in predicates:
        num_pass = predicate.num_positive(df)
        if num_pass == 0:
            continue
        # minimum_relative_coverage is given in %
        if num_pass / num_total < (minimum_relative_coverage/100):
            continue
        num_tp = predicate.num_positive(pos)
        ratio_current_predicate = num_tp / num_pass

        diff = ratio_current_predicate - ratio_whole

        if diff > predicate_relevance_threshold:
            filtered.add(predicate)
    return filtered