in rules/predicate.py [0:0]
def filter(predicates: Set[Predicate], df: pd.DataFrame, pos: pd.DataFrame, predicate_relevance_threshold=0.01, minimum_relative_coverage=0):
"""
This is a pre-processing step to identify a subset of interesting predicates
to consider in the seed set.
It considers a predicate to be "interesting" if the data filtered according
to this predicate has a slightly higher occurrence of trait compared
to whole population
Parameters
----------
predicates : Set[Predicate]
The feature we care about (e.g., misprediction)
df : pd.DataFrame
Tabular data as Pandas data frame
pos : pd.DataFrame
Boolean value of target_attrib (e.g., for True, we want to predict when it's true)
Returns
-------
filtered: Set[Predicate]
A subset of filtered predicates
"""
filtered = set()
num_pos = pos.shape[0]
num_total = df.shape[0]
ratio_whole = num_pos / num_total
for predicate in predicates:
num_pass = predicate.num_positive(df)
if num_pass == 0:
continue
# minimum_relative_coverage is given in %
if num_pass / num_total < (minimum_relative_coverage/100):
continue
num_tp = predicate.num_positive(pos)
ratio_current_predicate = num_tp / num_pass
diff = ratio_current_predicate - ratio_whole
if diff > predicate_relevance_threshold:
filtered.add(predicate)
return filtered