in causalml/feature_selection/filters.py [0:0]
def _filter_D_one_feature(self, data, feature_name, y_name,
n_bins=10, method='KL', control_group='control',
experiment_group_column='treatment_group_key'):
"""
Calculate the chosen divergence measure for one feature.
Parameters
----------
data (pd.Dataframe): DataFrame containing outcome, features, and experiment group
treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
feature_name (string): feature name, as one column in the data DataFrame
y_name (string): name of the outcome variable
method (string, optional, default = 'KL'): taking one of the following values {'F', 'LR', 'KL', 'ED', 'Chi'}
The feature selection method to be used to rank the features.
'F' for F-test
'LR' for likelihood ratio test
'KL', 'ED', 'Chi' for bin-based uplift filter methods, KL divergence, Euclidean distance, Chi-Square respectively
experiment_group_column (string, optional, default = 'treatment_group_key'): the experiment column name in the DataFrame, which contains the treatment and control assignment label
control_group (string, optional, default = 'control'): name for control group, value in the experiment group column
n_bins (int, optional, default = 10): number of bins to be used for bin-based uplift filter methods
Returns
----------
(pd.DataFrame): a data frame containing the feature importance statistics
"""
# [TODO] Application to categorical features
if method == 'KL':
evaluationFunction = self._evaluate_KL
elif method == 'ED':
evaluationFunction = self._evaluate_ED
elif method == 'Chi':
evaluationFunction = self._evaluate_Chi
totalSize = len(data.index)
x_bin = pd.qcut(data[feature_name].values, n_bins, labels=False,
duplicates='raise')
d_children = 0
for i_bin in range(x_bin.max() + 1): # range(n_bins):
nodeSummary = self._GetNodeSummary(
data=data.loc[x_bin == i_bin],
experiment_group_column=experiment_group_column, y_name=y_name
)[1]
nodeScore = evaluationFunction(nodeSummary,
control_group=control_group)
nodeSize = sum([x[1] for x in list(nodeSummary.values())])
d_children += nodeScore * nodeSize / totalSize
parentNodeSummary = self._GetNodeSummary(
data=data, experiment_group_column=experiment_group_column, y_name=y_name
)[1]
d_parent = evaluationFunction(parentNodeSummary,
control_group=control_group)
d_res = d_children - d_parent
D_result = pd.DataFrame({
'feature': feature_name,
'method': method,
'score': d_res,
'p_value': None,
'misc': 'number_of_bins: {}'.format(min(n_bins, x_bin.max()+1)),# format(n_bins),
}, index=[0]).reset_index(drop=True)
return(D_result)