in causalml/metrics/visualize.py [0:0]
def get_cumlift(df, outcome_col='y', treatment_col='w', treatment_effect_col='tau',
random_seed=42):
"""Get average uplifts of model estimates in cumulative population.
If the true treatment effect is provided (e.g. in synthetic data), it's calculated
as the mean of the true treatment effect in each of cumulative population.
Otherwise, it's calculated as the difference between the mean outcomes of the
treatment and control groups in each of cumulative population.
For details, see Section 4.1 of Gutierrez and G{\'e}rardy (2016), `Causal Inference
and Uplift Modeling: A review of the literature`.
For the former, `treatment_effect_col` should be provided. For the latter, both
`outcome_col` and `treatment_col` should be provided.
Args:
df (pandas.DataFrame): a data frame with model estimates and actual data as columns
outcome_col (str, optional): the column name for the actual outcome
treatment_col (str, optional): the column name for the treatment indicator (0 or 1)
treatment_effect_col (str, optional): the column name for the true treatment effect
random_seed (int, optional): random seed for numpy.random.rand()
Returns:
(pandas.DataFrame): average uplifts of model estimates in cumulative population
"""
assert ((outcome_col in df.columns) and (treatment_col in df.columns) or
treatment_effect_col in df.columns)
df = df.copy()
np.random.seed(random_seed)
random_cols = []
for i in range(10):
random_col = '__random_{}__'.format(i)
df[random_col] = np.random.rand(df.shape[0])
random_cols.append(random_col)
model_names = [x for x in df.columns if x not in [outcome_col, treatment_col,
treatment_effect_col]]
lift = []
for i, col in enumerate(model_names):
df = df.sort_values(col, ascending=False).reset_index(drop=True)
df.index = df.index + 1
if treatment_effect_col in df.columns:
# When treatment_effect_col is given, use it to calculate the average treatment effects
# of cumulative population.
lift.append(df[treatment_effect_col].cumsum() / df.index)
else:
# When treatment_effect_col is not given, use outcome_col and treatment_col
# to calculate the average treatment_effects of cumulative population.
df['cumsum_tr'] = df[treatment_col].cumsum()
df['cumsum_ct'] = df.index.values - df['cumsum_tr']
df['cumsum_y_tr'] = (df[outcome_col] * df[treatment_col]).cumsum()
df['cumsum_y_ct'] = (df[outcome_col] * (1 - df[treatment_col])).cumsum()
lift.append(df['cumsum_y_tr'] / df['cumsum_tr'] - df['cumsum_y_ct'] / df['cumsum_ct'])
lift = pd.concat(lift, join='inner', axis=1)
lift.loc[0] = np.zeros((lift.shape[1], ))
lift = lift.sort_index().interpolate()
lift.columns = model_names
lift[RANDOM_COL] = lift[random_cols].mean(axis=1)
lift.drop(random_cols, axis=1, inplace=True)
return lift