in econml/solutions/causal_analysis/_causal_analysis.py [0:0]
def _process_feature(name, feat_ind, verbose, categorical_inds, categories, heterogeneity_inds, min_counts, y, X,
nuisance_models, h_model, random_state, model_y, cv, mc_iters):
try:
if verbose > 0:
print(f"CausalAnalysis: Feature {name}")
discrete_treatment = feat_ind in categorical_inds
if discrete_treatment:
cats = categories[categorical_inds.index(feat_ind)]
else:
cats = 'auto' # just leave the setting at the default otherwise
# the transformation logic here is somewhat tricky; we always need to encode the categorical columns,
# whether they end up in X or in W. However, for the continuous columns, we want to scale them all
# when running the first stage models, but don't want to scale the X columns when running the final model,
# since then our coefficients will have odd units and our trees will also have decisions using those units.
#
# we achieve this by pipelining the X scaling with the Y and T models (with fixed scaling, not refitting)
hinds = heterogeneity_inds[feat_ind]
WX_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
[ind for ind in categorical_inds
if ind != feat_ind]),
('drop', 'drop', feat_ind)],
remainder=StandardScaler())
W_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
[ind for ind in categorical_inds
if ind != feat_ind and ind not in hinds]),
('drop', 'drop', hinds),
('drop_feat', 'drop', feat_ind)],
remainder=StandardScaler())
X_cont_inds = [ind for ind in hinds
if ind != feat_ind and ind not in categorical_inds]
# Use _ColumnTransformer instead of ColumnTransformer so we can get feature names
X_transformer = _ColumnTransformer([ind for ind in categorical_inds
if ind != feat_ind and ind in hinds],
X_cont_inds)
# Controls are all other columns of X
WX = WX_transformer.fit_transform(X)
# can't use X[:, feat_ind] when X is a DataFrame
T = _safe_indexing(X, feat_ind, axis=1)
# TODO: we can't currently handle unseen values of the feature column when getting the effect;
# we might want to modify OrthoLearner (and other discrete treatment classes)
# so that the user can opt-in to allowing unseen treatment values
# (and return NaN or something in that case)
W = W_transformer.fit_transform(X)
X_xf = X_transformer.fit_transform(X)
# HACK: this is slightly ugly because we rely on the fact that DML passes [X;W] to the first stage models
# and so we can just peel the first columns off of that combined array for rescaling in the pipeline
# TODO: consider addding an API to DML that allows for better understanding of how the nuisance inputs are
# built, such as model_y_feature_names, model_t_feature_names, model_y_transformer, etc., so that this
# becomes a valid approach to handling this
X_scaler = ColumnTransformer([('scale', StandardScaler(),
list(range(len(X_cont_inds))))],
remainder='passthrough').fit(np.hstack([X_xf, W])).named_transformers_['scale']
X_scaler_fixed = ColumnTransformer([('scale', _freeze(X_scaler),
list(range(len(X_cont_inds))))],
remainder='passthrough')
if W.shape[1] == 0:
# array checking routines don't accept 0-width arrays
W = None
if X_xf.shape[1] == 0:
X_xf = None
if verbose > 0:
print("CausalAnalysis: performing model selection on T model")
# perform model selection
model_t = (_first_stage_clf(WX, T, automl=nuisance_models == 'automl',
min_count=min_counts.get(feat_ind, None),
random_state=random_state, verbose=verbose)
if discrete_treatment else _first_stage_reg(WX, T, automl=nuisance_models == 'automl',
random_state=random_state,
verbose=verbose))
pipelined_model_t = Pipeline([('scale', X_scaler_fixed),
('model', model_t)])
pipelined_model_y = Pipeline([('scale', X_scaler_fixed),
('model', model_y)])
if X_xf is None and h_model == 'forest':
warnings.warn(f"Using a linear model instead of a forest model for feature '{name}' "
"because forests don't support models with no heterogeneity indices")
h_model = 'linear'
if h_model == 'linear':
est = LinearDML(model_y=pipelined_model_y,
model_t=pipelined_model_t,
discrete_treatment=discrete_treatment,
fit_cate_intercept=True,
linear_first_stages=False,
categories=cats,
random_state=random_state,
cv=cv,
mc_iters=mc_iters)
elif h_model == 'forest':
est = CausalForestDML(model_y=pipelined_model_y,
model_t=pipelined_model_t,
discrete_treatment=discrete_treatment,
n_estimators=4000,
min_var_leaf_on_val=True,
categories=cats,
random_state=random_state,
verbose=verbose,
cv=cv,
mc_iters=mc_iters)
if verbose > 0:
print("CausalAnalysis: tuning forest")
est.tune(y, T, X=X_xf, W=W)
if verbose > 0:
print("CausalAnalysis: training causal model")
est.fit(y, T, X=X_xf, W=W, cache_values=True)
# Prefer ate__inference to const_marginal_ate_inference(X) because it is doubly-robust and not conservative
if h_model == 'forest' and discrete_treatment:
global_inference = est.ate__inference()
else:
# convert to NormalInferenceResults for consistency
inf = est.const_marginal_ate_inference(X=X_xf)
global_inference = NormalInferenceResults(d_t=inf.d_t, d_y=inf.d_y,
pred=inf.mean_point,
pred_stderr=inf.stderr_mean,
mean_pred_stderr=None,
inf_type='ate')
# Set the dictionary values shared between local and global summaries
if discrete_treatment:
cats = est.transformer.categories_[0]
baseline = cats[est.transformer.drop_idx_[0]]
cats = cats[np.setdiff1d(np.arange(len(cats)),
est.transformer.drop_idx_[0])]
d_t = len(cats)
insights = {
_CausalInsightsConstants.TypeKey: ['cat'] * d_t,
_CausalInsightsConstants.RawFeatureNameKey: [name] * d_t,
_CausalInsightsConstants.CategoricalColumnKey: cats.tolist(),
_CausalInsightsConstants.EngineeredNameKey: [
f"{name} (base={baseline}): {c}" for c in cats]
}
treatment_value = 1
else:
d_t = 1
cats = ["num"]
baseline = None
insights = {
_CausalInsightsConstants.TypeKey: ["num"],
_CausalInsightsConstants.RawFeatureNameKey: [name],
_CausalInsightsConstants.CategoricalColumnKey: [name],
_CausalInsightsConstants.EngineeredNameKey: [name]
}
# calculate a "typical" treatment value, using the mean of the absolute value of non-zero treatments
treatment_value = np.mean(np.abs(T[T != 0]))
result = _result(feature_index=feat_ind,
feature_name=name,
feature_baseline=baseline,
feature_levels=cats,
hinds=hinds,
X_transformer=X_transformer,
W_transformer=W_transformer,
estimator=est,
global_inference=global_inference,
treatment_value=treatment_value)
return insights, result
except Exception as e:
return e