def _process_feature()

in econml/solutions/causal_analysis/_causal_analysis.py [0:0]
131 lines of code
22 McCabe index (conditional complexity)

def _process_feature(name, feat_ind, verbose, categorical_inds, categories, heterogeneity_inds, min_counts, y, X,
                     nuisance_models, h_model, random_state, model_y, cv, mc_iters):
    try:
        if verbose > 0:
            print(f"CausalAnalysis: Feature {name}")

        discrete_treatment = feat_ind in categorical_inds
        if discrete_treatment:
            cats = categories[categorical_inds.index(feat_ind)]
        else:
            cats = 'auto'  # just leave the setting at the default otherwise

        # the transformation logic here is somewhat tricky; we always need to encode the categorical columns,
        # whether they end up in X or in W.  However, for the continuous columns, we want to scale them all
        # when running the first stage models, but don't want to scale the X columns when running the final model,
        # since then our coefficients will have odd units and our trees will also have decisions using those units.
        #
        # we achieve this by pipelining the X scaling with the Y and T models (with fixed scaling, not refitting)

        hinds = heterogeneity_inds[feat_ind]
        WX_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
                                             [ind for ind in categorical_inds
                                              if ind != feat_ind]),
                                            ('drop', 'drop', feat_ind)],
                                           remainder=StandardScaler())
        W_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
                                            [ind for ind in categorical_inds
                                             if ind != feat_ind and ind not in hinds]),
                                           ('drop', 'drop', hinds),
                                           ('drop_feat', 'drop', feat_ind)],
                                          remainder=StandardScaler())

        X_cont_inds = [ind for ind in hinds
                       if ind != feat_ind and ind not in categorical_inds]

        # Use _ColumnTransformer instead of ColumnTransformer so we can get feature names
        X_transformer = _ColumnTransformer([ind for ind in categorical_inds
                                            if ind != feat_ind and ind in hinds],
                                           X_cont_inds)

        # Controls are all other columns of X
        WX = WX_transformer.fit_transform(X)
        # can't use X[:, feat_ind] when X is a DataFrame
        T = _safe_indexing(X, feat_ind, axis=1)

        # TODO: we can't currently handle unseen values of the feature column when getting the effect;
        #       we might want to modify OrthoLearner (and other discrete treatment classes)
        #       so that the user can opt-in to allowing unseen treatment values
        #       (and return NaN or something in that case)

        W = W_transformer.fit_transform(X)
        X_xf = X_transformer.fit_transform(X)

        # HACK: this is slightly ugly because we rely on the fact that DML passes [X;W] to the first stage models
        #       and so we can just peel the first columns off of that combined array for rescaling in the pipeline
        # TODO: consider addding an API to DML that allows for better understanding of how the nuisance inputs are
        #       built, such as model_y_feature_names, model_t_feature_names, model_y_transformer, etc., so that this
        #       becomes a valid approach to handling this
        X_scaler = ColumnTransformer([('scale', StandardScaler(),
                                       list(range(len(X_cont_inds))))],
                                     remainder='passthrough').fit(np.hstack([X_xf, W])).named_transformers_['scale']

        X_scaler_fixed = ColumnTransformer([('scale', _freeze(X_scaler),
                                             list(range(len(X_cont_inds))))],
                                           remainder='passthrough')

        if W.shape[1] == 0:
            # array checking routines don't accept 0-width arrays
            W = None

        if X_xf.shape[1] == 0:
            X_xf = None

        if verbose > 0:
            print("CausalAnalysis: performing model selection on T model")

        # perform model selection
        model_t = (_first_stage_clf(WX, T, automl=nuisance_models == 'automl',
                                    min_count=min_counts.get(feat_ind, None),
                                    random_state=random_state, verbose=verbose)
                   if discrete_treatment else _first_stage_reg(WX, T, automl=nuisance_models == 'automl',
                                                               random_state=random_state,
                                                               verbose=verbose))

        pipelined_model_t = Pipeline([('scale', X_scaler_fixed),
                                      ('model', model_t)])

        pipelined_model_y = Pipeline([('scale', X_scaler_fixed),
                                      ('model', model_y)])

        if X_xf is None and h_model == 'forest':
            warnings.warn(f"Using a linear model instead of a forest model for feature '{name}' "
                          "because forests don't support models with no heterogeneity indices")
            h_model = 'linear'

        if h_model == 'linear':
            est = LinearDML(model_y=pipelined_model_y,
                            model_t=pipelined_model_t,
                            discrete_treatment=discrete_treatment,
                            fit_cate_intercept=True,
                            linear_first_stages=False,
                            categories=cats,
                            random_state=random_state,
                            cv=cv,
                            mc_iters=mc_iters)
        elif h_model == 'forest':
            est = CausalForestDML(model_y=pipelined_model_y,
                                  model_t=pipelined_model_t,
                                  discrete_treatment=discrete_treatment,
                                  n_estimators=4000,
                                  min_var_leaf_on_val=True,
                                  categories=cats,
                                  random_state=random_state,
                                  verbose=verbose,
                                  cv=cv,
                                  mc_iters=mc_iters)

            if verbose > 0:
                print("CausalAnalysis: tuning forest")
            est.tune(y, T, X=X_xf, W=W)
        if verbose > 0:
            print("CausalAnalysis: training causal model")
        est.fit(y, T, X=X_xf, W=W, cache_values=True)

        # Prefer ate__inference to const_marginal_ate_inference(X) because it is doubly-robust and not conservative
        if h_model == 'forest' and discrete_treatment:
            global_inference = est.ate__inference()
        else:
            # convert to NormalInferenceResults for consistency
            inf = est.const_marginal_ate_inference(X=X_xf)
            global_inference = NormalInferenceResults(d_t=inf.d_t, d_y=inf.d_y,
                                                      pred=inf.mean_point,
                                                      pred_stderr=inf.stderr_mean,
                                                      mean_pred_stderr=None,
                                                      inf_type='ate')

        # Set the dictionary values shared between local and global summaries
        if discrete_treatment:
            cats = est.transformer.categories_[0]
            baseline = cats[est.transformer.drop_idx_[0]]
            cats = cats[np.setdiff1d(np.arange(len(cats)),
                                     est.transformer.drop_idx_[0])]
            d_t = len(cats)
            insights = {
                _CausalInsightsConstants.TypeKey: ['cat'] * d_t,
                _CausalInsightsConstants.RawFeatureNameKey: [name] * d_t,
                _CausalInsightsConstants.CategoricalColumnKey: cats.tolist(),
                _CausalInsightsConstants.EngineeredNameKey: [
                    f"{name} (base={baseline}): {c}" for c in cats]
            }
            treatment_value = 1
        else:
            d_t = 1
            cats = ["num"]
            baseline = None
            insights = {
                _CausalInsightsConstants.TypeKey: ["num"],
                _CausalInsightsConstants.RawFeatureNameKey: [name],
                _CausalInsightsConstants.CategoricalColumnKey: [name],
                _CausalInsightsConstants.EngineeredNameKey: [name]
            }
            # calculate a "typical" treatment value, using the mean of the absolute value of non-zero treatments
            treatment_value = np.mean(np.abs(T[T != 0]))

        result = _result(feature_index=feat_ind,
                         feature_name=name,
                         feature_baseline=baseline,
                         feature_levels=cats,
                         hinds=hinds,
                         X_transformer=X_transformer,
                         W_transformer=W_transformer,
                         estimator=est,
                         global_inference=global_inference,
                         treatment_value=treatment_value)

        return insights, result
    except Exception as e:
        return e