def _crossfit()

in econml/_ortho_learner.py [0:0]
46 lines of code
26 McCabe index (conditional complexity)

def _crossfit(model, folds, *args, **kwargs):
    """
    General crossfit based calculation of nuisance parameters.

    Parameters
    ----------
    model : object
        An object that supports fit and predict. Fit must accept all the args
        and the keyword arguments kwargs. Similarly predict must all accept
        all the args as arguments and kwards as keyword arguments. The fit
        function estimates a model of the nuisance function, based on the input
        data to fit. Predict evaluates the fitted nuisance function on the input
        data to predict.
    folds : list of tuples or None
        The crossfitting fold structure. Every entry in the list is a tuple whose
        first element are the training indices of the args and kwargs data and
        the second entry are the test indices. If the union of the test indices
        is not the full set of all indices, then the remaining nuisance parameters
        for the missing indices have value NaN.  If folds is None, then cross fitting
        is not performed; all indices are used for both model fitting and prediction
    args : a sequence of (numpy matrices or None)
        Each matrix is a data variable whose first index corresponds to a sample
    kwargs : a sequence of key-value args, with values being (numpy matrices or None)
        Each keyword argument is of the form Var=x, with x a numpy array. Each
        of these arrays are data variables. The model fit and predict will be
        called with signature: `model.fit(*args, **kwargs)` and
        `model.predict(*args, **kwargs)`. Key-value arguments that have value
        None, are ommitted from the two calls. So all the args and the non None
        kwargs variables must be part of the models signature.

    Returns
    -------
    nuisances : tuple of numpy matrices
        Each entry in the tuple is a nuisance parameter matrix. Each row i-th in the
        matrix corresponds to the value of the nuisance parameter for the i-th input
        sample.
    model_list : list of objects of same type as input model
        The cloned and fitted models for each fold. Can be used for inspection of the
        variability of the fitted models across folds.
    fitted_inds : np array1d
        The indices of the arrays for which the nuisance value was calculated. This
        corresponds to the union of the indices of the test part of each fold in
        the input fold list.
    scores : tuple of list of float or None
        The out-of-sample model scores for each nuisance model

    Examples
    --------

    .. testcode::

        import numpy as np
        from sklearn.model_selection import KFold
        from sklearn.linear_model import Lasso
        from econml._ortho_learner import _crossfit
        class Wrapper:
            def __init__(self, model):
                self._model = model
            def fit(self, X, y, W=None):
                self._model.fit(X, y)
                return self
            def predict(self, X, y, W=None):
                return self._model.predict(X)
        np.random.seed(123)
        X = np.random.normal(size=(5000, 3))
        y = X[:, 0] + np.random.normal(size=(5000,))
        folds = list(KFold(2).split(X, y))
        model = Lasso(alpha=0.01)
        nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), folds, X, y, W=y, Z=None)

    >>> nuisance
    (array([-1.105728... , -1.537566..., -2.451827... , ...,  1.106287...,
       -1.829662..., -1.782273...]),)
    >>> model_list
    [<Wrapper object at 0x...>, <Wrapper object at 0x...>]
    >>> fitted_inds
    array([   0,    1,    2, ..., 4997, 4998, 4999])

    """
    model_list = []
    fitted_inds = []
    calculate_scores = hasattr(model, 'score')

    # remove None arguments
    kwargs = filter_none_kwargs(**kwargs)

    if folds is None:  # skip crossfitting
        model_list.append(clone(model, safe=False))
        model_list[0].fit(*args, **kwargs)
        nuisances = model_list[0].predict(*args, **kwargs)
        scores = model_list[0].score(*args, **kwargs) if calculate_scores else None

        if not isinstance(nuisances, tuple):
            nuisances = (nuisances,)
        if not isinstance(scores, tuple):
            scores = (scores,)

        # scores entries should be lists of scores, so make each entry a singleton list
        scores = tuple([s] for s in scores)

        first_arr = args[0] if args else kwargs.items()[0][1]
        return nuisances, model_list, np.arange(first_arr.shape[0]), scores

    for idx, (train_idxs, test_idxs) in enumerate(folds):
        model_list.append(clone(model, safe=False))
        if len(np.intersect1d(train_idxs, test_idxs)) > 0:
            raise AttributeError("Invalid crossfitting fold structure." +
                                 "Train and test indices of each fold must be disjoint.")
        if len(np.intersect1d(fitted_inds, test_idxs)) > 0:
            raise AttributeError("Invalid crossfitting fold structure. The same index appears in two test folds.")
        fitted_inds = np.concatenate((fitted_inds, test_idxs))

        args_train = tuple(var[train_idxs] if var is not None else None for var in args)
        args_test = tuple(var[test_idxs] if var is not None else None for var in args)

        kwargs_train = {key: var[train_idxs] for key, var in kwargs.items()}
        kwargs_test = {key: var[test_idxs] for key, var in kwargs.items()}

        model_list[idx].fit(*args_train, **kwargs_train)

        nuisance_temp = model_list[idx].predict(*args_test, **kwargs_test)

        if not isinstance(nuisance_temp, tuple):
            nuisance_temp = (nuisance_temp,)

        if idx == 0:
            nuisances = tuple([np.full((args[0].shape[0],) + nuis.shape[1:], np.nan) for nuis in nuisance_temp])

        for it, nuis in enumerate(nuisance_temp):
            nuisances[it][test_idxs] = nuis

        if calculate_scores:
            score_temp = model_list[idx].score(*args_test, **kwargs_test)

            if not isinstance(score_temp, tuple):
                score_temp = (score_temp,)

            if idx == 0:
                scores = tuple([] for _ in score_temp)

            for it, score in enumerate(score_temp):
                scores[it].append(score)

    return nuisances, model_list, np.sort(fitted_inds.astype(int)), (scores if calculate_scores else None)