in econml/_ortho_learner.py [0:0]
def _crossfit(model, folds, *args, **kwargs):
"""
General crossfit based calculation of nuisance parameters.
Parameters
----------
model : object
An object that supports fit and predict. Fit must accept all the args
and the keyword arguments kwargs. Similarly predict must all accept
all the args as arguments and kwards as keyword arguments. The fit
function estimates a model of the nuisance function, based on the input
data to fit. Predict evaluates the fitted nuisance function on the input
data to predict.
folds : list of tuples or None
The crossfitting fold structure. Every entry in the list is a tuple whose
first element are the training indices of the args and kwargs data and
the second entry are the test indices. If the union of the test indices
is not the full set of all indices, then the remaining nuisance parameters
for the missing indices have value NaN. If folds is None, then cross fitting
is not performed; all indices are used for both model fitting and prediction
args : a sequence of (numpy matrices or None)
Each matrix is a data variable whose first index corresponds to a sample
kwargs : a sequence of key-value args, with values being (numpy matrices or None)
Each keyword argument is of the form Var=x, with x a numpy array. Each
of these arrays are data variables. The model fit and predict will be
called with signature: `model.fit(*args, **kwargs)` and
`model.predict(*args, **kwargs)`. Key-value arguments that have value
None, are ommitted from the two calls. So all the args and the non None
kwargs variables must be part of the models signature.
Returns
-------
nuisances : tuple of numpy matrices
Each entry in the tuple is a nuisance parameter matrix. Each row i-th in the
matrix corresponds to the value of the nuisance parameter for the i-th input
sample.
model_list : list of objects of same type as input model
The cloned and fitted models for each fold. Can be used for inspection of the
variability of the fitted models across folds.
fitted_inds : np array1d
The indices of the arrays for which the nuisance value was calculated. This
corresponds to the union of the indices of the test part of each fold in
the input fold list.
scores : tuple of list of float or None
The out-of-sample model scores for each nuisance model
Examples
--------
.. testcode::
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from econml._ortho_learner import _crossfit
class Wrapper:
def __init__(self, model):
self._model = model
def fit(self, X, y, W=None):
self._model.fit(X, y)
return self
def predict(self, X, y, W=None):
return self._model.predict(X)
np.random.seed(123)
X = np.random.normal(size=(5000, 3))
y = X[:, 0] + np.random.normal(size=(5000,))
folds = list(KFold(2).split(X, y))
model = Lasso(alpha=0.01)
nuisance, model_list, fitted_inds, scores = _crossfit(Wrapper(model), folds, X, y, W=y, Z=None)
>>> nuisance
(array([-1.105728... , -1.537566..., -2.451827... , ..., 1.106287...,
-1.829662..., -1.782273...]),)
>>> model_list
[<Wrapper object at 0x...>, <Wrapper object at 0x...>]
>>> fitted_inds
array([ 0, 1, 2, ..., 4997, 4998, 4999])
"""
model_list = []
fitted_inds = []
calculate_scores = hasattr(model, 'score')
# remove None arguments
kwargs = filter_none_kwargs(**kwargs)
if folds is None: # skip crossfitting
model_list.append(clone(model, safe=False))
model_list[0].fit(*args, **kwargs)
nuisances = model_list[0].predict(*args, **kwargs)
scores = model_list[0].score(*args, **kwargs) if calculate_scores else None
if not isinstance(nuisances, tuple):
nuisances = (nuisances,)
if not isinstance(scores, tuple):
scores = (scores,)
# scores entries should be lists of scores, so make each entry a singleton list
scores = tuple([s] for s in scores)
first_arr = args[0] if args else kwargs.items()[0][1]
return nuisances, model_list, np.arange(first_arr.shape[0]), scores
for idx, (train_idxs, test_idxs) in enumerate(folds):
model_list.append(clone(model, safe=False))
if len(np.intersect1d(train_idxs, test_idxs)) > 0:
raise AttributeError("Invalid crossfitting fold structure." +
"Train and test indices of each fold must be disjoint.")
if len(np.intersect1d(fitted_inds, test_idxs)) > 0:
raise AttributeError("Invalid crossfitting fold structure. The same index appears in two test folds.")
fitted_inds = np.concatenate((fitted_inds, test_idxs))
args_train = tuple(var[train_idxs] if var is not None else None for var in args)
args_test = tuple(var[test_idxs] if var is not None else None for var in args)
kwargs_train = {key: var[train_idxs] for key, var in kwargs.items()}
kwargs_test = {key: var[test_idxs] for key, var in kwargs.items()}
model_list[idx].fit(*args_train, **kwargs_train)
nuisance_temp = model_list[idx].predict(*args_test, **kwargs_test)
if not isinstance(nuisance_temp, tuple):
nuisance_temp = (nuisance_temp,)
if idx == 0:
nuisances = tuple([np.full((args[0].shape[0],) + nuis.shape[1:], np.nan) for nuis in nuisance_temp])
for it, nuis in enumerate(nuisance_temp):
nuisances[it][test_idxs] = nuis
if calculate_scores:
score_temp = model_list[idx].score(*args_test, **kwargs_test)
if not isinstance(score_temp, tuple):
score_temp = (score_temp,)
if idx == 0:
scores = tuple([] for _ in score_temp)
for it, score in enumerate(score_temp):
scores[it].append(score)
return nuisances, model_list, np.sort(fitted_inds.astype(int)), (scores if calculate_scores else None)