in econml/policy/_forest/_forest.py [0:0]
def fit(self, X, y, *, sample_weight=None, **kwargs):
"""
Build a forest of trees from the training set (X, y) and any other auxiliary variables.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples. Internally, its dtype will be converted
to ``dtype=np.float64``.
y : array-like of shape (n_samples,) or (n_samples, n_treatments)
The outcome values for each sample and for each treatment.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted. Splits
that would create child nodes with net zero or negative weight are
ignored while searching for a split in each node.
**kwargs : dictionary of array-like items of shape (n_samples, d_var)
Auxiliary random variables
Returns
-------
self : object
"""
X, y = check_X_y(X, y, multi_output=True)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)
# Remap output
n_samples, self.n_features_ = X.shape
y = np.atleast_1d(y)
if y.ndim == 1:
# reshape is necessary to preserve the data contiguity against vs
# [:, np.newaxis] that does not.
y = np.reshape(y, (-1, 1))
self.n_outputs_ = y.shape[1]
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
y = np.ascontiguousarray(y, dtype=DOUBLE)
if getattr(X, "dtype", None) != DTYPE:
X = X.astype(DTYPE)
# Get subsample sample size
n_samples_subsample = _get_n_samples_subsample(
n_samples=n_samples,
max_samples=self.max_samples
)
# Check parameters
self._validate_estimator()
random_state = check_random_state(self.random_state)
# We re-initialize the subsample_random_seed_ only if we are not in warm_start mode or
# if this is the first `fit` call of the warm start mode.
if (not self.warm_start) or (not hasattr(self, 'subsample_random_seed_')):
self.subsample_random_seed_ = random_state.randint(MAX_INT)
else:
random_state.randint(MAX_INT) # just advance random_state
subsample_random_state = check_random_state(self.subsample_random_seed_)
if not self.warm_start or not hasattr(self, "estimators_"):
# Free allocated memory, if any
self.estimators_ = []
self.slices_ = []
# the below are needed to replicate randomness of subsampling when warm_start=True
self.n_samples_ = []
self.n_samples_subsample_ = []
n_more_estimators = self.n_estimators - len(self.estimators_)
if n_more_estimators < 0:
raise ValueError('n_estimators=%d must be larger or equal to '
'len(estimators_)=%d when warm_start==True'
% (self.n_estimators, len(self.estimators_)))
elif n_more_estimators == 0:
warn("Warm-start fitting without increasing n_estimators does not "
"fit new trees.", UserWarning)
else:
if self.warm_start and len(self.estimators_) > 0:
# We draw from the random state to get the random state we
# would have got if we hadn't used a warm_start.
random_state.randint(MAX_INT, size=len(self.estimators_))
trees = [self._make_estimator(append=False,
random_state=random_state).init()
for i in range(n_more_estimators)]
if self.warm_start:
# Advancing subsample_random_state. Assumes each prior fit call has the same number of
# samples at fit time. If not then this would not exactly replicate a single batch execution,
# but would still advance randomness enough so that tree subsamples will be different.
for _, n_, ns_ in zip(range(len(self.estimators_)), self.n_samples_, self.n_samples_subsample_):
subsample_random_state.choice(n_, ns_, replace=False)
s_inds = [subsample_random_state.choice(n_samples, n_samples_subsample, replace=False)
for _ in range(n_more_estimators)]
# Parallel loop: we prefer the threading backend as the Cython code
# for fitting the trees is internally releasing the Python GIL
# making threading more efficient than multiprocessing in
# that case. However, for joblib 0.12+ we respect any
# parallel_backend contexts set at a higher level,
# since correctness does not rely on using threads.
trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend='threading')(
delayed(t.fit)(X[s], y[s],
sample_weight=sample_weight[s] if sample_weight is not None else None,
check_input=False)
for t, s in zip(trees, s_inds))
# Collect newly grown trees
self.estimators_.extend(trees)
self.n_samples_.extend([n_samples] * len(trees))
self.n_samples_subsample_.extend([n_samples_subsample] * len(trees))
return self