def fit()

in econml/grf/_base_grf.py [0:0]
127 lines of code
40 McCabe index (conditional complexity)

    def fit(self, X, T, y, *, sample_weight=None, **kwargs):
        """
        Build a forest of trees from the training set (X, T, y) and any other auxiliary variables.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float64``.
        T : array-like of shape (n_samples, n_treatments)
            The treatment vector for each sample
        y : array-like of shape (n_samples,) or (n_samples, n_outcomes)
            The outcome values for each sample.
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node.
        **kwargs : dictionary of array-like items of shape (n_samples, d_var)
            Auxiliary random variables that go into the moment function (e.g. instrument, censoring etc)
            Any of these variables will be passed on as is to the `get_pointJ` and
            `get_alpha` method of the children classes.

        Returns
        -------
        self : object
        """
        # TODO: support freq_weight and sample_var
        y, T, X, _ = check_inputs(y, T, X, W=None, multi_output_T=True, multi_output_Y=True)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)

        # Remap output
        n_samples, self.n_features_ = X.shape

        y = np.atleast_1d(y)
        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_y_ = y.shape[1]

        T = np.atleast_1d(T)
        if T.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            T = np.reshape(T, (-1, 1))

        alpha, pointJ = self._get_alpha_and_pointJ(X, T, y, **kwargs)
        self.n_outputs_, self.n_relevant_outputs_ = self._get_n_outputs_decomposition(X, T, y, **kwargs)
        yaug = np.hstack([y, alpha, pointJ])

        if getattr(yaug, "dtype", None) != DOUBLE or not yaug.flags.contiguous:
            yaug = np.ascontiguousarray(yaug, dtype=DOUBLE)

        if getattr(X, "dtype", None) != DTYPE:
            X = X.astype(DTYPE)

        # Get subsample sample size
        n_samples_subsample = _get_n_samples_subsample(
            n_samples=n_samples,
            max_samples=self.max_samples
        )

        # Converting `min_var_fraction_leaf` to an absolute `min_var_leaf` that the GRFTree can handle
        if self.min_var_fraction_leaf is None:
            self.min_var_leaf = None
        elif (not isinstance(self.min_var_fraction_leaf, numbers.Real)) or (not (0 < self.min_var_fraction_leaf <= 1)):
            msg = "`min_var_fraction_leaf` must be in range (0, 1) but got value {}"
            raise ValueError(msg.format(self.min_var_fraction_leaf))
        else:
            # We calculate the min eigenvalue proxy that each criterion is considering
            # on the overall mean jacobian, to determine the absolute level of `min_var_leaf`
            jac = np.mean(pointJ, axis=0).reshape((self.n_outputs_, self.n_outputs_))
            min_var = np.min(np.abs(np.diag(jac)))
            if self.criterion == 'mse':
                for i in range(self.n_outputs_):
                    for j in range(self.n_outputs_):
                        if j != i:
                            det = np.sqrt(np.abs(jac[i, i] * jac[j, j] - jac[i, j] * jac[j, i]))
                            if det < min_var:
                                min_var = det
            self.min_var_leaf = min_var * self.min_var_fraction_leaf

        # Check parameters
        self._validate_estimator()

        random_state = check_random_state(self.random_state)
        # We re-initialize the subsample_random_seed_ only if we are not in warm_start mode or
        # if this is the first `fit` call of the warm start mode.
        if (not self.warm_start) or (not hasattr(self, 'subsample_random_seed_')):
            self.subsample_random_seed_ = random_state.randint(MAX_INT)
        else:
            random_state.randint(MAX_INT)  # just advance random_state
        subsample_random_state = check_random_state(self.subsample_random_seed_)

        if (self.warm_start and hasattr(self, 'inference_') and (self.inference != self.inference_)):
            raise ValueError("Parameter inference cannot be altered in between `fit` "
                             "calls when `warm_start=True`.")
        self.inference_ = self.inference
        self.warm_start_ = self.warm_start

        if not self.warm_start or not hasattr(self, "estimators_"):
            # Free allocated memory, if any
            self.estimators_ = []
            self.slices_ = []
            # the below are needed to replicate randomness of subsampling when warm_start=True
            self.slices_n_samples_ = []
            self.slices_n_samples_subsample_ = []
            self.n_samples_ = []
            self.n_samples_subsample_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True'
                             % (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
        else:
            if self.inference:
                if not isinstance(self.subforest_size, numbers.Integral):
                    raise ValueError("Parameter `subforest_size` must be "
                                     "an integer but got value {}.".format(self.subforest_size))
                if self.subforest_size < 2:
                    raise ValueError("Parameter `subforest_size` must be at least 2 if `inference=True`, "
                                     "but got value {}".format(self.subforest_size))
                if not (n_more_estimators % self.subforest_size == 0):
                    raise ValueError("The number of estimators to be constructed must be divisible "
                                     "the `subforest_size` parameter. Asked to build `n_estimators={}` "
                                     "with `subforest_size={}`.".format(n_more_estimators, self.subforest_size))
                if n_samples_subsample > n_samples // 2:
                    if isinstance(self.max_samples, numbers.Integral):
                        raise ValueError("Parameter `max_samples` must be in [1, n_samples // 2], "
                                         "if `inference=True`. "
                                         "Got values n_samples={}, max_samples={}".format(n_samples, self.max_samples))
                    else:
                        raise ValueError("Parameter `max_samples` must be in (0, .5], if `inference=True`. "
                                         "Got value {}".format(self.max_samples))

            if self.warm_start and len(self.estimators_) > 0:
                # We draw from the random state to get the random state we
                # would have got if we hadn't used a warm_start.
                random_state.randint(MAX_INT, size=len(self.estimators_))

            trees = [self._make_estimator(append=False,
                                          random_state=random_state).init()
                     for i in range(n_more_estimators)]

            if self.inference:
                if self.warm_start:
                    # Advancing subsample_random_state. Assumes each prior fit call has the same number of
                    # samples at fit time. If not then this would not exactly replicate a single batch execution,
                    # but would still advance randomness enough so that tree subsamples will be different.
                    for sl, n_, ns_ in zip(self.slices_, self.slices_n_samples_, self.slices_n_samples_subsample_):
                        subsample_random_state.choice(n_, n_ // 2, replace=False)
                        for _ in range(len(sl)):
                            subsample_random_state.choice(n_ // 2, ns_, replace=False)

                # Generating indices a priori before parallelism ended up being orders of magnitude
                # faster than how sklearn does it. The reason is that random samplers do not release the
                # gil it seems.
                n_groups = n_more_estimators // self.subforest_size
                new_slices = np.array_split(np.arange(len(self.estimators_),
                                                      len(self.estimators_) + n_more_estimators),
                                            n_groups)
                s_inds = []
                for sl in new_slices:
                    half_sample_inds = subsample_random_state.choice(n_samples, n_samples // 2, replace=False)
                    s_inds.extend([half_sample_inds[subsample_random_state.choice(n_samples // 2,
                                                                                  n_samples_subsample,
                                                                                  replace=False)]
                                   for _ in range(len(sl))])
            else:
                if self.warm_start:
                    # Advancing subsample_random_state. Assumes each prior fit call has the same number of
                    # samples at fit time. If not then this would not exactly replicate a single batch execution,
                    # but would still advance randomness enough so that tree subsamples will be different.
                    for _, n_, ns_ in zip(range(len(self.estimators_)), self.n_samples_, self.n_samples_subsample_):
                        subsample_random_state.choice(n_, ns_, replace=False)
                new_slices = []
                s_inds = [subsample_random_state.choice(n_samples, n_samples_subsample, replace=False)
                          for _ in range(n_more_estimators)]

            # Parallel loop: we prefer the threading backend as the Cython code
            # for fitting the trees is internally releasing the Python GIL
            # making threading more efficient than multiprocessing in
            # that case. However, for joblib 0.12+ we respect any
            # parallel_backend contexts set at a higher level,
            # since correctness does not rely on using threads.
            trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend='threading')(
                delayed(t.fit)(X[s], yaug[s], self.n_y_, self.n_outputs_, self.n_relevant_outputs_,
                               sample_weight=sample_weight[s] if sample_weight is not None else None,
                               check_input=False)
                for t, s in zip(trees, s_inds))

            # Collect newly grown trees
            self.estimators_.extend(trees)
            self.n_samples_.extend([n_samples] * len(trees))
            self.n_samples_subsample_.extend([n_samples_subsample] * len(trees))
            self.slices_.extend(list(new_slices))
            self.slices_n_samples_.extend([n_samples] * len(new_slices))
            self.slices_n_samples_subsample_.extend([n_samples_subsample] * len(new_slices))

        return self