in causalml/inference/tree/uplift.pyx [0:0]
def divideSet_len(X, treatment_idx, y, column, value, missing_go_to_left=1):
'''Tree node split.
Modified from dividedSet(), but return the len(X_l) and
len(X_r) instead of the split X_l and X_r, to avoid some
overhead, intended to be used for finding the split. After
finding the best splits, can split to find the X_l and X_r.
Args
----
X : ndarray, shape = [num_samples, num_features]
An ndarray of the covariates used to train the uplift model.
treatment_idx : array-like, shape = [num_samples]
An array containing the treatment group index for each unit.
y : array-like, shape = [num_samples]
An array containing the outcome of interest for each unit.
column : int
The column used to split the data.
value : float or int
The value in the column for splitting the data.
Returns
-------
(len_X_l, len_X_r, treatment_l, treatment_r, y_l, y_r) : list of ndarray
The covariates nrows, treatments and outcomes of left node and the right node.
'''
if isinstance(value, numbers.Number):
filt = X[:, column] >= value
else: # for strings
filt = X[:, column] == value
# Handle NaNs only for numeric columns
if np.issubdtype(X[:, column].dtype, np.number):
nan_mask = np.isnan(X[:, column])
if missing_go_to_left:
filt = filt | nan_mask
else:
filt = filt & ~nan_mask
len_X_l = np.sum(filt)
return (
len_X_l,
len(X) - len_X_l,
treatment_idx[filt],
treatment_idx[~filt],
y[filt],
y[~filt],
)