in causalml/inference/tree/uplift.pyx [0:0]
def group_counts_by_divide(
col_vals, threshold_val, is_split_by_gt,
np.ndarray[TR_TYPE_t, ndim=1] treatment_idx,
np.ndarray[Y_TYPE_t, ndim=1] y,
np.ndarray[N_TYPE_t, ndim=1] out_arr):
'''
Count sample size by experiment group for the left branch,
after splitting col_vals by threshold_val.
If is_split_by_gt, the left branch is (col_vals >= threshold_val),
otherwise the left branch is (col_vals == threshold_val).
This aims to combine the previous divideSet_len and
group_uniqueCounts_to_arr into one function, so as to reduce the
number of intermediate objects.
Args
----
col_vals : array-like, shape = [num_samples]
An array containing one column of x values.
threshold_val : compatible value with col_vals
A value for splitting col_vals.
If is_split_by_gt, the left branch is (col_vals >= threshold_val),
otherwise the left branch is (col_vals == threshold_val).
is_split_by_gt : bool
Whether to split by (col_vals >= threshold_val).
If False, will split by (col_vals == threshold_val).
treatment_idx : array-like, shape = [num_samples]
An array containing the treatment group index for each unit.
Should be of type numpy.int8
y : array-like, shape = [num_samples]
An array containing the outcome of interest for each unit.
Should be of type numpy.int8
out_arr : array-like, shape = [2 * n_class]
An array to store the output counts, should have type numpy.int32
Returns
-------
len_X_l: the number of samples in the left branch.
Also modify the out_arr to hold the negative and positive
outcome sample sizes for each of the control and treatment groups.
out_arr[2*i] is N(Y = 0, T = i) for i = 0, ..., n_class
out_arr[2*i+1] is N(Y = 1, T = i) for i = 0, ..., n_class
'''
cdef int out_arr_len = out_arr.shape[0]
cdef int n_class = out_arr_len / 2
cdef int num_samples = treatment_idx.shape[0]
cdef int yv = 0
cdef int tv = 0
cdef int i = 0
cdef N_TYPE_t len_X_l = 0
cdef np.ndarray[np.uint8_t, ndim=1, cast=True] filt
# first clear the output
for i in range(out_arr_len):
out_arr[i] = 0
# split
if is_split_by_gt:
filt = col_vals >= threshold_val
else:
filt = col_vals == threshold_val
# then loop through treatment_idx and y, sum the counts where filt
# is True, and it is the count for the left branch.
# Also count len_X_l in the process.
# first sum as N(T = i) and N(Y = 1, T = i) at index (2*i, 2*i+1), and later adjust
for i in range(num_samples):
if filt[i]> 0:
len_X_l += 1
tv = treatment_idx[i]
# assume treatment index is in range
out_arr[2*tv] += 1
# assume y should be either 0 or 1, so this is summing
out_arr[2*tv + 1] += y[i]
# adjust the entry at index 2*i to be N(Y = 0, T = i) = N(T = i) - N(Y = 1, T = i)
for i in range(n_class):
out_arr[2*i] -= out_arr[2*i + 1]
# done, modified out_arr
return len_X_l