def group_counts_by_divide()

in causalml/inference/tree/uplift.pyx [0:0]


def group_counts_by_divide(
        col_vals, threshold_val, is_split_by_gt,
        np.ndarray[TR_TYPE_t, ndim=1] treatment_idx,
        np.ndarray[Y_TYPE_t, ndim=1] y,
        np.ndarray[N_TYPE_t, ndim=1] out_arr):
    '''
    Count sample size by experiment group for the left branch,
    after splitting col_vals by threshold_val.
    If is_split_by_gt, the left branch is (col_vals >= threshold_val),
    otherwise the left branch is (col_vals == threshold_val).

    This aims to combine the previous divideSet_len and
    group_uniqueCounts_to_arr into one function, so as to reduce the
    number of intermediate objects.

    Args
    ----
    col_vals : array-like, shape = [num_samples]
        An array containing one column of x values.
    threshold_val : compatible value with col_vals
        A value for splitting col_vals.
        If is_split_by_gt, the left branch is (col_vals >= threshold_val),
        otherwise the left branch is (col_vals == threshold_val).
    is_split_by_gt : bool
        Whether to split by (col_vals >= threshold_val).
        If False, will split by (col_vals == threshold_val).
    treatment_idx : array-like, shape = [num_samples]
        An array containing the treatment group index for each unit.
        Should be of type numpy.int8
    y : array-like, shape = [num_samples]
        An array containing the outcome of interest for each unit.
        Should be of type numpy.int8
    out_arr : array-like, shape = [2 * n_class]
        An array to store the output counts, should have type numpy.int32

    Returns
    -------
    len_X_l: the number of samples in the left branch.
    Also modify the out_arr to hold the negative and positive
    outcome sample sizes for each of the control and treatment groups.
        out_arr[2*i] is N(Y = 0, T = i) for i = 0, ..., n_class
        out_arr[2*i+1] is N(Y = 1, T = i) for i = 0, ..., n_class
    '''
    cdef int out_arr_len = out_arr.shape[0]
    cdef int n_class = out_arr_len / 2
    cdef int num_samples = treatment_idx.shape[0]
    cdef int yv = 0
    cdef int tv = 0
    cdef int i = 0
    cdef N_TYPE_t len_X_l = 0
    cdef np.ndarray[np.uint8_t, ndim=1, cast=True] filt
    # first clear the output
    for i in range(out_arr_len):
        out_arr[i] = 0

    # split
    if is_split_by_gt:
        filt = col_vals >= threshold_val
    else:
        filt = col_vals == threshold_val

    # then loop through treatment_idx and y, sum the counts where filt
    # is True, and it is the count for the left branch.
    # Also count len_X_l in the process.

    # first sum as N(T = i) and N(Y = 1, T = i) at index (2*i, 2*i+1), and later adjust
    for i in range(num_samples):
        if filt[i]> 0:
            len_X_l += 1
            tv = treatment_idx[i]
            # assume treatment index is in range
            out_arr[2*tv] += 1
            # assume y should be either 0 or 1, so this is summing 
            out_arr[2*tv + 1] += y[i]
    # adjust the entry at index 2*i to be N(Y = 0, T = i) = N(T = i) - N(Y = 1, T = i)
    for i in range(n_class):
        out_arr[2*i] -= out_arr[2*i + 1]
    # done, modified out_arr
    return len_X_l