def tree_node_summary_to_arr()

in causalml/inference/tree/uplift.pyx [0:0]


    def tree_node_summary_to_arr(np.ndarray[TR_TYPE_t, ndim=1] treatment_idx,
                                 np.ndarray[Y_TYPE_t, ndim=1] y,
                                 np.ndarray[P_TYPE_t, ndim=1] out_summary_p,
                                 np.ndarray[N_TYPE_t, ndim=1] out_summary_n,
                                 np.ndarray[N_TYPE_t, ndim=1] buf_count_arr,
                                 np.ndarray[P_TYPE_t, ndim=1] parentNodeSummary_p,
                                 int has_parent_summary,
                                 min_samples_treatment=10, n_reg=100
                                 ):
        '''
        Tree node summary statistics.
        Modified from tree_node_summary, to use different format for the summary.
        Instead of [[P(Y=1|T=0), N(T=0)], [P(Y=1|T=1), N(T=1)], ...],
        use two arrays [N(T=i)...] and [P(Y=1|T=i)...].

        Args
        ----
        treatment_idx : array-like, shape = [num_samples]
            An array containing the treatment group index for each unit.
            Has type numpy.int8.
        y : array-like, shape = [num_samples]
            An array containing the outcome of interest for each unit.
            Has type numpy.int8.
        out_summary_p : array of shape [n_class]
            Has type numpy.double.
            To be filled with the positive probabilities of each of the control
            and treament groups of the current node.
        out_summary_n : array of shape [n_class]
            Has type numpy.int32.
            To be filled with the counts of each of the control
            and treament groups of the current node.
        buf_count_arr : array of shape [2*n_class]
            Has type numpy.int32.
            To be use as temporary buffer for group_uniqueCounts_to_arr.
        parentNodeSummary_p : array of shape [n_class]
            The positive probabilities of each of the control and treatment groups
            in the parent node.
        has_parent_summary : bool as int
            If True (non-zero), then parentNodeSummary_p is a valid parent node summary probabilities.
            If False (0), assume no parent node summary and parentNodeSummary_p is not touched.
        min_samples_treatment: int, optional (default=10)
            The minimum number of samples required of the experiment group t be split at a leaf node.
        n_reg :  int, optional (default=10)
            The regularization parameter defined in Rzepakowski et al. 2012,
            the weight (in terms of sample size) of the parent node influence
            on the child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.

        Returns
        -------
        No return values, but will modify out_summary_p and out_summary_n.
        '''
        # buf_count_arr: [N(Y=0, T=0), N(Y=1, T=0), N(Y=0, T=1), N(Y=1, T=1), ...]
        group_uniqueCounts_to_arr(treatment_idx, y, buf_count_arr)

        cdef int i = 0
        cdef int n_class = buf_count_arr.shape[0] / 2
        cdef int n = 0
        cdef int n_pos = 0
        cdef P_TYPE_t p = 0.0
        cdef int n_min_sams = min_samples_treatment
        cdef P_TYPE_t n_reg_p = n_reg

        # out_summary_p: [P(Y=1|T=i)...]
        # out_summary_n: [N(T=i) ... ]
        if has_parent_summary == 0:
            for i in range(n_class):
                n_pos = buf_count_arr[2*i + 1] # N(Y=1|T=i)
                n = buf_count_arr[2*i] + n_pos # N(Y=0|T=i) + N(Y=1|T=i) == N(T=i)
                p = (n_pos / <double> n) if n > 0 else 0.
                out_summary_n[i] = n
                out_summary_p[i] = p
        else:
            for i in range(n_class):
                n_pos = buf_count_arr[2*i + 1]
                n = buf_count_arr[2*i] + n_pos
                if n > n_min_sams:
                    p = (n_pos + parentNodeSummary_p[i] * n_reg_p) / (<double> n + n_reg_p)
                else:
                    p = parentNodeSummary_p[i]
                out_summary_n[i] = n
                out_summary_p[i] = p