def match()

in causalml/match.py [0:0]


    def match(self, data, treatment_col, score_cols):
        """Find matches from the control group by matching on specified columns
        (propensity preferred).

        Args:
            data (pandas.DataFrame): total input data
            treatment_col (str): the column name for the treatment
            score_cols (list): list of column names for matching (propensity
                column should be included)

        Returns:
            (pandas.DataFrame): The subset of data consisting of matched
                treatment and control group data.
        """
        assert isinstance(score_cols, list), "score_cols must be a list"
        treatment = data.loc[data[treatment_col] == 1, score_cols]
        control = data.loc[data[treatment_col] == 0, score_cols]

        # Picks whether to use treatment or control for matching direction
        match_from = treatment if self.treatment_to_control else control
        match_to = control if self.treatment_to_control else treatment
        sdcal = self.caliper * np.std(data[score_cols].values)

        if self.replace:
            scaler = StandardScaler()
            scaler.fit(data[score_cols])
            match_from_scaled = pd.DataFrame(
                scaler.transform(match_from), index=match_from.index
            )
            match_to_scaled = pd.DataFrame(
                scaler.transform(match_to), index=match_to.index
            )

            # SD is the same as caliper because we use a StandardScaler above
            sdcal = self.caliper

            matching_model = NearestNeighbors(
                n_neighbors=self.ratio, n_jobs=self.n_jobs
            )
            matching_model.fit(match_to_scaled)
            distances, indices = matching_model.kneighbors(match_from_scaled)
            # distances and indices are (n_obs, self.ratio) matrices.
            # To index easily, reshape distances, indices and treatment into
            # the (n_obs * self.ratio, 1) matrices and data frame.
            distances = distances.T.flatten()
            indices = indices.T.flatten()
            match_from_scaled = pd.concat([match_from_scaled] * self.ratio, axis=0)

            cond = (distances / np.sqrt(len(score_cols))) < sdcal
            # Deduplicate the indices of the treatment group
            from_idx_matched = np.unique(match_from_scaled.loc[cond].index)
            # XXX: Should we deduplicate the indices of the control group too?
            to_idx_matched = np.array(match_to_scaled.iloc[indices[cond]].index)
        else:
            assert len(score_cols) == 1, (
                "Matching on multiple columns is only supported using the "
                "replacement method (if matching on multiple columns, set "
                "replace=True)."
            )
            # unpack score_cols for the single-variable matching case
            score_col = score_cols[0]

            if self.shuffle:
                from_indices = self.random_state.permutation(match_from.index)
            else:
                from_indices = match_from.index

            from_idx_matched = []
            to_idx_matched = []
            match_to["unmatched"] = True

            for from_idx in from_indices:
                dist = np.abs(
                    match_to.loc[match_to.unmatched, score_col]
                    - match_from.loc[from_idx, score_col]
                )
                # Gets self.ratio lowest dists
                to_np_idx_list = np.argpartition(dist, self.ratio)[: self.ratio]
                to_idx_list = dist.index[to_np_idx_list]
                for i, to_idx in enumerate(to_idx_list):
                    if dist[to_idx] <= sdcal:
                        if i == 0:
                            from_idx_matched.append(from_idx)
                        to_idx_matched.append(to_idx)
                        match_to.loc[to_idx, "unmatched"] = False

        return data.loc[
            np.concatenate([np.array(from_idx_matched), np.array(to_idx_matched)])
        ]