in causalml/match.py [0:0]
def match(self, data, treatment_col, score_cols):
"""Find matches from the control group by matching on specified columns
(propensity preferred).
Args:
data (pandas.DataFrame): total input data
treatment_col (str): the column name for the treatment
score_cols (list): list of column names for matching (propensity
column should be included)
Returns:
(pandas.DataFrame): The subset of data consisting of matched
treatment and control group data.
"""
assert isinstance(score_cols, list), "score_cols must be a list"
treatment = data.loc[data[treatment_col] == 1, score_cols]
control = data.loc[data[treatment_col] == 0, score_cols]
# Picks whether to use treatment or control for matching direction
match_from = treatment if self.treatment_to_control else control
match_to = control if self.treatment_to_control else treatment
sdcal = self.caliper * np.std(data[score_cols].values)
if self.replace:
scaler = StandardScaler()
scaler.fit(data[score_cols])
match_from_scaled = pd.DataFrame(
scaler.transform(match_from), index=match_from.index
)
match_to_scaled = pd.DataFrame(
scaler.transform(match_to), index=match_to.index
)
# SD is the same as caliper because we use a StandardScaler above
sdcal = self.caliper
matching_model = NearestNeighbors(
n_neighbors=self.ratio, n_jobs=self.n_jobs
)
matching_model.fit(match_to_scaled)
distances, indices = matching_model.kneighbors(match_from_scaled)
# distances and indices are (n_obs, self.ratio) matrices.
# To index easily, reshape distances, indices and treatment into
# the (n_obs * self.ratio, 1) matrices and data frame.
distances = distances.T.flatten()
indices = indices.T.flatten()
match_from_scaled = pd.concat([match_from_scaled] * self.ratio, axis=0)
cond = (distances / np.sqrt(len(score_cols))) < sdcal
# Deduplicate the indices of the treatment group
from_idx_matched = np.unique(match_from_scaled.loc[cond].index)
# XXX: Should we deduplicate the indices of the control group too?
to_idx_matched = np.array(match_to_scaled.iloc[indices[cond]].index)
else:
assert len(score_cols) == 1, (
"Matching on multiple columns is only supported using the "
"replacement method (if matching on multiple columns, set "
"replace=True)."
)
# unpack score_cols for the single-variable matching case
score_col = score_cols[0]
if self.shuffle:
from_indices = self.random_state.permutation(match_from.index)
else:
from_indices = match_from.index
from_idx_matched = []
to_idx_matched = []
match_to["unmatched"] = True
for from_idx in from_indices:
dist = np.abs(
match_to.loc[match_to.unmatched, score_col]
- match_from.loc[from_idx, score_col]
)
# Gets self.ratio lowest dists
to_np_idx_list = np.argpartition(dist, self.ratio)[: self.ratio]
to_idx_list = dist.index[to_np_idx_list]
for i, to_idx in enumerate(to_idx_list):
if dist[to_idx] <= sdcal:
if i == 0:
from_idx_matched.append(from_idx)
to_idx_matched.append(to_idx)
match_to.loc[to_idx, "unmatched"] = False
return data.loc[
np.concatenate([np.array(from_idx_matched), np.array(to_idx_matched)])
]