in causalml/dataset/semiSynthetic.py [0:0]
def generate(self, K: int = 10, n=None) -> List[List[pd.DataFrame]]:
if n is None:
n = len(self.X)
if all((self.y == 0) | (self.y == 1)):
binary_y = True
else:
binary_y = False
ctrl_idx = np.where(self.w == 0)[0]
trt_idx = np.where(self.w == 1)[0]
ctrl_n = int(n * (len(ctrl_idx) / len(self.X)))
trt_n = int(n * (len(trt_idx) / len(self.X)))
ans = []
for q in range(len(self.dgps)):
datasets = []
dgp_q = self.dgps[q]["final_model"]
data_tau = self.X.copy()
y0 = dgp_q[0](data_tau)
y1 = dgp_q[1](data_tau)
if binary_y:
y0 = logistic(y0)
y1 = logistic(y1)
data_tau["w"] = self.w
data_tau["tau_i"] = y1 - y0
data_tau["y_w"] = np.where(self.w == 1, y1, y0)
resid = self.y - data_tau["y_w"]
for k in range(K):
rng = np.random.default_rng(seed=k)
ctrl_idx_qk = rng.choice(ctrl_idx, size=ctrl_n, replace=True)
trt_idx_qk = rng.choice(trt_idx, size=trt_n, replace=True)
idx = np.concatenate([ctrl_idx_qk, trt_idx_qk])
data_qk = data_tau.iloc[idx].copy()
if not binary_y:
data_qk["y"] = data_qk["y_w"] + rng.choice(
resid, size=len(data_qk), replace=True
) # aka observed y
else:
data_qk["y"] = data_qk["y_w"].apply(lambda x: rng.binomial(1, x))
data_qk = data_qk[["y", "w", "tau_i"] + list(self.X)]
datasets.append(data_qk)
ans.append(datasets)
return ans