in dowhy/datasets.py [0:0]
def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0,
num_effect_modifiers=0,
num_treatments = 1,
num_frontdoor_variables=0,
treatment_is_binary=True,
treatment_is_category=False,
outcome_is_binary=False,
num_discrete_common_causes=0,
num_discrete_instruments=0,
num_discrete_effect_modifiers=0,
stddev_treatment_noise = 1,
one_hot_encode = False):
assert not (treatment_is_binary and treatment_is_category)
W, X, Z, FD, c1, c2, ce, cz, cfd1, cfd2 = [None]*10
W_with_dummy, X_with_categorical = (None, None)
beta = float(beta)
# Making beta an array
if type(beta) not in [list, np.ndarray]:
beta = np.repeat(beta, num_treatments)
num_cont_common_causes = num_common_causes-num_discrete_common_causes
num_cont_instruments = num_instruments -num_discrete_instruments
num_cont_effect_modifiers = num_effect_modifiers - num_discrete_effect_modifiers
if num_common_causes > 0:
range_c1 = 0.5 + max(abs(beta))*0.5
range_c2 = 0.5 + max(abs(beta))*0.5
means = np.random.uniform(-1, 1, num_common_causes)
cov_mat = np.diag(np.ones(num_common_causes))
W = np.random.multivariate_normal(means, cov_mat, num_samples)
W_with_dummy = convert_to_categorical(W, num_common_causes, num_discrete_common_causes,
quantiles=[0.25, 0.5, 0.75], one_hot_encode=one_hot_encode)
c1 = np.random.uniform(0, range_c1, (W_with_dummy.shape[1], num_treatments))
c2 = np.random.uniform(0, range_c2, W_with_dummy.shape[1])
if num_instruments > 0:
range_cz = 1 + max(abs(beta))
p = np.random.uniform(0, 1, num_instruments)
Z = np.zeros((num_samples, num_instruments))
for i in range(num_instruments):
if (i % 2) == 0:
Z[:, i] = np.random.binomial(n=1, p=p[i], size=num_samples)
else:
Z[:, i] = np.random.uniform(0, 1, size=num_samples)
# TODO Ensure that we do not generate weak instruments
cz = np.random.uniform(range_cz - (range_cz * 0.05),
range_cz + (range_cz * 0.05), (num_instruments, num_treatments))
if num_effect_modifiers >0:
range_ce = 0.5 + max(abs(beta))*0.5
means = np.random.uniform(-1, 1, num_effect_modifiers)
cov_mat = np.diag(np.ones(num_effect_modifiers))
X = np.random.multivariate_normal(means, cov_mat, num_samples)
X_with_categorical = convert_to_categorical(X, num_effect_modifiers,
num_discrete_effect_modifiers, quantiles=[0.25, 0.5, 0.75],
one_hot_encode=one_hot_encode)
ce = np.random.uniform(0, range_ce, X_with_categorical.shape[1])
# TODO - test all our methods with random noise added to covariates (instead of the stochastic treatment assignment)
t = np.random.normal(0, stddev_treatment_noise, (num_samples, num_treatments))
if num_common_causes > 0:
t += W_with_dummy @ c1 # + np.random.normal(0, 0.01)
if num_instruments > 0:
t += Z @ cz
# Converting treatment to binary if required
if treatment_is_binary:
t = np.vectorize(stochastically_convert_to_binary)(t)
elif treatment_is_category:
t = np.vectorize(stochastically_convert_to_three_level_categorical)(t)
# Generating frontdoor variables if asked for
if num_frontdoor_variables > 0:
range_cfd1 = max(abs(beta))*0.5
range_cfd2 = max(abs(beta))*0.5
cfd1 = np.random.uniform(0, range_cfd1, (num_treatments, num_frontdoor_variables))
cfd2 = np.random.uniform(0, range_cfd2, num_frontdoor_variables)
FD_noise = np.random.normal(0, 1, (num_samples, num_frontdoor_variables))
FD = FD_noise
FD += t @ cfd1
if num_common_causes >0:
range_c1_frontdoor = range_c1/10.0
c1_frontdoor = np.random.uniform(0, range_c1_frontdoor,
(W_with_dummy.shape[1], num_frontdoor_variables))
FD += W_with_dummy @ c1_frontdoor
def _compute_y(t, W, X, FD, beta, c2, ce, cfd2):
y = np.random.normal(0,0.01, num_samples)
if num_frontdoor_variables > 0:
y += FD @ cfd2
else:
# NOTE: We are assuming a linear relationship *even when t is categorical* and integer coded.
# For categorical t, this example dataset has the effect size for category 2 being exactly
# double the effect for category 1
# This could be changed at this stage by one-hot encoding t and using a custom beta that
# sets a different effect for each category {0, 1, 2}
y += t @ beta
if num_common_causes > 0:
y += W @ c2
if num_effect_modifiers > 0:
y += (X @ ce) * np.prod(t, axis=1)
if outcome_is_binary:
y = np.vectorize(stochastically_convert_to_binary)(y)
return y
y = _compute_y(t, W_with_dummy, X_with_categorical, FD, beta, c2, ce, cfd2)
data = np.column_stack((t, y))
if num_common_causes > 0:
data = np.column_stack((W_with_dummy, data))
if num_instruments > 0:
data = np.column_stack((Z, data))
if num_effect_modifiers > 0:
data = np.column_stack((X_with_categorical, data))
if num_frontdoor_variables > 0:
data = np.column_stack((FD, data))
# Computing ATE
FD_T1, FD_T0 = None, None
T1 = np.ones((num_samples, num_treatments))
T0 = np.zeros((num_samples, num_treatments))
if num_frontdoor_variables > 0:
FD_T1 = FD_noise + (T1 @ cfd1)
FD_T0 = FD_noise + (T0 @ cfd1)
ate = np.mean(
_compute_y(T1, W_with_dummy, X_with_categorical, FD_T1, beta, c2, ce, cfd2) -
_compute_y(T0, W_with_dummy, X_with_categorical, FD_T0, beta, c2, ce, cfd2))
treatments = [("v" + str(i)) for i in range(0, num_treatments)]
outcome = "y"
# constructing column names for one-hot encoded discrete features
common_causes = construct_col_names("W", num_common_causes, num_discrete_common_causes,
num_discrete_levels=4, one_hot_encode=one_hot_encode)
instruments = [("Z" + str(i)) for i in range(0, num_instruments)]
frontdoor_variables = [("FD" + str(i)) for i in range(0, num_frontdoor_variables)]
effect_modifiers = construct_col_names("X", num_effect_modifiers,
num_discrete_effect_modifiers,
num_discrete_levels=4, one_hot_encode=one_hot_encode)
other_variables = None
col_names = frontdoor_variables + effect_modifiers + instruments + common_causes + treatments + [outcome]
data = pd.DataFrame(data, columns=col_names)
# Specifying the correct dtypes
if treatment_is_binary:
data = data.astype({tname: "bool" for tname in treatments}, copy=False)
elif treatment_is_category:
data = data.astype({tname: "category" for tname in treatments}, copy=False)
if outcome_is_binary:
data = data.astype({outcome: 'bool'}, copy=False)
if num_discrete_common_causes >0 and not one_hot_encode:
data = data.astype({wname:'int64' for wname in common_causes[num_cont_common_causes:]}, copy=False)
data = data.astype({wname:'category' for wname in common_causes[num_cont_common_causes:]}, copy=False)
if num_discrete_effect_modifiers >0 and not one_hot_encode:
data = data.astype({emodname:'int64' for emodname in effect_modifiers[num_cont_effect_modifiers:]}, copy=False)
data = data.astype({emodname:'category' for emodname in effect_modifiers[num_cont_effect_modifiers:]}, copy=False)
# Now specifying the corresponding graph strings
dot_graph = create_dot_graph(treatments, outcome, common_causes, instruments, effect_modifiers, frontdoor_variables)
# Now writing the gml graph
gml_graph = create_gml_graph(treatments, outcome, common_causes, instruments, effect_modifiers, frontdoor_variables)
ret_dict = {
"df": data,
"treatment_name": treatments,
"outcome_name": outcome,
"common_causes_names": common_causes,
"instrument_names": instruments,
"effect_modifier_names": effect_modifiers,
"frontdoor_variables_names": frontdoor_variables,
"dot_graph": dot_graph,
"gml_graph": gml_graph,
"ate": ate
}
return ret_dict