in src/generate_matrices.py [0:0]
def generate_synthetic_data(numdims, noise, numsamples=1000, num_group_types=1,
min_subgroups=2, max_subgroups=10, min_subgroup_size=20,
mean_range=0, variability=1, num_uniform_features=0, intercept_scale=2,
binary=False, drop_group_as_feature=False,
save_data=False, file_dir='', file_name='',
random_seed=0):
"""
Generates two matrices X, y of features and labels where for each type of groups, X is divided into numgroups
different groups each of which has a shared linear function from which labels are sampled with noise.
For the binary case, we convert the real valued labels into 0 or 1 by sign of label (positive or negative)
:param numsamples : Number of instances/rows of X
:param numdims : Dimensionality of synthetic data
:param noise : Gaussian noise in Y
:param num_group_types: Number of categories (e.g. race, sex, etc.) such that each instances belongs to one
subgroup for each groups type
:param min_subgroups : Minimum number of subgroups for each groups type (selected uniformly at random)
:param max_subgroups : Minimum number of subgroups for each groups type (selected uniformly at random)
:param min_subgroup_size : Minimum number of instances for each subgroup. Generated by randomized algorithm that
repeats until minimum size is satisfied for all subgroups. Can't exceed average
subgroup size.
:param intercept_scale : Coefficient on randomly generated intercept for each groups. Intercepts drawn from unit
normal and 0.0 denotes no intercept.
:param mean_range : Mean for each feature dist. is selected uniformly at random from [-mean_range, mean_range]
:param variability: Denotes std. dev. for normally distributed features and distance from mean to endpoint for
uniform features
:param num_uniform_features: How many of `numdims` features should be drawn uniformly from the distribution
defined by the mean and `variability`. Remaining features drawn from normal dist.
:param binary : If labels should be converted to binary (0/1) for classification. Uses sign (+/-) of numeric label
:param drop_group_as_feature : Denotes if X should drop columns corresponding to one-hot encoded groups labels
:param random_seed : Random seed for all numpy randomization
:param save_data : Denotes whether or not generated matrices should be saved to a file
:param file_dir : Directory to save to if save_data is True
:param file_name : File name in file_dir to save to if save_data is True
"""
# Set the random seed
np.random.seed(random_seed)
if num_uniform_features > numdims:
raise Exception(f'Error! More uniform features ({num_uniform_features}) than total dimensions ({numdims})')
Xs = []
ys = []
group_sets = []
grouplabel_list = []
for i in range(num_group_types):
n_subgroups = np.random.randint(min_subgroups, max_subgroups+1) # Determines num. subgroups for this class
# With multiple categories of groups, we partition into subgroups of random but lower-bounded size
# Generate a numpy array of the sizes for each groups
groupsize = generate_random_intervals(numsamples, n_subgroups, min_subgroup_size)
# Fill out the labels in order
# e.g. groups 0 will be the first groupsize[0] rows in the matrix, group1 the next set of rows, etc.
grouplabels = []
curr_grp_index = 0
for size in groupsize:
for _ in range(size):
grouplabels.append(curr_grp_index)
curr_grp_index += 1
grouplabels = np.array(grouplabels) # convert to numpy array
# Compute number of samples and generate feature matrix X
assert numsamples == np.size(grouplabels)
# Generate feature matrix X
X = generate_feature_matrix(numsamples, numdims, n_subgroups, num_uniform_features,
grouplabels, mean_range, variability)
# Generate y; each groups has a different linear model
weights = np.random.randn(n_subgroups, numdims)
intercepts = np.zeros(n_subgroups) if drop_group_as_feature else (np.random.randn(n_subgroups) * intercept_scale)
y = np.zeros(numsamples)
# print('intercepts', intercepts)
# Create y according to X with noise
for g in range(0, n_subgroups):
w = weights[g]
idx = np.where(grouplabels == g)
X_g = X[idx, :]
y[idx] = np.matmul(X_g, w) + noise * np.random.randn(1, np.size(idx)) + intercepts[g]
# Given "labels" to each groups in the synthetic data
group_sets.append([f'Subgroup ' + str(1 + x) for x in range(n_subgroups)])
assert n_subgroups == len(groupsize)
grouplabel_list.append(grouplabels)
Xs.append(X)
ys.append(y)
# End of for loop over groups type
# Sum Xs and sum ys and divide by number of gorup types to get the average feature and label matrix
X = functools.reduce(lambda x1, x2: np.add(x1, x2), Xs) / num_group_types
y = functools.reduce(lambda y1, y2: np.add(y1, y2), ys) / num_group_types
# Add all the groups membership variables to the feature matrix with one-hot categorical encoding
if not drop_group_as_feature:
matrices_to_stack = [X] # Will store all the matrices to be horizontally concatenated to increase columns
for i in range(num_group_types):
lb = LabelBinarizer()
matrices_to_stack.append(lb.fit_transform(grouplabel_list[i]))
# Add the new columns to X
X = np.column_stack(matrices_to_stack)
# If we want a binary dataset, we can threshold the y labels
if binary:
y = (y > 0)
grouplabel_list = np.array(grouplabel_list)
# Saves the data as numpy objects
if save_data:
save_dataset(file_dir, file_name, X, y, grouplabel_list, group_sets, binary,
upload_dataset_to_s3, bucket_name, credentials_file)
group_types = [f'Type {i+1}' for i in range(num_group_types)]
return X, y, grouplabel_list, group_sets, group_types, binary