in src/generate_matrices.py [0:0]
def generate_feature_matrix(numsamples, numdims, numgroups, num_uniform_features, grouplabels, mean_range, variability):
"""
:param numsamples: Total number of samples
:param numdims: total dimensionality (number of columns)
:param num_uniform_features: how many of the distributions for each groups should be uniform rather than normal
:param numgroups: number of groups
:param grouplabels: array of grouplabels
:param mean_range: the mean of each distribution is selected uniformly at random from [-mean_range, mean_range]
:param variability: standard deviation for normal or distance from center to upper/lower bound on uniform
:return: X, matrix of features where each groups has a unique distribution for each feature
"""
# If we are using a vanilla dataset, just use unit normal for all features for all groups
if mean_range == 0 and variability == 1 and num_uniform_features == 0:
return np.random.randn(numsamples, numdims)
# Instantiate a feature matrix to be eventually returned once filled with non-zero values
X = np.zeros((numsamples, numdims)) # Instantiate an empty feature matrix
# Each groups has its own set of "numdims" distributions, defined by choice of normal/uniform, mean, and variability
# Then, we populate each groups features by sampling a row vector for each groups member, where each elemeent
# of this row vector is selected from one of the numdims pre-defined distributions. In practice, we may do this,
# column by column.
# Create a list of tuples for each groups
# Each list contains numdims 3-tuples, with each tuple defining a unique distribution
for g in range(0, numgroups):
# Tuple will store (is_uniform, mean, variability (std. dev or distance from endpoint to center in uniform))
# The last num_uniform_features features have a 1 in first position indicating uniform, rest are 0 for normal
distribution_attributes = \
[(i >= (numdims - num_uniform_features), np.random.uniform(-mean_range, mean_range), variability)
for i in range(numdims)]
# Mask the rows of X corresponding to the members of the current groups and populate accordingly
idx = np.where(grouplabels == g)
X[idx, :] = generate_group_features(distribution_attributes, np.size(idx))
return X