def generate_synthetic_data()

in src/generate_matrices.py [0:0]


def generate_synthetic_data(numdims, noise, numsamples=1000, num_group_types=1,
                            min_subgroups=2, max_subgroups=10, min_subgroup_size=20,
                            mean_range=0, variability=1, num_uniform_features=0, intercept_scale=2,
                            binary=False, drop_group_as_feature=False,
                            save_data=False, file_dir='', file_name='',
                            random_seed=0):
    """
    Generates two matrices X, y of features and labels where for each type of groups, X is divided into numgroups
    different groups each of which has a shared linear function from which labels are sampled with noise.

    For the binary case, we convert the real valued labels into 0 or 1 by sign of label (positive or negative)

    :param numsamples : Number of instances/rows of X
    :param numdims : Dimensionality of synthetic data
    :param noise : Gaussian noise in Y
    :param num_group_types: Number of categories (e.g. race, sex, etc.) such that each instances belongs to one
                            subgroup for each groups type
    :param min_subgroups : Minimum number of subgroups for each groups type (selected uniformly at random)
    :param max_subgroups : Minimum number of subgroups for each groups type (selected uniformly at random)
    :param min_subgroup_size : Minimum number of instances for each subgroup. Generated by randomized algorithm that
                               repeats until minimum size is satisfied for all subgroups. Can't exceed average
                               subgroup size.
    :param intercept_scale : Coefficient on randomly generated intercept for each groups. Intercepts drawn from unit
                              normal and 0.0 denotes no intercept.
    :param mean_range : Mean for each feature dist. is selected uniformly at random from [-mean_range, mean_range]
    :param variability: Denotes std. dev. for normally distributed features and distance from mean to endpoint for
                        uniform features
    :param num_uniform_features: How many of `numdims` features should be drawn uniformly from the distribution
                                 defined by the mean and `variability`. Remaining features drawn from normal dist.
    :param binary : If labels should be converted to binary (0/1) for classification. Uses sign (+/-) of numeric label
    :param drop_group_as_feature : Denotes if X should drop columns corresponding to one-hot encoded groups labels
    :param random_seed : Random seed for all numpy randomization
    :param save_data : Denotes whether or not generated matrices should be saved to a file
    :param file_dir : Directory to save to if save_data is True
    :param file_name : File name in file_dir to save to if save_data is True
    """

    # Set the random seed
    np.random.seed(random_seed)

    if num_uniform_features > numdims:
        raise Exception(f'Error! More uniform features ({num_uniform_features}) than total dimensions ({numdims})')

    Xs = []
    ys = []
    group_sets = []
    grouplabel_list = []

    for i in range(num_group_types):
        n_subgroups = np.random.randint(min_subgroups, max_subgroups+1)  # Determines num. subgroups for this class
        # With multiple categories of groups, we partition into subgroups of random but lower-bounded size
        # Generate a numpy array of the sizes for each groups
        groupsize = generate_random_intervals(numsamples, n_subgroups, min_subgroup_size)

        # Fill out the labels in order
        # e.g. groups 0 will be the first groupsize[0] rows in the matrix, group1 the next set of rows, etc.
        grouplabels = []
        curr_grp_index = 0
        for size in groupsize:
            for _ in range(size):
                grouplabels.append(curr_grp_index)
            curr_grp_index += 1
        grouplabels = np.array(grouplabels)  # convert to numpy array
        # Compute number of samples and generate feature matrix X
        assert numsamples == np.size(grouplabels)

        # Generate feature matrix X
        X = generate_feature_matrix(numsamples, numdims, n_subgroups, num_uniform_features,
                                    grouplabels, mean_range, variability)

        # Generate y; each groups has a different linear model
        weights = np.random.randn(n_subgroups, numdims)
        intercepts = np.zeros(n_subgroups) if drop_group_as_feature else (np.random.randn(n_subgroups) * intercept_scale)
        y = np.zeros(numsamples)

        # print('intercepts', intercepts)
        # Create y according to X with noise
        for g in range(0, n_subgroups):
            w = weights[g]
            idx = np.where(grouplabels == g)
            X_g = X[idx, :]
            y[idx] = np.matmul(X_g, w) + noise * np.random.randn(1, np.size(idx)) + intercepts[g]

        # Given "labels" to each groups in the synthetic data
        group_sets.append([f'Subgroup ' + str(1 + x) for x in range(n_subgroups)])
        assert n_subgroups == len(groupsize)

        grouplabel_list.append(grouplabels)
        Xs.append(X)
        ys.append(y)

    # End of for loop over groups type

    # Sum Xs and sum ys and divide by number of gorup types to get the average feature and label matrix
    X = functools.reduce(lambda x1, x2: np.add(x1, x2), Xs) / num_group_types
    y = functools.reduce(lambda y1, y2: np.add(y1, y2), ys) / num_group_types

    # Add all the groups membership variables to the feature matrix with one-hot categorical encoding
    if not drop_group_as_feature:
        matrices_to_stack = [X]  # Will store all the matrices to be horizontally concatenated to increase columns
        for i in range(num_group_types):
            lb = LabelBinarizer()
            matrices_to_stack.append(lb.fit_transform(grouplabel_list[i]))
        # Add the new columns to X
        X = np.column_stack(matrices_to_stack)

    # If we want a binary dataset, we can threshold the y labels
    if binary:
        y = (y > 0)

    grouplabel_list = np.array(grouplabel_list)

    # Saves the data as numpy objects
    if save_data:
        save_dataset(file_dir, file_name, X, y, grouplabel_list, group_sets, binary,
                     upload_dataset_to_s3, bucket_name, credentials_file)

    group_types = [f'Type {i+1}' for i in range(num_group_types)]

    return X, y, grouplabel_list, group_sets, group_types, binary