def setup_matrices()

in src/setup_matrices.py [0:0]
73 lines of code
37 McCabe index (conditional complexity)

def setup_matrices(path, label, groups, usable_features=None, drop_group_as_feature=False,
                   groups_to_drop=[], categorical_columns=[],
                   verbose=False, save_data=False, file_dir='', file_name=''):
    """
    :param path - path to csv file whose rows are sample points and whose columns are features
    :param label - string denoting the name of the column whose values are the target prediction labels
    :param groups - list of strings denoting feature used to differentiate groups
    :param usable_features - list of column names to be used in prediction
    :param categorical_columns - list of column names (or indices) that indicate which numeric columns (if any)
                                 should be treated as categorical data (e.g. rating systems of 1-4 such as
                                 university=1, college=2, highschool=3, other=4). String-type columns treated as
                                 categorical by default
    :param drop_group_as_feature - whether or not the groups label should be dropped and not used as predictive feature
    :param groups_to_drop - list of subgroups to drop, each denoted by the string 'group_type@subgroup_value'
    :param save_data - whether or not the resulting matrices should be saved to a file
    :param file_dir - directory in which to save the matrices if save_data is true
    :param file_name - name of file (without an extension) within file_dir to save the matrices to
    :param verbose - verbose output
    """

    if isinstance(groups, str):
        groups = [groups]

    # Determine what kind of delimiter to use
    delim = ','
    with open(path) as f:
        for _ in range(2):
            if ';' in f.readline():
                delim = ';'

    df = pd.read_csv(path, sep=delim)

    if len(groups_to_drop) > 0:
        for specific_group in groups_to_drop:
            [group_category, subgroup_name] = specific_group.split('@')
            df.drop(df[df[group_category] == subgroup_name].index, inplace=True)
        # Reset index numbering
        df.reset_index(inplace=True)

    y = np.array(df[label])  # define the label matrix
    if usable_features is None or usable_features == [] or usable_features == ['']:
        usable_features = list(df.columns)

    # remove label (and groups if specified) from usable features, if present
    if drop_group_as_feature:
        usable_features = [feat for feat in usable_features if feat != label and feat not in groups]
    else:
        usable_features = [feat for feat in usable_features if feat != label]

    # Create a matrix for all relevant features
    X_in = np.array(df[usable_features])

    # Create a new dataframe for each groups
    group_dfs = []
    for group in groups:
        group_dfs.append(df[group])  # define dataframe for groups labels

    column_list = []  # list of numpy arrays denoting one or more columns to be concatenated at the end

    # Loop through columns when find a column that is categorical (i.e. string type), replace column with one-hot
    # binary encoded version that consists of multiple columns

    for i in range(X_in.shape[1]):  # iterate over each column of the numpy matrix
        # Determine if the current column should be treated as categorical or not
        # NOTE: if data is missing from the first row, then the entire column is deleted
        if X_in[0, i] is not None and (type(X_in[0, i]) == str or df.columns[i] in categorical_columns):
            try:
                lb = LabelBinarizer()
                curr_col = lb.fit_transform(X_in[:, i].reshape(-1, 1))
                column_list.append(curr_col)
            except ValueError:
                try:
                    lb = LabelBinarizer()
                    curr_col = lb.fit_transform(X_in[:, i].reshape(-1, 1).astype('int'))
                    column_list.append(curr_col)
                except Exception as e:
                    warnings.warn(f'ValueError: ({e}) in LabelBinarizer transforming '
                                  f'categorical columns to one-hot vectors. Converting to int did not help.')
            except Exception as e:
                warnings.warn(f'ValueError: ({e}) in LabelBinarizer transforming '
                              f'categorical columns to one-hot vectors')
        else:
            column_list.append(X_in[:, i])

    # Concatenate all the re-encoded feature columns together
    X = np.column_stack(column_list)

    # Get set of unique label values denoting a specific groups (e.g. [white, black, asian, hispanic, other])
    # Also, get the sets of names of each groups
    group_types = groups  # The argument "groups" is our list of groups types
    group_sets = []
    numgroups = []
    grouplabels = []

    for i in range(len(groups)):
        group_set, n_groups, labels = extract_group_labels(group_dfs[i])
        group_sets.append(group_set)
        numgroups.append(n_groups)
        grouplabels.append(labels)

    grouplabels = np.array(grouplabels)

    # Determine if the dataset is binary by looking at the labels (y)
    is_binary = (len(set(np.unique(y))) == 2)
    if is_binary:
      lb = LabelBinarizer()
      y = lb.fit_transform(y).flatten()

    if verbose:
        print('Here are the results from setting up your dataset from a csv:')
        print(f'X: {X}')
        print(f'y: {y}')
        print(f'grouplabels: {grouplabels}')
        print(f'numgroups: {numgroups}')
        print(f'group_sets: {group_sets}')
        print(f'is_binary: {is_binary}')

    if save_data:
        save_dataset(file_dir, file_name, X, y, grouplabels, group_sets, group_types, is_binary)

    return X, y, grouplabels, group_sets, group_types, is_binary