in src/setup_matrices.py [0:0]
def setup_matrices(path, label, groups, usable_features=None, drop_group_as_feature=False,
groups_to_drop=[], categorical_columns=[],
verbose=False, save_data=False, file_dir='', file_name=''):
"""
:param path - path to csv file whose rows are sample points and whose columns are features
:param label - string denoting the name of the column whose values are the target prediction labels
:param groups - list of strings denoting feature used to differentiate groups
:param usable_features - list of column names to be used in prediction
:param categorical_columns - list of column names (or indices) that indicate which numeric columns (if any)
should be treated as categorical data (e.g. rating systems of 1-4 such as
university=1, college=2, highschool=3, other=4). String-type columns treated as
categorical by default
:param drop_group_as_feature - whether or not the groups label should be dropped and not used as predictive feature
:param groups_to_drop - list of subgroups to drop, each denoted by the string 'group_type@subgroup_value'
:param save_data - whether or not the resulting matrices should be saved to a file
:param file_dir - directory in which to save the matrices if save_data is true
:param file_name - name of file (without an extension) within file_dir to save the matrices to
:param verbose - verbose output
"""
if isinstance(groups, str):
groups = [groups]
# Determine what kind of delimiter to use
delim = ','
with open(path) as f:
for _ in range(2):
if ';' in f.readline():
delim = ';'
df = pd.read_csv(path, sep=delim)
if len(groups_to_drop) > 0:
for specific_group in groups_to_drop:
[group_category, subgroup_name] = specific_group.split('@')
df.drop(df[df[group_category] == subgroup_name].index, inplace=True)
# Reset index numbering
df.reset_index(inplace=True)
y = np.array(df[label]) # define the label matrix
if usable_features is None or usable_features == [] or usable_features == ['']:
usable_features = list(df.columns)
# remove label (and groups if specified) from usable features, if present
if drop_group_as_feature:
usable_features = [feat for feat in usable_features if feat != label and feat not in groups]
else:
usable_features = [feat for feat in usable_features if feat != label]
# Create a matrix for all relevant features
X_in = np.array(df[usable_features])
# Create a new dataframe for each groups
group_dfs = []
for group in groups:
group_dfs.append(df[group]) # define dataframe for groups labels
column_list = [] # list of numpy arrays denoting one or more columns to be concatenated at the end
# Loop through columns when find a column that is categorical (i.e. string type), replace column with one-hot
# binary encoded version that consists of multiple columns
for i in range(X_in.shape[1]): # iterate over each column of the numpy matrix
# Determine if the current column should be treated as categorical or not
# NOTE: if data is missing from the first row, then the entire column is deleted
if X_in[0, i] is not None and (type(X_in[0, i]) == str or df.columns[i] in categorical_columns):
try:
lb = LabelBinarizer()
curr_col = lb.fit_transform(X_in[:, i].reshape(-1, 1))
column_list.append(curr_col)
except ValueError:
try:
lb = LabelBinarizer()
curr_col = lb.fit_transform(X_in[:, i].reshape(-1, 1).astype('int'))
column_list.append(curr_col)
except Exception as e:
warnings.warn(f'ValueError: ({e}) in LabelBinarizer transforming '
f'categorical columns to one-hot vectors. Converting to int did not help.')
except Exception as e:
warnings.warn(f'ValueError: ({e}) in LabelBinarizer transforming '
f'categorical columns to one-hot vectors')
else:
column_list.append(X_in[:, i])
# Concatenate all the re-encoded feature columns together
X = np.column_stack(column_list)
# Get set of unique label values denoting a specific groups (e.g. [white, black, asian, hispanic, other])
# Also, get the sets of names of each groups
group_types = groups # The argument "groups" is our list of groups types
group_sets = []
numgroups = []
grouplabels = []
for i in range(len(groups)):
group_set, n_groups, labels = extract_group_labels(group_dfs[i])
group_sets.append(group_set)
numgroups.append(n_groups)
grouplabels.append(labels)
grouplabels = np.array(grouplabels)
# Determine if the dataset is binary by looking at the labels (y)
is_binary = (len(set(np.unique(y))) == 2)
if is_binary:
lb = LabelBinarizer()
y = lb.fit_transform(y).flatten()
if verbose:
print('Here are the results from setting up your dataset from a csv:')
print(f'X: {X}')
print(f'y: {y}')
print(f'grouplabels: {grouplabels}')
print(f'numgroups: {numgroups}')
print(f'group_sets: {group_sets}')
print(f'is_binary: {is_binary}')
if save_data:
save_dataset(file_dir, file_name, X, y, grouplabels, group_sets, group_types, is_binary)
return X, y, grouplabels, group_sets, group_types, is_binary