in src/train_test_split.py [0:0]
def create_validation_split(X, y, grouplabels, test_size, random_seed=45):
"""
:param X: Features matrix
:param y: label matrix (column vector)
:param grouplabels: numpy array denoting a groups index for each sample point
:param test_size: proportion of each groups (and thus overall data) to be witheld for validation
:param random_seed: random state for sklearns train/test split
:return: X_train, X_test, y_train, y_test, grouplabels_train, grouplabels_test
To create the validation data, we need an even split from each of the groups. We will create an
individual matrix of features and labels (X and y) for each of the groups individually and perform a train/test
split on them. After we have the train/test component of each groups's data, we can simply concatenate
(vertically stack) the feature matrices for each groups and the label matrices for each groups giving us a balanced
train/test split across the entire dataset. We also recompute groups labels array to match reconcatened split.
"""
num_group_types = grouplabels.shape[0]
# Default, single groups type case
if num_group_types == 1:
grouplabels = grouplabels[0]
numgroups = np.size(np.unique(grouplabels))
# Each of these 'pieces' is the training or testing portion of a specific groups to be combined later
X_train_pieces = []
y_train_pieces = []
X_test_pieces = []
y_test_pieces = []
grouplabels_train = []
grouplabels_test = []
# Create an array to store the index arrays for each groups so we do not have to contiunally recompute
index = [np.array([]) for _ in range(numgroups)]
for g in range(0, numgroups):
index[g] = np.where(grouplabels == g)
for g in range(numgroups):
# Perform the train test split of the desired size on this particular groups
X_train_curr, X_test_curr, y_train_curr, y_test_curr = \
train_test_split(X[index[g]], y[index[g]], test_size=test_size, random_state=random_seed)
# Append the matrix portions for this groups onto the appropriate python lists
X_train_pieces.append(X_train_curr)
X_test_pieces.append(X_test_curr)
y_train_pieces.append(y_train_curr)
y_test_pieces.append(y_test_curr)
# Assert that we have the same number of rows for X and y
assert X_train_curr.shape[0] == y_train_curr.shape[0]
assert X_test_curr.shape[0] == y_test_curr.shape[0]
# Add the appropriate grouplabels in preparation for the matrices that will be constructed (groups in order)
grouplabels_train.extend([g] * X_train_curr.shape[0]) # python short-hand to add many `g`s to the list
grouplabels_test.extend([g] * X_test_curr.shape[0]) # python short-hand to add many `g`s to the list
# Once we have all the pieces off the 4 matrices, all we have left to do is vertically stack them
X_train = np.concatenate(X_train_pieces, axis=0)
X_test = np.concatenate(X_test_pieces, axis=0)
y_train = np.concatenate(y_train_pieces, axis=0)
y_test = np.concatenate(y_test_pieces, axis=0)
# Assert that we still have the same number of features
assert X_train.shape[1] == X.shape[1]
assert X_test.shape[1] == X.shape[1]
grouplabels_train = np.expand_dims(np.array(grouplabels_train), axis=0)
grouplabels_test = np.expand_dims(np.array(grouplabels_test), axis=0)
return X_train, X_test, y_train, y_test, grouplabels_train, grouplabels_test
# Multi groups split
else:
# Return a random split over the training data
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=test_size, random_state=random_seed)
grouplabels_train_T, grouplabels_test_T = \
train_test_split(grouplabels.T, test_size=test_size, random_state=random_seed)
# print(grouplabels_train_T.T)
# print(grouplabels_train_T.T.shape)
# print(grouplabels_test_T.T)
# print(grouplabels_test_T.T.shape)
# Ensure that taking the transpose worked out fine
assert grouplabels_train_T.T.shape[0] == grouplabels_test_T.T.shape[0] == grouplabels.shape[0]
assert grouplabels_train_T.T.shape[1] + grouplabels_test_T.T.shape[1] == grouplabels.shape[1]
return X_train, X_test, y_train, y_test, grouplabels_train_T.T, grouplabels_test_T.T