def create_validation

def create_validation_split()

in src/train_test_split.py [0:0]
42 lines of code
11 McCabe index (conditional complexity)

def create_validation_split(X, y, grouplabels, test_size, random_seed=45):
    """
    :param X: Features matrix
    :param y: label matrix (column vector)
    :param grouplabels: numpy array denoting a groups index for each sample point
    :param test_size: proportion of each groups (and thus overall data) to be witheld for validation
    :param random_seed: random state for sklearns train/test split
    :return: X_train, X_test, y_train, y_test, grouplabels_train, grouplabels_test

    To create the validation data, we need an even split from each of the groups. We will create an
    individual matrix of features and labels (X and y) for each of the groups individually and perform a train/test
    split on them. After we have the train/test component of each groups's data, we can simply concatenate
    (vertically stack) the feature matrices for each groups and the label matrices for each groups giving us a balanced
    train/test split across the entire dataset. We also recompute groups labels array to match reconcatened split.
    """

    num_group_types = grouplabels.shape[0]

    # Default, single groups type case
    if num_group_types == 1:
        grouplabels = grouplabels[0]
        numgroups = np.size(np.unique(grouplabels))
        # Each of these 'pieces' is the training or testing portion of a specific groups to be combined later
        X_train_pieces = []
        y_train_pieces = []
        X_test_pieces = []
        y_test_pieces = []
        grouplabels_train = []
        grouplabels_test = []

        # Create an array to store the index arrays for each groups so we do not have to contiunally recompute
        index = [np.array([]) for _ in range(numgroups)]
        for g in range(0, numgroups):
            index[g] = np.where(grouplabels == g)

        for g in range(numgroups):
            # Perform the train test split of the desired size on this particular groups
            X_train_curr, X_test_curr, y_train_curr, y_test_curr = \
                train_test_split(X[index[g]], y[index[g]], test_size=test_size, random_state=random_seed)
            # Append the matrix portions for this groups onto the appropriate python lists
            X_train_pieces.append(X_train_curr)
            X_test_pieces.append(X_test_curr)
            y_train_pieces.append(y_train_curr)
            y_test_pieces.append(y_test_curr)

            # Assert that we have the same number of rows for X and y
            assert X_train_curr.shape[0] == y_train_curr.shape[0]
            assert X_test_curr.shape[0] == y_test_curr.shape[0]
            # Add the appropriate grouplabels in preparation for the matrices that will be constructed (groups in order)
            grouplabels_train.extend([g] * X_train_curr.shape[0])  # python short-hand to add many `g`s to the list
            grouplabels_test.extend([g] * X_test_curr.shape[0])  # python short-hand to add many `g`s to the list

        # Once we have all the pieces off the 4 matrices, all we have left to do is vertically stack them
        X_train = np.concatenate(X_train_pieces, axis=0)
        X_test = np.concatenate(X_test_pieces, axis=0)
        y_train = np.concatenate(y_train_pieces, axis=0)
        y_test = np.concatenate(y_test_pieces, axis=0)

        # Assert that we still have the same number of features
        assert X_train.shape[1] == X.shape[1]
        assert X_test.shape[1] == X.shape[1]

        grouplabels_train = np.expand_dims(np.array(grouplabels_train), axis=0)
        grouplabels_test = np.expand_dims(np.array(grouplabels_test), axis=0)

        return X_train, X_test, y_train, y_test, grouplabels_train, grouplabels_test

    # Multi groups split
    else:
        # Return a random split over the training data
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=test_size, random_state=random_seed)
        grouplabels_train_T, grouplabels_test_T = \
            train_test_split(grouplabels.T, test_size=test_size, random_state=random_seed)

        # print(grouplabels_train_T.T)
        # print(grouplabels_train_T.T.shape)
        # print(grouplabels_test_T.T)
        # print(grouplabels_test_T.T.shape)

        # Ensure that taking the transpose worked out fine
        assert grouplabels_train_T.T.shape[0] == grouplabels_test_T.T.shape[0] == grouplabels.shape[0]
        assert grouplabels_train_T.T.shape[1] + grouplabels_test_T.T.shape[1] == grouplabels.shape[1]

        return X_train, X_test, y_train, y_test, grouplabels_train_T.T, grouplabels_test_T.T