def transformCriteoAdData()

in torchbenchmark/models/dlrm/data_utils.py [0:0]
58 lines of code
5 McCabe index (conditional complexity)

def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file):
    # Transforms Criteo Kaggle or terabyte data by applying log transformation
    # on dense features and converting everything to appropriate tensors.
    #
    # Inputs:
    #     X_cat (ndarray): array of integers corresponding to preprocessed
    #                      categorical features
    #     X_int (ndarray): array of integers corresponding to dense features
    #     y (ndarray):     array of bool corresponding to labels
    #     data_split(str): flag for splitting dataset into training/validation/test
    #                      sets
    #     randomize (str): determines randomization scheme
    #         "none": no randomization
    #         "day": randomizes each day"s data (only works if split = True)
    #         "total": randomizes total dataset
    #
    # Outputs:
    #     if split:
    #         X_cat_train (tensor): sparse features for training set
    #         X_int_train (tensor): dense features for training set
    #         y_train (tensor): labels for training set
    #         X_cat_val (tensor): sparse features for validation set
    #         X_int_val (tensor): dense features for validation set
    #         y_val (tensor): labels for validation set
    #         X_cat_test (tensor): sparse features for test set
    #         X_int_test (tensor): dense features for test set
    #         y_test (tensor): labels for test set
    #     else:
    #         X_cat (tensor): sparse features
    #         X_int (tensor): dense features
    #         y (tensor): label

    # define initial set of indices
    indices = np.arange(len(y))

    # create offset per file
    offset_per_file = np.array([0] + [x for x in total_per_file])
    for i in range(days):
        offset_per_file[i + 1] += offset_per_file[i]

    # split dataset
    if data_split == 'train':
        indices = np.array_split(indices, offset_per_file[1:-1])

        # randomize train data (per day)
        if randomize == "day":  # or randomize == "total":
            for i in range(len(indices) - 1):
                indices[i] = np.random.permutation(indices[i])
            print("Randomized indices per day ...")

        train_indices = np.concatenate(indices[:-1])
        test_indices = indices[-1]
        test_indices, val_indices = np.array_split(test_indices, 2)

        print("Defined training and testing indices...")

        # randomize train data (across days)
        if randomize == "total":
            train_indices = np.random.permutation(train_indices)
            print("Randomized indices across days ...")

        # indices = np.concatenate((train_indices, test_indices))

        # create training, validation, and test sets
        X_cat_train = X_cat[train_indices]
        X_int_train = X_int[train_indices]
        y_train = y[train_indices]

        X_cat_val = X_cat[val_indices]
        X_int_val = X_int[val_indices]
        y_val = y[val_indices]

        X_cat_test = X_cat[test_indices]
        X_int_test = X_int[test_indices]
        y_test = y[test_indices]

        print("Split data according to indices...")

        X_cat_train = X_cat_train.astype(np.long)
        X_int_train = np.log(X_int_train.astype(np.float32) + 1)
        y_train = y_train.astype(np.float32)

        X_cat_val = X_cat_val.astype(np.long)
        X_int_val = np.log(X_int_val.astype(np.float32) + 1)
        y_val = y_val.astype(np.float32)

        X_cat_test = X_cat_test.astype(np.long)
        X_int_test = np.log(X_int_test.astype(np.float32) + 1)
        y_test = y_test.astype(np.float32)

        print("Converted to tensors...done!")

        return (
            X_cat_train,
            X_int_train,
            y_train,
            X_cat_val,
            X_int_val,
            y_val,
            X_cat_test,
            X_int_test,
            y_test,
        )

    else:

        # randomize data
        if randomize == "total":
            indices = np.random.permutation(indices)
            print("Randomized indices...")

        X_cat = X_cat[indices].astype(np.long)
        X_int = np.log(X_int[indices].astype(np.float32) + 1)
        y = y[indices].astype(np.float32)

        print("Converted to tensors...done!")

        return (X_cat, X_int, y, [], [], [], [], [], [])