in torchbenchmark/models/dlrm/data_utils.py [0:0]
def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file):
# Transforms Criteo Kaggle or terabyte data by applying log transformation
# on dense features and converting everything to appropriate tensors.
#
# Inputs:
# X_cat (ndarray): array of integers corresponding to preprocessed
# categorical features
# X_int (ndarray): array of integers corresponding to dense features
# y (ndarray): array of bool corresponding to labels
# data_split(str): flag for splitting dataset into training/validation/test
# sets
# randomize (str): determines randomization scheme
# "none": no randomization
# "day": randomizes each day"s data (only works if split = True)
# "total": randomizes total dataset
#
# Outputs:
# if split:
# X_cat_train (tensor): sparse features for training set
# X_int_train (tensor): dense features for training set
# y_train (tensor): labels for training set
# X_cat_val (tensor): sparse features for validation set
# X_int_val (tensor): dense features for validation set
# y_val (tensor): labels for validation set
# X_cat_test (tensor): sparse features for test set
# X_int_test (tensor): dense features for test set
# y_test (tensor): labels for test set
# else:
# X_cat (tensor): sparse features
# X_int (tensor): dense features
# y (tensor): label
# define initial set of indices
indices = np.arange(len(y))
# create offset per file
offset_per_file = np.array([0] + [x for x in total_per_file])
for i in range(days):
offset_per_file[i + 1] += offset_per_file[i]
# split dataset
if data_split == 'train':
indices = np.array_split(indices, offset_per_file[1:-1])
# randomize train data (per day)
if randomize == "day": # or randomize == "total":
for i in range(len(indices) - 1):
indices[i] = np.random.permutation(indices[i])
print("Randomized indices per day ...")
train_indices = np.concatenate(indices[:-1])
test_indices = indices[-1]
test_indices, val_indices = np.array_split(test_indices, 2)
print("Defined training and testing indices...")
# randomize train data (across days)
if randomize == "total":
train_indices = np.random.permutation(train_indices)
print("Randomized indices across days ...")
# indices = np.concatenate((train_indices, test_indices))
# create training, validation, and test sets
X_cat_train = X_cat[train_indices]
X_int_train = X_int[train_indices]
y_train = y[train_indices]
X_cat_val = X_cat[val_indices]
X_int_val = X_int[val_indices]
y_val = y[val_indices]
X_cat_test = X_cat[test_indices]
X_int_test = X_int[test_indices]
y_test = y[test_indices]
print("Split data according to indices...")
X_cat_train = X_cat_train.astype(np.long)
X_int_train = np.log(X_int_train.astype(np.float32) + 1)
y_train = y_train.astype(np.float32)
X_cat_val = X_cat_val.astype(np.long)
X_int_val = np.log(X_int_val.astype(np.float32) + 1)
y_val = y_val.astype(np.float32)
X_cat_test = X_cat_test.astype(np.long)
X_int_test = np.log(X_int_test.astype(np.float32) + 1)
y_test = y_test.astype(np.float32)
print("Converted to tensors...done!")
return (
X_cat_train,
X_int_train,
y_train,
X_cat_val,
X_int_val,
y_val,
X_cat_test,
X_int_test,
y_test,
)
else:
# randomize data
if randomize == "total":
indices = np.random.permutation(indices)
print("Randomized indices...")
X_cat = X_cat[indices].astype(np.long)
X_int = np.log(X_int[indices].astype(np.float32) + 1)
y = y[indices].astype(np.float32)
print("Converted to tensors...done!")
return (X_cat, X_int, y, [], [], [], [], [], [])