in dlrm_data_caffe2.py [0:0]
def assemble_samples(X_cat, X_int, y, max_ind_range, print_message):
if max_ind_range > 0:
X_cat = X_cat % max_ind_range
nsamples = len(y)
data_size = nsamples
# using floor is equivalent to dropping last mini-batch (drop_last = True)
nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
print(print_message)
if num_batches != 0 and num_batches < nbatches:
print(
"Limiting to %d batches of the total % d batches"
% (num_batches, nbatches)
)
nbatches = num_batches
else:
print("Total number of batches %d" % nbatches)
# data main loop
lX = []
lS_lengths = []
lS_indices = []
lT = []
for j in range(0, nbatches):
# number of data points in a batch
print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r")
n = min(mini_batch_size, data_size - (j * mini_batch_size))
# dense feature
idx_start = j * mini_batch_size
lX.append((X_int[idx_start : (idx_start + n)]).astype(np.float32))
# Targets - outputs
lT.append(
(y[idx_start : idx_start + n]).reshape(-1, 1).astype(np.int32)
)
# sparse feature (sparse indices)
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for size in range(n_emb):
lS_batch_indices = []
for _b in range(n):
# num of sparse indices to be used per embedding, e.g. for
# store lengths and indices
lS_batch_indices += (
(X_cat[idx_start + _b][size].reshape(-1)).astype(np.int32)
).tolist()
lS_emb_indices.append(lS_batch_indices)
lS_indices.append(lS_emb_indices)
# Criteo Kaggle data it is 1 because data is categorical
lS_lengths.append(
[(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]
)
print("\n")
return nbatches, lX, lS_lengths, lS_indices, lT