def assemble_samples()

in dlrm_data_caffe2.py [0:0]


        def assemble_samples(X_cat, X_int, y, max_ind_range, print_message):
            if max_ind_range > 0:
                X_cat = X_cat % max_ind_range

            nsamples = len(y)
            data_size = nsamples
            # using floor is equivalent to dropping last mini-batch (drop_last = True)
            nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
            print(print_message)
            if num_batches != 0 and num_batches < nbatches:
                print(
                    "Limiting to %d batches of the total % d batches"
                    % (num_batches, nbatches)
                )
                nbatches = num_batches
            else:
                print("Total number of batches %d" % nbatches)

            # data main loop
            lX = []
            lS_lengths = []
            lS_indices = []
            lT = []
            for j in range(0, nbatches):
                # number of data points in a batch
                print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r")
                n = min(mini_batch_size, data_size - (j * mini_batch_size))
                # dense feature
                idx_start = j * mini_batch_size
                lX.append((X_int[idx_start : (idx_start + n)]).astype(np.float32))
                # Targets - outputs
                lT.append(
                    (y[idx_start : idx_start + n]).reshape(-1, 1).astype(np.int32)
                )
                # sparse feature (sparse indices)
                lS_emb_indices = []
                # for each embedding generate a list of n lookups,
                # where each lookup is composed of multiple sparse indices
                for size in range(n_emb):
                    lS_batch_indices = []
                    for _b in range(n):
                        # num of sparse indices to be used per embedding, e.g. for
                        # store lengths and indices
                        lS_batch_indices += (
                            (X_cat[idx_start + _b][size].reshape(-1)).astype(np.int32)
                        ).tolist()
                    lS_emb_indices.append(lS_batch_indices)
                lS_indices.append(lS_emb_indices)
                # Criteo Kaggle data it is 1 because data is categorical
                lS_lengths.append(
                    [(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]
                )
            print("\n")

            return nbatches, lX, lS_lengths, lS_indices, lT