in source/embed.py [0:0]
def _make_batches(self, lines):
tokens = [self._tokenize(line) for line in lines]
lengths = np.array([t.numel() for t in tokens])
indices = np.argsort(-lengths, kind=self.sort_kind)
def batch(tokens, lengths, indices):
toks = tokens[0].new_full((len(tokens), tokens[0].shape[0]), self.pad_index)
for i in range(len(tokens)):
toks[i, -tokens[i].shape[0]:] = tokens[i]
return Batch(
srcs=None,
tokens=toks,
lengths=torch.LongTensor(lengths)
), indices
batch_tokens, batch_lengths, batch_indices = [], [], []
ntokens = nsentences = 0
for i in indices:
if nsentences > 0 and ((self.max_tokens is not None and ntokens + lengths[i] > self.max_tokens) or
(self.max_sentences is not None and nsentences == self.max_sentences)):
yield batch(batch_tokens, batch_lengths, batch_indices)
ntokens = nsentences = 0
batch_tokens, batch_lengths, batch_indices = [], [], []
batch_tokens.append(tokens[i])
batch_lengths.append(lengths[i])
batch_indices.append(i)
ntokens += tokens[i].shape[0]
nsentences += 1
if nsentences > 0:
yield batch(batch_tokens, batch_lengths, batch_indices)