in models.py [0:0]
def prepare_samples(self, sentences, bsize, tokenize, verbose):
sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else
[self.bos] + self.tokenize(s) + [self.eos] for s in sentences]
n_w = np.sum([len(x) for x in sentences])
# filters words without w2v vectors
for i in range(len(sentences)):
s_f = [word for word in sentences[i] if word in self.word_vec]
if not s_f:
import warnings
warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \
Replacing by "</s>"..' % (sentences[i], i))
s_f = [self.eos]
sentences[i] = s_f
lengths = np.array([len(s) for s in sentences])
n_wk = np.sum(lengths)
if verbose:
print('Nb words kept : %s/%s (%.1f%s)' % (
n_wk, n_w, 100.0 * n_wk / n_w, '%'))
# sort by decreasing length
lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths)
sentences = np.array(sentences)[idx_sort]
return sentences, lengths, idx_sort