in torchmoji/sentence_tokenizer.py [0:0]
def tokenize_sentences(self, sentences, reset_stats=True, max_sentences=None):
""" Converts a given list of sentences into a numpy array according to
its vocabulary.
# Arguments:
sentences: List of sentences to be tokenized.
reset_stats: Whether the word generator's stats should be reset.
max_sentences: Maximum length of sentences. Must be set if the
length cannot be inferred from the input.
# Returns:
Numpy array of the tokenization sentences with masking,
infos,
stats
# Raises:
ValueError: When maximum length is not set and cannot be inferred.
"""
if max_sentences is None and not hasattr(sentences, '__len__'):
raise ValueError('Either you must provide an array with a length'
'attribute (e.g. a list) or specify the maximum '
'length yourself using `max_sentences`!')
n_sentences = (max_sentences if max_sentences is not None
else len(sentences))
if self.masking_value == 0:
tokens = np.zeros((n_sentences, self.fixed_length), dtype='uint16')
else:
tokens = (np.ones((n_sentences, self.fixed_length), dtype='uint16')
* self.masking_value)
if reset_stats:
self.wordgen.reset_stats()
# With a custom word generator info can be extracted from each
# sentence (e.g. labels)
infos = []
# Returns words as strings and then map them to vocabulary
self.wordgen.stream = sentences
next_insert = 0
n_ignored_unknowns = 0
for s_words, s_info in self.wordgen:
s_tokens = self.find_tokens(s_words)
if (self.ignore_sentences_with_only_custom and
np.all([True if t < len(SPECIAL_TOKENS)
else False for t in s_tokens])):
n_ignored_unknowns += 1
continue
if len(s_tokens) > self.fixed_length:
s_tokens = s_tokens[:self.fixed_length]
tokens[next_insert,:len(s_tokens)] = s_tokens
infos.append(s_info)
next_insert += 1
# For standard word generators all sentences should be tokenized
# this is not necessarily the case for custom wordgenerators as they
# may filter the sentences etc.
if not self.uses_custom_wordgen and not self.ignore_sentences_with_only_custom:
assert len(sentences) == next_insert
else:
# adjust based on actual tokens received
tokens = tokens[:next_insert]
infos = infos[:next_insert]
return tokens, infos, self.wordgen.stats