in torchmoji/word_generator.py [0:0]
def extract_valid_sentence_words(self, line):
""" Line may either a string of a list of strings depending on how
the stream is being parsed.
Domain-specific processing and filtering can be done both prior to
and after tokenization.
Custom information about the line can be extracted during the
processing phases and returned as a dict.
"""
info = {}
pre_valid, pre_line, pre_info = \
self.data_preprocess_filtering(line, self.stats['total'])
info.update(pre_info)
if not pre_valid:
self.stats['pretokenization_filtered'] += 1
return False, [], info
words = self.get_words(pre_line)
if len(words) == 0:
self.stats['unicode_filtered'] += 1
return False, [], info
post_valid, post_words, post_info = \
self.data_postprocess_filtering(words, self.stats['total'])
info.update(post_info)
if not post_valid:
self.stats['posttokenization_filtered'] += 1
return post_valid, post_words, info