def extract_valid_sentence_words()

in torchmoji/word_generator.py [0:0]


    def extract_valid_sentence_words(self, line):
        """ Line may either a string of a list of strings depending on how
            the stream is being parsed.
            Domain-specific processing and filtering can be done both prior to
            and after tokenization.
            Custom information about the line can be extracted during the
            processing phases and returned as a dict.
        """

        info = {}

        pre_valid, pre_line, pre_info = \
            self.data_preprocess_filtering(line, self.stats['total'])
        info.update(pre_info)
        if not pre_valid:
            self.stats['pretokenization_filtered'] += 1
            return False, [], info

        words = self.get_words(pre_line)
        if len(words) == 0:
            self.stats['unicode_filtered'] += 1
            return False, [], info

        post_valid, post_words, post_info = \
            self.data_postprocess_filtering(words, self.stats['total'])
        info.update(post_info)
        if not post_valid:
            self.stats['posttokenization_filtered'] += 1
        return post_valid, post_words, info