def tokenize_sentences()

in torchmoji/sentence_tokenizer.py [0:0]


    def tokenize_sentences(self, sentences, reset_stats=True, max_sentences=None):
        """ Converts a given list of sentences into a numpy array according to
            its vocabulary.

        # Arguments:
            sentences: List of sentences to be tokenized.
            reset_stats: Whether the word generator's stats should be reset.
            max_sentences: Maximum length of sentences. Must be set if the
                length cannot be inferred from the input.

        # Returns:
            Numpy array of the tokenization sentences with masking,
            infos,
            stats

        # Raises:
            ValueError: When maximum length is not set and cannot be inferred.
        """

        if max_sentences is None and not hasattr(sentences, '__len__'):
            raise ValueError('Either you must provide an array with a length'
                             'attribute (e.g. a list) or specify the maximum '
                             'length yourself using `max_sentences`!')
        n_sentences = (max_sentences if max_sentences is not None
                       else len(sentences))

        if self.masking_value == 0:
            tokens = np.zeros((n_sentences, self.fixed_length), dtype='uint16')
        else:
            tokens = (np.ones((n_sentences, self.fixed_length), dtype='uint16')
                      * self.masking_value)

        if reset_stats:
            self.wordgen.reset_stats()

        # With a custom word generator info can be extracted from each
        # sentence (e.g. labels)
        infos = []

        # Returns words as strings and then map them to vocabulary
        self.wordgen.stream = sentences
        next_insert = 0
        n_ignored_unknowns = 0
        for s_words, s_info in self.wordgen:
            s_tokens = self.find_tokens(s_words)

            if (self.ignore_sentences_with_only_custom and
                np.all([True if t < len(SPECIAL_TOKENS)
                        else False for t in s_tokens])):
                n_ignored_unknowns += 1
                continue
            if len(s_tokens) > self.fixed_length:
                s_tokens = s_tokens[:self.fixed_length]
            tokens[next_insert,:len(s_tokens)] = s_tokens
            infos.append(s_info)
            next_insert += 1

        # For standard word generators all sentences should be tokenized
        # this is not necessarily the case for custom wordgenerators as they
        # may filter the sentences etc.
        if not self.uses_custom_wordgen and not self.ignore_sentences_with_only_custom:
            assert len(sentences) == next_insert
        else:
            # adjust based on actual tokens received
            tokens = tokens[:next_insert]
            infos = infos[:next_insert]

        return tokens, infos, self.wordgen.stats