def extract_stats()

in src/datatrove/pipeline/stats/word_stats.py [0:0]


    def extract_stats(self, doc: Document) -> dict[str, int | float]:
        word_tokenizer = load_word_tokenizer(self.language)

        words = word_tokenizer.word_tokenize(doc.text)
        lines = doc.text.splitlines()

        return {
            "n_words": len(words),
            "avg_word_length": sum([len(word) for word in words]) / len(words),
            "avg_words_per_line": len(words) / len(lines),
            **{
                f"short_word_ratio_{chars}": get_short_word_ratio(words, chars)
                for chars in self.short_word_max_chars_threshold
            },
            **{
                f"long_word_ratio_{chars}": get_long_word_ratio(words, chars)
                for chars in self.long_word_max_chars_threshold
            },
            "type_token_ratio": len(set(words)) / len(words),
            "uppercase_word_ratio": sum([1 for word in words if word.isupper()]) / len(words),
            "capitalized_word_ratio": sum([1 for word in words if word.istitle()]) / len(words),
            "stop_word_ratio": sum([1 for word in words if word in self.stop_words]) / len(words),
        }