in src/datatrove/pipeline/stats/word_stats.py [0:0]
def extract_stats(self, doc: Document) -> dict[str, int | float]:
word_tokenizer = load_word_tokenizer(self.language)
words = word_tokenizer.word_tokenize(doc.text)
lines = doc.text.splitlines()
return {
"n_words": len(words),
"avg_word_length": sum([len(word) for word in words]) / len(words),
"avg_words_per_line": len(words) / len(lines),
**{
f"short_word_ratio_{chars}": get_short_word_ratio(words, chars)
for chars in self.short_word_max_chars_threshold
},
**{
f"long_word_ratio_{chars}": get_long_word_ratio(words, chars)
for chars in self.long_word_max_chars_threshold
},
"type_token_ratio": len(set(words)) / len(words),
"uppercase_word_ratio": sum([1 for word in words if word.isupper()]) / len(words),
"capitalized_word_ratio": sum([1 for word in words if word.istitle()]) / len(words),
"stop_word_ratio": sum([1 for word in words if word in self.stop_words]) / len(words),
}