in src/datatrove/pipeline/stats/paragraph_stats.py [0:0]
def extract_stats(self, doc: Document) -> dict[str, int | float]:
paragraphs = [p for p in doc.text.split("\n\n") if p.strip()]
# Don't ignore empty paragraphs for count
n_paragraphs = len(paragraphs)
paragraphs = [p for p in paragraphs if p.strip()] if self.ignore_empty_paragraphs else paragraphs
paragraph_dups, paragraph_char_dups = find_duplicates(paragraphs)
return {
"n_paragraphs": n_paragraphs,
"avg_paragraph_length": sum([len(p) for p in paragraphs]) / n_paragraphs,
**{
f"short_paragraph_ratio_{chars}": get_short_paragraph_ratio(paragraphs, chars)
for chars in self.short_paragraph_max_chars_threshold
},
**{
f"long_paragraph_ratio_{chars}": get_long_paragraph_ratio(paragraphs, chars)
for chars in self.long_paragraph_max_chars_threshold
},
"paragraph_duplicates": paragraph_dups / n_paragraphs,
"paragraph_char_duplicates": paragraph_char_dups / sum(len(p) for p in paragraphs),
}