in src/datatrove/pipeline/stats/line_stats.py [0:0]
def extract_stats(self, doc: Document):
lines: list[str] = doc.metadata.get("lines") or doc.text.split("\n")
# Don't ignore empty lines for count
n_lines = len(lines)
lines = [line for line in lines if len(line.strip()) > 0] if self.ignore_empty_lines else lines
line_dups, char_dups = find_duplicates(lines)
return {
"n_lines": n_lines,
"avg_line_length": (sum([len(line) for line in lines]) / len(lines)),
**{
f"short_line_ratio_chars_{chars}": get_max_chars_per_line_ratio(lines, chars)
for chars in self.short_max_chars
},
**{
f"long_line_ratio_chars_{chars}": get_min_chars_per_line_ratio(lines, chars)
for chars in self.long_max_chars
},
"lines_ending_with_terminal_mark_ratio": sum(1 for line in lines if line.endswith(END_PUNCTUATION))
/ len(lines),
"bullet_point_lines_ratio": sum(1 for line in lines if is_bullet_line(line)) / len(lines),
"line_duplicates": line_dups / len(lines),
"line_char_duplicates": char_dups / sum(len(line) for line in lines),
}