in src/datatrove/pipeline/stats/doc_stats.py [0:0]
def extract_stats(self, doc: Document) -> dict[str, int | float]:
return {
"length": len(doc.text),
"white_space_ratio": sum([1 for c in doc.text if c.isspace()]) / len(doc.text),
"non_alpha_digit_ratio": sum([1 for c in doc.text if not c.isalpha() and not c.isdigit()]) / len(doc.text),
"digit_ratio": sum([1 for c in doc.text if c.isdigit()]) / len(doc.text),
"uppercase_ratio": sum([1 for c in doc.text if c.isupper()]) / len(doc.text),
"elipsis_ratio": sum(len(elipsis) for elipsis in self.elipsis_regex.findall(doc.text)) / len(doc.text),
"punctuation_ratio": sum(len(punc) for punc in self.punc_regex.findall(doc.text)) / len(doc.text),
}