in src/datatrove/pipeline/filters/fineweb_quality_filter.py [0:0]
def filter(self, doc) -> bool | tuple[bool, str]:
lines = doc.text.split("\n")
lines = [line for line in lines if line.strip() != ""]
if len(lines) == 0:
return False, "empty"
ratio = sum(1 for line in lines if line.endswith(self.stop_chars)) / len(lines)
if ratio < self.line_punct_thr and not (ratio == 0 and self.line_punct_exclude_zero):
return False, "line_punct_ratio"
ratio = sum(1 for line in lines if len(line) <= self.short_line_length) / len(lines)
if ratio > self.short_line_threshold:
return False, "short_line_ratio"
ratio = find_duplicates(lines)[1] / len(doc.text.replace("\n", ""))
if ratio > self.char_duplicates_ratio:
return False, "char_dup_ratio"
words = split_into_words(doc.text, self.language)
new_line = doc.text.count("\n")
if new_line / len(words) > self.new_line_ratio:
return False, "list_ratio"
return True