in src/datatrove/pipeline/filters/gopher_repetition_filter.py [0:0]
def filter(self, doc: Document) -> bool | tuple[bool, str]:
text = doc.text
paragraphs = self.paragraph_exp.split(text.strip())
paragraphs_duplicates, char_duplicates = find_duplicates(paragraphs)
if self.dup_para_frac and paragraphs_duplicates / len(paragraphs) > self.dup_para_frac:
return False, "dup_para_frac"
if self.dup_para_char_frac and char_duplicates / len(text) > self.dup_para_char_frac:
return False, "dup_para_char_frac"
lines = self._line_splitter.split(text)
line_duplicates, char_duplicates = find_duplicates(lines)
if self.dup_line_frac and line_duplicates / len(lines) > self.dup_line_frac:
return False, "dup_line_frac"
if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
return False, "dup_line_char_frac"
words = split_into_words(text, self.language)
for n, n_frac in self.top_n_grams:
n_grams = get_n_grams(words, n)
if not n_grams:
continue
top_char_length = find_top_duplicate(n_grams)
if top_char_length / len(text) > n_frac:
return False, f"top_{n}_gram"
for n, n_frac in self.dup_n_grams:
n_duplicates_char = find_all_duplicate(words, n)
if n_duplicates_char / len(text) > n_frac:
return False, f"duplicated_{n}_n_grams"
return True