def filter()

in src/datatrove/pipeline/filters/gopher_repetition_filter.py [0:0]


    def filter(self, doc: Document) -> bool | tuple[bool, str]:
        text = doc.text

        paragraphs = self.paragraph_exp.split(text.strip())
        paragraphs_duplicates, char_duplicates = find_duplicates(paragraphs)
        if self.dup_para_frac and paragraphs_duplicates / len(paragraphs) > self.dup_para_frac:
            return False, "dup_para_frac"
        if self.dup_para_char_frac and char_duplicates / len(text) > self.dup_para_char_frac:
            return False, "dup_para_char_frac"

        lines = self._line_splitter.split(text)
        line_duplicates, char_duplicates = find_duplicates(lines)
        if self.dup_line_frac and line_duplicates / len(lines) > self.dup_line_frac:
            return False, "dup_line_frac"
        if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
            return False, "dup_line_char_frac"

        words = split_into_words(text, self.language)

        for n, n_frac in self.top_n_grams:
            n_grams = get_n_grams(words, n)
            if not n_grams:
                continue
            top_char_length = find_top_duplicate(n_grams)
            if top_char_length / len(text) > n_frac:
                return False, f"top_{n}_gram"

        for n, n_frac in self.dup_n_grams:
            n_duplicates_char = find_all_duplicate(words, n)
            if n_duplicates_char / len(text) > n_frac:
                return False, f"duplicated_{n}_n_grams"

        return True