def remove_dup_sentences()

in src/datatrove/pipeline/dedup/sentence_dedup.py [0:0]


    def remove_dup_sentences(self, doc: Document, du_lines: np.ndarray) -> tuple[str, str]:
        sentence_spans = (
            list(self.tokenizer.span_tokenize(doc.text)) if self.config.split_sentences else doc.text.splitlines()
        )
        kept_sentences = []
        original_formatted = []
        last_s = 0
        du_line_idx = 0  # pointer for duplicate lines
        drop_until = 0  # used to keep track of last matched span's end
        removed_span = []
        for idx, s in enumerate(sentence_spans):
            line_text = doc.text[last_s : s[1]] if self.config.split_sentences else s
            # track / increment dup_line ref
            if du_line_idx < len(du_lines):
                if du_lines[du_line_idx] < idx:
                    raise ValueError("Error with duplicate line index")
                elif du_lines[du_line_idx] == idx:
                    drop_until = idx + self.config.n_sentences
                    du_line_idx += 1

            # if outside the range, we keep this line/sent
            if idx >= drop_until:
                if removed_span:
                    original_formatted.append("<<<")
                    if (
                        self.config.min_words_to_remove_span > 0
                        and len(self.tokenizer.word_tokenize("\n".join(removed_span)))
                        < self.config.min_words_to_remove_span
                    ):
                        kept_sentences.extend(removed_span)
                    removed_span.clear()
                kept_sentences.append(line_text)
            elif not removed_span:
                removed_span.append(line_text)
                original_formatted.append(">>>")
            original_formatted.append(line_text)
            if self.config.split_sentences:
                last_s = s[1]  # use this to include whitespace that is not included in the sentence spans
        if removed_span:
            original_formatted.append("<<<")
            if (
                self.config.min_words_to_remove_span > 0
                and len(self.tokenizer.word_tokenize("\n".join(removed_span))) < self.config.min_words_to_remove_span
            ):
                kept_sentences.extend(removed_span)
        if len(kept_sentences) < len(sentence_spans):
            self.stat_update("removed_sentences", value=len(sentence_spans) - len(kept_sentences))
        self.stat_update("original_sentences", value=len(sentence_spans))
        merge_char = "" if self.config.split_sentences else "\n"
        return merge_char.join(kept_sentences).lstrip(), merge_char.join(original_formatted)