in src/datatrove/pipeline/dedup/sentence_dedup.py [0:0]
def remove_dup_sentences(self, doc: Document, du_lines: np.ndarray) -> tuple[str, str]:
sentence_spans = (
list(self.tokenizer.span_tokenize(doc.text)) if self.config.split_sentences else doc.text.splitlines()
)
kept_sentences = []
original_formatted = []
last_s = 0
du_line_idx = 0 # pointer for duplicate lines
drop_until = 0 # used to keep track of last matched span's end
removed_span = []
for idx, s in enumerate(sentence_spans):
line_text = doc.text[last_s : s[1]] if self.config.split_sentences else s
# track / increment dup_line ref
if du_line_idx < len(du_lines):
if du_lines[du_line_idx] < idx:
raise ValueError("Error with duplicate line index")
elif du_lines[du_line_idx] == idx:
drop_until = idx + self.config.n_sentences
du_line_idx += 1
# if outside the range, we keep this line/sent
if idx >= drop_until:
if removed_span:
original_formatted.append("<<<")
if (
self.config.min_words_to_remove_span > 0
and len(self.tokenizer.word_tokenize("\n".join(removed_span)))
< self.config.min_words_to_remove_span
):
kept_sentences.extend(removed_span)
removed_span.clear()
kept_sentences.append(line_text)
elif not removed_span:
removed_span.append(line_text)
original_formatted.append(">>>")
original_formatted.append(line_text)
if self.config.split_sentences:
last_s = s[1] # use this to include whitespace that is not included in the sentence spans
if removed_span:
original_formatted.append("<<<")
if (
self.config.min_words_to_remove_span > 0
and len(self.tokenizer.word_tokenize("\n".join(removed_span))) < self.config.min_words_to_remove_span
):
kept_sentences.extend(removed_span)
if len(kept_sentences) < len(sentence_spans):
self.stat_update("removed_sentences", value=len(sentence_spans) - len(kept_sentences))
self.stat_update("original_sentences", value=len(sentence_spans))
merge_char = "" if self.config.split_sentences else "\n"
return merge_char.join(kept_sentences).lstrip(), merge_char.join(original_formatted)