in src/datatrove/pipeline/filters/c4_filters.py [0:0]
def filter(self, doc: Document) -> bool | tuple[bool, str]:
lines = doc.text.splitlines() if self.split_paragraph else split_into_sentences(doc.text, self.language)
num_sentences = 0
kept_lines = []
for line in lines:
line = line.strip()
words = line.split()
self.stat_update("line-total")
# check line has too long word
if self.max_word_length != -1 and any(len(word) > self.max_word_length for word in words):
self.stat_update("line-filter-too_long_word")
continue
# remove citation
if self.remove_citations:
line = CITATION_REGEX.sub("", line)
# end punctuation
if self.filter_no_terminal_punct and (not line.endswith(END_PUNCTUATION) or line.endswith(ELLIPSIS)):
self.stat_update("line-filter-no_terminal_punc")
continue
# min words per line
if len(words) < self.min_words_per_line:
self.stat_update("line-filter-too_few_words")
continue
line_l = line.lower()
# lorem ipsum
if self.filter_lorem_ipsum and "lorem ipsum" in line_l:
return False, "lorem_ipsum" # drop entire doc
# javascript
if self.filter_javascript and "javascript" in line_l:
self.stat_update("line-filter-javascript")
continue
# bracket
if self.filter_curly_bracket and "{" in line:
return False, "curly_bracket" # drop entire doc
# policy
if self.filter_policy and any(p in line_l for p in POLICY_SUBSTRINGS):
self.stat_update("line-filter-policy")
continue
if self.min_num_sentences != -1:
num_sentences += len(split_into_sentences(line, self.language)) if self.split_paragraph else 1
kept_lines.append(line)
self.stat_update("line-kept")
if num_sentences < self.min_num_sentences:
return False, "too_few_sentences"
doc.text = ("\n" if self.split_paragraph else " ").join(kept_lines).strip()
return True