in misc/precision_filtering/run_precision_filtering.py [0:0]
def wordlist_filter(self, doc):
from datatrove.utils.text import split_into_words, simplify_text
text = simplify_text(doc.text, self.norm_config)
words = set(split_into_words(text, self.language))
matching_words = len(words & self.wordlist())
doc.metadata["wordlist_ratio"] = matching_words / len(words) if words else 0
return doc.metadata["wordlist_ratio"] > 0.0000001