def wordlist_filter()

in misc/precision_filtering/run_precision_filtering.py [0:0]


    def wordlist_filter(self, doc):
        from datatrove.utils.text import split_into_words, simplify_text
        text = simplify_text(doc.text, self.norm_config)
        words = set(split_into_words(text, self.language))
        matching_words = len(words & self.wordlist())
        doc.metadata["wordlist_ratio"] = matching_words / len(words) if words else 0
        return doc.metadata["wordlist_ratio"] > 0.0000001