def filter()

in src/datatrove/pipeline/filters/url_filter.py [0:0]


    def filter(self, document: Document) -> bool | tuple[bool, str]:
        self.download_data()
        url = document.metadata.get("url")

        assert url, "Document does not have url in its metadata"
        url_info = self.tldextractor(url)

        if url_info.registered_domain in self.block_listed_domains:
            return False, "domain"

        if url_info.fqdn in self.block_listed_domains:
            return False, "subdomain"

        if url in self.block_listed_url:
            return False, "url"

        url_words = set(normalizer.split(url))
        if any(word in url_words for word in self.banned_words):
            return False, "hard_blacklisted"

        nb_soft_words = sum([word in url_words for word in self.soft_banned_words])
        if nb_soft_words >= self.soft_word_threshold:
            return False, "soft_blacklisted"

        normalized_space = normalize(url)
        if self.banned_subwords and next(self.banned_subwords_automaton.iter(normalized_space), False):
            return False, "blacklisted_subword"

        return True