in src/datatrove/pipeline/filters/url_filter.py [0:0]
def filter(self, document: Document) -> bool | tuple[bool, str]:
self.download_data()
url = document.metadata.get("url")
assert url, "Document does not have url in its metadata"
url_info = self.tldextractor(url)
if url_info.registered_domain in self.block_listed_domains:
return False, "domain"
if url_info.fqdn in self.block_listed_domains:
return False, "subdomain"
if url in self.block_listed_url:
return False, "url"
url_words = set(normalizer.split(url))
if any(word in url_words for word in self.banned_words):
return False, "hard_blacklisted"
nb_soft_words = sum([word in url_words for word in self.soft_banned_words])
if nb_soft_words >= self.soft_word_threshold:
return False, "soft_blacklisted"
normalized_space = normalize(url)
if self.banned_subwords and next(self.banned_subwords_automaton.iter(normalized_space), False):
return False, "blacklisted_subword"
return True