in misc/precision_filtering/run_precision_filtering.py [0:0]
def url_filter(self, document):
url = document.metadata.get("url").removesuffix("http://").removesuffix("https://")
assert url, "Document does not have url in its metadata"
url_info = self.tldextractor(url)
# check domain extension
if self.domain_extension and self.domain_extension in url_info.fqdn:
document.metadata['url_match'] = self.domain_extension
# print(f"DOMAIN EXTENSION: {self.domain_extension} in {url_info.fqdn}")
return True
# check lang code (pre space normalization)
if self.lang_code.upper() in url or self.lang_code_pattern.search(url):
# document.metadata['url_match'] = self.lang_code.upper()
# print(f"LANG CODE: {self.lang_code} in {url}")
return True
# check whitelist words
normalized_space = self.normalizer.sub("", url).lower()
if not self.whitelist_words:
return False
found = list(self.whitelist_automaton.iter(normalized_space))
if found:
words = [
self.whitelist_words[value] for end_index, value in found
]
# print(f"WHITELIST WORDS: {words} in {url}")
# document.metadata['url_match'] = words
return True
return False