def url_filter()

in misc/precision_filtering/run_precision_filtering.py [0:0]


    def url_filter(self, document):
        url = document.metadata.get("url").removesuffix("http://").removesuffix("https://")

        assert url, "Document does not have url in its metadata"
        url_info = self.tldextractor(url)

        # check domain extension
        if self.domain_extension and self.domain_extension in url_info.fqdn:
            document.metadata['url_match'] = self.domain_extension
            # print(f"DOMAIN EXTENSION: {self.domain_extension} in {url_info.fqdn}")
            return True

        # check lang code (pre space normalization)
        if self.lang_code.upper() in url or self.lang_code_pattern.search(url):
            # document.metadata['url_match'] = self.lang_code.upper()
            # print(f"LANG CODE: {self.lang_code} in {url}")
            return True
        
        # check whitelist words
        normalized_space = self.normalizer.sub("", url).lower()
        if not self.whitelist_words:
            return False
        found = list(self.whitelist_automaton.iter(normalized_space))
        if found:
            words = [
                self.whitelist_words[value] for end_index, value in found
            ]
            # print(f"WHITELIST WORDS: {words} in {url}")
            # document.metadata['url_match'] = words
            return True

        return False