in misc/precision_filtering/run_precision_filtering.py [0:0]
def __init__(self, lang_code, language,
whitelist_words: list[str] = [], exclusion_writer = None):
super().__init__(exclusion_writer)
from datatrove.utils.text import TextNormConfig
self.language = language
self.norm_config = TextNormConfig(
lowercase=False,
norm_numbers=False,
norm_weekdays=False,
norm_monthnames=False,
remove_punctuation=True,
norm_unicode_diacritics=False,
norm_whitespace=True,
)
# url related filtering
self.lang_code = lang_code
self.whitelist_words = whitelist_words
import ahocorasick
from tldextract import TLDExtract
self.tldextractor = TLDExtract()
self.domain_extension = None
domain_extension = [x for x in self.whitelist_words if x.startswith(".")]
if domain_extension:
self.domain_extension = domain_extension[0] + "/"
self.whitelist_words = [x for x in self.whitelist_words if not x.startswith(".")]
self.whitelist_automaton = ahocorasick.Automaton(ahocorasick.STORE_INTS)
for word in self.whitelist_words:
self.whitelist_automaton.add_word(word, len(self.whitelist_automaton))
self.whitelist_automaton.make_automaton()
import re
self.normalizer = re.compile(r"[^a-zA-Z0-9\/\.]+") # we include / and . to separate url sections
self.lang_code_pattern = re.compile(rf'(?<![a-zA-Z0-9]){self.lang_code}(?![a-zA-Z0-9])')