def __init__()

in misc/precision_filtering/run_precision_filtering.py [0:0]


    def __init__(self, lang_code, language, 
        whitelist_words: list[str] = [], exclusion_writer = None):
        super().__init__(exclusion_writer)
        from datatrove.utils.text import TextNormConfig
        self.language = language
        self.norm_config = TextNormConfig(
            lowercase=False,
            norm_numbers=False,
            norm_weekdays=False,
            norm_monthnames=False,
            remove_punctuation=True,
            norm_unicode_diacritics=False,
            norm_whitespace=True,
        )

        # url related filtering
        self.lang_code = lang_code
        self.whitelist_words = whitelist_words
        import ahocorasick
        from tldextract import TLDExtract
        self.tldextractor = TLDExtract()
        self.domain_extension = None
        domain_extension = [x for x in self.whitelist_words if x.startswith(".")]
        if domain_extension:
            self.domain_extension = domain_extension[0] + "/"

        self.whitelist_words = [x for x in self.whitelist_words if not x.startswith(".")]
        
        self.whitelist_automaton = ahocorasick.Automaton(ahocorasick.STORE_INTS)
        for word in self.whitelist_words:    
            self.whitelist_automaton.add_word(word, len(self.whitelist_automaton))
        self.whitelist_automaton.make_automaton()

        import re
        self.normalizer = re.compile(r"[^a-zA-Z0-9\/\.]+")  # we include / and . to separate url sections
        self.lang_code_pattern = re.compile(rf'(?<![a-zA-Z0-9]){self.lang_code}(?![a-zA-Z0-9])')