in obelics/processors/web_document_filtering.py [0:0]
def __reduce__(self):
return (
self.__class__,
(
self.cond_check_number_images,
self.number_images_min_cutoff,
self.number_images_max_cutoff,
self.cond_check_number_words_doc_level,
self.strip_characters,
self.number_words_doc_level_min_cutoff,
self.number_words_doc_level_max_cutoff,
self.cond_check_character_repetition_ratio_doc_level,
self.character_repetition_length_doc_level,
self.character_repetition_doc_level_max_cutoff,
self.cond_check_word_repetition_ratio_doc_level,
self.word_repetition_length_doc_level,
self.word_repetition_doc_level_max_cutoff,
self.cond_check_special_character_ratio_doc_level,
self.special_character_ratio_doc_level_max_cutoff,
self.cond_check_stopword_ratio_doc_level,
self.stopwords,
self.stopword_ratio_doc_level_min_cutoff,
self.cond_check_flagged_word_ratio_doc_level,
self.flagged_words,
self.flagged_word_ratio_doc_level_max_cutoff,
self.cond_check_punctuation_ratio_doc_level,
self.punctuation,
self.punctuation_ratio_doc_level_min_cutoff,
self.cond_check_common_word_ratio_doc_level,
self.path_common_words,
self.common_word_ratio_doc_level_min_cutoff,
self.cond_check_lang_id_doc_level,
self.path_lang_id_model,
self.lang_id_doc_level_min_cutoff,
self.cond_check_perplexity_score_doc_level,
self.non_printing_characters_re,
self.digits_re,
self.unicode_punctuation,
self.path_sentencepiece_model,
self.path_kenlm_model,
self.perplexity_score_doc_level_max_cutoff,
),
)