in obelics/processors/web_document_filtering.py [0:0]
def __reduce__(self):
return (
self.__class__,
(
self.cond_check_format,
self.valid_formats,
self.cond_check_size_image,
self.original_width_min_cutoff,
self.original_width_max_cutoff,
self.original_height_min_cutoff,
self.original_height_max_cutoff,
self.rendered_width_min_cutoff,
self.rendered_width_max_cutoff,
self.rendered_height_min_cutoff,
self.rendered_height_max_cutoff,
self.aspect_ratio_max_cutoff,
self.cond_remove_non_printing_characters,
self.non_printing_characters_re,
self.cond_standardize_whitespace,
self.cond_check_number_words_node_level,
self.strip_characters,
self.number_words_node_level_min_cutoff,
self.number_words_node_level_max_cutoff,
self.cond_check_character_repetition_ratio_node_level,
self.character_repetition_length_node_level,
self.character_repetition_node_level_max_cutoff,
self.cond_check_word_repetition_ratio_node_level,
self.word_repetition_length_node_level,
self.word_repetition_node_level_max_cutoff,
self.cond_check_special_character_ratio_node_level,
self.special_character_ratio_node_level_max_cutoff,
self.cond_check_stopword_ratio_node_level,
self.stopwords,
self.stopword_ratio_node_level_min_cutoff,
self.cond_check_flagged_word_ratio_node_level,
self.flagged_words,
self.flagged_word_ratio_node_level_max_cutoff,
self.cond_check_punctuation_ratio_node_level,
self.min_number_words_to_check_punctuation_ratio_node_level,
self.punctuation,
self.punctuation_ratio_node_level_min_cutoff,
self.cond_check_common_word_ratio_node_level,
self.path_common_words,
self.common_word_ratio_node_level_min_cutoff,
self.cond_check_lang_id_node_level,
self.path_lang_id_model,
self.lang_id_node_level_min_cutoff,
self.cond_check_perplexity_score_node_level,
self.digits_re,
self.unicode_punctuation,
self.path_sentencepiece_model,
self.path_kenlm_model,
self.perplexity_score_node_level_max_cutoff,
),
)