in obelics/processors/web_document_filtering.py [0:0]
def __init__(
self,
cond_check_number_images,
number_images_min_cutoff,
number_images_max_cutoff,
cond_check_number_words_doc_level,
strip_characters,
number_words_doc_level_min_cutoff,
number_words_doc_level_max_cutoff,
cond_check_character_repetition_ratio_doc_level,
character_repetition_length_doc_level,
character_repetition_doc_level_max_cutoff,
cond_check_word_repetition_ratio_doc_level,
word_repetition_length_doc_level,
word_repetition_doc_level_max_cutoff,
cond_check_special_character_ratio_doc_level,
special_character_ratio_doc_level_max_cutoff,
cond_check_stopword_ratio_doc_level,
stopwords,
stopword_ratio_doc_level_min_cutoff,
cond_check_flagged_word_ratio_doc_level,
flagged_words,
flagged_word_ratio_doc_level_max_cutoff,
cond_check_punctuation_ratio_doc_level,
punctuation,
punctuation_ratio_doc_level_min_cutoff,
cond_check_common_word_ratio_doc_level,
path_common_words,
common_word_ratio_doc_level_min_cutoff,
cond_check_lang_id_doc_level,
path_lang_id_model,
lang_id_doc_level_min_cutoff,
cond_check_perplexity_score_doc_level,
non_printing_characters_re,
digits_re,
unicode_punctuation,
path_sentencepiece_model,
path_kenlm_model,
perplexity_score_doc_level_max_cutoff,