in obelics/processors/web_document_filtering.py [0:0]
def __call__(self, web_document):
texts = web_document["texts"]
images = web_document["images"]
full_text = "\n\n".join([text for text in texts if text])
all_images = [image for image in images if image]
if self.cond_check_number_images:
if not FilteringFunctions.check_number_images(
number_images=len(all_images),
number_images_min_cutoff=self.number_images_min_cutoff,
number_images_max_cutoff=self.number_images_max_cutoff,
):
return False
if self.cond_check_number_words_doc_level:
if not FilteringFunctions.check_number_words(
text=full_text,
strip_characters=self.strip_characters,
number_words_min_cutoff=self.number_words_doc_level_min_cutoff,
number_words_max_cutoff=self.number_words_doc_level_max_cutoff,
):
return False
if self.cond_check_character_repetition_ratio_doc_level:
if not FilteringFunctions.check_character_repetition_ratio(
text=full_text,
character_repetition_length=self.character_repetition_length_doc_level,
character_repetition_max_cutoff=self.character_repetition_doc_level_max_cutoff,
):
return False
if self.cond_check_word_repetition_ratio_doc_level:
if not FilteringFunctions.check_word_repetition_ratio(
text=full_text,
strip_characters=self.strip_characters,
word_repetition_length=self.word_repetition_length_doc_level,
word_repetition_max_cutoff=self.word_repetition_doc_level_max_cutoff,
):
return False
if self.cond_check_special_character_ratio_doc_level:
if not FilteringFunctions.check_special_character_ratio(
text=full_text,
special_characters=self.strip_characters,
special_character_ratio_max_cutoff=self.special_character_ratio_doc_level_max_cutoff,
):
return False
if self.cond_check_stopword_ratio_doc_level:
if not FilteringFunctions.check_stopword_ratio(
text=full_text,
strip_characters=self.strip_characters,
stopwords=self.stopwords,
stopword_ratio_min_cutoff=self.stopword_ratio_doc_level_min_cutoff,
):
return False
if self.cond_check_flagged_word_ratio_doc_level:
if not FilteringFunctions.check_flagged_word_ratio(
text=full_text,
strip_characters=self.strip_characters,
flagged_words=self.flagged_words,
flagged_word_ratio_max_cutoff=self.flagged_word_ratio_doc_level_max_cutoff,
):
return False
if self.cond_check_punctuation_ratio_doc_level:
if not FilteringFunctions.check_punctuation_ratio(
text=full_text,
punctuation=self.punctuation,
punctuation_ratio_min_cutoff=self.punctuation_ratio_doc_level_min_cutoff,
):
return False
if self.cond_check_common_word_ratio_doc_level:
if not FilteringFunctions.check_common_word_ratio(
text=full_text,
strip_characters=self.strip_characters,
common_words=self.common_words,
common_word_ratio_min_cutoff=self.common_word_ratio_doc_level_min_cutoff,
):
return False
if self.cond_check_lang_id_doc_level:
if not FilteringFunctions.check_lang_id(
text=full_text,
lang_id_model=self.lang_id_model,
target_lang_id="en",
lang_id_min_cutoff=self.lang_id_doc_level_min_cutoff,
):
return False
if self.cond_check_perplexity_score_doc_level:
if not FilteringFunctions.check_perplexity_score(
text=full_text,
non_printing_characters_re=self.non_printing_characters_re,
digits_re=self.digits_re,
unicode_punctuation=self.unicode_punctuation,
sentencepiece_model=self.sentencepiece_model,
kenlm_model=self.kenlm_model,
perplexity_score_max_cutoff=self.perplexity_score_doc_level_max_cutoff,
):
return False
return True