def __call__()

in vision/m4/sourcing/data_collection/processors/web_document_filtering.py [0:0]


    def __call__(self, web_document):
        texts = web_document["texts"]
        images = web_document["images"]

        full_text = "\n\n".join([text for text in texts if text])
        all_images = [image for image in images if image]

        if self.cond_check_number_images:
            if not FilteringFunctions.check_number_images(
                number_images=len(all_images),
                number_images_min_cutoff=self.number_images_min_cutoff,
                number_images_max_cutoff=self.number_images_max_cutoff,
            ):
                return False

        if self.cond_check_number_words_doc_level:
            if not FilteringFunctions.check_number_words(
                text=full_text,
                strip_characters=self.strip_characters,
                number_words_min_cutoff=self.number_words_doc_level_min_cutoff,
                number_words_max_cutoff=self.number_words_doc_level_max_cutoff,
            ):
                return False

        if self.cond_check_character_repetition_ratio_doc_level:
            if not FilteringFunctions.check_character_repetition_ratio(
                text=full_text,
                character_repetition_length=self.character_repetition_length_doc_level,
                character_repetition_max_cutoff=self.character_repetition_doc_level_max_cutoff,
            ):
                return False

        if self.cond_check_word_repetition_ratio_doc_level:
            if not FilteringFunctions.check_word_repetition_ratio(
                text=full_text,
                strip_characters=self.strip_characters,
                word_repetition_length=self.word_repetition_length_doc_level,
                word_repetition_max_cutoff=self.word_repetition_doc_level_max_cutoff,
            ):
                return False

        if self.cond_check_special_character_ratio_doc_level:
            if not FilteringFunctions.check_special_character_ratio(
                text=full_text,
                special_characters=self.strip_characters,
                special_character_ratio_max_cutoff=self.special_character_ratio_doc_level_max_cutoff,
            ):
                return False

        if self.cond_check_stopword_ratio_doc_level:
            if not FilteringFunctions.check_stopword_ratio(
                text=full_text,
                strip_characters=self.strip_characters,
                stopwords=self.stopwords,
                stopword_ratio_min_cutoff=self.stopword_ratio_doc_level_min_cutoff,
            ):
                return False

        if self.cond_check_flagged_word_ratio_doc_level:
            if not FilteringFunctions.check_flagged_word_ratio(
                text=full_text,
                strip_characters=self.strip_characters,
                flagged_words=self.flagged_words,
                flagged_word_ratio_max_cutoff=self.flagged_word_ratio_doc_level_max_cutoff,
            ):
                return False

        if self.cond_check_punctuation_ratio_doc_level:
            if not FilteringFunctions.check_punctuation_ratio(
                text=full_text,
                punctuation=self.punctuation,
                punctuation_ratio_min_cutoff=self.punctuation_ratio_doc_level_min_cutoff,
            ):
                return False

        if self.cond_check_common_word_ratio_doc_level:
            if not FilteringFunctions.check_common_word_ratio(
                text=full_text,
                strip_characters=self.strip_characters,
                common_words=self.common_words,
                common_word_ratio_min_cutoff=self.common_word_ratio_doc_level_min_cutoff,
            ):
                return False

        if self.cond_check_lang_id_doc_level:
            if not FilteringFunctions.check_lang_id(
                text=full_text,
                lang_id_model=self.lang_id_model,
                target_lang_id="en",
                lang_id_min_cutoff=self.lang_id_doc_level_min_cutoff,
            ):
                return False

        if self.cond_check_perplexity_score_doc_level:
            if not FilteringFunctions.check_perplexity_score(
                text=full_text,
                non_printing_characters_re=self.non_printing_characters_re,
                digits_re=self.digits_re,
                unicode_punctuation=self.unicode_punctuation,
                sentencepiece_model=self.sentencepiece_model,
                kenlm_model=self.kenlm_model,
                perplexity_score_max_cutoff=self.perplexity_score_doc_level_max_cutoff,
            ):
                return False

        return True