def __call__()

in vision/m4/sourcing/data_collection/processors/pair_filtering.py [0:0]


    def __call__(self, media_info):
        # Make sure that we have access to at least one text
        available_text_keys = set(
            text_key for text_key in ["formatted_filename", "alt_text", "extracted_text"] if text_key in media_info
        )
        if not available_text_keys:
            return None

        # Image filtering
        if self.cond_check_image_in_simplified_dom_tree:
            if not PairFiltering.check_image_in_simplified_dom_tree(media_info):
                return None

        if self.cond_check_format:
            if not PairFiltering.check_format(media_info, self.valid_formats):
                return None

        if self.cond_check_size_image:
            if not PairFiltering.check_size_image(
                media_info,
                self.original_width_min_cutoff,
                self.original_width_max_cutoff,
                self.original_height_min_cutoff,
                self.original_height_max_cutoff,
                self.rendered_width_min_cutoff,
                self.rendered_width_max_cutoff,
                self.rendered_height_min_cutoff,
                self.rendered_height_max_cutoff,
                self.aspect_ratio_max_cutoff,
            ):
                return None

        # Text normalization
        if self.cond_remove_non_printing_characters:
            for text_key in available_text_keys:
                media_info[text_key] = PairFiltering.remove_non_printing_characters(
                    text=media_info[text_key], non_printing_characters_re=NON_PRINTING_CHARACTERS_RE
                )

        if self.cond_standardize_whitespace:
            for text_key in available_text_keys:
                media_info[text_key] = PairFiltering.standardize_whitespace(text=media_info[text_key])

        # Text filtering
        if self.cond_check_number_words:
            for text_key in available_text_keys:
                if not PairFiltering.check_number_words(
                    media_info, text_key, self.number_words_min_cutoff, self.number_words_max_cutoff
                ):
                    available_text_keys.remove(text_key)
            if not available_text_keys:
                return None

        if self.cond_check_special_character_ratio:
            for text_key in available_text_keys:
                if not PairFiltering.check_special_character_ratio(
                    media_info, text_key, self.special_character_ratio_max_cutoff
                ):
                    available_text_keys.remove(text_key)
            if not available_text_keys:
                return None

        if self.cond_check_stopword_ratio:
            for text_key in available_text_keys:
                if not PairFiltering.check_stopword_ratio(media_info, text_key, self.stopword_ratio_min_cutoff):
                    available_text_keys.remove(text_key)
            if not available_text_keys:
                return None

        if self.cond_check_repetition_ratio:
            for text_key in available_text_keys:
                if not PairFiltering.check_repetition_ratio(media_info, text_key, self.repetition_ratio_max_cutoff):
                    available_text_keys.remove(text_key)
            if not available_text_keys:
                return None

        # Text-image filtering
        if self.cond_check_clip_score:
            for text_key in available_text_keys:
                if not PairFiltering.check_clip_score(media_info, text_key, self.clip_score_min_cutoff):
                    available_text_keys.remove(text_key)
            if not available_text_keys:
                return None

        media_info["retained_text_keys_after_filtering"] = list(available_text_keys)
        return media_info