def filtering()

in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]


    def filtering(self):
        def should_keep_pair(pair):
            # pair = media_info
            if self.text_key not in pair:
                return False

            if self.should_remove_images_not_in_simplified_dom_trees:
                if not PairFiltering.check_image_in_simplified_dom_tree(pair):
                    return False

            if self.should_remove_images_not_in_valid_formats:
                if not PairFiltering.check_format(pair, self.valid_formats):
                    return False

            if self.should_remove_images_not_in_valid_sizes:
                if not PairFiltering.check_size_image(
                    pair,
                    self.original_width_min_cutoff,
                    self.original_width_max_cutoff,
                    self.original_height_min_cutoff,
                    self.original_height_max_cutoff,
                    self.rendered_width_min_cutoff,
                    self.rendered_width_max_cutoff,
                    self.rendered_height_min_cutoff,
                    self.rendered_height_max_cutoff,
                    self.aspect_ratio_max_cutoff,
                ):
                    return False

            if self.should_remove_texts_not_in_valid_number_words:
                if not PairFiltering.check_number_words(
                    pair, self.text_key, self.number_words_min_cutoff, self.number_words_max_cutoff
                ):
                    return False

            if self.should_remove_texts_with_too_high_special_character_ratio:
                if not PairFiltering.check_special_character_ratio(
                    pair, self.text_key, self.special_character_ratio_max_cutoff
                ):
                    return False

            if self.should_remove_texts_with_too_high_repetition_ratio:
                if not PairFiltering.check_repetition_ratio(pair, self.text_key, self.repetition_ratio_max_cutoff):
                    return False

            if self.should_remove_pairs_with_too_low_clip_score:
                if not PairFiltering.check_clip_score(pair, self.text_key, self.clip_score_min_cutoff):
                    return False

            return True

        all_pairs = [[pair, should_keep_pair(pair)] for pair in self.extracted_pairs]
        self.retained_pairs = [pair for pair, keep_pair in all_pairs if keep_pair]
        self.discarded_pairs = [pair for pair, keep_pair in all_pairs if not keep_pair]