def analysis_discarded_pairs()

in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]


    def analysis_discarded_pairs(self):
        num_discarded_tot = len(self.discarded_pairs)
        perc_discarded_tot = round(num_discarded_tot / len(self.extracted_pairs) * 100, 1)
        st.header(
            f"Analysis of discarded pairs: {num_discarded_tot}/{len(self.extracted_pairs)} ({perc_discarded_tot}%)"
        )

        if not self.discarded_pairs:
            st.markdown("No pair discarded")

        else:
            num_discarded_filter = len([1 for pair in self.discarded_pairs if self.text_key not in pair])
            perc_discarded_filter = round(num_discarded_filter / num_discarded_tot * 100, 1)
            st.markdown(
                "Discarded because of the *chosen type of text not being in pairs*:"
                f" **{num_discarded_filter}/{num_discarded_tot} ({perc_discarded_filter}%)**"
            )

            def display_discarded_by_filter(should_use_filter, func_filter, msg_filter):
                if should_use_filter:
                    num_discarded_filter = len([1 for pair in self.discarded_pairs if not func_filter(pair)])
                    perc_discarded_filter = round(num_discarded_filter / num_discarded_tot * 100, 1)
                    st.markdown(
                        f"Discarded by the filter on *{msg_filter}*:"
                        f" **{num_discarded_filter}/{num_discarded_tot} ({perc_discarded_filter}%)**"
                    )

            display_discarded_by_filter(
                self.should_remove_images_not_in_simplified_dom_trees,
                lambda pair: PairFiltering.check_image_in_simplified_dom_tree(pair),
                "not being in simplified DOM trees",
            )

            display_discarded_by_filter(
                self.should_remove_images_not_in_valid_formats,
                lambda pair: PairFiltering.check_format(pair, self.valid_formats),
                "not being in valid formats",
            )

            display_discarded_by_filter(
                self.should_remove_images_not_in_valid_sizes,
                lambda pair: PairFiltering.check_size_image(
                    pair,
                    self.original_width_min_cutoff,
                    self.original_width_max_cutoff,
                    self.original_height_min_cutoff,
                    self.original_height_max_cutoff,
                    self.rendered_width_min_cutoff,
                    self.rendered_width_max_cutoff,
                    self.rendered_height_min_cutoff,
                    self.rendered_height_max_cutoff,
                    self.aspect_ratio_max_cutoff,
                ),
                "not being in valid image sizes",
            )

            display_discarded_by_filter(
                self.should_remove_texts_not_in_valid_number_words,
                lambda pair: PairFiltering.check_number_words(
                    pair, self.text_key, self.number_words_min_cutoff, self.number_words_max_cutoff
                ),
                "not having a valid number of words",
            )

            display_discarded_by_filter(
                self.should_remove_texts_with_too_high_special_character_ratio,
                lambda pair: PairFiltering.check_special_character_ratio(
                    pair, self.text_key, self.special_character_ratio_max_cutoff
                ),
                "having a too high special character ratio",
            )

            display_discarded_by_filter(
                self.should_remove_texts_with_too_high_repetition_ratio,
                lambda pair: PairFiltering.check_repetition_ratio(
                    pair, self.text_key, self.repetition_ratio_max_cutoff
                ),
                "having a too high repetition ratio",
            )

            display_discarded_by_filter(
                self.should_remove_pairs_with_too_low_clip_score,
                lambda pair: PairFiltering.check_clip_score(pair, self.text_key, self.clip_score_min_cutoff),
                "having a too low CLIP score",
            )