def get_statistics_on_extracted_pairs()

in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]


    def get_statistics_on_extracted_pairs(self):
        st.header("Statistics for retained and discarded pairs")

        def number_pairs():
            st.subheader("Number of pairs")
            st.markdown(
                "*Retained pairs*: "
                f"**{len(self.retained_pairs)}/{len(self.extracted_pairs)} "
                f"({round(len(self.retained_pairs)/len(self.extracted_pairs)*100, 1)}%)**"
            )
            st.markdown(
                "*Discarded pairs*: "
                f"**{len(self.discarded_pairs)}/{len(self.extracted_pairs)} "
                f"({round(len(self.discarded_pairs)/len(self.extracted_pairs)*100, 1)}%)**"
            )

        def image_sizes(
            max_width=1_000,
            bin_size_width=10,
            max_height=1_000,
            bin_size_height=10,
            max_num_pixels=1_000_000,
            bin_size_num_pixels=1_000,
        ):
            st.subheader("Image sizes")
            col1, col2, col3 = st.columns(3)

            with col1:
                original_widths_retained_pairs = Visualization.truncate(
                    [pair["original_width"] for pair in self.retained_pairs], max_val=max_width
                )
                original_widths_discarded_pairs = Visualization.truncate(
                    [pair["original_width"] for pair in self.discarded_pairs], max_val=max_width
                )
                Visualization.plot_distributions(
                    [original_widths_retained_pairs, original_widths_discarded_pairs],
                    ["Retained pairs", "Discarded pairs"],
                    bin_size=bin_size_width,
                    title="Distribution of original widths of images",
                )

            with col2:
                original_heights_retained_pairs = Visualization.truncate(
                    [pair["original_height"] for pair in self.retained_pairs], max_val=max_height
                )
                original_heights_discarded_pairs = Visualization.truncate(
                    [pair["original_height"] for pair in self.discarded_pairs], max_val=max_height
                )
                Visualization.plot_distributions(
                    [original_heights_retained_pairs, original_heights_discarded_pairs],
                    ["Retained pairs", "Discarded pairs"],
                    bin_size=bin_size_height,
                    title="Distribution of original heights of images",
                )

            with col3:
                original_num_pixels_retained_pairs = Visualization.truncate(
                    [pair["original_width"] * pair["original_height"] for pair in self.retained_pairs],
                    max_val=max_num_pixels,
                )
                original_num_pixels_discarded_pairs = Visualization.truncate(
                    [pair["original_width"] * pair["original_height"] for pair in self.discarded_pairs],
                    max_val=max_num_pixels,
                )
                Visualization.plot_distributions(
                    [original_num_pixels_retained_pairs, original_num_pixels_discarded_pairs],
                    ["Retained pairs", "Discarded pairs"],
                    bin_size=bin_size_num_pixels,
                    title="Distribution of numbers of pixels of images",
                )

        def text_lengths():
            st.subheader("Text lengths")
            num_words_retained_pairs = [
                len(pair[self.text_key].split(" ")) for pair in self.retained_pairs if self.text_key in pair
            ]
            num_words_discarded_pairs = [
                len(pair[self.text_key].split(" ")) for pair in self.discarded_pairs if self.text_key in pair
            ]
            Visualization.plot_distributions(
                [num_words_retained_pairs, num_words_discarded_pairs],
                ["Retained pairs", "Discarded pairs"],
                bin_size=1,
                title=f"Distribution of numbers of words in the {self.text_key.replace('_', ' ')}",
            )

        def clip_scores():
            if self.should_compute_clip_scores:
                st.subheader("CLIP scores")
                clip_scores_retained_pairs = [
                    pair[f"clip_score_image_{self.text_key}"]
                    for pair in self.retained_pairs
                    if f"clip_score_image_{self.text_key}" in pair
                ]
                clip_scores_discarded_pairs = [
                    pair[f"clip_score_image_{self.text_key}"]
                    for pair in self.discarded_pairs
                    if f"clip_score_image_{self.text_key}" in pair
                ]
                Visualization.plot_distributions(
                    [clip_scores_retained_pairs, clip_scores_discarded_pairs],
                    ["Retained pairs", "Discarded pairs"],
                    bin_size=0.02,
                    title=f"Distribution of CLIP scores for the {self.text_key.replace('_', ' ')}",
                )

        number_pairs()
        image_sizes()
        text_lengths()
        clip_scores()