def statistics_without_filtering()

in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]


    def statistics_without_filtering(self):
        st.header("Statistics (without filtering)")

        def image_sizes(
            max_width=1_000,
            bin_size_width=10,
            max_height=1_000,
            bin_size_height=10,
            max_num_pixels=1_000_000,
            bin_size_num_pixels=1_000,
        ):
            st.subheader("Image sizes")
            col1, col2, col3 = st.columns(3)

            with col1:
                original_widths = Visualization.truncate(
                    [pair["original_width"] for pair in self.extracted_pairs], max_val=max_width
                )
                Visualization.plot_distributions(
                    [original_widths],
                    ["All images"],
                    bin_size=bin_size_width,
                    title="Distribution of original widths of images",
                )

            with col2:
                original_heights = Visualization.truncate(
                    [pair["original_height"] for pair in self.extracted_pairs], max_val=max_height
                )
                Visualization.plot_distributions(
                    [original_heights],
                    ["All pairs"],
                    bin_size=bin_size_height,
                    title="Distribution of original heights of images",
                )

            with col3:
                original_num_pixels = Visualization.truncate(
                    [pair["original_width"] * pair["original_height"] for pair in self.extracted_pairs],
                    max_val=max_num_pixels,
                )
                Visualization.plot_distributions(
                    [original_num_pixels],
                    ["All images"],
                    bin_size=bin_size_num_pixels,
                    title="Distribution of numbers of pixels of images",
                )

        def text_lengths():
            st.subheader("Text lengths")
            Visualization.plot_distributions(
                [
                    [len(pair[text_key].split(" ")) for pair in self.extracted_pairs if text_key in pair]
                    for text_key in ["formatted_filename", "alt_text", "extracted_text"]
                ],
                ["Formatted filename", "Alt text", "Extracted text"],
                bin_size=1,
                title="Distribution of numbers of words",
            )

        def clip_scores():
            if self.should_compute_clip_scores:
                st.subheader("CLIP scores")
                Visualization.plot_distributions(
                    [
                        [
                            pair[f"clip_score_image_{text_key}"]
                            for pair in self.extracted_pairs
                            if f"clip_score_image_{text_key}" in pair
                        ]
                        for text_key in ["formatted_filename", "alt_text", "extracted_text"]
                    ],
                    ["Formatted filename", "Alt text", "Extracted text"],
                    bin_size=0.02,
                    title="Distribution of CLIP scores",
                )

        image_sizes()
        text_lengths()
        clip_scores()