in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]
def statistics_without_filtering(self):
st.header("Statistics (without filtering)")
def image_sizes(
max_width=1_000,
bin_size_width=10,
max_height=1_000,
bin_size_height=10,
max_num_pixels=1_000_000,
bin_size_num_pixels=1_000,
):
st.subheader("Image sizes")
col1, col2, col3 = st.columns(3)
with col1:
original_widths = Visualization.truncate(
[pair["original_width"] for pair in self.extracted_pairs], max_val=max_width
)
Visualization.plot_distributions(
[original_widths],
["All images"],
bin_size=bin_size_width,
title="Distribution of original widths of images",
)
with col2:
original_heights = Visualization.truncate(
[pair["original_height"] for pair in self.extracted_pairs], max_val=max_height
)
Visualization.plot_distributions(
[original_heights],
["All pairs"],
bin_size=bin_size_height,
title="Distribution of original heights of images",
)
with col3:
original_num_pixels = Visualization.truncate(
[pair["original_width"] * pair["original_height"] for pair in self.extracted_pairs],
max_val=max_num_pixels,
)
Visualization.plot_distributions(
[original_num_pixels],
["All images"],
bin_size=bin_size_num_pixels,
title="Distribution of numbers of pixels of images",
)
def text_lengths():
st.subheader("Text lengths")
Visualization.plot_distributions(
[
[len(pair[text_key].split(" ")) for pair in self.extracted_pairs if text_key in pair]
for text_key in ["formatted_filename", "alt_text", "extracted_text"]
],
["Formatted filename", "Alt text", "Extracted text"],
bin_size=1,
title="Distribution of numbers of words",
)
def clip_scores():
if self.should_compute_clip_scores:
st.subheader("CLIP scores")
Visualization.plot_distributions(
[
[
pair[f"clip_score_image_{text_key}"]
for pair in self.extracted_pairs
if f"clip_score_image_{text_key}" in pair
]
for text_key in ["formatted_filename", "alt_text", "extracted_text"]
],
["Formatted filename", "Alt text", "Extracted text"],
bin_size=0.02,
title="Distribution of CLIP scores",
)
image_sizes()
text_lengths()
clip_scores()