in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]
def get_statistics_on_extracted_pairs(self):
st.header("Statistics for retained and discarded pairs")
def number_pairs():
st.subheader("Number of pairs")
st.markdown(
"*Retained pairs*: "
f"**{len(self.retained_pairs)}/{len(self.extracted_pairs)} "
f"({round(len(self.retained_pairs)/len(self.extracted_pairs)*100, 1)}%)**"
)
st.markdown(
"*Discarded pairs*: "
f"**{len(self.discarded_pairs)}/{len(self.extracted_pairs)} "
f"({round(len(self.discarded_pairs)/len(self.extracted_pairs)*100, 1)}%)**"
)
def image_sizes(
max_width=1_000,
bin_size_width=10,
max_height=1_000,
bin_size_height=10,
max_num_pixels=1_000_000,
bin_size_num_pixels=1_000,
):
st.subheader("Image sizes")
col1, col2, col3 = st.columns(3)
with col1:
original_widths_retained_pairs = Visualization.truncate(
[pair["original_width"] for pair in self.retained_pairs], max_val=max_width
)
original_widths_discarded_pairs = Visualization.truncate(
[pair["original_width"] for pair in self.discarded_pairs], max_val=max_width
)
Visualization.plot_distributions(
[original_widths_retained_pairs, original_widths_discarded_pairs],
["Retained pairs", "Discarded pairs"],
bin_size=bin_size_width,
title="Distribution of original widths of images",
)
with col2:
original_heights_retained_pairs = Visualization.truncate(
[pair["original_height"] for pair in self.retained_pairs], max_val=max_height
)
original_heights_discarded_pairs = Visualization.truncate(
[pair["original_height"] for pair in self.discarded_pairs], max_val=max_height
)
Visualization.plot_distributions(
[original_heights_retained_pairs, original_heights_discarded_pairs],
["Retained pairs", "Discarded pairs"],
bin_size=bin_size_height,
title="Distribution of original heights of images",
)
with col3:
original_num_pixels_retained_pairs = Visualization.truncate(
[pair["original_width"] * pair["original_height"] for pair in self.retained_pairs],
max_val=max_num_pixels,
)
original_num_pixels_discarded_pairs = Visualization.truncate(
[pair["original_width"] * pair["original_height"] for pair in self.discarded_pairs],
max_val=max_num_pixels,
)
Visualization.plot_distributions(
[original_num_pixels_retained_pairs, original_num_pixels_discarded_pairs],
["Retained pairs", "Discarded pairs"],
bin_size=bin_size_num_pixels,
title="Distribution of numbers of pixels of images",
)
def text_lengths():
st.subheader("Text lengths")
num_words_retained_pairs = [
len(pair[self.text_key].split(" ")) for pair in self.retained_pairs if self.text_key in pair
]
num_words_discarded_pairs = [
len(pair[self.text_key].split(" ")) for pair in self.discarded_pairs if self.text_key in pair
]
Visualization.plot_distributions(
[num_words_retained_pairs, num_words_discarded_pairs],
["Retained pairs", "Discarded pairs"],
bin_size=1,
title=f"Distribution of numbers of words in the {self.text_key.replace('_', ' ')}",
)
def clip_scores():
if self.should_compute_clip_scores:
st.subheader("CLIP scores")
clip_scores_retained_pairs = [
pair[f"clip_score_image_{self.text_key}"]
for pair in self.retained_pairs
if f"clip_score_image_{self.text_key}" in pair
]
clip_scores_discarded_pairs = [
pair[f"clip_score_image_{self.text_key}"]
for pair in self.discarded_pairs
if f"clip_score_image_{self.text_key}" in pair
]
Visualization.plot_distributions(
[clip_scores_retained_pairs, clip_scores_discarded_pairs],
["Retained pairs", "Discarded pairs"],
bin_size=0.02,
title=f"Distribution of CLIP scores for the {self.text_key.replace('_', ' ')}",
)
number_pairs()
image_sizes()
text_lengths()
clip_scores()