in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]
def analysis_discarded_pairs(self):
num_discarded_tot = len(self.discarded_pairs)
perc_discarded_tot = round(num_discarded_tot / len(self.extracted_pairs) * 100, 1)
st.header(
f"Analysis of discarded pairs: {num_discarded_tot}/{len(self.extracted_pairs)} ({perc_discarded_tot}%)"
)
if not self.discarded_pairs:
st.markdown("No pair discarded")
else:
num_discarded_filter = len([1 for pair in self.discarded_pairs if self.text_key not in pair])
perc_discarded_filter = round(num_discarded_filter / num_discarded_tot * 100, 1)
st.markdown(
"Discarded because of the *chosen type of text not being in pairs*:"
f" **{num_discarded_filter}/{num_discarded_tot} ({perc_discarded_filter}%)**"
)
def display_discarded_by_filter(should_use_filter, func_filter, msg_filter):
if should_use_filter:
num_discarded_filter = len([1 for pair in self.discarded_pairs if not func_filter(pair)])
perc_discarded_filter = round(num_discarded_filter / num_discarded_tot * 100, 1)
st.markdown(
f"Discarded by the filter on *{msg_filter}*:"
f" **{num_discarded_filter}/{num_discarded_tot} ({perc_discarded_filter}%)**"
)
display_discarded_by_filter(
self.should_remove_images_not_in_simplified_dom_trees,
lambda pair: PairFiltering.check_image_in_simplified_dom_tree(pair),
"not being in simplified DOM trees",
)
display_discarded_by_filter(
self.should_remove_images_not_in_valid_formats,
lambda pair: PairFiltering.check_format(pair, self.valid_formats),
"not being in valid formats",
)
display_discarded_by_filter(
self.should_remove_images_not_in_valid_sizes,
lambda pair: PairFiltering.check_size_image(
pair,
self.original_width_min_cutoff,
self.original_width_max_cutoff,
self.original_height_min_cutoff,
self.original_height_max_cutoff,
self.rendered_width_min_cutoff,
self.rendered_width_max_cutoff,
self.rendered_height_min_cutoff,
self.rendered_height_max_cutoff,
self.aspect_ratio_max_cutoff,
),
"not being in valid image sizes",
)
display_discarded_by_filter(
self.should_remove_texts_not_in_valid_number_words,
lambda pair: PairFiltering.check_number_words(
pair, self.text_key, self.number_words_min_cutoff, self.number_words_max_cutoff
),
"not having a valid number of words",
)
display_discarded_by_filter(
self.should_remove_texts_with_too_high_special_character_ratio,
lambda pair: PairFiltering.check_special_character_ratio(
pair, self.text_key, self.special_character_ratio_max_cutoff
),
"having a too high special character ratio",
)
display_discarded_by_filter(
self.should_remove_texts_with_too_high_repetition_ratio,
lambda pair: PairFiltering.check_repetition_ratio(
pair, self.text_key, self.repetition_ratio_max_cutoff
),
"having a too high repetition ratio",
)
display_discarded_by_filter(
self.should_remove_pairs_with_too_low_clip_score,
lambda pair: PairFiltering.check_clip_score(pair, self.text_key, self.clip_score_min_cutoff),
"having a too low CLIP score",
)