in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]
def filtering(self):
def should_keep_pair(pair):
# pair = media_info
if self.text_key not in pair:
return False
if self.should_remove_images_not_in_simplified_dom_trees:
if not PairFiltering.check_image_in_simplified_dom_tree(pair):
return False
if self.should_remove_images_not_in_valid_formats:
if not PairFiltering.check_format(pair, self.valid_formats):
return False
if self.should_remove_images_not_in_valid_sizes:
if not PairFiltering.check_size_image(
pair,
self.original_width_min_cutoff,
self.original_width_max_cutoff,
self.original_height_min_cutoff,
self.original_height_max_cutoff,
self.rendered_width_min_cutoff,
self.rendered_width_max_cutoff,
self.rendered_height_min_cutoff,
self.rendered_height_max_cutoff,
self.aspect_ratio_max_cutoff,
):
return False
if self.should_remove_texts_not_in_valid_number_words:
if not PairFiltering.check_number_words(
pair, self.text_key, self.number_words_min_cutoff, self.number_words_max_cutoff
):
return False
if self.should_remove_texts_with_too_high_special_character_ratio:
if not PairFiltering.check_special_character_ratio(
pair, self.text_key, self.special_character_ratio_max_cutoff
):
return False
if self.should_remove_texts_with_too_high_repetition_ratio:
if not PairFiltering.check_repetition_ratio(pair, self.text_key, self.repetition_ratio_max_cutoff):
return False
if self.should_remove_pairs_with_too_low_clip_score:
if not PairFiltering.check_clip_score(pair, self.text_key, self.clip_score_min_cutoff):
return False
return True
all_pairs = [[pair, should_keep_pair(pair)] for pair in self.extracted_pairs]
self.retained_pairs = [pair for pair, keep_pair in all_pairs if keep_pair]
self.discarded_pairs = [pair for pair, keep_pair in all_pairs if not keep_pair]