in vision/m4/sourcing/data_collection/processors/pair_filtering.py [0:0]
def __call__(self, media_info):
# Make sure that we have access to at least one text
available_text_keys = set(
text_key for text_key in ["formatted_filename", "alt_text", "extracted_text"] if text_key in media_info
)
if not available_text_keys:
return None
# Image filtering
if self.cond_check_image_in_simplified_dom_tree:
if not PairFiltering.check_image_in_simplified_dom_tree(media_info):
return None
if self.cond_check_format:
if not PairFiltering.check_format(media_info, self.valid_formats):
return None
if self.cond_check_size_image:
if not PairFiltering.check_size_image(
media_info,
self.original_width_min_cutoff,
self.original_width_max_cutoff,
self.original_height_min_cutoff,
self.original_height_max_cutoff,
self.rendered_width_min_cutoff,
self.rendered_width_max_cutoff,
self.rendered_height_min_cutoff,
self.rendered_height_max_cutoff,
self.aspect_ratio_max_cutoff,
):
return None
# Text normalization
if self.cond_remove_non_printing_characters:
for text_key in available_text_keys:
media_info[text_key] = PairFiltering.remove_non_printing_characters(
text=media_info[text_key], non_printing_characters_re=NON_PRINTING_CHARACTERS_RE
)
if self.cond_standardize_whitespace:
for text_key in available_text_keys:
media_info[text_key] = PairFiltering.standardize_whitespace(text=media_info[text_key])
# Text filtering
if self.cond_check_number_words:
for text_key in available_text_keys:
if not PairFiltering.check_number_words(
media_info, text_key, self.number_words_min_cutoff, self.number_words_max_cutoff
):
available_text_keys.remove(text_key)
if not available_text_keys:
return None
if self.cond_check_special_character_ratio:
for text_key in available_text_keys:
if not PairFiltering.check_special_character_ratio(
media_info, text_key, self.special_character_ratio_max_cutoff
):
available_text_keys.remove(text_key)
if not available_text_keys:
return None
if self.cond_check_stopword_ratio:
for text_key in available_text_keys:
if not PairFiltering.check_stopword_ratio(media_info, text_key, self.stopword_ratio_min_cutoff):
available_text_keys.remove(text_key)
if not available_text_keys:
return None
if self.cond_check_repetition_ratio:
for text_key in available_text_keys:
if not PairFiltering.check_repetition_ratio(media_info, text_key, self.repetition_ratio_max_cutoff):
available_text_keys.remove(text_key)
if not available_text_keys:
return None
# Text-image filtering
if self.cond_check_clip_score:
for text_key in available_text_keys:
if not PairFiltering.check_clip_score(media_info, text_key, self.clip_score_min_cutoff):
available_text_keys.remove(text_key)
if not available_text_keys:
return None
media_info["retained_text_keys_after_filtering"] = list(available_text_keys)
return media_info