in obelics/processors/web_document_filtering.py [0:0]
def __call__(self, web_document):
texts = web_document["texts"]
images = web_document["images"]
metadata = json.loads(web_document["metadata"])
indices_to_remove = set()
for ind, (text, image, meta) in enumerate(zip(texts, images, metadata)):
if image is not None:
meta["original_width"], meta["original_height"] = image.size
for side in ["width", "height"]:
if (
(f"rendered_{side}" in meta)
and isinstance(meta[f"rendered_{side}"], str)
and ("%" in meta[f"rendered_{side}"])
):
try:
meta[f"rendered_{side}"] = round(
meta[f"original_{side}"] * int(meta[f"rendered_{side}"].replace("%", "")) / 100
)
except Exception:
del meta[f"rendered_{side}"]
if image.format:
meta["format"] = image.format.lower()
if self.cond_check_format:
if not FilteringFunctions.check_format(image_metadata=meta, valid_formats=self.valid_formats):
indices_to_remove.add(ind)
continue
if self.cond_check_size_image:
if not FilteringFunctions.check_size_image(
image_metadata=meta,
original_width_min_cutoff=self.original_width_min_cutoff,
original_width_max_cutoff=self.original_width_max_cutoff,
original_height_min_cutoff=self.original_height_min_cutoff,
original_height_max_cutoff=self.original_height_max_cutoff,
rendered_width_min_cutoff=self.rendered_width_min_cutoff,
rendered_width_max_cutoff=self.rendered_width_max_cutoff,
rendered_height_min_cutoff=self.rendered_height_min_cutoff,
rendered_height_max_cutoff=self.rendered_height_max_cutoff,
aspect_ratio_max_cutoff=self.aspect_ratio_max_cutoff,
):
indices_to_remove.add(ind)
continue
metadata[ind] = meta
elif text is not None:
if self.cond_remove_non_printing_characters:
text = FilteringFunctions.remove_non_printing_characters(
text=text, non_printing_characters_re=self.non_printing_characters_re
)
if self.cond_standardize_whitespace:
text = FilteringFunctions.standardize_whitespace(text=text)
paragraphs = text.split("\n\n")
paragraphs_indices_to_remove = set()
for ind_par, paragraph in enumerate(paragraphs):
if paragraph == "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED":
continue
if self.cond_check_number_words_node_level:
if not FilteringFunctions.check_number_words(
text=paragraph,
strip_characters=self.strip_characters,
number_words_min_cutoff=self.number_words_node_level_min_cutoff,
number_words_max_cutoff=self.number_words_node_level_max_cutoff,
):
paragraphs_indices_to_remove.add(ind_par)
continue
if self.cond_check_character_repetition_ratio_node_level:
if not FilteringFunctions.check_character_repetition_ratio(
text=paragraph,
character_repetition_length=self.character_repetition_length_node_level,
character_repetition_max_cutoff=self.character_repetition_node_level_max_cutoff,
):
paragraphs_indices_to_remove.add(ind_par)
continue
if self.cond_check_word_repetition_ratio_node_level:
if not FilteringFunctions.check_word_repetition_ratio(
text=paragraph,
strip_characters=self.strip_characters,
word_repetition_length=self.word_repetition_length_node_level,
word_repetition_max_cutoff=self.word_repetition_node_level_max_cutoff,
):
paragraphs_indices_to_remove.add(ind_par)
continue
if self.cond_check_special_character_ratio_node_level:
if not FilteringFunctions.check_special_character_ratio(
text=paragraph,
special_characters=self.strip_characters,
special_character_ratio_max_cutoff=self.special_character_ratio_node_level_max_cutoff,
):
paragraphs_indices_to_remove.add(ind_par)
continue
if self.cond_check_stopword_ratio_node_level:
if not FilteringFunctions.check_stopword_ratio(
text=paragraph,
strip_characters=self.strip_characters,
stopwords=self.stopwords,
stopword_ratio_min_cutoff=self.stopword_ratio_node_level_min_cutoff,
):
paragraphs_indices_to_remove.add(ind_par)
continue
if self.cond_check_flagged_word_ratio_node_level:
if not FilteringFunctions.check_flagged_word_ratio(
text=paragraph,
strip_characters=self.strip_characters,
flagged_words=self.flagged_words,
flagged_word_ratio_max_cutoff=self.flagged_word_ratio_node_level_max_cutoff,
):
paragraphs_indices_to_remove.add(ind_par)
continue
if self.cond_check_punctuation_ratio_node_level:
if not FilteringFunctions.check_punctuation_ratio(
text=paragraph,
punctuation=self.punctuation,
punctuation_ratio_min_cutoff=self.punctuation_ratio_node_level_min_cutoff,
min_nb_words=self.min_number_words_to_check_punctuation_ratio_node_level,
):
paragraphs_indices_to_remove.add(ind_par)
continue
if self.cond_check_common_word_ratio_node_level:
if not FilteringFunctions.check_common_word_ratio(
text=paragraph,
strip_characters=self.strip_characters,
common_words=self.common_words,
common_word_ratio_min_cutoff=self.common_word_ratio_node_level_min_cutoff,
):
paragraphs_indices_to_remove.add(ind_par)
continue
if self.cond_check_lang_id_node_level:
if not FilteringFunctions.check_lang_id(
text=paragraph,
lang_id_model=self.lang_id_model,
target_lang_id="en",
lang_id_min_cutoff=self.lang_id_node_level_min_cutoff,
):
paragraphs_indices_to_remove.add(ind_par)
continue
if self.cond_check_perplexity_score_node_level:
if not FilteringFunctions.check_perplexity_score(
text=paragraph,
non_printing_characters_re=self.non_printing_characters_re,
digits_re=self.digits_re,
unicode_punctuation=self.unicode_punctuation,
sentencepiece_model=self.sentencepiece_model,
kenlm_model=self.kenlm_model,
perplexity_score_max_cutoff=self.perplexity_score_node_level_max_cutoff,
):
paragraphs_indices_to_remove.add(ind_par)
continue
paragraphs = [
el for ind_par, el in enumerate(paragraphs) if ind_par not in paragraphs_indices_to_remove
]
if not paragraphs:
indices_to_remove.add(ind)
else:
texts[ind] = "\n\n".join(paragraphs)
else:
indices_to_remove.add(ind)
web_document["texts"] = [el for ind, el in enumerate(texts) if ind not in indices_to_remove]
web_document["images"] = [el for ind, el in enumerate(images) if ind not in indices_to_remove]
web_document["metadata"] = json.dumps([el for ind, el in enumerate(metadata) if ind not in indices_to_remove])
return web_document