def __call__()

in vision/m4/sourcing/data_collection/processors/web_document_filtering.py [0:0]


    def __call__(self, web_document):
        texts = web_document["texts"]
        images = web_document["images"]
        metadata = json.loads(web_document["metadata"])

        indices_to_remove = set()

        for ind, (text, image, meta) in enumerate(zip(texts, images, metadata)):
            if image is not None:
                meta["original_width"], meta["original_height"] = image.size
                for side in ["width", "height"]:
                    if (
                        (f"rendered_{side}" in meta)
                        and isinstance(meta[f"rendered_{side}"], str)
                        and ("%" in meta[f"rendered_{side}"])
                    ):
                        try:
                            meta[f"rendered_{side}"] = round(
                                meta[f"original_{side}"] * int(meta[f"rendered_{side}"].replace("%", "")) / 100
                            )
                        except Exception:
                            del meta[f"rendered_{side}"]
                if image.format:
                    meta["format"] = image.format.lower()

                if self.cond_check_format:
                    if not FilteringFunctions.check_format(image_metadata=meta, valid_formats=self.valid_formats):
                        indices_to_remove.add(ind)
                        continue

                if self.cond_check_size_image:
                    if not FilteringFunctions.check_size_image(
                        image_metadata=meta,
                        original_width_min_cutoff=self.original_width_min_cutoff,
                        original_width_max_cutoff=self.original_width_max_cutoff,
                        original_height_min_cutoff=self.original_height_min_cutoff,
                        original_height_max_cutoff=self.original_height_max_cutoff,
                        rendered_width_min_cutoff=self.rendered_width_min_cutoff,
                        rendered_width_max_cutoff=self.rendered_width_max_cutoff,
                        rendered_height_min_cutoff=self.rendered_height_min_cutoff,
                        rendered_height_max_cutoff=self.rendered_height_max_cutoff,
                        aspect_ratio_max_cutoff=self.aspect_ratio_max_cutoff,
                    ):
                        indices_to_remove.add(ind)
                        continue

                metadata[ind] = meta

            elif text is not None:
                if self.cond_remove_non_printing_characters:
                    text = FilteringFunctions.remove_non_printing_characters(
                        text=text, non_printing_characters_re=self.non_printing_characters_re
                    )

                if self.cond_standardize_whitespace:
                    text = FilteringFunctions.standardize_whitespace(text=text)

                paragraphs = text.split("\n\n")
                paragraphs_indices_to_remove = set()

                for ind_par, paragraph in enumerate(paragraphs):
                    if paragraph == "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED":
                        continue

                    if self.cond_check_number_words_node_level:
                        if not FilteringFunctions.check_number_words(
                            text=paragraph,
                            strip_characters=self.strip_characters,
                            number_words_min_cutoff=self.number_words_node_level_min_cutoff,
                            number_words_max_cutoff=self.number_words_node_level_max_cutoff,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                    if self.cond_check_character_repetition_ratio_node_level:
                        if not FilteringFunctions.check_character_repetition_ratio(
                            text=paragraph,
                            character_repetition_length=self.character_repetition_length_node_level,
                            character_repetition_max_cutoff=self.character_repetition_node_level_max_cutoff,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                    if self.cond_check_word_repetition_ratio_node_level:
                        if not FilteringFunctions.check_word_repetition_ratio(
                            text=paragraph,
                            strip_characters=self.strip_characters,
                            word_repetition_length=self.word_repetition_length_node_level,
                            word_repetition_max_cutoff=self.word_repetition_node_level_max_cutoff,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                    if self.cond_check_special_character_ratio_node_level:
                        if not FilteringFunctions.check_special_character_ratio(
                            text=paragraph,
                            special_characters=self.strip_characters,
                            special_character_ratio_max_cutoff=self.special_character_ratio_node_level_max_cutoff,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                    if self.cond_check_stopword_ratio_node_level:
                        if not FilteringFunctions.check_stopword_ratio(
                            text=paragraph,
                            strip_characters=self.strip_characters,
                            stopwords=self.stopwords,
                            stopword_ratio_min_cutoff=self.stopword_ratio_node_level_min_cutoff,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                    if self.cond_check_flagged_word_ratio_node_level:
                        if not FilteringFunctions.check_flagged_word_ratio(
                            text=paragraph,
                            strip_characters=self.strip_characters,
                            flagged_words=self.flagged_words,
                            flagged_word_ratio_max_cutoff=self.flagged_word_ratio_node_level_max_cutoff,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                    if self.cond_check_punctuation_ratio_node_level:
                        if not FilteringFunctions.check_punctuation_ratio(
                            text=paragraph,
                            punctuation=self.punctuation,
                            punctuation_ratio_min_cutoff=self.punctuation_ratio_node_level_min_cutoff,
                            min_nb_words=self.min_number_words_to_check_punctuation_ratio_node_level,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                    if self.cond_check_common_word_ratio_node_level:
                        if not FilteringFunctions.check_common_word_ratio(
                            text=paragraph,
                            strip_characters=self.strip_characters,
                            common_words=self.common_words,
                            common_word_ratio_min_cutoff=self.common_word_ratio_node_level_min_cutoff,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                    if self.cond_check_lang_id_node_level:
                        if not FilteringFunctions.check_lang_id(
                            text=paragraph,
                            lang_id_model=self.lang_id_model,
                            target_lang_id="en",
                            lang_id_min_cutoff=self.lang_id_node_level_min_cutoff,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                    if self.cond_check_perplexity_score_node_level:
                        if not FilteringFunctions.check_perplexity_score(
                            text=paragraph,
                            non_printing_characters_re=self.non_printing_characters_re,
                            digits_re=self.digits_re,
                            unicode_punctuation=self.unicode_punctuation,
                            sentencepiece_model=self.sentencepiece_model,
                            kenlm_model=self.kenlm_model,
                            perplexity_score_max_cutoff=self.perplexity_score_node_level_max_cutoff,
                        ):
                            paragraphs_indices_to_remove.add(ind_par)
                            continue

                paragraphs = [
                    el for ind_par, el in enumerate(paragraphs) if ind_par not in paragraphs_indices_to_remove
                ]
                if not paragraphs:
                    indices_to_remove.add(ind)
                else:
                    texts[ind] = "\n\n".join(paragraphs)

            else:
                indices_to_remove.add(ind)

        web_document["texts"] = [el for ind, el in enumerate(texts) if ind not in indices_to_remove]
        web_document["images"] = [el for ind, el in enumerate(images) if ind not in indices_to_remove]
        web_document["metadata"] = json.dumps([el for ind, el in enumerate(metadata) if ind not in indices_to_remove])

        return web_document