def filtering()

in vision/m4/sourcing/data_collection/visualization/web_document_and_filtering_visualization.py [0:0]
526 lines of code
3 McCabe index (conditional complexity)

    def filtering(self):
        st.header("Filtering")

        st.subheader("Filtering at node level")

        self.cond_check_format = st.checkbox(
            "Remove images not in valid formats", value=self.filtering_params["cond_check_format"]
        )
        self.valid_formats = st.multiselect(
            "Valid formats",
            options=list(self.filtering_params["valid_formats"]),
            default=self.filtering_params["valid_formats"],
        )

        st.write("-----")

        self.cond_check_size_image = st.checkbox(
            "Remove images not in valid sizes", value=self.filtering_params["cond_check_size_image"]
        )
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            self.original_width_min_cutoff = st.number_input(
                "Minimum original width",
                min_value=1,
                max_value=None,
                value=self.filtering_params["original_width_min_cutoff"],
                step=1,
            )
            self.rendered_width_min_cutoff = st.number_input(
                "Minimum rendered width",
                min_value=1,
                max_value=None,
                value=self.filtering_params["rendered_width_min_cutoff"],
                step=1,
            )
        with col2:
            self.original_width_max_cutoff = st.number_input(
                "Maximum original width",
                min_value=1,
                max_value=None,
                value=self.filtering_params["original_width_max_cutoff"],
                step=1,
            )
            self.rendered_width_max_cutoff = st.number_input(
                "Maximum rendered width",
                min_value=1,
                max_value=None,
                value=self.filtering_params["rendered_width_max_cutoff"],
                step=1,
            )
        with col3:
            self.original_height_min_cutoff = st.number_input(
                "Minimum original height",
                min_value=1,
                max_value=None,
                value=self.filtering_params["original_height_min_cutoff"],
                step=1,
            )
            self.rendered_height_min_cutoff = st.number_input(
                "Minimum rendered height",
                min_value=1,
                max_value=None,
                value=self.filtering_params["rendered_height_min_cutoff"],
                step=1,
            )
        with col4:
            self.original_height_max_cutoff = st.number_input(
                "Maximum original height",
                min_value=1,
                max_value=None,
                value=self.filtering_params["original_height_max_cutoff"],
                step=1,
            )
            self.rendered_height_max_cutoff = st.number_input(
                "Maximum rendered height",
                min_value=1,
                max_value=None,
                value=self.filtering_params["rendered_height_max_cutoff"],
                step=1,
            )
        self.aspect_ratio_max_cutoff = st.number_input(
            "Maximum aspect ratio",
            min_value=1.0,
            max_value=None,
            value=float(self.filtering_params["aspect_ratio_max_cutoff"]),
            step=0.5,
        )

        st.write("-----")

        self.cond_check_number_words_node_level = st.checkbox(
            "Remove paragraphs not having a valid number of words",
            value=self.filtering_params["cond_check_number_words_node_level"],
        )
        col1, col2 = st.columns(2)
        with col1:
            self.number_words_node_level_min_cutoff = st.number_input(
                "Minimum number of words (paragraph level)",
                min_value=0,
                max_value=None,
                value=self.filtering_params["number_words_node_level_min_cutoff"],
                step=1,
            )
        with col2:
            self.number_words_node_level_max_cutoff = st.number_input(
                "Maximum number of words (paragraph level)",
                min_value=0,
                max_value=None,
                value=self.filtering_params["number_words_node_level_max_cutoff"],
                step=1,
            )

        st.write("-----")

        self.cond_check_character_repetition_ratio_node_level = st.checkbox(
            "Remove paragraphs with a too high character repetition ratio",
            value=self.filtering_params["cond_check_character_repetition_ratio_node_level"],
        )
        col1, col2 = st.columns(2)
        with col1:
            self.character_repetition_length_node_level = st.number_input(
                "Character repetition length (node level)",
                min_value=0,
                max_value=None,
                value=self.filtering_params["character_repetition_length_node_level"],
                step=1,
            )
        with col2:
            self.character_repetition_node_level_max_cutoff = st.number_input(
                "Maximum character repetition ratio (node level)",
                min_value=0.0,
                max_value=1.0,
                value=self.filtering_params["character_repetition_node_level_max_cutoff"],
                step=0.01,
            )

        st.write("-----")

        self.cond_check_word_repetition_ratio_node_level = st.checkbox(
            "Remove paragraphs with a too high word repetition ratio",
            value=self.filtering_params["cond_check_word_repetition_ratio_node_level"],
        )
        col1, col2 = st.columns(2)
        with col1:
            self.word_repetition_length_node_level = st.number_input(
                "Word repetition length (node level)",
                min_value=0,
                max_value=None,
                value=self.filtering_params["word_repetition_length_node_level"],
                step=1,
            )
        with col2:
            self.word_repetition_node_level_max_cutoff = st.number_input(
                "Maximum word repetition ratio (node level)",
                min_value=0.0,
                max_value=1.0,
                value=self.filtering_params["word_repetition_node_level_max_cutoff"],
                step=0.01,
            )

        st.write("-----")

        self.cond_check_special_character_ratio_node_level = st.checkbox(
            "Remove paragraphs with a too high special character ratio",
            value=self.filtering_params["cond_check_special_character_ratio_node_level"],
        )
        self.special_character_ratio_node_level_max_cutoff = st.number_input(
            "Maximum special character ratio (paragraph level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["special_character_ratio_node_level_max_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_stopword_ratio_node_level = st.checkbox(
            "Remove paragraphs with a too low stop word ratio",
            value=self.filtering_params["cond_check_stopword_ratio_node_level"],
        )
        self.stopword_ratio_node_level_min_cutoff = st.number_input(
            "Minimum stop word ratio (paragraph level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["stopword_ratio_node_level_min_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_flagged_word_ratio_node_level = st.checkbox(
            "Remove paragraphs with a too high flagged word ratio",
            value=self.filtering_params["cond_check_flagged_word_ratio_node_level"],
        )
        self.flagged_word_ratio_node_level_max_cutoff = st.number_input(
            "Maximum flagged word ratio (node level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["flagged_word_ratio_node_level_max_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_punctuation_ratio_node_level = st.checkbox(
            "Remove paragraphs with a too low punctuation ratio",
            value=self.filtering_params["cond_check_punctuation_ratio_node_level"],
        )
        self.min_number_words_to_check_punctuation_ratio_node_level = st.number_input(
            "Minimum number of words to check punctuation ratio (node level)",
            min_value=0,
            max_value=None,
            value=self.filtering_params["min_number_words_to_check_punctuation_ratio_node_level"],
            step=1,
        )
        self.punctuation_ratio_node_level_min_cutoff = st.number_input(
            "Minimum punctuation ratio (node level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["punctuation_ratio_node_level_min_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_common_word_ratio_node_level = st.checkbox(
            "Remove paragraphs with a too low common word ratio",
            value=self.filtering_params["cond_check_common_word_ratio_node_level"],
        )
        self.common_word_ratio_node_level_min_cutoff = st.number_input(
            "Minimum common word ratio (node level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["common_word_ratio_node_level_min_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_lang_id_node_level = st.checkbox(
            "Remove paragraphs with a too low language identification confidence score",
            value=self.filtering_params["cond_check_lang_id_node_level"],
        )
        self.lang_id_node_level_min_cutoff = st.number_input(
            "Minimum language identification confidence score (node level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["lang_id_node_level_min_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_perplexity_score_node_level = st.checkbox(
            "Remove paragraphs with a too high perplexity score",
            value=self.filtering_params["cond_check_perplexity_score_node_level"],
        )
        self.perplexity_score_node_level_max_cutoff = st.number_input(
            "Maximum perplexity score (node level)",
            min_value=0,
            max_value=None,
            value=self.filtering_params["perplexity_score_node_level_max_cutoff"],
            step=1,
        )

        st.write("-----")

        st.subheader("Filtering at document level")

        self.cond_check_number_images = st.checkbox(
            "Remove documents with too few images", value=self.filtering_params["cond_check_number_images"]
        )
        col1, col2 = st.columns(2)
        with col1:
            self.number_images_min_cutoff = st.number_input(
                "Minimum number of images",
                min_value=0,
                max_value=None,
                value=self.filtering_params["number_images_min_cutoff"],
                step=1,
            )
        with col2:
            self.number_images_max_cutoff = st.number_input(
                "Maximum number of images",
                min_value=0,
                max_value=None,
                value=self.filtering_params["number_images_max_cutoff"],
                step=1,
            )

        st.write("-----")

        self.cond_check_number_words_doc_level = st.checkbox(
            "Remove documents not having a valid number of words",
            value=self.filtering_params["cond_check_number_words_doc_level"],
        )
        col1, col2 = st.columns(2)
        with col1:
            self.number_words_doc_level_min_cutoff = st.number_input(
                "Minimum number of words (doc level)",
                min_value=0,
                max_value=None,
                value=self.filtering_params["number_words_doc_level_min_cutoff"],
                step=1,
            )
        with col2:
            self.number_words_doc_level_max_cutoff = st.number_input(
                "Maximum number of words (doc level)",
                min_value=0,
                max_value=None,
                value=self.filtering_params["number_words_doc_level_max_cutoff"],
                step=1,
            )

        st.write("-----")

        self.cond_check_character_repetition_ratio_doc_level = st.checkbox(
            "Remove documents with a too high character repetition ratio",
            value=self.filtering_params["cond_check_character_repetition_ratio_doc_level"],
        )
        col1, col2 = st.columns(2)
        with col1:
            self.character_repetition_length_doc_level = st.number_input(
                "Character repetition length (doc level)",
                min_value=0,
                max_value=None,
                value=self.filtering_params["character_repetition_length_doc_level"],
                step=1,
            )
        with col2:
            self.character_repetition_doc_level_max_cutoff = st.number_input(
                "Maximum character repetition ratio (doc level)",
                min_value=0.0,
                max_value=1.0,
                value=self.filtering_params["character_repetition_doc_level_max_cutoff"],
                step=0.01,
            )

        st.write("-----")

        self.cond_check_word_repetition_ratio_doc_level = st.checkbox(
            "Remove documents with a too high word repetition ratio",
            value=self.filtering_params["cond_check_word_repetition_ratio_doc_level"],
        )
        col1, col2 = st.columns(2)
        with col1:
            self.word_repetition_length_doc_level = st.number_input(
                "Word repetition length (doc level)",
                min_value=0,
                max_value=None,
                value=self.filtering_params["word_repetition_length_doc_level"],
                step=1,
            )
        with col2:
            self.word_repetition_doc_level_max_cutoff = st.number_input(
                "Maximum word repetition ratio (doc level)",
                min_value=0.0,
                max_value=1.0,
                value=self.filtering_params["word_repetition_doc_level_max_cutoff"],
                step=0.01,
            )

        st.write("-----")

        self.cond_check_special_character_ratio_doc_level = st.checkbox(
            "Remove documents with a too high special character ratio",
            value=self.filtering_params["cond_check_special_character_ratio_doc_level"],
        )
        self.special_character_ratio_doc_level_max_cutoff = st.number_input(
            "Maximum special character ratio (doc level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["special_character_ratio_doc_level_max_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_stopword_ratio_doc_level = st.checkbox(
            "Remove documents with a too low stop word ratio",
            value=self.filtering_params["cond_check_stopword_ratio_doc_level"],
        )
        self.stopword_ratio_doc_level_min_cutoff = st.number_input(
            "Minimum stop word ratio (doc level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["stopword_ratio_doc_level_min_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_flagged_word_ratio_doc_level = st.checkbox(
            "Remove documents with a too high flagged word ratio",
            value=self.filtering_params["cond_check_flagged_word_ratio_doc_level"],
        )
        self.flagged_word_ratio_doc_level_max_cutoff = st.number_input(
            "Maximum flagged word ratio (doc level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["flagged_word_ratio_doc_level_max_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_punctuation_ratio_doc_level = st.checkbox(
            "Remove documents with a too low punctuation ratio",
            value=self.filtering_params["cond_check_punctuation_ratio_doc_level"],
        )
        self.punctuation_ratio_doc_level_min_cutoff = st.number_input(
            "Minimum punctuation ratio (doc level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["punctuation_ratio_doc_level_min_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_common_word_ratio_doc_level = st.checkbox(
            "Remove documents with a too low common word ratio",
            value=self.filtering_params["cond_check_common_word_ratio_doc_level"],
        )
        self.common_word_ratio_doc_level_min_cutoff = st.number_input(
            "Minimum common word ratio (doc level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["common_word_ratio_doc_level_min_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_lang_id_doc_level = st.checkbox(
            "Remove documents with a too low language identification confidence score",
            value=self.filtering_params["cond_check_lang_id_doc_level"],
        )
        self.lang_id_doc_level_min_cutoff = st.number_input(
            "Minimum language identification confidence score (doc level)",
            min_value=0.0,
            max_value=1.0,
            value=self.filtering_params["lang_id_doc_level_min_cutoff"],
            step=0.01,
        )

        st.write("-----")

        self.cond_check_perplexity_score_doc_level = st.checkbox(
            "Remove documents with a too high perplexity score",
            value=self.filtering_params["cond_check_perplexity_score_doc_level"],
        )
        self.perplexity_score_doc_level_max_cutoff = st.number_input(
            "Maximum perplexity score (doc level)",
            min_value=0,
            max_value=None,
            value=self.filtering_params["perplexity_score_doc_level_max_cutoff"],
            step=1,
        )

        st.write("-----")

        st.subheader("Perform filtering")
        button_filtering = st.button("Perform filtering 💥")
        if button_filtering:
            with st.spinner("Wait for it... 🤞"):
                start_time = time()

                web_document_filtering_node_level = WebDocumentFilteringNodeLevel(
                    cond_check_format=self.cond_check_format,
                    valid_formats=self.valid_formats,
                    cond_check_size_image=self.cond_check_size_image,
                    original_width_min_cutoff=self.original_width_min_cutoff,
                    original_width_max_cutoff=self.original_width_max_cutoff,
                    original_height_min_cutoff=self.original_height_min_cutoff,
                    original_height_max_cutoff=self.original_height_max_cutoff,
                    rendered_width_min_cutoff=self.rendered_width_min_cutoff,
                    rendered_width_max_cutoff=self.rendered_width_max_cutoff,
                    rendered_height_min_cutoff=self.rendered_height_min_cutoff,
                    rendered_height_max_cutoff=self.rendered_height_max_cutoff,
                    aspect_ratio_max_cutoff=self.aspect_ratio_max_cutoff,
                    cond_remove_non_printing_characters=self.filtering_params["cond_remove_non_printing_characters"],
                    non_printing_characters_re=NON_PRINTING_CHARACTERS_RE,
                    cond_standardize_whitespace=self.filtering_params["cond_standardize_whitespace"],
                    cond_check_number_words_node_level=self.cond_check_number_words_node_level,
                    strip_characters=SPECIAL_CHARACTERS,
                    number_words_node_level_min_cutoff=self.number_words_node_level_min_cutoff,
                    number_words_node_level_max_cutoff=self.number_words_node_level_max_cutoff,
                    cond_check_character_repetition_ratio_node_level=self.cond_check_character_repetition_ratio_node_level,
                    character_repetition_length_node_level=self.character_repetition_length_node_level,
                    character_repetition_node_level_max_cutoff=self.character_repetition_node_level_max_cutoff,
                    cond_check_word_repetition_ratio_node_level=self.cond_check_word_repetition_ratio_node_level,
                    word_repetition_length_node_level=self.word_repetition_length_node_level,
                    word_repetition_node_level_max_cutoff=self.word_repetition_node_level_max_cutoff,
                    cond_check_special_character_ratio_node_level=self.cond_check_special_character_ratio_node_level,
                    special_character_ratio_node_level_max_cutoff=self.special_character_ratio_node_level_max_cutoff,
                    cond_check_stopword_ratio_node_level=self.cond_check_stopword_ratio_node_level,
                    stopwords=STOPWORDS,
                    stopword_ratio_node_level_min_cutoff=self.stopword_ratio_node_level_min_cutoff,
                    cond_check_flagged_word_ratio_node_level=self.cond_check_flagged_word_ratio_node_level,
                    flagged_words=FLAGGED_WORDS,
                    flagged_word_ratio_node_level_max_cutoff=self.flagged_word_ratio_node_level_max_cutoff,
                    cond_check_punctuation_ratio_node_level=self.cond_check_punctuation_ratio_node_level,
                    min_number_words_to_check_punctuation_ratio_node_level=self.min_number_words_to_check_punctuation_ratio_node_level,
                    punctuation=PUNCTUATION,
                    punctuation_ratio_node_level_min_cutoff=self.punctuation_ratio_node_level_min_cutoff,
                    cond_check_common_word_ratio_node_level=self.cond_check_common_word_ratio_node_level,
                    path_common_words=path_common_words,
                    common_word_ratio_node_level_min_cutoff=self.common_word_ratio_node_level_min_cutoff,
                    cond_check_lang_id_node_level=self.cond_check_lang_id_node_level,
                    path_lang_id_model=self.path_lang_id_model,
                    lang_id_node_level_min_cutoff=self.lang_id_node_level_min_cutoff,
                    cond_check_perplexity_score_node_level=self.cond_check_perplexity_score_node_level,
                    digits_re=DIGITS_RE,
                    unicode_punctuation=UNICODE_PUNCTUATION,
                    path_sentencepiece_model=self.path_sentencepiece_model,
                    path_kenlm_model=self.path_kenlm_model,
                    perplexity_score_node_level_max_cutoff=self.perplexity_score_node_level_max_cutoff,
                )

                full_dataset_filtered_node_level = self.full_dataset.map(
                    web_document_filtering_node_level, load_from_cache_file=False, writer_batch_size=10000
                )

                web_document_filtering_doc_level = WebDocumentFilteringDocLevel(
                    cond_check_number_images=self.cond_check_number_images,
                    number_images_min_cutoff=self.number_images_min_cutoff,
                    number_images_max_cutoff=self.number_images_max_cutoff,
                    cond_check_number_words_doc_level=self.cond_check_number_words_doc_level,
                    strip_characters=SPECIAL_CHARACTERS,
                    number_words_doc_level_min_cutoff=self.number_words_doc_level_min_cutoff,
                    number_words_doc_level_max_cutoff=self.number_words_doc_level_max_cutoff,
                    cond_check_character_repetition_ratio_doc_level=self.cond_check_character_repetition_ratio_doc_level,
                    character_repetition_length_doc_level=self.character_repetition_length_doc_level,
                    character_repetition_doc_level_max_cutoff=self.character_repetition_doc_level_max_cutoff,
                    cond_check_word_repetition_ratio_doc_level=self.cond_check_word_repetition_ratio_doc_level,
                    word_repetition_length_doc_level=self.word_repetition_length_doc_level,
                    word_repetition_doc_level_max_cutoff=self.word_repetition_doc_level_max_cutoff,
                    cond_check_special_character_ratio_doc_level=self.cond_check_special_character_ratio_doc_level,
                    special_character_ratio_doc_level_max_cutoff=self.special_character_ratio_doc_level_max_cutoff,
                    cond_check_stopword_ratio_doc_level=self.cond_check_stopword_ratio_doc_level,
                    stopwords=STOPWORDS,
                    stopword_ratio_doc_level_min_cutoff=self.stopword_ratio_doc_level_min_cutoff,
                    cond_check_flagged_word_ratio_doc_level=self.cond_check_flagged_word_ratio_doc_level,
                    flagged_words=FLAGGED_WORDS,
                    flagged_word_ratio_doc_level_max_cutoff=self.flagged_word_ratio_doc_level_max_cutoff,
                    cond_check_punctuation_ratio_doc_level=self.cond_check_punctuation_ratio_doc_level,
                    punctuation=PUNCTUATION,
                    punctuation_ratio_doc_level_min_cutoff=self.punctuation_ratio_doc_level_min_cutoff,
                    cond_check_common_word_ratio_doc_level=self.cond_check_common_word_ratio_doc_level,
                    path_common_words=path_common_words,
                    common_word_ratio_doc_level_min_cutoff=self.common_word_ratio_doc_level_min_cutoff,
                    cond_check_lang_id_doc_level=self.cond_check_lang_id_doc_level,
                    path_lang_id_model=self.path_lang_id_model,
                    lang_id_doc_level_min_cutoff=self.lang_id_doc_level_min_cutoff,
                    cond_check_perplexity_score_doc_level=self.cond_check_perplexity_score_doc_level,
                    non_printing_characters_re=NON_PRINTING_CHARACTERS_RE,
                    digits_re=DIGITS_RE,
                    unicode_punctuation=UNICODE_PUNCTUATION,
                    path_sentencepiece_model=self.path_sentencepiece_model,
                    path_kenlm_model=self.path_kenlm_model,
                    perplexity_score_doc_level_max_cutoff=self.perplexity_score_doc_level_max_cutoff,
                )

                st.session_state.retained_web_document_dataset = full_dataset_filtered_node_level.filter(
                    web_document_filtering_doc_level, load_from_cache_file=False, writer_batch_size=10000
                )

                idx_retained_docs = set(st.session_state.retained_web_document_dataset["original_idx"])

                def keep_discarded_docs(web_document):
                    if web_document["original_idx"] not in idx_retained_docs:
                        return True
                    return False

                st.session_state.discarded_web_document_dataset = full_dataset_filtered_node_level.filter(
                    keep_discarded_docs, load_from_cache_file=False, writer_batch_size=10000
                )

            st.balloons()
            end_time = time()
            tot_time = round(end_time - start_time)
            st.success(f"Filtering done in {timedelta(seconds=tot_time)} (HH:MM:SS)!")