def choose_filtering_options()

in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]


    def choose_filtering_options(self):
        st.header("Filtering options")

        text_keys = ["Formatted filename", "Alt text", "Extracted text"]
        text_key = st.selectbox("Choose the type of text to pair with images", text_keys, index=2)
        self.text_key = text_key.lower().replace(" ", "_")

        st.write("-----")

        self.should_remove_images_not_in_simplified_dom_trees = st.checkbox(
            "Remove images not in simplified DOM trees", value=False
        )

        st.write("-----")

        self.should_remove_images_not_in_valid_formats = st.checkbox("Remove images not in valid formats", value=False)
        if self.should_remove_images_not_in_valid_formats:
            self.valid_formats = st.multiselect(
                "Valid formats",
                options=list(self.filtering_params["valid_formats"]),
                default=self.filtering_params["valid_formats"],
            )

        st.write("-----")

        self.should_remove_images_not_in_valid_sizes = st.checkbox("Remove images not in valid sizes", value=False)
        if self.should_remove_images_not_in_valid_sizes:
            col1, col2, col3, col4 = st.columns(4)
            with col1:
                self.original_width_min_cutoff = st.number_input(
                    "Minimum original width",
                    min_value=1,
                    max_value=None,
                    value=self.filtering_params["original_width_min_cutoff"],
                    step=1,
                )
                self.rendered_width_min_cutoff = st.number_input(
                    "Minimum rendered width",
                    min_value=1,
                    max_value=None,
                    value=self.filtering_params["rendered_width_min_cutoff"],
                    step=1,
                )
            with col2:
                self.original_width_max_cutoff = st.number_input(
                    "Maximum original width",
                    min_value=1,
                    max_value=None,
                    value=self.filtering_params["original_width_max_cutoff"],
                    step=1,
                )
                self.rendered_width_max_cutoff = st.number_input(
                    "Maximum rendered width",
                    min_value=1,
                    max_value=None,
                    value=self.filtering_params["rendered_width_max_cutoff"],
                    step=1,
                )
            with col3:
                self.original_height_min_cutoff = st.number_input(
                    "Minimum original height",
                    min_value=1,
                    max_value=None,
                    value=self.filtering_params["original_height_min_cutoff"],
                    step=1,
                )
                self.rendered_height_min_cutoff = st.number_input(
                    "Minimum rendered height",
                    min_value=1,
                    max_value=None,
                    value=self.filtering_params["rendered_height_min_cutoff"],
                    step=1,
                )
            with col4:
                self.original_height_max_cutoff = st.number_input(
                    "Maximum original height",
                    min_value=1,
                    max_value=None,
                    value=self.filtering_params["original_height_max_cutoff"],
                    step=1,
                )
                self.rendered_height_max_cutoff = st.number_input(
                    "Maximum rendered height",
                    min_value=1,
                    max_value=None,
                    value=self.filtering_params["rendered_height_max_cutoff"],
                    step=1,
                )
            self.aspect_ratio_max_cutoff = st.number_input(
                "Maximum aspect ratio",
                min_value=1.0,
                max_value=None,
                value=float(self.filtering_params["aspect_ratio_max_cutoff"]),
                step=0.5,
            )

        st.write("-----")

        self.should_remove_texts_not_in_valid_number_words = st.checkbox(
            "Remove texts not having a valid number of words", value=False
        )
        if self.should_remove_texts_not_in_valid_number_words:
            col1, col2 = st.columns(2)
            with col1:
                self.number_words_min_cutoff = st.number_input(
                    "Minimum number of words",
                    min_value=0,
                    max_value=None,
                    value=self.filtering_params["number_words_min_cutoff"],
                    step=1,
                )
            with col2:
                self.number_words_max_cutoff = st.number_input(
                    "Maximum number of words",
                    min_value=0,
                    max_value=None,
                    value=self.filtering_params["number_words_max_cutoff"],
                    step=1,
                )

        st.write("-----")

        self.should_remove_texts_with_too_high_special_character_ratio = st.checkbox(
            "Remove texts with a too high special character ratio", value=False
        )
        if self.should_remove_texts_with_too_high_special_character_ratio:
            self.special_character_ratio_max_cutoff = st.number_input(
                "Maximum special character ratio",
                min_value=0.0,
                max_value=1.0,
                value=self.filtering_params["special_character_ratio_max_cutoff"],
                step=0.01,
            )

        st.write("-----")

        self.should_remove_texts_with_too_high_repetition_ratio = st.checkbox(
            "Remove texts with a too high repetition ratio", value=False
        )
        if self.should_remove_texts_with_too_high_repetition_ratio:
            self.repetition_ratio_max_cutoff = st.number_input(
                "Maximum repetition ratio",
                min_value=0.0,
                max_value=1.0,
                value=self.filtering_params["repetition_ratio_max_cutoff"],
                step=0.01,
            )

        st.write("-----")

        self.should_remove_pairs_with_too_low_clip_score = st.checkbox(
            "Remove pairs with a too low CLIP score", value=False
        )
        if self.should_remove_pairs_with_too_low_clip_score:
            self.clip_score_min_cutoff = st.number_input(
                "Minimum CLIP score",
                min_value=0.0,
                max_value=1.0,
                value=self.filtering_params["clip_score_min_cutoff"],
                step=0.01,
            )