in vision/m4/sourcing/data_collection/visualization/pair_visualization.py [0:0]
def choose_filtering_options(self):
st.header("Filtering options")
text_keys = ["Formatted filename", "Alt text", "Extracted text"]
text_key = st.selectbox("Choose the type of text to pair with images", text_keys, index=2)
self.text_key = text_key.lower().replace(" ", "_")
st.write("-----")
self.should_remove_images_not_in_simplified_dom_trees = st.checkbox(
"Remove images not in simplified DOM trees", value=False
)
st.write("-----")
self.should_remove_images_not_in_valid_formats = st.checkbox("Remove images not in valid formats", value=False)
if self.should_remove_images_not_in_valid_formats:
self.valid_formats = st.multiselect(
"Valid formats",
options=list(self.filtering_params["valid_formats"]),
default=self.filtering_params["valid_formats"],
)
st.write("-----")
self.should_remove_images_not_in_valid_sizes = st.checkbox("Remove images not in valid sizes", value=False)
if self.should_remove_images_not_in_valid_sizes:
col1, col2, col3, col4 = st.columns(4)
with col1:
self.original_width_min_cutoff = st.number_input(
"Minimum original width",
min_value=1,
max_value=None,
value=self.filtering_params["original_width_min_cutoff"],
step=1,
)
self.rendered_width_min_cutoff = st.number_input(
"Minimum rendered width",
min_value=1,
max_value=None,
value=self.filtering_params["rendered_width_min_cutoff"],
step=1,
)
with col2:
self.original_width_max_cutoff = st.number_input(
"Maximum original width",
min_value=1,
max_value=None,
value=self.filtering_params["original_width_max_cutoff"],
step=1,
)
self.rendered_width_max_cutoff = st.number_input(
"Maximum rendered width",
min_value=1,
max_value=None,
value=self.filtering_params["rendered_width_max_cutoff"],
step=1,
)
with col3:
self.original_height_min_cutoff = st.number_input(
"Minimum original height",
min_value=1,
max_value=None,
value=self.filtering_params["original_height_min_cutoff"],
step=1,
)
self.rendered_height_min_cutoff = st.number_input(
"Minimum rendered height",
min_value=1,
max_value=None,
value=self.filtering_params["rendered_height_min_cutoff"],
step=1,
)
with col4:
self.original_height_max_cutoff = st.number_input(
"Maximum original height",
min_value=1,
max_value=None,
value=self.filtering_params["original_height_max_cutoff"],
step=1,
)
self.rendered_height_max_cutoff = st.number_input(
"Maximum rendered height",
min_value=1,
max_value=None,
value=self.filtering_params["rendered_height_max_cutoff"],
step=1,
)
self.aspect_ratio_max_cutoff = st.number_input(
"Maximum aspect ratio",
min_value=1.0,
max_value=None,
value=float(self.filtering_params["aspect_ratio_max_cutoff"]),
step=0.5,
)
st.write("-----")
self.should_remove_texts_not_in_valid_number_words = st.checkbox(
"Remove texts not having a valid number of words", value=False
)
if self.should_remove_texts_not_in_valid_number_words:
col1, col2 = st.columns(2)
with col1:
self.number_words_min_cutoff = st.number_input(
"Minimum number of words",
min_value=0,
max_value=None,
value=self.filtering_params["number_words_min_cutoff"],
step=1,
)
with col2:
self.number_words_max_cutoff = st.number_input(
"Maximum number of words",
min_value=0,
max_value=None,
value=self.filtering_params["number_words_max_cutoff"],
step=1,
)
st.write("-----")
self.should_remove_texts_with_too_high_special_character_ratio = st.checkbox(
"Remove texts with a too high special character ratio", value=False
)
if self.should_remove_texts_with_too_high_special_character_ratio:
self.special_character_ratio_max_cutoff = st.number_input(
"Maximum special character ratio",
min_value=0.0,
max_value=1.0,
value=self.filtering_params["special_character_ratio_max_cutoff"],
step=0.01,
)
st.write("-----")
self.should_remove_texts_with_too_high_repetition_ratio = st.checkbox(
"Remove texts with a too high repetition ratio", value=False
)
if self.should_remove_texts_with_too_high_repetition_ratio:
self.repetition_ratio_max_cutoff = st.number_input(
"Maximum repetition ratio",
min_value=0.0,
max_value=1.0,
value=self.filtering_params["repetition_ratio_max_cutoff"],
step=0.01,
)
st.write("-----")
self.should_remove_pairs_with_too_low_clip_score = st.checkbox(
"Remove pairs with a too low CLIP score", value=False
)
if self.should_remove_pairs_with_too_low_clip_score:
self.clip_score_min_cutoff = st.number_input(
"Minimum CLIP score",
min_value=0.0,
max_value=1.0,
value=self.filtering_params["clip_score_min_cutoff"],
step=0.01,
)