vision/m4/sourcing/data_collection/visualization/global_visualization.py [72:267]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        )
        self.extractor = TextMediaPairsExtractor(
            dom_tree_simplificator=self.dom_tree_simplificator,
            pre_extraction_simplificator=self.pre_extraction_simplificator_merge_texts,
            also_extract_images_not_in_simplified_dom_tree=True,
            extract_clip_scores=True,
        )

    def visualization(self):
        st.title(
            "Visualization of DOM tree simplification strategies, "
            "web document rendering, and text-image pair extractions"
        )
        self.choose_mode()
        self.choose_example()
        self.simplification_mode()
        self.extraction_mode()

    def choose_mode(self):
        st.header("Mode")
        self.mode = st.selectbox(
            label="Choose a mode",
            options=["Simplification", "Extraction"],
            index=1,
        )

    def choose_example(self):
        st.header("Document")
        if st.button("Select a random document"):
            dct_idx = random.randint(a=0, b=self.num_docs - 1)
        else:
            dct_idx = 0
        idx = st.number_input(
            f"Select a document among the first {self.num_docs} ones",
            min_value=0,
            max_value=self.num_docs - 1,
            value=dct_idx,
            step=1,
            help=f"Index between 0 and {self.num_docs-1}",
        )
        self.current_example = self.examples[idx]

    def get_dom_viz_html(self, html):
        def get_body_html_string(html):
            tree = make_selectolax_tree(html)
            tree.strip_tags(["script"])
            return tree.body.html

        body_html = get_body_html_string(html)
        rendered_dom = self.dom_viz_template.render(body_html=body_html)
        return rendered_dom

    def simplification_mode(self):
        if self.mode == "Simplification":
            current_html = self.current_example["html"]
            current_url = self.current_example["url"]

            simplified_current_html = self.dom_tree_simplificator(current_html, type_return="str")

            def display_rendered_webpages():
                st.header("Rendered webpage")
                st.markdown(f"Webpage url: {current_url}")
                col1, col2 = st.columns(2)
                with col1:
                    st.subheader("Raw html rendering")
                    st.components.v1.html(current_html, height=450, scrolling=True)
                with col2:
                    st.subheader("Simplified html rendering")
                    st.components.v1.html(simplified_current_html, height=450, scrolling=True)

            def display_dom_trees():
                st.header("DOM trees")
                col1, col2 = st.columns(2)
                with col1:
                    st.subheader("Raw DOM tree")
                    rendered_dom = self.get_dom_viz_html(current_html)
                    st.components.v1.html(rendered_dom, height=600, scrolling=True)
                with col2:
                    st.subheader("Simplified DOM tree")
                    simplified_rendered_dom = self.get_dom_viz_html(simplified_current_html)
                    st.components.v1.html(simplified_rendered_dom, height=600, scrolling=True)

            def display_html_codes():
                st.header("HTML codes")
                col1, col2 = st.columns(2)
                with col1:
                    st.subheader("Raw HTML code")
                    st.components.v1.html("<xmp>" + current_html + "</xmp>", height=450, scrolling=True)
                with col2:
                    st.subheader("Simplified HTML code")
                    st.components.v1.html("<xmp>" + simplified_current_html + "</xmp>", height=450, scrolling=True)

            display_rendered_webpages()
            display_dom_trees()
            display_html_codes()

    def extraction_mode(self):
        if self.mode == "Extraction":
            current_html = self.current_example["html"]
            current_url = self.current_example["url"]

            simplified_current_html_tree = self.dom_tree_simplificator(current_html, type_return="selectolax_tree")
            simplified_current_html = simplified_current_html_tree.html

            current_list_nodes_not_merge_texts = self.pre_extraction_simplificator_not_merge_texts(
                simplified_current_html_tree, page_url=current_url
            )
            current_list_nodes_merge_texts = self.pre_extraction_simplificator_merge_texts(
                simplified_current_html_tree, page_url=current_url
            )

            extracted_images = self.extractor(html_str=current_html, page_url=current_url)

            # For simplicity, only doing this replacement on the extracted images.
            # Doing that before the extraction (i.e. in the DOM simplification) would be possible be would require
            # more significant changes
            replacement_dict = {
                elem["unformatted_src"]: elem["src"]
                for elem in extracted_images
                if elem["src"] != elem["unformatted_src"]
            }

            def replace_relative_paths(html_string):
                if replacement_dict:
                    for k, v in replacement_dict.items():
                        html_string = html_string.replace(k, v)
                return html_string

            def display_rendered_webpages():
                st.header("Rendered webpage")
                st.markdown(f"Webpage url: {current_url}")

                display_raw_html_rendering = st.checkbox("Raw html rendering", value=True)
                display_simplified_html_rendering = st.checkbox("Simplified html rendering", value=True)
                col1, col2 = st.columns(2)
                with col1:
                    display_pre_extraction_visualization = st.checkbox(
                        "Web document rendering (pre-extraction visualization)", value=True
                    )
                with col2:
                    if display_pre_extraction_visualization:
                        merge_text_nodes = st.checkbox("Merge text nodes", value=True)

                list_display_pages = [
                    [display_raw_html_rendering, "raw_html_rendering"],
                    [display_simplified_html_rendering, "simplified_html_rendering"],
                    [display_pre_extraction_visualization, "pre_extraction_visualization"],
                ]
                list_display_pages = [
                    page_to_display
                    for should_display_page, page_to_display in list_display_pages
                    if should_display_page
                ]

                def display_specific_rendered_webpage(page_to_display, col):
                    with col:
                        if page_to_display == "raw_html_rendering":
                            st.subheader("Raw html rendering")
                            st.components.v1.html(replace_relative_paths(current_html), height=800, scrolling=True)
                        elif page_to_display == "simplified_html_rendering":
                            st.subheader("Simplified html rendering")
                            st.components.v1.html(
                                replace_relative_paths(simplified_current_html), height=800, scrolling=True
                            )
                        elif page_to_display == "pre_extraction_visualization":
                            st.subheader("Web document rendering (pre-extraction visualization)")

                            def list_nodes_to_visu():
                                if not merge_text_nodes:
                                    list_nodes = current_list_nodes_not_merge_texts
                                    reduce_levels = {
                                        v: i + 1
                                        for i, v in enumerate(sorted(list(set([node.level for node in list_nodes]))))
                                    }
                                    last_level = None
                                    markdown = ""
                                    for node in list_nodes:
                                        if node.tag in ["-text", "img"]:
                                            current_level = reduce_levels[node.level]
                                            if last_level != current_level:
                                                markdown += (
                                                    "#" * min(current_level, 6) + f" Level: {current_level}\n\n"
                                                )
                                                last_level = current_level
                                                path_in_tree_str = [tag for tag, _ in node.path_in_tree]
                                                markdown += f"**{'/'.join(path_in_tree_str)}**\n\n"
                                            if node.tag == "-text":
                                                markdown += f"{node.text}\n\n"
                                            elif node.tag == "img":
                                                markdown += f"![img]({node.media_info['src']})\n\n"
                                    st.markdown(markdown)

                                else:
                                    list_nodes = current_list_nodes_merge_texts
                                    for node in list_nodes:
                                        if node.tag == "-text":
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


vision/m4/sourcing/data_collection/visualization/wikipedia/global_visualization.py [71:266]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        )
        self.extractor = TextMediaPairsExtractor(
            dom_tree_simplificator=self.dom_tree_simplificator,
            pre_extraction_simplificator=self.pre_extraction_simplificator_merge_texts,
            also_extract_images_not_in_simplified_dom_tree=True,
            extract_clip_scores=True,
        )

    def visualization(self):
        st.title(
            "Visualization of DOM tree simplification strategies, "
            "web document rendering, and text-image pair extractions"
        )
        self.choose_mode()
        self.choose_example()
        self.simplification_mode()
        self.extraction_mode()

    def choose_mode(self):
        st.header("Mode")
        self.mode = st.selectbox(
            label="Choose a mode",
            options=["Simplification", "Extraction"],
            index=1,
        )

    def choose_example(self):
        st.header("Document")
        if st.button("Select a random document"):
            dct_idx = random.randint(a=0, b=self.num_docs - 1)
        else:
            dct_idx = 0
        idx = st.number_input(
            f"Select a document among the first {self.num_docs} ones",
            min_value=0,
            max_value=self.num_docs - 1,
            value=dct_idx,
            step=1,
            help=f"Index between 0 and {self.num_docs-1}",
        )
        self.current_example = self.examples[idx]

    def get_dom_viz_html(self, html):
        def get_body_html_string(html):
            tree = make_selectolax_tree(html)
            tree.strip_tags(["script"])
            return tree.body.html

        body_html = get_body_html_string(html)
        rendered_dom = self.dom_viz_template.render(body_html=body_html)
        return rendered_dom

    def simplification_mode(self):
        if self.mode == "Simplification":
            current_html = self.current_example["html"]
            current_url = self.current_example["url"]

            simplified_current_html = self.dom_tree_simplificator(current_html, type_return="str")

            def display_rendered_webpages():
                st.header("Rendered webpage")
                st.markdown(f"Webpage url: {current_url}")
                col1, col2 = st.columns(2)
                with col1:
                    st.subheader("Raw html rendering")
                    st.components.v1.html(current_html, height=450, scrolling=True)
                with col2:
                    st.subheader("Simplified html rendering")
                    st.components.v1.html(simplified_current_html, height=450, scrolling=True)

            def display_dom_trees():
                st.header("DOM trees")
                col1, col2 = st.columns(2)
                with col1:
                    st.subheader("Raw DOM tree")
                    rendered_dom = self.get_dom_viz_html(current_html)
                    st.components.v1.html(rendered_dom, height=600, scrolling=True)
                with col2:
                    st.subheader("Simplified DOM tree")
                    simplified_rendered_dom = self.get_dom_viz_html(simplified_current_html)
                    st.components.v1.html(simplified_rendered_dom, height=600, scrolling=True)

            def display_html_codes():
                st.header("HTML codes")
                col1, col2 = st.columns(2)
                with col1:
                    st.subheader("Raw HTML code")
                    st.components.v1.html("<xmp>" + current_html + "</xmp>", height=450, scrolling=True)
                with col2:
                    st.subheader("Simplified HTML code")
                    st.components.v1.html("<xmp>" + simplified_current_html + "</xmp>", height=450, scrolling=True)

            display_rendered_webpages()
            display_dom_trees()
            display_html_codes()

    def extraction_mode(self):
        if self.mode == "Extraction":
            current_html = self.current_example["html"]
            current_url = self.current_example["url"]

            simplified_current_html_tree = self.dom_tree_simplificator(current_html, type_return="selectolax_tree")
            simplified_current_html = simplified_current_html_tree.html

            current_list_nodes_not_merge_texts = self.pre_extraction_simplificator_not_merge_texts(
                simplified_current_html_tree, page_url=current_url
            )
            current_list_nodes_merge_texts = self.pre_extraction_simplificator_merge_texts(
                simplified_current_html_tree, page_url=current_url
            )

            extracted_images = self.extractor(html_str=current_html, page_url=current_url)

            # For simplicity, only doing this replacement on the extracted images.
            # Doing that before the extraction (i.e. in the DOM simplification) would be possible be would require
            # more significant changes
            replacement_dict = {
                elem["unformatted_src"]: elem["src"]
                for elem in extracted_images
                if elem["src"] != elem["unformatted_src"]
            }

            def replace_relative_paths(html_string):
                if replacement_dict:
                    for k, v in replacement_dict.items():
                        html_string = html_string.replace(k, v)
                return html_string

            def display_rendered_webpages():
                st.header("Rendered webpage")
                st.markdown(f"Webpage url: {current_url}")

                display_raw_html_rendering = st.checkbox("Raw html rendering", value=True)
                display_simplified_html_rendering = st.checkbox("Simplified html rendering", value=True)
                col1, col2 = st.columns(2)
                with col1:
                    display_pre_extraction_visualization = st.checkbox(
                        "Web document rendering (pre-extraction visualization)", value=True
                    )
                with col2:
                    if display_pre_extraction_visualization:
                        merge_text_nodes = st.checkbox("Merge text nodes", value=True)

                list_display_pages = [
                    [display_raw_html_rendering, "raw_html_rendering"],
                    [display_simplified_html_rendering, "simplified_html_rendering"],
                    [display_pre_extraction_visualization, "pre_extraction_visualization"],
                ]
                list_display_pages = [
                    page_to_display
                    for should_display_page, page_to_display in list_display_pages
                    if should_display_page
                ]

                def display_specific_rendered_webpage(page_to_display, col):
                    with col:
                        if page_to_display == "raw_html_rendering":
                            st.subheader("Raw html rendering")
                            st.components.v1.html(replace_relative_paths(current_html), height=800, scrolling=True)
                        elif page_to_display == "simplified_html_rendering":
                            st.subheader("Simplified html rendering")
                            st.components.v1.html(
                                replace_relative_paths(simplified_current_html), height=800, scrolling=True
                            )
                        elif page_to_display == "pre_extraction_visualization":
                            st.subheader("Web document rendering (pre-extraction visualization)")

                            def list_nodes_to_visu():
                                if not merge_text_nodes:
                                    list_nodes = current_list_nodes_not_merge_texts
                                    reduce_levels = {
                                        v: i + 1
                                        for i, v in enumerate(sorted(list(set([node.level for node in list_nodes]))))
                                    }
                                    last_level = None
                                    markdown = ""
                                    for node in list_nodes:
                                        if node.tag in ["-text", "img"]:
                                            current_level = reduce_levels[node.level]
                                            if last_level != current_level:
                                                markdown += (
                                                    "#" * min(current_level, 6) + f" Level: {current_level}\n\n"
                                                )
                                                last_level = current_level
                                                path_in_tree_str = [tag for tag, _ in node.path_in_tree]
                                                markdown += f"**{'/'.join(path_in_tree_str)}**\n\n"
                                            if node.tag == "-text":
                                                markdown += f"{node.text}\n\n"
                                            elif node.tag == "img":
                                                markdown += f"![img]({node.media_info['src']})\n\n"
                                    st.markdown(markdown)

                                else:
                                    list_nodes = current_list_nodes_merge_texts
                                    for node in list_nodes:
                                        if node.tag == "-text":
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -