def __init__()

in obelics/visualization/global_visualization.py [0:0]


    def __init__(self, num_docs, dom_viz_template_path):
        self.num_docs = num_docs

        @st.experimental_memo  # it is caching but is incredibly slow when N is big.
        def load_examples(num_docs):
            try:
                dataset = load_dataset(
                    "bs-modeling-metadata/c4-en-html-with-metadata",
                    streaming=True,
                    split="train",
                    use_auth_token=True,
                )
            except FileNotFoundError:
                # This is how the DOM DOM Spaces should get access to the data.
                dataset = load_dataset(  # Use any dataset of html files containing columns "html" and "url"
                    "bs-modeling-metadata/c4-en-html-with-metadata",
                    streaming=True,
                    split="train",
                    use_auth_token=st.secrets["DOMDOM_READ_TOKEN"],
                )
            return list(dataset.take(num_docs))

        self.examples = load_examples(num_docs)

        def load_dom_viz_template(dom_viz_template_path):
            with open(dom_viz_template_path, "r") as file:
                template_string = file.read()
            return Template(template_string)

        self.dom_viz_template = load_dom_viz_template(dom_viz_template_path)

        self.dom_tree_simplificator = DOMTreeSimplificator(
            strip_multiple_linebreaks=True,
            strip_multiple_spaces=True,
            remove_html_comments=True,
            replace_line_break_tags=True,
            unwrap_tags=True,
            strip_tags=True,
            strip_special_divs=True,
            remove_dates=True,
            remove_empty_leaves=True,
            unnest_nodes=True,
            remake_tree=True,
        )
        self.pre_extraction_simplificator_not_merge_texts = PreExtractionSimplificator(
            only_text_image_nodes=True,
            format_texts=True,
            merge_consecutive_text_nodes=False,
        )
        self.pre_extraction_simplificator_merge_texts = PreExtractionSimplificator(
            only_text_image_nodes=True,
            format_texts=True,
            merge_consecutive_text_nodes=True,
        )
        self.extractor = TextMediaPairsExtractor(
            dom_tree_simplificator=self.dom_tree_simplificator,
            pre_extraction_simplificator=self.pre_extraction_simplificator_merge_texts,
            also_extract_images_not_in_simplified_dom_tree=True,
            extract_clip_scores=True,
        )