def __init__()

in vision/data/datasets_processing_scripts/clean_m4_prelimenary_experiments/explore/global_visualization.py [0:0]


    def __init__(self, num_docs, dom_viz_template_path):
        self.num_docs = num_docs

        css_rule = st.text_input("CSS rule", value="[class~='more-link']")

        def filter_doc_with_matching_class(example, css_rule):
            current_html = example["html"]
            tree = HTMLParser(current_html)
            matche = tree.css_first(css_rule)
            if matche:
                return True
            return False

        @st.cache_resource  # it is caching but is incredibly slow when N is big.
        def load_examples(num_docs, css_rule):
            dataset = load_from_disk(
                "/home/lucile/data/web_document_dataset_45M_sharded_ftered_2_line_deduplicated_with_html/train/shard_215"
            )
            # dataset = dataset.select(range(num_docs))
            print(f"Loaded {len(dataset)} examples.")
            dataset = dataset.filter(partial(filter_doc_with_matching_class, css_rule=css_rule), num_proc=32)
            # dataset = dataset.filter(lambda x: x["document_url"] == "https://acrusteaten.com/tag/meatless-monday/", num_proc=32)

            print(f"Loaded {len(dataset)} examples.")
            return dataset

        self.examples = load_examples(num_docs, css_rule)

        def load_dom_viz_template(dom_viz_template_path):
            with open(dom_viz_template_path, "r") as file:
                template_string = file.read()
            return Template(template_string)

        self.dom_viz_template = load_dom_viz_template(dom_viz_template_path)

        self.dom_tree_simplificator_v1 = DOMTreeSimplificator(
            strip_multiple_linebreaks=True,
            strip_multiple_spaces=True,
            remove_html_comments=True,
            replace_line_break_tags=True,
            unwrap_tags=True,
            strip_tags=True,
            strip_special_divs=True,
            remove_dates=True,
            remove_empty_leaves=True,
            unnest_nodes=True,
            remake_tree=True,
        )

        self.dom_tree_simplificator_v2 = DOMTreeSimplificator(
            strip_multiple_linebreaks=True,
            strip_multiple_spaces=True,
            remove_html_comments=True,
            replace_line_break_tags=True,
            unwrap_tags=True,
            strip_tags=True,
            strip_special_divs=True,
            remove_dates=True,
            remove_empty_leaves=True,
            unnest_nodes=True,
            remake_tree=True,
            css_rules=[
                "[class~='footer']",
                "[class~='site-info']",
            ],
            css_rules_replace_with_text={"[class~='more-link']": "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED"},
        )
        self.pre_extraction_simplificator_not_merge_texts = PreExtractionSimplificator(
            only_text_image_nodes=True,
            format_texts=True,
            merge_consecutive_text_nodes=False,
            interesting_attributes_set_cat=InterestingAttributesSetCategory.WIKIPEDIA,
        )
        self.pre_extraction_simplificator_merge_texts = PreExtractionSimplificator(
            only_text_image_nodes=True,
            format_texts=True,
            merge_consecutive_text_nodes=True,
            interesting_attributes_set_cat=InterestingAttributesSetCategory.WIKIPEDIA,
        )