in vision/data/datasets_processing_scripts/clean_m4_prelimenary_experiments/explore/global_visualization.py [0:0]
def __init__(self, num_docs, dom_viz_template_path):
self.num_docs = num_docs
css_rule = st.text_input("CSS rule", value="[class~='more-link']")
def filter_doc_with_matching_class(example, css_rule):
current_html = example["html"]
tree = HTMLParser(current_html)
matche = tree.css_first(css_rule)
if matche:
return True
return False
@st.cache_resource # it is caching but is incredibly slow when N is big.
def load_examples(num_docs, css_rule):
dataset = load_from_disk(
"/home/lucile/data/web_document_dataset_45M_sharded_ftered_2_line_deduplicated_with_html/train/shard_215"
)
# dataset = dataset.select(range(num_docs))
print(f"Loaded {len(dataset)} examples.")
dataset = dataset.filter(partial(filter_doc_with_matching_class, css_rule=css_rule), num_proc=32)
# dataset = dataset.filter(lambda x: x["document_url"] == "https://acrusteaten.com/tag/meatless-monday/", num_proc=32)
print(f"Loaded {len(dataset)} examples.")
return dataset
self.examples = load_examples(num_docs, css_rule)
def load_dom_viz_template(dom_viz_template_path):
with open(dom_viz_template_path, "r") as file:
template_string = file.read()
return Template(template_string)
self.dom_viz_template = load_dom_viz_template(dom_viz_template_path)
self.dom_tree_simplificator_v1 = DOMTreeSimplificator(
strip_multiple_linebreaks=True,
strip_multiple_spaces=True,
remove_html_comments=True,
replace_line_break_tags=True,
unwrap_tags=True,
strip_tags=True,
strip_special_divs=True,
remove_dates=True,
remove_empty_leaves=True,
unnest_nodes=True,
remake_tree=True,
)
self.dom_tree_simplificator_v2 = DOMTreeSimplificator(
strip_multiple_linebreaks=True,
strip_multiple_spaces=True,
remove_html_comments=True,
replace_line_break_tags=True,
unwrap_tags=True,
strip_tags=True,
strip_special_divs=True,
remove_dates=True,
remove_empty_leaves=True,
unnest_nodes=True,
remake_tree=True,
css_rules=[
"[class~='footer']",
"[class~='site-info']",
],
css_rules_replace_with_text={"[class~='more-link']": "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED"},
)
self.pre_extraction_simplificator_not_merge_texts = PreExtractionSimplificator(
only_text_image_nodes=True,
format_texts=True,
merge_consecutive_text_nodes=False,
interesting_attributes_set_cat=InterestingAttributesSetCategory.WIKIPEDIA,
)
self.pre_extraction_simplificator_merge_texts = PreExtractionSimplificator(
only_text_image_nodes=True,
format_texts=True,
merge_consecutive_text_nodes=True,
interesting_attributes_set_cat=InterestingAttributesSetCategory.WIKIPEDIA,
)