in obelics/visualization/global_visualization.py [0:0]
def __init__(self, num_docs, dom_viz_template_path):
self.num_docs = num_docs
@st.experimental_memo # it is caching but is incredibly slow when N is big.
def load_examples(num_docs):
try:
dataset = load_dataset(
"bs-modeling-metadata/c4-en-html-with-metadata",
streaming=True,
split="train",
use_auth_token=True,
)
except FileNotFoundError:
# This is how the DOM DOM Spaces should get access to the data.
dataset = load_dataset( # Use any dataset of html files containing columns "html" and "url"
"bs-modeling-metadata/c4-en-html-with-metadata",
streaming=True,
split="train",
use_auth_token=st.secrets["DOMDOM_READ_TOKEN"],
)
return list(dataset.take(num_docs))
self.examples = load_examples(num_docs)
def load_dom_viz_template(dom_viz_template_path):
with open(dom_viz_template_path, "r") as file:
template_string = file.read()
return Template(template_string)
self.dom_viz_template = load_dom_viz_template(dom_viz_template_path)
self.dom_tree_simplificator = DOMTreeSimplificator(
strip_multiple_linebreaks=True,
strip_multiple_spaces=True,
remove_html_comments=True,
replace_line_break_tags=True,
unwrap_tags=True,
strip_tags=True,
strip_special_divs=True,
remove_dates=True,
remove_empty_leaves=True,
unnest_nodes=True,
remake_tree=True,
)
self.pre_extraction_simplificator_not_merge_texts = PreExtractionSimplificator(
only_text_image_nodes=True,
format_texts=True,
merge_consecutive_text_nodes=False,
)
self.pre_extraction_simplificator_merge_texts = PreExtractionSimplificator(
only_text_image_nodes=True,
format_texts=True,
merge_consecutive_text_nodes=True,
)
self.extractor = TextMediaPairsExtractor(
dom_tree_simplificator=self.dom_tree_simplificator,
pre_extraction_simplificator=self.pre_extraction_simplificator_merge_texts,
also_extract_images_not_in_simplified_dom_tree=True,
extract_clip_scores=True,
)