vision/m4/sourcing/data_collection/utils/utils.py (16 lines of code) (raw):

from datasets import load_dataset from selectolax.parser import HTMLParser def load_dataset_html(shuffle=False, buffer_size=10000, seed=42): dataset = load_dataset( "bs-modeling-metadata/c4-en-html-with-metadata", streaming=True, split="train", use_auth_token=True, ) if shuffle: dataset = dataset.shuffle(buffer_size=buffer_size, seed=seed) dataset = iter(dataset) return dataset def make_selectolax_tree(html_str): selectolax_tree = HTMLParser(html_str) return selectolax_tree