in obelics/processors/web_document_extractor.py [0:0]
def __init__(
self,
html_dataset,
dom_tree_simplificator,
pre_extraction_simplificator,
path_save_dir_dataset,
num_proc,
path_save_file_image_urls,
path_save_dir_downloaded_images,
thread_count,
number_sample_per_shard,
image_size,
resize_mode,
path_save_dir_tmp_datasets_images,
path_save_dir_dataset_images,
path_save_file_map_url_idx,
num_proc_urls_to_images,
path_save_dir_sharded_dataset,
shard_size,