in obelics/processors/web_document_extractor.py [0:0]
def urls_to_images(self, reload_files=False):
with open(self.path_save_file_map_url_idx) as f:
self.map_url_idx = json.load(f)
# Useful when this method is called independently without
# the previous ones, so we need to load some files
if reload_files:
logger.info("Starting reloading variables for the step urls_to_images")
self.dataset = load_from_disk(self.path_save_dir_dataset)
self.dataset_images = load_from_disk(self.path_save_dir_dataset_images)
logger.info("Finished reloading variables for the step urls_to_images")
else:
try:
_ = self.dataset
_ = self.dataset_images
_ = self.map_url_idx
except Exception:
print("Set `reload_files=True` if you're calling this method alone to define the missing variables")
self.dataset = urls_to_images(
dataset=self.dataset,
dataset_images=self.dataset_images,
map_url_idx=self.map_url_idx,
num_proc=self.num_proc_urls_to_images,
)