in obelics/processors/web_document_extractor.py [0:0]
def process_one_tar(args):
(tar_path, idx_tar) = args
with tarfile.open(tar_path) as tar_file:
tar_members = tar_file.getmembers()
name_to_url = {}
name_to_img = {}
url_to_img = {}
for tar_member in tar_members:
if tar_member.name.endswith(".jpg"):
name = tar_member.name.replace(".jpg", "")
tar_member_file = tar_file.extractfile(tar_member)
img = tar_member_file.read()
tar_member_file.close()
name_to_img[name] = img
elif tar_member.name.endswith(".json"):
name = tar_member.name.replace(".json", "")
tar_member_file = tar_file.extractfile(tar_member)
json_val = json.loads(tar_member_file.read())
status = json_val["status"]
url = json_val["url"]
tar_member_file.close()
if status == "success": # Should always happend with webdataset format, not with parquet
name_to_url[name] = url
for name in name_to_url:
url_to_img[name_to_url[name]] = name_to_img[name]
new_urls_indexed = list(url_to_img.keys())
new_datasets_images = Dataset.from_dict(
{"url": list(url_to_img.keys()), "image": list(url_to_img.values())}
)
# We need to save the new datasets and then reload them, since `from_dict` store the dataset
# in the RAM and does not use the disk space
new_datasets_images.save_to_disk(os.path.join(path_save_dir_tmp_datasets_images, str(idx_tar)))
return new_urls_indexed