def process_one_tar()

in obelics/processors/web_document_extractor.py [0:0]


    def process_one_tar(args):
        (tar_path, idx_tar) = args
        with tarfile.open(tar_path) as tar_file:
            tar_members = tar_file.getmembers()
            name_to_url = {}
            name_to_img = {}
            url_to_img = {}
            for tar_member in tar_members:
                if tar_member.name.endswith(".jpg"):
                    name = tar_member.name.replace(".jpg", "")
                    tar_member_file = tar_file.extractfile(tar_member)
                    img = tar_member_file.read()
                    tar_member_file.close()
                    name_to_img[name] = img
                elif tar_member.name.endswith(".json"):
                    name = tar_member.name.replace(".json", "")
                    tar_member_file = tar_file.extractfile(tar_member)
                    json_val = json.loads(tar_member_file.read())
                    status = json_val["status"]
                    url = json_val["url"]
                    tar_member_file.close()
                    if status == "success":  # Should always happend with webdataset format, not with parquet
                        name_to_url[name] = url
            for name in name_to_url:
                url_to_img[name_to_url[name]] = name_to_img[name]
            new_urls_indexed = list(url_to_img.keys())
            new_datasets_images = Dataset.from_dict(
                {"url": list(url_to_img.keys()), "image": list(url_to_img.values())}
            )
            # We need to save the new datasets and then reload them, since `from_dict` store the dataset
            # in the RAM and does not use the disk space
            new_datasets_images.save_to_disk(os.path.join(path_save_dir_tmp_datasets_images, str(idx_tar)))
            return new_urls_indexed