def urls_to_images()

in obelics/processors/web_document_extractor.py [0:0]


def urls_to_images(dataset, dataset_images, map_url_idx, num_proc, some_urls_are_already_retrieved=False):
    if some_urls_are_already_retrieved:
        if "images_urls" not in dataset.features or "images" not in dataset.features:
            raise ValueError(
                "If some urls are already retrieved, the dataset must contain the features 'images_urls' and 'images'"
            )

    def retrieve_image(url):
        if url not in map_url_idx:
            return None
        image = {"path": None, "bytes": dataset_images[map_url_idx[url]]["image"]}
        return image

    def func_urls_to_images_urls_in_images_col(example):
        example["images_urls"] = deepcopy(example["images"])
        num_urls = sum([(url is not None and url != "") for url in example["images_urls"]])

        example["images"] = [retrieve_image(url) if url else None for url in example["images"]]

        num_found = sum([img is not None for img in example["images"]])
        num_not_found = num_urls - num_found

        example["num_found"] = num_found
        example["num_not_found"] = num_not_found
        return example

    def func_urls_to_images_urls_in_images_urls_col(example):
        num_urls = sum([(url is not None and url != "") for url in example["images_urls"]])

        example["images"] = [
            img if img is not None else retrieve_image(url) if url else None
            for img, url in zip(example["images"], example["images_urls"])
        ]

        num_found = sum([img is not None for img in example["images"]])
        num_not_found = num_urls - num_found

        example["num_found"] = num_found
        example["num_not_found"] = num_not_found
        return example

    func_urls_to_images = (
        func_urls_to_images_urls_in_images_urls_col
        if some_urls_are_already_retrieved
        else func_urls_to_images_urls_in_images_col
    )

    logger.info("Starting replacing urls by images")

    new_features = deepcopy(dataset.features)
    new_features["images"] = Sequence(Image())
    new_features["images_urls"] = Sequence(Value("string"))
    new_features["num_found"] = Value("int32")
    new_features["num_not_found"] = Value("int32")

    dataset = dataset.map(
        func_urls_to_images,
        features=new_features,
        num_proc=num_proc,
        load_from_cache_file=False,
    )
    logger.info("Finished replacing urls by images")
    return dataset