def get_dataset()

in distilvit/_datasets/coco.py [0:0]


def get_dataset(feature_extractor_model, text_decoder_model, args):
    """Downloads the COCO dataset and tokenizes it.

    The result is saved on disk so we can reuse it.
    """
    cache_dir = os.path.join(args.cache_dir, "coco")

    for url in urls:
        print(f"Downloading {url}...")
        download_file(url, cache_dir)
    print("Download complete.")

    from datasets import load_dataset

    ds = load_dataset(
        "ydshieh/coco_dataset_script",
        "2017",
        data_dir=cache_dir,
        trust_remote_code=True,
    )

    ds_tokenizer = DatasetTokenizer(
        feature_extractor_model,
        text_decoder_model,
        caption_column="caption",
        image_column="image_path",
        image_preprocessor_cls=CocoImagePreprocessor,
    )

    ds = ds_tokenizer("coco", ds)
    return ds