in distilvit/_datasets/coco.py [0:0]
def get_dataset(feature_extractor_model, text_decoder_model, args):
"""Downloads the COCO dataset and tokenizes it.
The result is saved on disk so we can reuse it.
"""
cache_dir = os.path.join(args.cache_dir, "coco")
for url in urls:
print(f"Downloading {url}...")
download_file(url, cache_dir)
print("Download complete.")
from datasets import load_dataset
ds = load_dataset(
"ydshieh/coco_dataset_script",
"2017",
data_dir=cache_dir,
trust_remote_code=True,
)
ds_tokenizer = DatasetTokenizer(
feature_extractor_model,
text_decoder_model,
caption_column="caption",
image_column="image_path",
image_preprocessor_cls=CocoImagePreprocessor,
)
ds = ds_tokenizer("coco", ds)
return ds