in distilvit/curate.py [0:0]
def main(args):
llm_converter = TextConverter(args)
if args.text:
result = llm_converter.transform_one(args.text)
print(f"Transformed Text: {result}")
else:
from datasets import load_dataset, DatasetDict
split = "test[:100]" if args.test_sample else "test"
dataset = load_dataset(DATASET_NAME, split=split)
num_proc = platform.system() == "Darwin" and 1 or 4
dataset = dataset.map(
llm_converter.process_batch,
batched=True,
batch_size=BATCH_SIZE,
num_proc=num_proc,
)
dataset = dataset.rename_column("original_caption", "original_alt_text")
dataset = dataset.rename_column("caption", "alt_text")
dataset_dict = DatasetDict({"test": dataset})
dataset_dict.save_to_disk("./dataset")
if not args.test_sample:
dataset_dict.push_to_hub("mozilla/flickr30k-transformed-captions")