def main()

in distilvit/curate.py [0:0]


def main(args):
    llm_converter = TextConverter(args)

    if args.text:
        result = llm_converter.transform_one(args.text)
        print(f"Transformed Text: {result}")
    else:
        from datasets import load_dataset, DatasetDict

        split = "test[:100]" if args.test_sample else "test"
        dataset = load_dataset(DATASET_NAME, split=split)

        num_proc = platform.system() == "Darwin" and 1 or 4

        dataset = dataset.map(
            llm_converter.process_batch,
            batched=True,
            batch_size=BATCH_SIZE,
            num_proc=num_proc,
        )
        dataset = dataset.rename_column("original_caption", "original_alt_text")
        dataset = dataset.rename_column("caption", "alt_text")
        dataset_dict = DatasetDict({"test": dataset})
        dataset_dict.save_to_disk("./dataset")
        if not args.test_sample:
            dataset_dict.push_to_hub("mozilla/flickr30k-transformed-captions")