def save_dataset_to_txt()

in phi3/dataset-preparation/train_tokenizer.py [0:0]


def save_dataset_to_txt(txt_column, txt_dir, hf_dataset_id):
    dataset = load_dataset(hf_dataset_id)
    os.makedirs(txt_dir, exist_ok=True)
    for split_key in dataset.keys():
        doc_path = f"{txt_dir}/{split_key}.txt"
        with open(doc_path, 'w') as f:
            for doc in dataset[split_key][txt_column]:
                f.write(doc+'\n')