in phi3/dataset-preparation/train_tokenizer.py [0:0]
def save_dataset_to_txt(txt_column, txt_dir, hf_dataset_id):
dataset = load_dataset(hf_dataset_id)
os.makedirs(txt_dir, exist_ok=True)
for split_key in dataset.keys():
doc_path = f"{txt_dir}/{split_key}.txt"
with open(doc_path, 'w') as f:
for doc in dataset[split_key][txt_column]:
f.write(doc+'\n')