phi3/dataset-preparation/1_data-preparation-basic.ipynb (189 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"id": "6defcbd8-3255-4c28-b0be-1dfdfe826dee",
"metadata": {},
"source": [
"# Basic Dataset preparation with chunking\n"
]
},
{
"cell_type": "markdown",
"id": "e9ae2e91-92ab-45ad-a389-900f89c3e05a",
"metadata": {},
"source": [
"## 1. Concatenate Multiple datasets\n",
"\n",
"---\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8fa7e88b-59a6-4403-a6b7-32ec05929ba8",
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = \"dataset\"\n",
"!rm -rf $DATA_DIR \n",
"os.makedirs(DATA_DIR, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb3e4684-73dc-4d67-8654-ad2cb8659a8f",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import random\n",
"from datasets import load_dataset, concatenate_datasets\n",
"\n",
"def formatting_en_func(examples):\n",
" convos = examples[\"conversations\"]\n",
" texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]\n",
" return { \"text\" : texts, }\n",
"\n",
"\n",
"def formatting_ko_func(example):\n",
" if example[\"input\"] == \"\":\n",
" text = f\"<s><|user|>\\n{example['instruction']}<|end|>\\n<|assistant|>\\n{example['output']}<|end|>\"\n",
" else:\n",
" text = f\"<s><|system|>\\n{example['input']}<|end|>\\n<|user|>\\n{example['instruction']}<|end|>\\n<|assistant|>\\n{example['output']}<|end|>\"\n",
" #example[\"text\"] = text\n",
" return { \"text\" : text }\n",
"\n",
"def formatting_guanaco_func(examples):\n",
" txt = examples[\"text\"]\n",
" splits = txt.split(\"### \")\n",
" txt = \"<s>\"\n",
" for s in splits[1:]:\n",
" s = s.replace('Human: ', '<|user|>\\n') \n",
" s = s.replace('Assistant: ', '<|assistant|>\\n')\n",
" s = s + '<|end|>\\n'\n",
" txt += s\n",
" return { \"text\" : txt }\n",
"\n",
"\n",
"dataset_ko1 = load_dataset(\"kyujinpy/KOR-OpenOrca-Platypus-v3\", split=\"train[:1%]\")\n",
"dataset_ko2 = load_dataset(\"kyujinpy/KOR-gugugu-platypus-set\", split=\"train[:1%]\")\n",
"dataset_ko3 = load_dataset(\"nlpai-lab/openassistant-guanaco-ko\", split=\"train[:1%]\")\n",
"\n",
"dataset_ko1 = dataset_ko1.map(formatting_ko_func, remove_columns=dataset_ko1.features, batched=False)\n",
"dataset_ko2 = dataset_ko2.map(formatting_ko_func, remove_columns=dataset_ko2.features, batched=False)\n",
"dataset_ko3 = dataset_ko3.map(formatting_guanaco_func, remove_columns=['id'], batched=False)\n",
"\n",
"dataset = concatenate_datasets([dataset_ko1, dataset_ko2, dataset_ko3])\n",
"dataset = dataset.shuffle(seed=42)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55b28c22-15ba-496d-92ac-9de4b856f936",
"metadata": {},
"outputs": [],
"source": [
"dataset = dataset.train_test_split(test_size=0.2)\n",
"train_dataset = dataset['train']\n",
"train_dataset.to_json(f\"{DATA_DIR}/train_example1.jsonl\")\n",
"test_dataset = dataset['test']\n",
"test_dataset.to_json(f\"{DATA_DIR}/eval_example1.jsonl\")"
]
},
{
"cell_type": "markdown",
"id": "87be1f8b-ce30-4973-af20-cb9a943214e5",
"metadata": {},
"source": [
"<br>\n",
"\n",
"## 2. Convert to OpenAI chat format\n",
"\n",
"---\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "375544a2-70f9-4f08-aef2-5286b571499f",
"metadata": {},
"outputs": [],
"source": [
"def convert_to_oai_format(data):\n",
"\n",
" formatted_data = []\n",
" for message in data:\n",
" msg = {\"messages\": [\n",
" {\"role\":\"system\",\n",
" \"content\":\"\"\"You are an AI assistant. Please reply users' answer using polite,clear and respectful language in Korean.\"\"\"\n",
" },\n",
" {\"role\":\"user\",\n",
" \"content\" :message[\"instruction\"]\n",
" },\n",
" {\"role\":\"assistant\",\n",
" \"content\": message[\"output\"]\n",
" }]\n",
" }\n",
" formatted_data.append(msg)\n",
" random.shuffle(formatted_data)\n",
" \n",
" return formatted_data\n",
"\n",
"def save_jsonl(dictionary_data, file_name):\n",
" with open(file_name, 'w', encoding='UTF-8-sig') as outfile:\n",
" for entry in dictionary_data:\n",
" json.dump(entry, outfile, ensure_ascii=False)\n",
" outfile.write('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee858b1a-9923-45a0-b3a6-bc91f5fa5154",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"dataset = load_dataset(\"kyujinpy/KOR-OpenOrca-Platypus-v3\", split=\"train[:1%]\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f996bdcf-8138-411d-be8d-84c95cc34cc5",
"metadata": {},
"outputs": [],
"source": [
"dataset = dataset.train_test_split(test_size=0.2) \n",
"formatted_train_data = convert_to_oai_format(dataset['train'])\n",
"formatted_valid_data = convert_to_oai_format(dataset['test'])\n",
"save_jsonl(formatted_train_data, f\"{DATA_DIR}/train_example2.jsonl\")\n",
"save_jsonl(formatted_train_data, f\"{DATA_DIR}/valid_example2.jsonl\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10 - SDK v2",
"language": "python",
"name": "python310-sdkv2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}