phi3/dataset-preparation/1_data-preparation-basic.ipynb (189 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "id": "6defcbd8-3255-4c28-b0be-1dfdfe826dee", "metadata": {}, "source": [ "# Basic Dataset preparation with chunking\n" ] }, { "cell_type": "markdown", "id": "e9ae2e91-92ab-45ad-a389-900f89c3e05a", "metadata": {}, "source": [ "## 1. Concatenate Multiple datasets\n", "\n", "---\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8fa7e88b-59a6-4403-a6b7-32ec05929ba8", "metadata": {}, "outputs": [], "source": [ "DATA_DIR = \"dataset\"\n", "!rm -rf $DATA_DIR \n", "os.makedirs(DATA_DIR, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "eb3e4684-73dc-4d67-8654-ad2cb8659a8f", "metadata": {}, "outputs": [], "source": [ "import json\n", "import random\n", "from datasets import load_dataset, concatenate_datasets\n", "\n", "def formatting_en_func(examples):\n", " convos = examples[\"conversations\"]\n", " texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]\n", " return { \"text\" : texts, }\n", "\n", "\n", "def formatting_ko_func(example):\n", " if example[\"input\"] == \"\":\n", " text = f\"<s><|user|>\\n{example['instruction']}<|end|>\\n<|assistant|>\\n{example['output']}<|end|>\"\n", " else:\n", " text = f\"<s><|system|>\\n{example['input']}<|end|>\\n<|user|>\\n{example['instruction']}<|end|>\\n<|assistant|>\\n{example['output']}<|end|>\"\n", " #example[\"text\"] = text\n", " return { \"text\" : text }\n", "\n", "def formatting_guanaco_func(examples):\n", " txt = examples[\"text\"]\n", " splits = txt.split(\"### \")\n", " txt = \"<s>\"\n", " for s in splits[1:]:\n", " s = s.replace('Human: ', '<|user|>\\n') \n", " s = s.replace('Assistant: ', '<|assistant|>\\n')\n", " s = s + '<|end|>\\n'\n", " txt += s\n", " return { \"text\" : txt }\n", "\n", "\n", "dataset_ko1 = load_dataset(\"kyujinpy/KOR-OpenOrca-Platypus-v3\", split=\"train[:1%]\")\n", "dataset_ko2 = load_dataset(\"kyujinpy/KOR-gugugu-platypus-set\", split=\"train[:1%]\")\n", "dataset_ko3 = load_dataset(\"nlpai-lab/openassistant-guanaco-ko\", split=\"train[:1%]\")\n", "\n", "dataset_ko1 = dataset_ko1.map(formatting_ko_func, remove_columns=dataset_ko1.features, batched=False)\n", "dataset_ko2 = dataset_ko2.map(formatting_ko_func, remove_columns=dataset_ko2.features, batched=False)\n", "dataset_ko3 = dataset_ko3.map(formatting_guanaco_func, remove_columns=['id'], batched=False)\n", "\n", "dataset = concatenate_datasets([dataset_ko1, dataset_ko2, dataset_ko3])\n", "dataset = dataset.shuffle(seed=42)" ] }, { "cell_type": "code", "execution_count": null, "id": "55b28c22-15ba-496d-92ac-9de4b856f936", "metadata": {}, "outputs": [], "source": [ "dataset = dataset.train_test_split(test_size=0.2)\n", "train_dataset = dataset['train']\n", "train_dataset.to_json(f\"{DATA_DIR}/train_example1.jsonl\")\n", "test_dataset = dataset['test']\n", "test_dataset.to_json(f\"{DATA_DIR}/eval_example1.jsonl\")" ] }, { "cell_type": "markdown", "id": "87be1f8b-ce30-4973-af20-cb9a943214e5", "metadata": {}, "source": [ "<br>\n", "\n", "## 2. Convert to OpenAI chat format\n", "\n", "---\n" ] }, { "cell_type": "code", "execution_count": null, "id": "375544a2-70f9-4f08-aef2-5286b571499f", "metadata": {}, "outputs": [], "source": [ "def convert_to_oai_format(data):\n", "\n", " formatted_data = []\n", " for message in data:\n", " msg = {\"messages\": [\n", " {\"role\":\"system\",\n", " \"content\":\"\"\"You are an AI assistant. Please reply users' answer using polite,clear and respectful language in Korean.\"\"\"\n", " },\n", " {\"role\":\"user\",\n", " \"content\" :message[\"instruction\"]\n", " },\n", " {\"role\":\"assistant\",\n", " \"content\": message[\"output\"]\n", " }]\n", " }\n", " formatted_data.append(msg)\n", " random.shuffle(formatted_data)\n", " \n", " return formatted_data\n", "\n", "def save_jsonl(dictionary_data, file_name):\n", " with open(file_name, 'w', encoding='UTF-8-sig') as outfile:\n", " for entry in dictionary_data:\n", " json.dump(entry, outfile, ensure_ascii=False)\n", " outfile.write('\\n')" ] }, { "cell_type": "code", "execution_count": null, "id": "ee858b1a-9923-45a0-b3a6-bc91f5fa5154", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "dataset = load_dataset(\"kyujinpy/KOR-OpenOrca-Platypus-v3\", split=\"train[:1%]\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f996bdcf-8138-411d-be8d-84c95cc34cc5", "metadata": {}, "outputs": [], "source": [ "dataset = dataset.train_test_split(test_size=0.2) \n", "formatted_train_data = convert_to_oai_format(dataset['train'])\n", "formatted_valid_data = convert_to_oai_format(dataset['test'])\n", "save_jsonl(formatted_train_data, f\"{DATA_DIR}/train_example2.jsonl\")\n", "save_jsonl(formatted_train_data, f\"{DATA_DIR}/valid_example2.jsonl\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10 - SDK v2", "language": "python", "name": "python310-sdkv2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 5 }