seed/make_qa_csv.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Generate QnA synthetic dataset from CSV\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import os, sys\n", "module_path = \"../../0_lab_preparation\"\n", "sys.path.append(os.path.abspath(module_path))\n", "\n", "from common import check_kernel\n", "check_kernel()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()\n", "\n", "aoai_api_endpoint = os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n", "aoai_api_key = os.getenv(\"AZURE_OPENAI_API_KEY\")\n", "aoai_api_version = os.getenv(\"AZURE_OPENAI_API_VERSION\")\n", "aoai_deployment_name = os.getenv(\"AZURE_OPENAI_DEPLOYMENT_NAME\")\n", "\n", "if not aoai_api_version:\n", " aoai_api_version = os.getenv(\"OPENAI_API_VERSION\")\n", "if not aoai_deployment_name:\n", " aoai_deployment_name = os.getenv(\"DEPLOYMENT_NAME\")\n", " \n", "print(f\"aoai_api_endpoint: {aoai_api_endpoint}\")\n", "print(f\"aoai_api_key: {aoai_api_key}\")\n", "print(f\"aoai_api_version: {aoai_api_version}\")\n", "print(f\"aoai_deployment_name: {aoai_deployment_name}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import shutil, random\n", "from langchain_community.document_loaders.csv_loader import CSVLoader\n", "from util.preprocess import convert_html_to_md\n", "from util.common_utils import get_language_code\n", "\n", "raw_data_dir = \"../raw_data\"\n", "file_path = f\"{raw_data_dir}/csv/en-store-information-virtual.csv\"\n", "\n", "DOMAIN = \"CS (Customer Support)\"\n", "LANGUAGE = \"English\" # You can change your language here. e.g., \"Korean\", \"Japanese\", \"Chinese\"\n", "LANGUAGE_CODE = get_language_code(LANGUAGE)\n", "print(f\"Domain: {DOMAIN}, Language: {LANGUAGE}, Language Code: {LANGUAGE_CODE}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Read & Preprocess CSV file\n", "\n", "---\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "\n", "loader = CSVLoader(\n", " file_path=file_path,\n", " csv_args={\n", " \"delimiter\": \",\",\n", " \"quotechar\": '\"',\n", " }, \n", ")\n", "docs = loader.load()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "preprocessed_docs = []\n", "for doc in docs:\n", " md = convert_html_to_md(doc.page_content)\n", " preprocessed_docs.append(md)\n", "\n", "print(preprocessed_docs[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Construct QnA Pairs\n", "\n", "---\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from util.qa import CustomQADataGenerator\n", "model_config = {\n", " \"deployment\": aoai_deployment_name,\n", " \"model\": \"gpt-4o\",\n", " \"max_tokens\": 2000,\n", "}\n", "\n", "qa_generator = CustomQADataGenerator(model_config=model_config, templates_dir=f\"./prompt_template/{LANGUAGE_CODE}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import asyncio\n", "from collections import Counter\n", "from typing import Dict\n", "import os\n", "from azure.ai.generative.synthetic.qa import QAType\n", "concurrency = 6 # number of concurrent calls\n", "sem = asyncio.Semaphore(concurrency)\n", "\n", "#qa_type = QAType.CONVERSATION\n", "qa_type = QAType.LONG_ANSWER\n", "\n", "async def generate_async(text: str) -> Dict:\n", " async with sem:\n", " return await qa_generator.generate_async(\n", " text=text,\n", " qa_type=qa_type,\n", " num_questions=3, # Number of questions to generate per text\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "input_batch = preprocessed_docs\n", "results = await asyncio.gather(*[generate_async(text) for text in input_batch], return_exceptions=True)\n", "\n", "question_answer_list = []\n", "for result in results:\n", " if isinstance(result, Exception):\n", " raise result # exception raised inside generate_async()\n", " question_answer_list.append(result[\"question_answers\"])\n", "\n", "print(\"Successfully generated QAs\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Save to jsonl for fine-tuning\n", "\n", "---\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "from util.common_utils import convert_to_oai_format, save_jsonl\n", "\n", "output_dir = './dataset'\n", "os.makedirs(output_dir, exist_ok=True)\n", "\n", "system_prompt_msg = f\"\"\"You are the SME (Subject Matter Expert) in {DOMAIN}. Please answer the questions accurately. If the question is in {LANGUAGE}, write your answer in {LANGUAGE}.\"\"\"\n", "\n", "save_filename = \"store-info\"\n", "oai_qa_pair = convert_to_oai_format(question_answer_list, system_prompt_msg=system_prompt_msg)\n", "\n", "#save_jsonl(qa_pair, f\"{output_dir}/{save_filename}.jsonl\")\n", "save_jsonl(oai_qa_pair, f\"{output_dir}/{save_filename}-oai.jsonl\")" ] } ], "metadata": { "kernelspec": { "display_name": "py312-dev", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 4 }

seed/make_qa_csv.ipynb (238 lines of code) (raw):