seed/make_qa_csv.ipynb (238 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generate QnA synthetic dataset from CSV\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import os, sys\n",
"module_path = \"../../0_lab_preparation\"\n",
"sys.path.append(os.path.abspath(module_path))\n",
"\n",
"from common import check_kernel\n",
"check_kernel()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()\n",
"\n",
"aoai_api_endpoint = os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n",
"aoai_api_key = os.getenv(\"AZURE_OPENAI_API_KEY\")\n",
"aoai_api_version = os.getenv(\"AZURE_OPENAI_API_VERSION\")\n",
"aoai_deployment_name = os.getenv(\"AZURE_OPENAI_DEPLOYMENT_NAME\")\n",
"\n",
"if not aoai_api_version:\n",
" aoai_api_version = os.getenv(\"OPENAI_API_VERSION\")\n",
"if not aoai_deployment_name:\n",
" aoai_deployment_name = os.getenv(\"DEPLOYMENT_NAME\")\n",
" \n",
"print(f\"aoai_api_endpoint: {aoai_api_endpoint}\")\n",
"print(f\"aoai_api_key: {aoai_api_key}\")\n",
"print(f\"aoai_api_version: {aoai_api_version}\")\n",
"print(f\"aoai_deployment_name: {aoai_deployment_name}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import shutil, random\n",
"from langchain_community.document_loaders.csv_loader import CSVLoader\n",
"from util.preprocess import convert_html_to_md\n",
"from util.common_utils import get_language_code\n",
"\n",
"raw_data_dir = \"../raw_data\"\n",
"file_path = f\"{raw_data_dir}/csv/en-store-information-virtual.csv\"\n",
"\n",
"DOMAIN = \"CS (Customer Support)\"\n",
"LANGUAGE = \"English\" # You can change your language here. e.g., \"Korean\", \"Japanese\", \"Chinese\"\n",
"LANGUAGE_CODE = get_language_code(LANGUAGE)\n",
"print(f\"Domain: {DOMAIN}, Language: {LANGUAGE}, Language Code: {LANGUAGE_CODE}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Read & Preprocess CSV file\n",
"\n",
"---\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"\n",
"loader = CSVLoader(\n",
" file_path=file_path,\n",
" csv_args={\n",
" \"delimiter\": \",\",\n",
" \"quotechar\": '\"',\n",
" }, \n",
")\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"preprocessed_docs = []\n",
"for doc in docs:\n",
" md = convert_html_to_md(doc.page_content)\n",
" preprocessed_docs.append(md)\n",
"\n",
"print(preprocessed_docs[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Construct QnA Pairs\n",
"\n",
"---\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from util.qa import CustomQADataGenerator\n",
"model_config = {\n",
" \"deployment\": aoai_deployment_name,\n",
" \"model\": \"gpt-4o\",\n",
" \"max_tokens\": 2000,\n",
"}\n",
"\n",
"qa_generator = CustomQADataGenerator(model_config=model_config, templates_dir=f\"./prompt_template/{LANGUAGE_CODE}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import asyncio\n",
"from collections import Counter\n",
"from typing import Dict\n",
"import os\n",
"from azure.ai.generative.synthetic.qa import QAType\n",
"concurrency = 6 # number of concurrent calls\n",
"sem = asyncio.Semaphore(concurrency)\n",
"\n",
"#qa_type = QAType.CONVERSATION\n",
"qa_type = QAType.LONG_ANSWER\n",
"\n",
"async def generate_async(text: str) -> Dict:\n",
" async with sem:\n",
" return await qa_generator.generate_async(\n",
" text=text,\n",
" qa_type=qa_type,\n",
" num_questions=3, # Number of questions to generate per text\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"input_batch = preprocessed_docs\n",
"results = await asyncio.gather(*[generate_async(text) for text in input_batch], return_exceptions=True)\n",
"\n",
"question_answer_list = []\n",
"for result in results:\n",
" if isinstance(result, Exception):\n",
" raise result # exception raised inside generate_async()\n",
" question_answer_list.append(result[\"question_answers\"])\n",
"\n",
"print(\"Successfully generated QAs\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Save to jsonl for fine-tuning\n",
"\n",
"---\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from util.common_utils import convert_to_oai_format, save_jsonl\n",
"\n",
"output_dir = './dataset'\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"system_prompt_msg = f\"\"\"You are the SME (Subject Matter Expert) in {DOMAIN}. Please answer the questions accurately. If the question is in {LANGUAGE}, write your answer in {LANGUAGE}.\"\"\"\n",
"\n",
"save_filename = \"store-info\"\n",
"oai_qa_pair = convert_to_oai_format(question_answer_list, system_prompt_msg=system_prompt_msg)\n",
"\n",
"#save_jsonl(qa_pair, f\"{output_dir}/{save_filename}.jsonl\")\n",
"save_jsonl(oai_qa_pair, f\"{output_dir}/{save_filename}-oai.jsonl\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py312-dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}