1_synthetic-qa-generation/glan-instruct/glan_tutorial.ipynb (831 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Open Source implementation of GLAN (Generalized Instruction Tuning) using the Azure OpenAI API.\n", "\n", "This Jupyter notebook is recommended for workshop/education only. It is recommended to run `generate.py` when creating a dataset.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Preparation\n", "\n", "---\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "import time\n", "import uuid\n", "import random\n", "import openai\n", "import markdown\n", "import textwrap\n", "from tqdm import tqdm\n", "from bs4 import BeautifulSoup\n", "from datasets import load_dataset\n", "from dotenv import load_dotenv\n", "from openai import AzureOpenAI, RateLimitError\n", "\n", "load_dotenv() # take environment variables from .env.\n", "\n", "client = AzureOpenAI(\n", " azure_endpoint = os.getenv(\"AZURE_OPENAI_ENDPOINT\"),\n", " api_key = os.getenv(\"AZURE_OPENAI_API_KEY\"),\n", " api_version = os.getenv(\"AZURE_OPENAI_API_VERSION\")\n", ")\n", "\n", "LANGUAGE = \"English\"\n", "MODEL_NAME = \"gpt-4o\"\n", "MODEL_NAME_FOR_ANSWER = \"gpt-4o\"\n", "\n", "MAX_NUMBER_OF_FIELDS = 1\n", "MAX_NUMBER_OF_SUBJECTS = 2\n", "MAX_NUMBER_OF_SUBTOPICS = 3\n", "MAX_NUMBER_OF_SESSION_NAME = 3\n", "NUM_ITERATIONS = 1\n", "NUM_QUESTIONS_PER_ITERATION = 3\n", "QUESTION_MAX_TOKENS = 256\n", "QUESTION_BACTH_SIZE = 3\n", "ANSWER_BACTH_SIZE = 3\n", "OUTPUT_DIR = \"./outputs\"\n", "UUID = str(uuid.uuid4())[:4]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Generate Disciplines\n", "\n", "---\n", "\n", "Generate a taxonomy of human knowledge and capabilities. Disciplines derived from taxonomy are used to create subjects.<br>\n", "You can have GPT automatically create disciplines (in this case, human verification is required), or you can use the disciplines (`disciplines.txt`) we created.\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def format_timespan(seconds):\n", " hours = seconds // 3600\n", " minutes = (seconds - hours*3600) // 60\n", " remaining_seconds = seconds - hours*3600 - minutes*60\n", " timespan = f\"{hours} hours {minutes} minutes {remaining_seconds:.4f} seconds.\"\n", " return timespan\n", "\n", "\n", "def read_text_to_list(file_path):\n", " data_list = []\n", " \n", " with open(file_path, 'r') as file:\n", " for line in file:\n", " cleaned_line = line.strip()\n", " if cleaned_line:\n", " data_list.append(cleaned_line)\n", " \n", " return data_list\n", "\n", "\n", "def save_list_to_text(file_path, data_list):\n", " with open(file_path, 'w') as file:\n", " for item in data_list:\n", " file.write(f\"{item}\\n\")\n", "\n", "\n", "def generate_taxonomy(max_number_of_fields=10, model_name=\"gpt-4o\", **kwargs):\n", " \"\"\"\n", " Generate a taxonomy of human knowledge and capabilities.\n", " \"\"\"\n", "\n", " prompt = f\"\"\"\n", " Create a taxonomy of human knowledge and capabilities. Break it down into fields, sub-fields, and disciplines.\n", " Limit the number of fields to a maximum of {max_number_of_fields}.\n", "\n", " Provide the result in JSON format with the following structure:\n", " {{\n", " \"fields\": [\n", " {{\n", " \"field_name\": \"Field Name\",\n", " \"sub_fields\": [\n", " {{\n", " \"sub_field_name\": \"Sub-field Name\",\n", " \"disciplines\": [\"Discipline 1\", \"Discipline 2\", ...]\n", " }},\n", " ...\n", " ]\n", " }},\n", " ...\n", " ]\n", " }}\n", "\n", " Examples of `field_name` are Natural Sciences, Humanities or Service.\n", " Examples of `sub_field_name` are Chemistry, Sociology or Retailing.\n", " \"\"\"\n", " response = client.chat.completions.create(\n", " model=model_name,\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " response_format = {'type': \"json_object\"},\n", " **kwargs \n", " )\n", " taxonomy = response.choices[0].message.content\n", " try:\n", " taxonomy_json = json.loads(taxonomy)\n", " except json.JSONDecodeError:\n", " taxonomy_json = {\"error\": \"Failed to parse JSON\"}\n", "\n", " key = next(iter(taxonomy_json))\n", " disciplines = [discipline for field in taxonomy_json[key] for sub_field in field['sub_fields'] for discipline in sub_field['disciplines']]\n", " \n", " return taxonomy_json, disciplines\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "===== Generate a Taxonomy of human knowledge and capabilities\n", "Generating taxonomy took 3.5342 seconds.\n" ] } ], "source": [ "print(f\"===== Generate a Taxonomy of human knowledge and capabilities\")\n", "t0 = time.time()\n", "taxonomy_json, disciplines = generate_taxonomy(max_number_of_fields=MAX_NUMBER_OF_FIELDS, model_name=\"gpt-4o\", temperature=0.5)\n", "t1 = time.time()\n", "print(f\"Generating taxonomy took {t1 - t0:.4f} seconds.\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Disciplines have been saved to disciplines_custom.txt\n" ] } ], "source": [ "file_path = 'disciplines_custom.txt'\n", "save_list_to_text(file_path, disciplines)\n", "print(f\"Disciplines have been saved to {file_path}\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Classical Mechanics',\n", " 'Quantum Mechanics',\n", " 'Thermodynamics',\n", " 'Electromagnetism',\n", " 'Optics']" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "disciplines[:5]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Machine Learning', 'Organic Chemistry']\n" ] } ], "source": [ "file_path = 'disciplines_sample.txt' \n", "disciplines = read_text_to_list(file_path)\n", "print(disciplines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Generate Question set\n", "\n", "---\n", "\n", "In order to generate questions, the implementation follows the process below.\n", "\n", "- Create subjects: Create a list of subjects for each discipline.\n", "- Create syllabus: Design the curriculum for each subject, breaking it down into sessions and key concepts.\n", "- Create questions: Generates cquestions varying difficulty levels based on the generated session and key concepts.\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def validate_subjects_json_structure(data):\n", " \"\"\"\n", " Check if the JSON data has the correct structure for the subjects.\n", " \"\"\"\n", " # Check if the top-level key \"subjects\" exists and is a list\n", " if \"subjects\" not in data or not isinstance(data[\"subjects\"], list):\n", " return False\n", " \n", " # Iterate through each subject to validate its structure\n", " for subject in data[\"subjects\"]:\n", " # Check if each subject is a dictionary\n", " if not isinstance(subject, dict):\n", " return False\n", " # Check if required keys exist in each subject and have the correct types\n", " if \"subject\" not in subject or not isinstance(subject[\"subject\"], str):\n", " return False\n", " if \"level\" not in subject or not isinstance(subject[\"level\"], int):\n", " return False\n", " if \"subtopics\" not in subject or not isinstance(subject[\"subtopics\"], list):\n", " return False\n", " # Check if each item in \"subtopics\" is a string\n", " if not all(isinstance(subtopic, str) for subtopic in subject[\"subtopics\"]):\n", " return False\n", " \n", " return True\n", "\n", "\n", "def generate_subjects(discipline, max_number_of_subjects=2, max_number_of_subtopics=5, model_name=\"gpt-4o\", **kwargs):\n", " \"\"\"\n", " Generate a list of subjects for a given discipline. Please refer to section 2.2 of the paper.\n", " \"\"\"\n", "\n", " prompt = f\"\"\"\n", " You are an expert in {discipline}. Create a comprehensive list of subjects a student should learn under this discipline. \n", " For each subject, provide the level (e.g., 100, 200, 300, 400, 500, 600, 700, 800, 900) and include key subtopics in JSON format.\n", " {{ \n", " \"subjects\": [\n", " {{\n", " 'subject': 'Introduction to Computer Science',\n", " 'level': 100,\n", " 'subtopics': [\n", " 'Basic Programming',\n", " 'Software Development Fundamentals',\n", " 'Computer Organization'\n", " ]\n", " }}, \n", " ...\n", " ]\n", " }}\n", " Limit the number of `subjects` to a maximum of {max_number_of_subjects}. \n", " Limit the number of `subtopics` to a maximum of {max_number_of_subtopics} for each `subject`. \n", " \"\"\"\n", " t0 = time.time() \n", " response = client.chat.completions.create(\n", " model=model_name,\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " response_format = {'type': \"json_object\"},\n", " **kwargs \n", " )\n", " subjects = response.choices[0].message.content\n", "\n", " subjects_json = json.loads(subjects)\n", " if not validate_subjects_json_structure(subjects_json):\n", " print(\"Failed to parse JSON. Trying again.\")\n", " subjects_json = generate_subjects(discipline, max_number_of_subjects, max_number_of_subtopics, model_name, **kwargs)\n", "\n", " t1 = time.time()\n", " print(f\"Generating subjects took {t1 - t0:.4f} seconds.\")\n", " \n", " return subjects_json\n", "\n", "\n", "def generate_syllabus(subject, level, subtopics, max_number_of_session_name=5, model_name=\"gpt-4o\", **kwargs):\n", " \"\"\"\n", " Generate a syllabus for a given subject at a specific level. Please refer to section 2.3 of the paper.\n", " \"\"\"\n", " prompt = f\"\"\"\n", " You are an expert in creating educational syllabi. Create a detailed syllabus for the subject \"{subject}\" at the {level} level. \n", " The syllabus should be broken down into multiple class sessions, each covering different key concepts. \n", " The subtopics for this subject include: {subtopics}. Provide the syllabus in JSON format with the following structure in JSON format:\n", "\n", " {{ \n", " \"syllabus\": [\n", " {{\n", " \"session_name\": \"Session 1 Name\",\n", " \"description\": \"Brief description of the session\",\n", " \"key_concepts\": [\"Key concept 1\", \"Key concept 2\", ...]\n", " }},\n", " ...\n", " ]\n", " }} \n", " Limit the number of `session_name` to a maximum of {max_number_of_session_name}. \n", " \"\"\"\n", " t0 = time.time()\n", " response = client.chat.completions.create(\n", " model=model_name,\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " response_format = {'type': \"json_object\"},\n", " **kwargs \n", " )\n", "\n", " output = response.choices[0].message.content.strip()\n", " #print(textwrap.indent(output, '\\t'))\n", " \n", " try:\n", " syllabus_json = json.loads(output)\n", " key = next(iter(syllabus_json))\n", " syllabus = syllabus_json[key]\n", " except json.JSONDecodeError:\n", " print(\"Failed to parse JSON\")\n", " return None, None\n", "\n", " # Extract class details\n", " class_sessions = [session['session_name'] for session in syllabus]\n", " key_concepts = [session['key_concepts'] for session in syllabus]\n", " t1 = time.time()\n", " print(f\"\\tGenerating syllabus took {t1 - t0:.4f} seconds.\")\n", "\n", " return class_sessions, key_concepts\n", "\n", "\n", "def sample_class_sessions_and_key_concepts(class_sessions, key_concepts, single_session=True):\n", " \"\"\"\n", " Sample class sessions and key concepts to generate questions of varying difficulty.\n", "\n", " class_sessions: List of class sessions\n", " key_concepts: List of key concepts for each session.\n", " single_session: Whether to sample from a single session or multiple sessions.\n", " :return: Combination of sampled class sessions and core concepts\n", " \"\"\"\n", " if single_session:\n", " session_index = random.randint(0, len(class_sessions) - 1)\n", " selected_session = class_sessions[session_index]\n", " num_concepts = min(5, len(key_concepts[session_index]))\n", " selected_key_concepts = random.sample(key_concepts[session_index], k=random.randint(1, num_concepts))\n", " else:\n", " if len(class_sessions) < 2:\n", " raise ValueError(\"Not enough sessions for multi-session sampling\")\n", " session_indices = random.sample(range(len(class_sessions)), k=2)\n", " selected_sessions = [class_sessions[i] for i in session_indices]\n", " combined_key_concepts = key_concepts[session_indices[0]] + key_concepts[session_indices[1]]\n", " num_concepts = min(5, len(combined_key_concepts))\n", " selected_key_concepts = random.sample(combined_key_concepts, k=random.randint(2, num_concepts))\n", " \n", " return selected_session if single_session else selected_sessions, selected_key_concepts\n", "\n", "\n", "def generate_questions(\n", " class_sessions, key_concepts, subject, level, subtopics, model_name=\"gpt-4o\", \n", " num_iterations=2, num_questions_per_iteration=5, max_tokens=2048, batch_size=4, language=\"Korean\", **kwargs\n", " ):\n", " \"\"\"\n", " Generate questions based on class sessions and key concepts using LangChain pipeline. Please refer to section 2.4 of the paper.\n", " \"\"\"\n", "\n", " from langchain_openai import AzureChatOpenAI\n", " from langchain_core.prompts import PromptTemplate\n", " from langchain_core.output_parsers import StrOutputParser, BaseOutputParser\n", " #from langchain_core.pydantic_v1 import BaseModel, Field\n", " from pydantic import BaseModel, Field\n", "\n", " llm = AzureChatOpenAI(\n", " max_tokens=max_tokens,\n", " openai_api_version=\"2024-09-01-preview\",\n", " azure_deployment=model_name,\n", " **kwargs \n", " )\n", "\n", " class CustomOutputParser(BaseOutputParser):\n", " def parse(self, text: str):\n", " cleaned_text = text.strip()\n", " return {\"question\": cleaned_text}\n", "\n", "\n", " prompt = PromptTemplate.from_template(\n", " \"\"\"Based on the class session(s) {selected_class_sessions} and key concepts {selected_key_concepts}, generate a homework question.\n", " A question must be less than {max_tokens} tokens.\n", " Write in {language}.\n", " \"\"\"\n", " )\n", " chain = prompt | llm | CustomOutputParser() \n", "\n", " questions = []\n", " for idx in range(num_iterations):\n", " t0 = time.time()\n", " print(f\"\\t\\t===== Generating Question: Iteration {idx}\")\n", " selected_class_sessions, selected_key_concepts = sample_class_sessions_and_key_concepts(class_sessions, key_concepts, single_session=True)\n", "\n", " batch_inputs = [{\n", " \"selected_class_sessions\": selected_class_sessions,\n", " \"selected_key_concepts\": selected_key_concepts,\n", " \"max_tokens\": max_tokens,\n", " \"language\": language\n", " } for _ in range(num_questions_per_iteration)]\n", "\n", " metadata = {\"subject\": subject, \"level\": level, \"subtopics\": subtopics}\n", "\n", " with tqdm(total=len(batch_inputs), desc=\"\\t\\tProcessing Questions\") as pbar:\n", " for i in range(0, len(batch_inputs), batch_size):\n", " minibatch = batch_inputs[i:i+batch_size]\n", " questions_ = chain.batch(minibatch, {\"max_concurrency\": batch_size})\n", " for q in questions_:\n", " q.update(metadata)\n", " questions.extend(questions_)\n", " pbar.set_postfix({\"current_batch\": f\"{i//batch_size + 1}/{(len(batch_inputs) + (batch_size-1))//batch_size}\"})\n", "\n", " pbar.update(len(minibatch))\n", " \n", " t1 = time.time()\n", " print(f\"\\t\\tIteration {idx} took {t1 - t0:.4f} seconds.\")\n", "\n", " return questions" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "====================================================================================================\n", "===== [Discipline 0] Generating Subjects for discipline: Machine Learning\n", "====================================================================================================\n", "Generating subjects took 1.6345 seconds.\n", "Number of subjects is 2\n", "\t====================================================================================================\n", "\t===== [Subject 0] Generating Syllabus: Discipline: Machine Learning - Subject: Introduction to Computer Science - Level: 100\n", "\t====================================================================================================\n", "\tGenerating syllabus took 4.3651 seconds.\n", "\tNumber of class sessions is 4\n", "\t\t===== Generating Question: Iteration 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t\tProcessing Questions: 100%|██████████| 3/3 [00:02<00:00, 1.42it/s, current_batch=1/1]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\t\tIteration 0 took 2.1132 seconds.\n", "\t====================================================================================================\n", "\t===== [Subject 1] Generating Syllabus: Discipline: Machine Learning - Subject: Machine Learning Fundamentals - Level: 200\n", "\t====================================================================================================\n", "\tGenerating syllabus took 2.3797 seconds.\n", "\tNumber of class sessions is 3\n", "\t\t===== Generating Question: Iteration 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t\tProcessing Questions: 100%|██████████| 3/3 [00:03<00:00, 1.10s/it, current_batch=1/1]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\t\tIteration 0 took 3.3149 seconds.\n", "====================================================================================================\n", "===== [Discipline 1] Generating Subjects for discipline: Organic Chemistry\n", "====================================================================================================\n", "Generating subjects took 1.5246 seconds.\n", "Number of subjects is 2\n", "\t====================================================================================================\n", "\t===== [Subject 0] Generating Syllabus: Discipline: Organic Chemistry - Subject: Organic Chemistry I - Level: 200\n", "\t====================================================================================================\n", "\tGenerating syllabus took 3.4074 seconds.\n", "\tNumber of class sessions is 3\n", "\t\t===== Generating Question: Iteration 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t\tProcessing Questions: 100%|██████████| 3/3 [00:03<00:00, 1.12s/it, current_batch=1/1]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\t\tIteration 0 took 3.3638 seconds.\n", "\t====================================================================================================\n", "\t===== [Subject 1] Generating Syllabus: Discipline: Organic Chemistry - Subject: Organic Chemistry II - Level: 300\n", "\t====================================================================================================\n", "\tGenerating syllabus took 3.6107 seconds.\n", "\tNumber of class sessions is 3\n", "\t\t===== Generating Question: Iteration 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t\tProcessing Questions: 100%|██████████| 3/3 [00:05<00:00, 1.74s/it, current_batch=1/1]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\t\tIteration 0 took 5.2338 seconds.\n", "Generating Question dataset took 0.0 hours 0.0 minutes 31.0906 seconds.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "t0 = time.time()\n", "all_questions = []\n", "\n", "for idx1, discipline in enumerate(disciplines):\n", " print(\"====================================================================================================\")\n", " print(f\"===== [Discipline {idx1}] Generating Subjects for discipline: {discipline}\") \n", " print(\"====================================================================================================\")\n", " subjects_json = generate_subjects(\n", " discipline, \n", " max_number_of_subjects=MAX_NUMBER_OF_SUBJECTS, \n", " max_number_of_subtopics=MAX_NUMBER_OF_SUBTOPICS, \n", " model_name=MODEL_NAME, \n", " temperature=1.0, \n", " top_p=0.95\n", " )\n", " \n", " print(f\"Number of subjects is {len(subjects_json['subjects'])}\")\n", " for idx2, s in enumerate(subjects_json[\"subjects\"]):\n", " subject = s['subject']\n", " level = s['level']\n", " subtopics = \", \".join(s['subtopics'])\n", " print(\"\\t====================================================================================================\")\n", " print(f\"\\t===== [Subject {idx2}] Generating Syllabus: Discipline: {discipline} - Subject: {subject} - Level: {level}\") \n", " print(\"\\t====================================================================================================\")\n", "\n", " class_sessions, key_concepts = generate_syllabus(\n", " subject, \n", " level, \n", " subtopics,\n", " max_number_of_session_name=MAX_NUMBER_OF_SESSION_NAME, \n", " model_name=MODEL_NAME, \n", " temperature=1.0, \n", " top_p=0.95\n", " )\n", " print(f\"\\tNumber of class sessions is {len(class_sessions)}\")\n", "\n", " questions = generate_questions(\n", " class_sessions, \n", " key_concepts, \n", " subject, \n", " level, \n", " subtopics,\n", " model_name=MODEL_NAME, \n", " num_iterations=NUM_ITERATIONS,\n", " num_questions_per_iteration=NUM_QUESTIONS_PER_ITERATION, \n", " max_tokens=QUESTION_MAX_TOKENS, \n", " batch_size=QUESTION_BACTH_SIZE,\n", " language=LANGUAGE\n", " )\n", " all_questions.extend(questions)\n", "\n", "t1 = time.time()\n", "timespan = format_timespan(t1 - t0)\n", "print(f\"Generating Question dataset took {timespan}\")\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "import jsonlines\n", "\n", "num_questions = len(all_questions)\n", "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "filename = f\"{OUTPUT_DIR}/GLAN_Questions_{LANGUAGE}_{num_questions}_Samples_{UUID}.jsonl\"\n", "\n", "with jsonlines.open(filename, mode='w') as writer:\n", " for question in all_questions:\n", " writer.write(question)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Generate Answers\n", "\n", "---\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "def generate_answers(all_questions, model_name=\"gpt-4o\", max_tokens=1024, batch_size=5, **kwargs):\n", " \"\"\"\n", " Generate answers to the questions using LangChain pipeline. Please refer to section 2.4 of the paper.\n", " \"\"\"\n", " from langchain.schema.output_parser import StrOutputParser\n", " from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate\n", " from langchain_openai import AzureChatOpenAI\n", "\n", " llm = AzureChatOpenAI(\n", " temperature=0, \n", " max_tokens=max_tokens,\n", " openai_api_version=\"2024-09-01-preview\",\n", " azure_deployment=model_name, \n", " )\n", "\n", " system_prompt = \"\"\"Answer the question. Keep the answer short and concise. The topic, level, and subtopic of this question are as follows.\n", "\n", " ## Subject: {subject}\n", " ## Level: {level}\n", " ## Subtopics: {subtopics}\n", "\n", " Respond \"DO NOT KNOW\" if not sure about the answer.\n", " Answer must be less than 700 tokens.\n", " \"\"\"\n", " system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)\n", " human_prompt = [\n", " {\n", " \"type\": \"text\",\n", " \"text\": \"{question}\"\n", " },\n", " ]\n", " human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)\n", "\n", " prompt = ChatPromptTemplate.from_messages(\n", " [\n", " system_message_template,\n", " human_message_template\n", " ]\n", " )\n", "\n", " chain = prompt | llm | StrOutputParser()\n", " print(f\"===== Generating Answers\")\n", " t0 = time.time()\n", " all_answers = []\n", " with tqdm(total=len(all_questions), desc=\"Processing Answers\") as pbar:\n", " for i in range(0, len(all_questions), batch_size):\n", " minibatch = all_questions[i:i+batch_size]\n", " answers = chain.batch(minibatch, {\"max_concurrency\": batch_size})\n", " all_answers.extend(answers)\n", " pbar.set_postfix({\"current_batch\": f\"{i//batch_size + 1}/{(len(all_questions) + (batch_size-1))//batch_size}\"})\n", " pbar.update(len(minibatch))\n", " t1 = time.time()\n", " timespan = format_timespan(t1 - t0)\n", " print(f\"Generating Answer dataset took {timespan}\")\n", "\n", " return all_answers " ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "===== Generating Answers\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Processing Answers: 100%|██████████| 12/12 [00:13<00:00, 1.14s/it, current_batch=4/4]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Generating Answer dataset took 0.0 hours 0.0 minutes 13.6741 seconds.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "all_answers = generate_answers(\n", " all_questions, \n", " model_name=MODEL_NAME_FOR_ANSWER, \n", " max_tokens=256, \n", " batch_size=ANSWER_BACTH_SIZE\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Save the generated instruction dataset\n", "\n", "---\n", "\n", "Save the question and answer pair as jsonl. You can also push it to the Hugging Face hub.\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "import jsonlines\n", "\n", "instructions = []\n", "for q, a in zip(all_questions, all_answers):\n", " if a not in \"DO NOT KNOW\":\n", " q.update({\"answer\": a})\n", " instructions.append(q)\n", "\n", "num_instructions = len(instructions)\n", "new_filename = filename.replace(\"Questions\", \"Instructions\")\n", "\n", "with jsonlines.open(new_filename, mode='w') as writer:\n", " for instruction in instructions:\n", " writer.write(instruction)" ] } ], "metadata": { "kernelspec": { "display_name": "py310", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }