# Generate QnA synthetic dataset from multiple PDFs - Image-heavy PDF


In [None]:
%load_ext autoreload
%autoreload 2

import os, sys
lab_prep_dir = os.getcwd().split("slm-innovator-lab")[0] + "slm-innovator-lab/0_lab_preparation"
sys.path.append(os.path.abspath(lab_prep_dir))

from common import check_kernel
check_kernel()

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

aoai_api_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
aoai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
aoai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

if not aoai_api_version:
    aoai_api_version = os.getenv("OPENAI_API_VERSION")
if not aoai_deployment_name:
    aoai_deployment_name = os.getenv("DEPLOYMENT_NAME")
    
print(f"aoai_api_endpoint: {aoai_api_endpoint}")
print(f"aoai_api_key: {aoai_api_key}")
print(f"aoai_api_version: {aoai_api_version}")
print(f"aoai_deployment_name: {aoai_deployment_name}")

In [None]:
import time
import glob
import pandas as pd
import shutil, random
from langchain_community.document_loaders.csv_loader import CSVLoader
from util.preprocess import convert_html_to_md, remove_short_sentences, remove_small_images
from util.common_utils import get_language_code

DOMAIN = "Advertising"
LANGUAGE = "English" # You can change your language here. e.g., "Korean", "Japanese", "Chinese"
LANGUAGE_CODE = get_language_code(LANGUAGE)
print(f"Domain: {DOMAIN}, Language: {LANGUAGE}, Language Code: {LANGUAGE_CODE}")

raw_data_dir = "../raw_data"
pdf_dir = f"{raw_data_dir}/pdf"
dataset_tmp_dir = "dataset_tmp"

all_files = glob.glob(os.path.join(pdf_dir, "img-*.pdf"))
print(all_files)

In [None]:
import json
import fitz
from glob import glob
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

from util.preprocess import encode_image_base64
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from util.qa_pair import get_qna_prompt_template, QAPair
from util.common_utils import convert_to_oai_format, save_jsonl

max_tokens = 1024

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version=aoai_api_version,
    azure_deployment=aoai_deployment_name                
)

## 2. Preprocess each PDF file

---


In [None]:
for idx, file_path in enumerate(all_files):

    print(f"\n##### Idx {idx} - Processing {file_path}...")

    image_path = "./image"
    if os.path.isdir(image_path): shutil.rmtree(image_path)
    os.makedirs(image_path, exist_ok=True)

    doc = fitz.open(file_path)
    doc.delete_page(0) # 1st page is the cover page, so we delete it.
    clip_x, clip_y = 30, 30

    for i, page in enumerate(doc):
        x, y, w, h = page.rect
        clip = fitz.Rect(x+clip_x, y+clip_y, w-clip_x, h-clip_y)
        page.set_cropbox(clip)
        pix = page.get_pixmap()
        pix.save(f"{image_path}/page_{i:03d}.jpg")

    images = sorted(glob(os.path.join(image_path, "*.jpg")))

    ### Generate image summariesd
    print(f"### Generating image summaries using LLM - path: {file_path}")

    start = time.time()

    human_prompt_main = f"Given image, give a concise summary in {LANGUAGE}. Don't insert any XML tag such as <text> and </text> when answering."

    system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
    system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
    human_prompt = [
        {
            "type": "image_url",
            "image_url": {
                "url": "data:image/png;base64," + "{image_base64}",
            },
        },
        {
            "type": "text",
            "text": human_prompt_main
        },
    ]
    human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

    prompt = ChatPromptTemplate.from_messages(
        [
            system_message_template,
            human_message_template
        ]
    )

    summarize_chain = prompt | llm | StrOutputParser()
    base64_images = [encode_image_base64(img_path) for img_path in images]
    image_summaries = summarize_chain.batch(base64_images, {"max_concurrency": 8})
    image_summaries = remove_short_sentences(image_summaries)
    end = time.time()

    print(f"Elasped {end - start:.5f} ses for generating image summaries using LLM")

    ### Generate QA pair
    print(f"### Generating QA pairs using LLM - path: {file_path}")
    start = time.time()

    parser = JsonOutputParser(pydantic_object=QAPair)
    prompt = get_qna_prompt_template()
    #prompt = get_qna_repair_cost_prompt_template()
    chain = prompt | llm | parser

    input_batch = []

    for doc in image_summaries:
        dic = {"context": doc, "domain": "Mobile phone", "num_questions": "3"}
        input_batch.append(dic)


    qa_pair = chain.batch(input_batch, {"max_concurrency": 8})
    end = time.time()

    print(f"Elasped {end - start:.5f} ses for generating image summaries using LLM")

    ### Save to jsonl for fine-tuning
    print(f"### Saving QA pairs to jsonl")
    os.makedirs(dataset_tmp_dir, exist_ok=True)

    system_prompt_msg = f"""You are the SME (Subject Matter Expert) in {DOMAIN}. Please answer the questions accurately. If the question is in {LANGUAGE}, write your answer in {LANGUAGE}."""

    oai_qa_pair = convert_to_oai_format(qa_pair, system_prompt_msg=system_prompt_msg)

    #save_jsonl(qa_pair, f"{dataset_tmp_dir}/{idx}.jsonl")
    save_jsonl(oai_qa_pair, f"{dataset_tmp_dir}/{idx}-oai.jsonl")

### Merge the generated jsonl files into a single jsonl file.


In [None]:
import os, shutil, random
from util.preprocess import convert_html_to_md
import json
import glob
import pandas as pd

all_files = glob.glob(os.path.join(dataset_tmp_dir, "*-oai.jsonl"))

result = []
for f in all_files:
    with open(f, "r", encoding="utf-8-sig") as infile:
        for line in infile.readlines():
            try:
                result.append(json.loads(line)) # read each line of the file
            except ValueError:
                print(f)

save_filename = "advertising-multiple"

output_dir = './dataset'
with open(f"{output_dir}/{save_filename}-oai.jsonl", "w", encoding="utf-8-sig") as outfile:
    for entry in result:
        outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")

In [None]:
!rm -rf pdf_image_tmp pdf_mixed_tmp outputs_tmp images {dataset_tmp_dir}