# Generate QnA synthetic dataset from a Complex PDF using Unstructured

### Overview

We process the PDF by dividing it into three parts.

-   **Text-heavy** - Text-heavy PDF can be processed with open source without the need to use toolkits like Azure AI Document Intelligence or Unstructured.
-   **Image-heavy** - Image-heavy PDF can be converted the entire page to images and let a multimodal LLM like GPT-4o summarize each page.
-   **Mixed** - After reading the document with Azure AI Document Intelligence, we replace the image descriptions inside the figure tags with text summarized by a multimodal LLM. (Often the image descriptions are blank or have only a short caption.)

![summary](../imgs/summary-creating-qna-pdf.png)


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

aoai_api_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
aoai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
aoai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

if not aoai_api_version:
    aoai_api_version = os.getenv("OPENAI_API_VERSION")
if not aoai_deployment_name:
    aoai_deployment_name = os.getenv("DEPLOYMENT_NAME")
    
print(f"aoai_api_endpoint: {aoai_api_endpoint}")
print(f"aoai_api_key: {aoai_api_key}")
print(f"aoai_api_version: {aoai_api_version}")
print(f"aoai_deployment_name: {aoai_deployment_name}")

## 1. Read & Preprocess PDF file

---


### Split the PDFs into individual pages


In [None]:
import shutil, random
import openai
from unstructured.cleaners.core import clean_bullets, clean_extra_whitespace, remove_punctuation
from langchain_community.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader, UnstructuredAPIFileLoader
from langchain_community.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader
from util.common_utils import get_language_code

raw_data_dir = "../raw_data"
splitted_raw_data_dir = "splitted_raw_data"
file_path = f"{raw_data_dir}/pdf/en-imagenet-training-wrote-by-daekeun.pdf"

DOMAIN = "Distributed training on Cloud"
LANGUAGE = "English" # You can change your language here. e.g., "Korean", "Japanese", "Chinese"
LANGUAGE_CODE = get_language_code(LANGUAGE)
print(f"Domain: {DOMAIN}, Language: {LANGUAGE}, Language Code: {LANGUAGE_CODE}")

(Optional) Only use a poration of the PDF documents for testing. If there are a lot of pages or partial processing is required, cut and save only some pages.


In [None]:
import fitz

# Open the first PDF document
doc1 = fitz.open(file_path)
split_pages = [(1, 15)]

for idx, s in enumerate(split_pages):
    # Create a new empty PDF document
    doc2 = fitz.open()

    # Insert the first 2 pages of doc1 into doc2
    doc2.insert_pdf(doc1, from_page=s[0], to_page=s[1])

    # Save the modified document
    doc2.save(f"{raw_data_dir}/part{idx}.pdf")

In [None]:
from util.common_utils import delete_folder_and_make_folder
from util.preprocess import remove_short_sentences, remove_small_images, analyze_pdf_page_content, split_pdf

#file_path = f"{raw_data_dir}/part0.pdf"
analyzed_pdf_result = analyze_pdf_page_content(file_path)
delete_folder_and_make_folder(splitted_raw_data_dir)    

print("### PDF Content Analysis Result:")
for content_type, pages in analyzed_pdf_result.items():
    print(f"{content_type} pages: {pages}")
    split_pdf(file_path, f"{splitted_raw_data_dir}/{content_type}.pdf", pages)

### Case 1: Mixed page (Images and text mixed appropriately)

After reading the document with UnstructuredFileLoader, we replace the image descriptions inside the figure tags with text summarized by a multimodal LLM. (Often the image descriptions are blank or have only a short caption.)


In [None]:
%%time

pdf_mixed_path = f"{splitted_raw_data_dir}/Mixed.pdf"

chunk_size = 1500
new_after_n_chars = 1200
combine_text_under_n_chars = 1000
chunk_overlap = 100
max_tokens = 1024
image_dir = "./images"

loader = UnstructuredFileLoader(
    file_path=pdf_mixed_path,

    chunking_strategy = "by_title",
    mode="elements",

    extract_image_block_types=["Image", "Table"],
    hi_res_model_name="yolox_quantized", #"detectron2_onnx", "yolox", "yolox_quantized"

    extract_images_in_pdf=True,
    skip_infer_table_types='[]', # ['pdf', 'jpg', 'png', 'xls', 'xlsx', 'heic']
    #skip_infer_table_types=True, ## enable to get table as html using tabletrasformer

    extract_image_block_output_dir=image_dir,
    extract_image_block_to_payload=False, ## False: to save image

    max_characters=chunk_size,
    new_after_n_chars=new_after_n_chars,
    combine_text_under_n_chars=combine_text_under_n_chars, # 이 문자 수 이하의 텍스트는 결합

    languages= ["kor+eng"],

    post_processors=[clean_bullets, clean_extra_whitespace, remove_punctuation]
)
docs = loader.load()

In [None]:
images = remove_small_images(image_dir, image_dim_thres=16)
tables, texts = [], []

for doc in docs:
    category = doc.metadata["category"]
    if category == "Table": tables.append(doc)
    else: texts.append(doc)

print (f' # texts: {len(texts)} \n # tables: {len(tables)} \n # images: {len(images)}')

#### Summarize images


In [None]:
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version=aoai_api_version,
    azure_deployment=aoai_deployment_name             
)

system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": '''Given image, give a concise summary in Korean. Don't insert any XML tag such as <text> and </text> when answering.'''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

summarize_chain = prompt | llm | StrOutputParser()
#summarize_chain = {"image_base64": lambda x:x} | prompt | llm_text | StrOutputParser()

In [None]:
%%time
from util.preprocess import encode_image_base64
#images = glob(os.path.join(image_path, "*.jpg"))
base64_images = [encode_image_base64(img_path) for img_path in images]
image_summaries = summarize_chain.batch(base64_images, {"max_concurrency": 3})
image_summaries = remove_short_sentences(image_summaries)

In [None]:
from util.preprocess import split_text_using_tiktoken

texts_tiktoken = split_text_using_tiktoken(texts, chunk_size, chunk_overlap)

mixed_chunks = image_summaries + texts_tiktoken
print("Length of splits (mixed case): " + str(len(mixed_chunks)))

### Case 2: Text-heavy

Text-heavy PDFs can be processed with open source without the need to use toolkits like Azure AI Document Intelligence or Unstructured.


In [None]:
if "Text" in analyzed_pdf_result:

    from langchain_community.document_loaders.pdf import PyMuPDFLoader
    from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

    pdf_text_path = f"{splitted_raw_data_dir}/Text.pdf"
    loader = PyMuPDFLoader(pdf_text_path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200, 
        chunk_overlap=200
    )

    text_chunks = text_splitter.split_documents(documents)

    for idx, chunk in enumerate(text_chunks):
        print(f"Chunk {idx}\n{chunk}")
        print("="*80)
        if idx == 2:
            break

    text_chunks = [d.page_content for d in text_chunks]
    print("Length of splits (text-heay case): " + str(len(text_chunks)))
else:
    text_chunks = []

### Case 3: Image-heavy

Image-heavy PDF can be converted the entire page to images and let a multimodal LLM like GPT-4o summarize each page.

### Preprocess Image


In [None]:
if "Image" in analyzed_pdf_result:
    import fitz
    from glob import glob

    image_dir = "./pdf_image_tmp"
    delete_folder_and_make_folder(image_dir) 

    pdf_image_path = f"{splitted_raw_data_dir}/Image.pdf"
    doc = fitz.open(pdf_image_path)
    #clip_x, clip_y = 10, 45
    clip_x, clip_y = 10, 10

    for i, page in enumerate(doc):
        x, y, w, h = page.rect
        clip = fitz.Rect(x+clip_x, y+clip_y, w-clip_x, h-clip_y)
        page.set_cropbox(clip)
        pix = page.get_pixmap()
        pix.save(f"{image_dir}/page_{i:03d}.jpg")

    images = sorted(glob(os.path.join(image_dir, "*.jpg")))

In [None]:
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

max_tokens = 1024
llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version=aoai_api_version,
    azure_deployment=aoai_deployment_name                  
)

human_prompt_main = f"Given image, give a concise summary in {LANGUAGE}. Don't insert any XML tag such as <text> and </text> when answering."

system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": human_prompt_main
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

summarize_chain = prompt | llm | StrOutputParser()

In [None]:
%%time
if "Image" in analyzed_pdf_result:
    from util.preprocess import encode_image_base64
    #images = glob(os.path.join(image_path, "*.jpg"))
    base64_images = [encode_image_base64(img_path) for img_path in images]
    image_summaries = summarize_chain.batch(base64_images, {"max_concurrency": 8})
    image_summaries = remove_short_sentences(image_summaries)
    print("Length of image_summaries (image-heavy case): " + str(len(image_summaries)))
else:
    image_summaries = []


## 2. Construct QnA Pairs

---

### Option 1.

Leverage the `azure-ai-generative` package. The `QADataGenerator` class in this package makes it easy to generate QnA synthetic questions. However, using this class as is has the disadvantage of not being able to use custom prompts, so we inherited from it and created the `CustomQADataGenerator` class.


In [None]:
from util.qa import CustomQADataGenerator
model_config = {
    "deployment": aoai_deployment_name,
    "model": "gpt-4o-mini",
    "max_tokens": 2000,
}

qa_generator = CustomQADataGenerator(model_config=model_config, templates_dir=f"./prompt_template/{LANGUAGE_CODE}")

In [None]:
import asyncio
from collections import Counter
from typing import Dict
import os
from azure.ai.generative.synthetic.qa import QAType
concurrency = 6  # number of concurrent calls
sem = asyncio.Semaphore(concurrency)

#qa_type = QAType.CONVERSATION
qa_type = QAType.LONG_ANSWER

async def generate_async(text: str) -> Dict:
    async with sem:
        return await qa_generator.generate_async(
            text=text,
            qa_type=qa_type,
            num_questions=3,  # Number of questions to generate per text
        )

In [None]:
input_batch = mixed_chunks + text_chunks + image_summaries
results = await asyncio.gather(*[generate_async(text) for text in input_batch], return_exceptions=True)

question_answer_list = []
for result in results:
    if isinstance(result, Exception):
        raise result  # exception raised inside generate_async()
    question_answer_list.append(result["question_answers"])

print("Successfully generated QAs")

In [None]:
question_answer_list[0]

### Option 2.

You write the entire sequence of code to create a QnA dataset without using a separate toolkit.


In [None]:
aoai_api_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
aoai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
aoai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

In [None]:
from langchain_openai import AzureChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from util.qa_pair import get_qna_prompt_template, QAPair

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=1024,
    openai_api_version=aoai_api_version,
    azure_deployment=aoai_deployment_name                    
)

parser = JsonOutputParser(pydantic_object=QAPair)
prompt = get_qna_prompt_template(LANGUAGE)

chain = prompt | llm | parser

In [None]:
input_batch = []

for doc in mixed_chunks:
    dic = {"context": doc, "domain": DOMAIN, "num_questions": "3"}
    input_batch.append(dic)

for doc in text_chunks:
    dic = {"context": doc, "domain": DOMAIN, "num_questions": "3"}
    input_batch.append(dic)

for doc in image_summaries:
    dic = {"context": doc, "domain": DOMAIN, "num_questions": "3"}
    input_batch.append(dic)        

In [None]:
%%time
qa_pair = chain.batch(input_batch, {"max_concurrency": 5})

## 3. Save to jsonl

---

If you want to augment dataset, you can try Evovle-Instruct or other data augmentation techniques.<br>
Please refer to `../evolve-instruct` and `../glan-instruct` for more details.


In [None]:
import json
from util.common_utils import convert_to_oai_format, save_jsonl

output_dir = './dataset'
os.makedirs(output_dir, exist_ok=True)

system_prompt_msg = f"""You are the SME (Subject Matter Expert) in {DOMAIN}. Please answer the questions accurately. If the question is in {LANGUAGE}, write your answer in {LANGUAGE}."""

save_filename = "advertising"
oai_qa_pair = convert_to_oai_format(question_answer_list, system_prompt_msg=system_prompt_msg)

#save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
save_jsonl(oai_qa_pair, f"{output_dir}/{save_filename}-oai.jsonl")

### Clean up


In [None]:
!rm -rf {splitted_raw_data_dir} pdf_image_tmp pdf_mixed_tmp outputs_tmp images