# Overview
This notebook provides the following: 

1. Parses PDFs with [Azure Document Intelligence](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence/) that have text and tables. Each PDF is saved as a JSON file so that it can be loaded into elastic. 
2. Loads JSON files into Elasticsearch. This notebook uses the elasticsearch python client to create an index with E5 and ELSER semantic_text mappings. 
3. Once the data is loaded into Elasticsearch, you can ask questions in Playground and get answers grounded in truth. The index "id" field uses the following naming convention: PDF_FILENAME.pdf_PAGENUMBER. That allows you to see PDF and page number in the "document sources" link.

**This notebook cannot be used to parse PDF images.**

# Install python dependencies

In [None]:
!pip install elasticsearch python-dotenv tqdm azure-core azure-ai-documentintelligence requests httpx

# Create a .env file that has the following entries. 

## Elasticsearch 
- You must have a functional elasticsearch environment that has an `enterprise` level license
- The fastest way to get up and running is to use the [Elastic Serverless - Get started](https://www.elastic.co/guide/en/serverless/current/elasticsearch-get-started.html) guide

```
ES_URL=?
ES_API_KEY=?
```

## Azure AI Document Intelligence

```
AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT=?
AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY=?
```

# Create input and output folders

- /pdf - place your PDF files in this input folder
- /json - parser will output one json file for each pdf in this output folder

In [None]:
import os

input_folder_pdf = "./pdf"
output_folder_pdf = "./json"

folders = [input_folder_pdf, output_folder_pdf]


def create_folders_if_not_exist(folders):
    for folder in folders:
        os.makedirs(folder, exist_ok=True)
        print(f"Folder '{folder}' created or already exists.")


create_folders_if_not_exist(folders)

# Download PDF files

- This notebook downloads 4 recent Elastic SEC 10-Q quarterly reports
- If you already have PDF files, feel free to place them in `./pdf` folder 

In [None]:
import os
import requests


def download_pdf(url, directory="./pdf", filename=None):
    if not os.path.exists(directory):
        os.makedirs(directory)

    response = requests.get(url)
    if response.status_code == 200:
        if filename is None:
            filename = url.split("/")[-1]
        filepath = os.path.join(directory, filename)
        with open(filepath, "wb") as file:
            file.write(response.content)
        print(f"Downloaded {filepath}")
    else:
        print(f"Failed to download file from {url}")


print("Downloading 4 recent 10-Q reports for Elastic NV.")
base_url = "https://s201.q4cdn.com/217177842/files/doc_financials"
download_pdf(
    f"{base_url}/2025/q2/e5aa7a0a-6f56-468d-a5bd-661792773d71.pdf",
    filename="elastic-10Q-Q2-2025.pdf",
)
download_pdf(
    f"{base_url}/2025/q1/18656e06-8107-4423-8e2b-6f2945438053.pdf",
    filename="elastic-10Q-Q1-2025.pdf",
)
download_pdf(
    f"{base_url}/2024/q4/9949f03b-09fb-4941-b105-62a304dc1411.pdf",
    filename="elastic-10Q-Q4-2024.pdf",
)
download_pdf(
    f"{base_url}/2024/q3/7e60e3bd-ff50-4ae8-ab12-5b3ae19420e6.pdf",
    filename="elastic-10Q-Q3-2024.pdf",
)

# Set Azure AI Document Intelligence Imports and Environment Variables

In [None]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
import json
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv(
    "AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT"
)
AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY = os.getenv(
    "AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY"
)

# Parse paragraphs using AnalyzeResult

This function extracts the paragraph text via an AnalyzeResult on a PDF file.

In [None]:
def parse_paragraphs(analyze_result):
    table_offsets = []
    page_content = {}

    for paragraph in analyze_result.paragraphs:
        for span in paragraph.spans:
            if span.offset not in table_offsets:
                for region in paragraph.bounding_regions:
                    page_number = region.page_number
                    if page_number not in page_content:
                        page_content[page_number] = []
                    page_content[page_number].append(
                        {"content_text": paragraph.content}
                    )
    return page_content, table_offsets

# Parse tables using AnalyzeResult

This function extracts the paragraph text via an AnalyzeResult on a PDF file.

In [None]:
def parse_tables(analyze_result, table_offsets):
    page_content = {}

    for table in analyze_result.tables:
        table_data = []
        for region in table.bounding_regions:
            page_number = region.page_number
            for cell in table.cells:
                for span in cell.spans:
                    table_offsets.append(span.offset)
                table_data.append(
                    f"Cell [{cell.row_index}, {cell.column_index}]: {cell.content}"
                )

        if page_number not in page_content:
            page_content[page_number] = []

        page_content[page_number].append({"content_text": "\n".join(table_data)})

    return page_content

# Combine paragraph and table text

In [None]:
def combine_paragraphs_tables(filepath, paragraph_content, table_content):
    page_content_concatenated = {}
    structured_data = []

    # Combine paragraph and table content
    for p_number in set(paragraph_content.keys()).union(table_content.keys()):
        concatenated_text = ""

        if p_number in paragraph_content:
            for content in paragraph_content[p_number]:
                concatenated_text += content["content_text"] + "\n"

        if p_number in table_content:
            for content in table_content[p_number]:
                concatenated_text += content["content_text"] + "\n"

        page_content_concatenated[p_number] = concatenated_text.strip()

    # Append a single item per page to the structured_data list
    for p_number, concatenated_text in page_content_concatenated.items():
        structured_data.append(
            {
                "page_number": p_number,
                "content_text": concatenated_text,
                "pdf_file": os.path.basename(filepath),
            }
        )

    return structured_data

# Bring it all together

In [None]:
pdf_files = [
    os.path.join(input_folder_pdf, file)
    for file in os.listdir(input_folder_pdf)
    if file.endswith(".pdf")
]

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY),
    connection_timeout=600,
)

for filepath in tqdm(pdf_files, desc="Parsing PDF files"):
    with open(filepath, "rb") as file:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout", AnalyzeDocumentRequest(bytes_source=file.read())
        )

        analyze_result: AnalyzeResult = poller.result()

        paragraph_content, table_offsets = parse_paragraphs(analyze_result)
        table_content = parse_tables(analyze_result, table_offsets)
        structured_data = combine_paragraphs_tables(
            filepath, paragraph_content, table_content
        )

        # Convert the structured data to JSON format
        json_output = json.dumps(structured_data, indent=4)

        # Get the filename without the ".pdf" extension
        filename_without_ext = os.path.splitext(os.path.basename(filepath))[0]
        # Write the JSON output to a file
        output_json_file = f"{output_folder_pdf}/{filename_without_ext}.json"

        with open(output_json_file, "w") as json_file:
            json_file.write(json_output)

# Set imports for the elasticsearch client and environment variables

In [None]:
import json
from dotenv import load_dotenv
from elasticsearch import Elasticsearch
from tqdm import tqdm
import os

load_dotenv()

ES_URL = os.getenv("ES_URL")
ES_API_KEY = os.getenv("ES_API_KEY")

es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=300)

# Create index in Elastic Cloud Serverless

In [None]:
index_name = "pdf-chat"
index_body = {
    "mappings": {
        "properties": {
            "page_content": {
                "type": "text",
                "copy_to": ["page_content_sparse", "page_content_dense"],
            },
            "page_content_sparse": {
                "type": "semantic_text",
                "inference_id": ".elser-2-elasticsearch",
            },
            "page_content_dense": {
                "type": "semantic_text",
                "inference_id": ".multilingual-e5-small-elasticsearch",
            },
            "page_number": {"type": "text"},
            "pdf_file": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    print(f"Index '{index_name}' deleted successfully.")

response = es.indices.create(index=index_name, body=index_body)
if "acknowledged" in response and response["acknowledged"]:
    print(f"Index '{index_name}' created successfully.")
elif "error" in response:
    print(f"Failed to create: '{index_name}'")
    print(f"Error: {response['error']['reason']}")
else:
    print(f"Index '{index_name}' already exists.")

In [None]:
files = os.listdir(output_folder_pdf)
with tqdm(total=len(files), desc="Indexing PDF docs") as pbar_files:
    for file in files:
        with open(output_folder_pdf + "/" + file) as f:
            data = json.loads(f.read())

        with tqdm(total=len(data), desc=f"Processing {file}") as pbar_pages:
            for page in data:
                doc = {
                    "page_content": page["content_text"],
                    "page_number": page["page_number"],
                    "pdf_file": page["pdf_file"],
                }
                id = f"{page['pdf_file']}_{page['page_number']}"
                es.index(index=index_name, id=id, body=json.dumps(doc))
                pbar_pages.update(1)

        pbar_files.update(1)

# Prompt List

1. Compare/contrast subscription revenue for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?
2. Provide an Income Taxes summary for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?
3. How has the balance sheet changed for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?