function_app/bp_multimodal_doc_intel_processing.py (104 lines of code) (raw):
import json
import logging
import os
from typing import Optional
import azure.functions as func
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from src.components.doc_intelligence import (
DefaultDocumentFigureProcessor,
DefaultDocumentPageProcessor,
DocumentIntelligenceProcessor,
PageDocumentListSplitter,
convert_processed_di_doc_chunks_to_markdown,
)
from src.helpers.data_loading import load_visual_obj_bytes_to_pil_imgs_dict
logger = logging.getLogger(__name__)
load_dotenv()
bp_multimodal_doc_intel_processing = func.Blueprint()
FUNCTION_ROUTE = "multimodal_doc_intel_processing"
DOC_INTEL_ENDPOINT = os.getenv("DOC_INTEL_ENDPOINT")
# Create the client for Document Intelligence and Azure OpenAI
DOC_INTEL_MODEL_ID = "prebuilt-layout" # Set Document Intelligence model ID
# Set up the Document Intelligence v4.0 preview client. This will allow us to
# use the latest features of the Document Intelligence service. Check out the
# Document Intelligence Processor Walkthrough Notebook for more information
# (within the `notebooks` folder).
di_client = DocumentIntelligenceClient(
endpoint=DOC_INTEL_ENDPOINT,
credential=DefaultAzureCredential(),
api_version="2024-07-31-preview",
)
class FunctionRequestModel(BaseModel):
"""
Defines the schema that will be expected in the request body. We'll use this to
ensure that the request contains the correct values and structure, and to allow
a partially filled request to be processed in case of an error.
"""
file: bytes = Field(description="The text to be summarized")
include_page_images_after_content: bool = Field(
description="Whether to include page images after the text content of each page"
)
extract_and_crop_inline_figures: bool = Field(
description="Whether to extract and crop inline figures"
)
class FunctionReponseModel(BaseModel):
"""
Defines the schema that will be returned by the function. We'll use this to
ensure that the response contains the correct values and structure, and
to allow a partially filled response to be returned in case of an error.
"""
processed_di_result_markdown: str = Field(
description="Markdown text of the processed Document Intelligence Result."
)
raw_di_response: dict = Field(description="Raw document intelligence response.")
di_time_taken_secs: Optional[float] = Field(
description="The time taken to process the document with Document Intelligence.",
)
@bp_multimodal_doc_intel_processing.route(route=FUNCTION_ROUTE)
def multimodal_doc_intel_processing(req: func.HttpRequest) -> func.HttpResponse:
logging.info(f"Python HTTP trigger function `{FUNCTION_ROUTE}` received a request.")
try:
# Load and validate input data
error_text = "Error while loading and validating the input data."
error_code = 422
# Check the request body
request_json_content = json.loads(req.files["json"].read().decode("utf-8"))
include_page_images_after_content = request_json_content.get(
"include_page_images_after_content", False
)
extract_and_crop_inline_figures = request_json_content.get(
"extract_and_crop_inline_figures", False
)
# Now construct the a splitter class which can separate the outputs into different chunks
pages_per_chunk = request_json_content.get("pages_per_chunk", 3)
page_chunk_splitter = PageDocumentListSplitter(pages_per_chunk=pages_per_chunk)
file_bytes = req.files["file"].read()
file_mime_type = req.files["file"].content_type
# Create the Doc Intelligence result processor. This can be configured to
# process the raw Doc Intelligence result into a format that is easier
# to work with downstream.
doc_intel_result_processor = DocumentIntelligenceProcessor(
page_processor=DefaultDocumentPageProcessor(
page_img_order="after" if include_page_images_after_content else None,
),
figure_processor=DefaultDocumentFigureProcessor(
output_figure_img=extract_and_crop_inline_figures
),
)
# Process the document with Document Intelligence
error_text = "An error occurred while processing the document."
error_code = 422
# Load content as images
doc_page_imgs = load_visual_obj_bytes_to_pil_imgs_dict(
file_bytes, file_mime_type, starting_idx=1, pdf_img_dpi=100
)
# Get Doc Intelligence resul;t
poller = di_client.begin_analyze_document(
model_id=DOC_INTEL_MODEL_ID,
analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes),
)
di_result = poller.result()
# Process the result into Documents containing the content of every element
processed_content_docs = doc_intel_result_processor.process_analyze_result(
analyze_result=di_result,
doc_page_imgs=doc_page_imgs,
on_error="raise",
)
# Chunk the content by page
page_chunked_content_docs = page_chunk_splitter.split_document_list(
processed_content_docs
)
# Merge adjacent text content together (reducing the number of objects)
merged_page_chunked_content_docs = (
doc_intel_result_processor.merge_adjacent_text_content_docs(
page_chunked_content_docs
)
)
# Convert the chunks into a single Markdown string
di_processed_md = convert_processed_di_doc_chunks_to_markdown(
merged_page_chunked_content_docs
)
return func.HttpResponse(
body=di_processed_md,
mimetype="text/plain",
status_code=200,
)
except Exception as e:
logging.exception(e)
return func.HttpResponse(error_text, status_code=error_code)