in function_app/bp_pii_redaction.py [0:0]
def redact_pii_pdf(req: func.HttpRequest) -> func.HttpResponse:
logging.info(
f"Python HTTP trigger function `{PDF_FUNCTION_ROUTE}` received a request."
)
output_model = PDFFunctionReponseModel(success=False)
try:
# Create error_text and error_code variables. These will be updated as
# we move through the pipeline so that if a step fails, the vars reflect
# what has failed. If all steps complete successfully, the vars are
# never used.
error_text = "An error occurred while reading the PDF file"
error_code = 422
func_timer = MeasureRunTime()
func_timer.start()
### 1. Check the request body
# Check mime_type of the request data
mime_type = req.headers.get("Content-Type")
if mime_type != "application/pdf":
return func.HttpResponse(
"This function only supports a Content-Type of 'application/pdf'. Supplied file is of type {}".format(
mime_type
),
status_code=error_code,
)
req_body = req.get_body()
if len(req_body) == 0:
return func.HttpResponse(
"Please provide a base64 encoded PDF in the request body.",
status_code=error_code,
)
pdf = load_pymupdf_pdf(pdf_bytes=req_body)
### 2. Extract the text and images using Document Intelligence
error_text = "An error occurred during text & image extraction."
error_code = 500
input_doc_pdf_page_imgs = [
pil_img_to_base64_bytes(pymupdf_pdf_page_to_img_pil(page, 80, 0))
for page in pdf
]
output_model.input_doc_pdf_page_imgs = input_doc_pdf_page_imgs
# Extract the text from the PDF
if TEXT_EXTRACTION_METHOD is PDFTextExtractionMethod.PYMUPDF:
raw_text = "\f".join(
[page.get_text(page_num) for page_num, page in enumerate(pdf, start=1)]
)
elif TEXT_EXTRACTION_METHOD is PDFTextExtractionMethod.DOCUMENT_INTELLIGENCE:
poller = di_client.begin_analyze_document(
model_id=DOC_INTEL_MODEL_ID,
analyze_request=AnalyzeDocumentRequest(bytes_source=req_body),
)
di_result = poller.result()
output_model.di_raw_response = di_result.as_dict()
raw_text = di_result.content
else:
raise ValueError(
f"Invalid text extraction method: {TEXT_EXTRACTION_METHOD}"
)
### 3. Redact PII from the text using Azure AI Language service
error_text = "An error occurred during PII recognition."
documents = [raw_text]
pii_result = text_analytics_client.recognize_pii_entities(
documents=documents,
)
output_model.pii_raw_response = [str(doc_result) for doc_result in pii_result]
pii_result_doc = pii_result[0]
if pii_result_doc.is_error:
raise Exception("An error occurred during PII recognition")
### 3. Replace the PII entities with '<<CATEGORY>>' text.
# This gives us a more readable output than the redacted text from the raw API response (which simply replaces PII with asterixes).
replacements = {
entity.text: f"<<{entity.category}>>" for entity in pii_result_doc.entities
}
output_model.redacted_text = replace_text(raw_text, replacements)
redacted_pdf = replace_text_in_pdf(pdf, replacements)
output_model.redacted_pdf = b64encode(redacted_pdf.tobytes()).decode("utf-8")
redacted_pdf_page_imgs = [
pil_img_to_base64_bytes(pymupdf_pdf_page_to_img_pil(page, 80, 0))
for page in redacted_pdf
]
output_model.redacted_pdf_page_imgs = redacted_pdf_page_imgs
### 5. All steps completed successfully, set success=True and return the final result
output_model.success = True
output_model.func_time_taken_secs = func_timer.stop()
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=200,
)
except Exception as _e:
# If an error occurred at any stage, return the partial response. Update the error_text
# field to contain the error message, and ensure success=False.
output_model.success = False
output_model.error_text = error_text
output_model.func_time_taken_secs = func_timer.stop()
logging.exception(output_model.error_text)
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=error_code,
)