def redact_pii_pdf()

in function_app/bp_pii_redaction.py [0:0]


def redact_pii_pdf(req: func.HttpRequest) -> func.HttpResponse:
    logging.info(
        f"Python HTTP trigger function `{PDF_FUNCTION_ROUTE}` received a request."
    )
    output_model = PDFFunctionReponseModel(success=False)
    try:
        # Create error_text and error_code variables. These will be updated as
        # we move through the pipeline so that if a step fails, the vars reflect
        # what has failed. If all steps complete successfully, the vars are
        # never used.
        error_text = "An error occurred while reading the PDF file"
        error_code = 422

        func_timer = MeasureRunTime()
        func_timer.start()

        ### 1. Check the request body
        # Check mime_type of the request data
        mime_type = req.headers.get("Content-Type")
        if mime_type != "application/pdf":
            return func.HttpResponse(
                "This function only supports a Content-Type of 'application/pdf'. Supplied file is of type {}".format(
                    mime_type
                ),
                status_code=error_code,
            )

        req_body = req.get_body()
        if len(req_body) == 0:
            return func.HttpResponse(
                "Please provide a base64 encoded PDF in the request body.",
                status_code=error_code,
            )
        pdf = load_pymupdf_pdf(pdf_bytes=req_body)

        ### 2. Extract the text and images using Document Intelligence
        error_text = "An error occurred during text & image extraction."
        error_code = 500

        input_doc_pdf_page_imgs = [
            pil_img_to_base64_bytes(pymupdf_pdf_page_to_img_pil(page, 80, 0))
            for page in pdf
        ]
        output_model.input_doc_pdf_page_imgs = input_doc_pdf_page_imgs
        # Extract the text from the PDF
        if TEXT_EXTRACTION_METHOD is PDFTextExtractionMethod.PYMUPDF:
            raw_text = "\f".join(
                [page.get_text(page_num) for page_num, page in enumerate(pdf, start=1)]
            )
        elif TEXT_EXTRACTION_METHOD is PDFTextExtractionMethod.DOCUMENT_INTELLIGENCE:
            poller = di_client.begin_analyze_document(
                model_id=DOC_INTEL_MODEL_ID,
                analyze_request=AnalyzeDocumentRequest(bytes_source=req_body),
            )
            di_result = poller.result()
            output_model.di_raw_response = di_result.as_dict()
            raw_text = di_result.content
        else:
            raise ValueError(
                f"Invalid text extraction method: {TEXT_EXTRACTION_METHOD}"
            )

        ### 3. Redact PII from the text using Azure AI Language service
        error_text = "An error occurred during PII recognition."
        documents = [raw_text]
        pii_result = text_analytics_client.recognize_pii_entities(
            documents=documents,
        )
        output_model.pii_raw_response = [str(doc_result) for doc_result in pii_result]
        pii_result_doc = pii_result[0]
        if pii_result_doc.is_error:
            raise Exception("An error occurred during PII recognition")

        ### 3. Replace the PII entities with '<<CATEGORY>>' text.
        # This gives us a more readable output than the redacted text from the raw API response (which simply replaces PII with asterixes).
        replacements = {
            entity.text: f"<<{entity.category}>>" for entity in pii_result_doc.entities
        }
        output_model.redacted_text = replace_text(raw_text, replacements)
        redacted_pdf = replace_text_in_pdf(pdf, replacements)
        output_model.redacted_pdf = b64encode(redacted_pdf.tobytes()).decode("utf-8")
        redacted_pdf_page_imgs = [
            pil_img_to_base64_bytes(pymupdf_pdf_page_to_img_pil(page, 80, 0))
            for page in redacted_pdf
        ]
        output_model.redacted_pdf_page_imgs = redacted_pdf_page_imgs

        ### 5. All steps completed successfully, set success=True and return the final result
        output_model.success = True
        output_model.func_time_taken_secs = func_timer.stop()
        return func.HttpResponse(
            body=output_model.model_dump_json(),
            mimetype="application/json",
            status_code=200,
        )
    except Exception as _e:
        # If an error occurred at any stage, return the partial response. Update the error_text
        # field to contain the error message, and ensure success=False.
        output_model.success = False
        output_model.error_text = error_text
        output_model.func_time_taken_secs = func_timer.stop()
        logging.exception(output_model.error_text)
        return func.HttpResponse(
            body=output_model.model_dump_json(),
            mimetype="application/json",
            status_code=error_code,
        )