def get_doc()

in sql-pdf-python/src/cloud-functions/process_docai/main.py [0:0]


def get_doc(request):
    request_json = request.get_json(silent=True)

    replies = []
    calls = request_json["calls"]

    for call in calls:
        uri = call[0]
        content_type = call[1]
        location = call[2]
        processor_id = call[3]

        print("Uri:", uri)
        print("Content:", content_type)
        print("Processor_id:", processor_id)
        print("Location:", location)

        accepted_file_types = {
            "application/pdf",
            "image/png",
            "image/gif",
            "image/tiff",
            "image/jpeg",
            "image/webp",
            "image/bmp",
        }
        opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
        docai_client = documentai.DocumentProcessorServiceClient(client_options=opts)
        processor = docai_client.processor_path(project_number, location, processor_id)

        if content_type in accepted_file_types:
            file = get_gcs_file(uri)
            raw_document = documentai.RawDocument(content=file, mime_type=content_type)
            request = documentai.ProcessRequest(
                name=processor, raw_document=raw_document
            )
            response = docai_client.process_document(request=request)
            document = response.document

            types = []
            raw_values = []
            normalized_values = []
            confidence = []

            print("Length:", len(document.entities))

            for entity in document.entities:
                types.append(entity.type_)
                raw_values.append(entity.mention_text)
                normalized_values.append(entity.normalized_value.text)
                confidence.append(f"{entity.confidence:.0%}")

                # Get Properties (Sub-Entities) with confidence scores
                for prop in entity.properties:
                    types.append(prop.type_)
                    raw_values.append(prop.mention_text)
                    normalized_values.append(prop.normalized_value.text)
                    confidence.append(f"{prop.confidence:.0%}")

            extracted_val = {
                "Type": types,
                "Raw Value": raw_values,
                "Normalized Value": normalized_values,
                "Confidence": confidence,
            }
            replies.append(extracted_val)

        else:
            error_response = [{"output": "Cannot parse the file type"}]
            replies.append(error_response)

    return json.dumps({"replies": [json.dumps(extracts) for extracts in replies]})