in sql-pdf-python/src/cloud-functions/process_docai/main.py [0:0]
def get_doc(request):
request_json = request.get_json(silent=True)
replies = []
calls = request_json["calls"]
for call in calls:
uri = call[0]
content_type = call[1]
location = call[2]
processor_id = call[3]
print("Uri:", uri)
print("Content:", content_type)
print("Processor_id:", processor_id)
print("Location:", location)
accepted_file_types = {
"application/pdf",
"image/png",
"image/gif",
"image/tiff",
"image/jpeg",
"image/webp",
"image/bmp",
}
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
docai_client = documentai.DocumentProcessorServiceClient(client_options=opts)
processor = docai_client.processor_path(project_number, location, processor_id)
if content_type in accepted_file_types:
file = get_gcs_file(uri)
raw_document = documentai.RawDocument(content=file, mime_type=content_type)
request = documentai.ProcessRequest(
name=processor, raw_document=raw_document
)
response = docai_client.process_document(request=request)
document = response.document
types = []
raw_values = []
normalized_values = []
confidence = []
print("Length:", len(document.entities))
for entity in document.entities:
types.append(entity.type_)
raw_values.append(entity.mention_text)
normalized_values.append(entity.normalized_value.text)
confidence.append(f"{entity.confidence:.0%}")
# Get Properties (Sub-Entities) with confidence scores
for prop in entity.properties:
types.append(prop.type_)
raw_values.append(prop.mention_text)
normalized_values.append(prop.normalized_value.text)
confidence.append(f"{prop.confidence:.0%}")
extracted_val = {
"Type": types,
"Raw Value": raw_values,
"Normalized Value": normalized_values,
"Confidence": confidence,
}
replies.append(extracted_val)
else:
error_response = [{"output": "Cannot parse the file type"}]
replies.append(error_response)
return json.dumps({"replies": [json.dumps(extracts) for extracts in replies]})