in microservices/extraction_service/src/utils/extract_entities.py [0:0]
def specialized_parser_extraction(parser_details: dict, gcs_doc_path: str,
doc_type: str, context: str):
"""
This is specialized parser extraction main function.
It will send request to parser and retrieve response and call
default and derived entities functions
Parameters
----------
parser_details: It has parser info like parser id, name, location, and etc
gcs_doc_path: Document gcs path
doc_type: Document type
Returns: Specialized parser response - list of dicts having entity,
value, confidence and manual_extraction information.
-------
"""
# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
location = parser_details["location"]
processor_id = parser_details["processor_id"]
#parser_name = parser_details["parser_name"]
project_id = PROJECT_ID
opts = {}
if location == "eu":
opts = {"api_endpoint": "eu-documentai.googleapis.com"}
client = documentai.DocumentProcessorServiceClient(client_options=opts)
# parser api end point
# name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
name = processor_id
blob = download_pdf_gcs(gcs_uri=gcs_doc_path)
document = {
"content": blob.download_as_bytes(),
"mime_type": "application/pdf"
}
# Configure the process request
request = {"name": name, "raw_document": document}
Logger.info("Specialized parser extraction api called")
# send request to parser
result = client.process_document(request=request)
parser_doc_data = result.document
# convert to json
json_string = proto.Message.to_json(parser_doc_data)
data = json.loads(json_string)
# remove unnecessary entities from parser
for each_attr in DOCAI_ATTRIBUTES_TO_IGNORE:
if "." in each_attr:
parent_attr, child_attr = each_attr.split(".")
for idx in range(len(data.get(parent_attr, 0))):
data[parent_attr][idx].pop(child_attr, None)
else:
data.pop(each_attr, None)
# Get corresponding mapping dict, for specific context or fallback to "all"
docai_entity_mapping_by_context = DOCAI_ENTITY_MAPPING.get(context)
mapping_dict = docai_entity_mapping_by_context.get(
doc_type) or DOCAI_ENTITY_MAPPING["all"][doc_type]
# extract dl entities
extracted_entity_dict = entities_extraction(data, mapping_dict, doc_type)
# Create a list of entities dicts
specialized_parser_entity_list = [v for k, v in extracted_entity_dict.items()]
# this can be removed while integration
# save extracted entities json
# with open("{}.json".format(os.path.join(extracted_entities,
# gcs_doc_path.split('/')[-1][:-4])), "w") as outfile:
# json.dump(specialized_parser_entity_list, outfile, indent=4)
Logger.info("Required entities created from Specialized parser response")
return specialized_parser_entity_list