in microservices/extraction_service/src/utils/extract_entities.py [0:0]
def extract_entities(gcs_doc_path: str, doc_type: str, context: str):
"""
This function calls specialed parser or form parser depends on document type
Parameters
----------
gcs_doc_path: Document gcs path
doc_type: Type of documents. Ex: unemployment_form, driver_license, and etc
context: context
Returns
-------
List of dicts having entity, value, confidence and
manual_extraction information.
Extraction accuracy
"""
# read parser details from configuration json file
parsers_info = PARSER_CONFIG
parser_information = parsers_info.get(doc_type)
# if parser present then do extraction else update the status
if parser_information:
parser_name = parser_information["parser_name"]
parser_type = parser_information["parser_type"]
if parser_type == "FORM_PARSER_PROCESSOR":
Logger.info(f"Form parser extraction started for"
f" this document:{doc_type}")
desired_entities_list, flag = form_parser_extraction(
parser_information, gcs_doc_path, doc_type, context, 300)
else:
Logger.info(f"Specialized parser extraction "
f"started for this document:{doc_type}")
flag = True
desired_entities_list = specialized_parser_extraction(
parser_information, gcs_doc_path, doc_type, context)
# calling standard entity mapping function to standardize the entities
final_extracted_entities = standard_entity_mapping(desired_entities_list,
parser_name)
# calling post processing utility function
# input json is the extracted json file after your mapping script
input_dict = get_json_format_for_processing(final_extracted_entities)
input_dict, output_dict = data_transformation(input_dict)
final_extracted_entities = correct_json_format_for_db(
output_dict, final_extracted_entities)
# with open("{}.json".format(os.path.join(mapped_extracted_entities,
# gcs_doc_path.split('/')[-1][:-4])),
# "w") as outfile:
# json.dump(final_extracted_entities, outfile, indent=4)
# extraction accuracy calculation
document_extraction_confidence,extraction_status = \
extraction_accuracy_calc(final_extracted_entities,flag)
# print(final_extracted_entities)
# print(document_extraction_confidence)
Logger.info(f"Extraction completed for this document:{doc_type}")
return final_extracted_entities, \
document_extraction_confidence,extraction_status
else:
# Parser not available
Logger.error(f"Parser not available for this document:{doc_type}")
# print("parser not available for this document")
return None