def extract_entities()

in microservices/extraction_service/src/utils/extract_entities.py [0:0]


def extract_entities(gcs_doc_path: str, doc_type: str, context: str):
  """
  This function calls specialed parser or form parser depends on document type

  Parameters
  ----------
  gcs_doc_path: Document gcs path
  doc_type: Type of documents. Ex: unemployment_form, driver_license, and etc
  context: context

  Returns
  -------
    List of dicts having entity, value, confidence and
           manual_extraction information.
    Extraction accuracy
  """

  # read parser details from configuration json file
  parsers_info = PARSER_CONFIG
  parser_information = parsers_info.get(doc_type)
  # if parser present then do extraction else update the status
  if parser_information:
    parser_name = parser_information["parser_name"]
    parser_type = parser_information["parser_type"]

    if parser_type == "FORM_PARSER_PROCESSOR":
      Logger.info(f"Form parser extraction started for"
                  f" this document:{doc_type}")
      desired_entities_list, flag = form_parser_extraction(
          parser_information, gcs_doc_path, doc_type, context, 300)
    else:
      Logger.info(f"Specialized parser extraction "
                  f"started for this document:{doc_type}")
      flag = True
      desired_entities_list = specialized_parser_extraction(
          parser_information, gcs_doc_path, doc_type, context)

    # calling standard entity mapping function to standardize the entities
    final_extracted_entities = standard_entity_mapping(desired_entities_list,
                                                       parser_name)
    # calling post processing utility function
    # input json is the extracted json file after your mapping script
    input_dict = get_json_format_for_processing(final_extracted_entities)
    input_dict, output_dict = data_transformation(input_dict)
    final_extracted_entities = correct_json_format_for_db(
        output_dict, final_extracted_entities)
    # with open("{}.json".format(os.path.join(mapped_extracted_entities,
    #         gcs_doc_path.split('/')[-1][:-4])),
    #           "w") as outfile:
    #     json.dump(final_extracted_entities, outfile, indent=4)

    # extraction accuracy calculation
    document_extraction_confidence,extraction_status = \
      extraction_accuracy_calc(final_extracted_entities,flag)
    # print(final_extracted_entities)
    # print(document_extraction_confidence)
    Logger.info(f"Extraction completed for this document:{doc_type}")
    return final_extracted_entities, \
          document_extraction_confidence,extraction_status
  else:
    # Parser not available
    Logger.error(f"Parser not available for this document:{doc_type}")
    # print("parser not available for this document")
    return None