def specialized_parser_extraction()

in microservices/extraction_service/src/utils/extract_entities.py [0:0]


def specialized_parser_extraction(parser_details: dict, gcs_doc_path: str,
                                  doc_type: str, context: str):
  """
    This is specialized parser extraction main function.
    It will send request to parser and retrieve response and call
        default and derived entities functions

    Parameters
    ----------
    parser_details: It has parser info like parser id, name, location, and etc
    gcs_doc_path: Document gcs path
    doc_type: Document type

    Returns: Specialized parser response - list of dicts having entity,
     value, confidence and manual_extraction information.
    -------
  """

  # The full resource name of the processor, e.g.:
  # projects/project-id/locations/location/processor/processor-id

  location = parser_details["location"]
  processor_id = parser_details["processor_id"]
  #parser_name = parser_details["parser_name"]
  project_id = PROJECT_ID
  opts = {}
  if location == "eu":
    opts = {"api_endpoint": "eu-documentai.googleapis.com"}

  client = documentai.DocumentProcessorServiceClient(client_options=opts)
  # parser api end point
  # name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
  name = processor_id
  blob = download_pdf_gcs(gcs_uri=gcs_doc_path)

  document = {
      "content": blob.download_as_bytes(),
      "mime_type": "application/pdf"
  }
  # Configure the process request
  request = {"name": name, "raw_document": document}
  Logger.info("Specialized parser extraction api called")
  # send request to parser
  result = client.process_document(request=request)
  parser_doc_data = result.document
  # convert to json
  json_string = proto.Message.to_json(parser_doc_data)
  data = json.loads(json_string)
  # remove unnecessary entities from parser
  for each_attr in DOCAI_ATTRIBUTES_TO_IGNORE:
    if "." in each_attr:
      parent_attr, child_attr = each_attr.split(".")
      for idx in range(len(data.get(parent_attr, 0))):
        data[parent_attr][idx].pop(child_attr, None)
    else:
      data.pop(each_attr, None)

  # Get corresponding mapping dict, for specific context or fallback to "all"
  docai_entity_mapping_by_context = DOCAI_ENTITY_MAPPING.get(context)
  mapping_dict = docai_entity_mapping_by_context.get(
      doc_type) or DOCAI_ENTITY_MAPPING["all"][doc_type]

  # extract dl entities
  extracted_entity_dict = entities_extraction(data, mapping_dict, doc_type)
  # Create a list of entities dicts
  specialized_parser_entity_list = [v for k, v in extracted_entity_dict.items()]

  # this can be removed while integration
  # save extracted entities json
  # with open("{}.json".format(os.path.join(extracted_entities,
  #     gcs_doc_path.split('/')[-1][:-4])), "w") as outfile:
  #     json.dump(specialized_parser_entity_list, outfile, indent=4)
  Logger.info("Required entities created from Specialized parser response")
  return specialized_parser_entity_list