def form_parser_entities_mapping()

in microservices/extraction_service/src/utils/utils_functions.py [0:0]


def form_parser_entities_mapping(form_parser_entity_list, mapping_dict,
                                 form_parser_text, json_folder):
  """
    Form parser entity mapping function

    Parameters
    ----------
    form_parser_entity_list: Extracted form parser entities before mapping
    mapping_dict: Mapping dictionary have info of default, derived entities
            along with desired keys

    Returns: required entities - list of dicts having entity, value, confidence
            and manual_extraction information
    -------
  """
  # extract entities information from config files
  default_entities = mapping_dict.get("default_entities")
  derived_entities = mapping_dict.get("derived_entities")
  table_entities = mapping_dict.get("table_entities")
  flag = check_duplicate_keys(default_entities,form_parser_entity_list)

  df = pd.DataFrame(form_parser_entity_list)
  required_entities_list = []
  # loop through one by one deafult entities mentioned in the config file
  for each_ocr_key, each_ocr_val in default_entities.items():
    try:
      idx_list = df.index[df["key"] == each_ocr_key].tolist()
    except: # pylint: disable=bare-except
      idx_list = []
    # loop for matched records of mapping dictionary
    for idx, each_val in enumerate(each_ocr_val):
      if idx_list:
        try:
          # creating response
          temp_dict = \
            {"entity": each_val, "value": df["value"][idx_list[idx]],
             "extraction_confidence": float(df["value_confidence"]
                                            [idx_list[idx]]),
             "manual_extraction": False,
             "corrected_value": None,
             "value_coordinates": [float(i) for i in df["value_coordinates"]
                                             [idx_list[idx]]],
             "key_coordinates": [float(i) for i in df["key_coordinates"]
                                           [idx_list[idx]]],
             "page_no": int(df["page_no"][idx_list[idx]]),
             "page_width": int(df["page_width"][idx_list[idx]]),
             "page_height": int(df["page_height"][idx_list[idx]])
             }

        except: # pylint: disable=bare-except
          Logger.info("Key not found in parser output,"
                      " so filling null value")

          temp_dict = {"entity": each_val, "value": None,
                       "extraction_confidence": None,
                       "manual_extraction": False,
                       "corrected_value": None,
                       "value_coordinates": None,
                       "key_coordinates": None,
                       "page_no": None,
                       "page_width": None,
                       "page_height": None
                       }

        required_entities_list.append(temp_dict)
      else:
        # filling null value if parser didn't extract
        temp_dict = {"entity": each_val, "value": None,
                         "extraction_confidence": None,
                         "manual_extraction": False,
                         "corrected_value": None,
                         "value_coordinates": None,
                         "key_coordinates": None,
                         "page_no": None,
                         "page_width": None,
                         "page_height": None
                         }
        required_entities_list.append(temp_dict)
  Logger.info("Default entities created from Form parser response")
  if derived_entities:
    # this function can be used for all docs, if derived entities
    # are extracted by using regex pattern
    parser_data = {}
    parser_data["text"] = form_parser_text
    derived_entities_op_dict = derived_entities_extraction(parser_data,
                                                           derived_entities)
    required_entities_list.extend(list(derived_entities_op_dict.values()))
    Logger.info("Derived entities created from Form parser response")

  if table_entities:
    table_response = None
    files = os.listdir(json_folder)
    for json_file in files:
      json_path = os.path.join(json_folder, json_file)
      table_extract_obj = TableExtractor(json_path)
      table_response = table_extract_obj.get_entities(table_entities)
      if table_response and isinstance(table_response, list):
        required_entities_list.extend(table_response)
        break
    if table_response is None:
      Logger.error("No table data found")

  return required_entities_list, flag