def form_parser_entities_mapping()

in microservices/prior_learning_assessment/src/services/extraction/utils_functions.py [0:0]


def form_parser_entities_mapping(form_parser_entity_list, mapping_dict,
                                 form_parser_text, json_folder):
  """
    Form parser entity mapping function

    Parameters
    ----------
    form_parser_entity_list: Extracted form parser entities before mapping
    mapping_dict: Mapping dictionary have info of default, derived entities
            along with desired keys

    Returns: required entities - list of dicts having entity, value, confidence
            and manual_extraction information
    -------
  """
  # extract entities information from config files
  default_entities = mapping_dict.get("default_entities")
  derived_entities = mapping_dict.get("derived_entities")
  table_entities = mapping_dict.get("table_entities")
  flag = check_duplicate_keys(default_entities, form_parser_entity_list)

  df = pd.DataFrame(form_parser_entity_list)
  key_list = df["key"].tolist()
  required_entities_list = []
  # loop through one by one default entities mentioned in the config file
  for each_ocr_key, each_ocr_val in default_entities.items():
    idx_list = []
    for val in each_ocr_val:
      extracted_one = process.extractOne(val, key_list)
      if extracted_one[1] >= 90:
        idx_list = df.index[df["key"] == extracted_one[0]].tolist()
        break

    if idx_list:
      temp_dict = {
          "entity": each_ocr_key, "value": df["value"][idx_list[0]],
          "extraction_confidence": float(df["value_confidence"]
                                        [idx_list[0]]),
          "manual_extraction": False,
          "corrected_value": None,
          "value_coordinates": [float(i) for i in df["value_coordinates"]
                                          [idx_list[0]]],
          "key_coordinates": [float(i) for i in df["key_coordinates"]
                                        [idx_list[0]]],
          "page_no": int(df["page_no"][idx_list[0]]),
          "page_width": int(df["page_width"][idx_list[0]]),
          "page_height": int(df["page_height"][idx_list[0]])
        }
    else:
      # filling null value if parser didn't extract
      temp_dict = {
        "entity": each_ocr_key,
        "value": None,
        "extraction_confidence": None,
        "manual_extraction": False,
        "corrected_value": None,
        "value_coordinates": None,
        "key_coordinates": None,
        "page_no": None,
        "page_width": None,
        "page_height": None
      }
    required_entities_list.append(temp_dict)
  print("Default entities created from Form parser response")
  if derived_entities:
    # this function can be used for all docs, if derived entities
    # are extracted by using regex pattern
    parser_data = {}
    parser_data["text"] = form_parser_text
    derived_entities_op_dict = derived_entities_extraction(parser_data,
                                                           derived_entities)
    required_entities_list.extend(list(derived_entities_op_dict.values()))
    print("Derived entities created from Form parser response")

  #Print statements mentioned in this section needs to be removed after
  #integration of the tabel extracted values into the function's return value
  if table_entities:
    table_response = None
    files = os.listdir(json_folder)
    for json_file in files:
      json_path = os.path.join(json_folder, json_file)
      table_extract_obj = TableExtractor(json_path)
      final_table_list = table_extract_obj.filter_table(table_entities)
      print("Filtered list of tables")
      print(final_table_list)
      table_response = table_extract_obj.course_extract(final_table_list,\
        table_entities)
      print("Extracted course details")
      print(table_response)
      #validating if there is atleast one prior experinece that was extracted
      if table_response and table_response[0]["keys"]:
        extracted_entities = extract_entities_from_table_response(
          table_response)
        if extracted_entities:
          required_entities_list.extend(extracted_entities)
          required_entities_list = separate_out_PE(required_entities_list)
          break
      else:
        print("No experience data found from tables")

  if not all(isinstance(item, list) for item in required_entities_list):
    required_entities_list = [required_entities_list]
  print("Checking final entity list after addition of table data")
  print(required_entities_list)
  return required_entities_list, flag