in microservices/extraction_service/src/utils/utils_functions.py [0:0]
def form_parser_entities_mapping(form_parser_entity_list, mapping_dict,
form_parser_text, json_folder):
"""
Form parser entity mapping function
Parameters
----------
form_parser_entity_list: Extracted form parser entities before mapping
mapping_dict: Mapping dictionary have info of default, derived entities
along with desired keys
Returns: required entities - list of dicts having entity, value, confidence
and manual_extraction information
-------
"""
# extract entities information from config files
default_entities = mapping_dict.get("default_entities")
derived_entities = mapping_dict.get("derived_entities")
table_entities = mapping_dict.get("table_entities")
flag = check_duplicate_keys(default_entities,form_parser_entity_list)
df = pd.DataFrame(form_parser_entity_list)
required_entities_list = []
# loop through one by one deafult entities mentioned in the config file
for each_ocr_key, each_ocr_val in default_entities.items():
try:
idx_list = df.index[df["key"] == each_ocr_key].tolist()
except: # pylint: disable=bare-except
idx_list = []
# loop for matched records of mapping dictionary
for idx, each_val in enumerate(each_ocr_val):
if idx_list:
try:
# creating response
temp_dict = \
{"entity": each_val, "value": df["value"][idx_list[idx]],
"extraction_confidence": float(df["value_confidence"]
[idx_list[idx]]),
"manual_extraction": False,
"corrected_value": None,
"value_coordinates": [float(i) for i in df["value_coordinates"]
[idx_list[idx]]],
"key_coordinates": [float(i) for i in df["key_coordinates"]
[idx_list[idx]]],
"page_no": int(df["page_no"][idx_list[idx]]),
"page_width": int(df["page_width"][idx_list[idx]]),
"page_height": int(df["page_height"][idx_list[idx]])
}
except: # pylint: disable=bare-except
Logger.info("Key not found in parser output,"
" so filling null value")
temp_dict = {"entity": each_val, "value": None,
"extraction_confidence": None,
"manual_extraction": False,
"corrected_value": None,
"value_coordinates": None,
"key_coordinates": None,
"page_no": None,
"page_width": None,
"page_height": None
}
required_entities_list.append(temp_dict)
else:
# filling null value if parser didn't extract
temp_dict = {"entity": each_val, "value": None,
"extraction_confidence": None,
"manual_extraction": False,
"corrected_value": None,
"value_coordinates": None,
"key_coordinates": None,
"page_no": None,
"page_width": None,
"page_height": None
}
required_entities_list.append(temp_dict)
Logger.info("Default entities created from Form parser response")
if derived_entities:
# this function can be used for all docs, if derived entities
# are extracted by using regex pattern
parser_data = {}
parser_data["text"] = form_parser_text
derived_entities_op_dict = derived_entities_extraction(parser_data,
derived_entities)
required_entities_list.extend(list(derived_entities_op_dict.values()))
Logger.info("Derived entities created from Form parser response")
if table_entities:
table_response = None
files = os.listdir(json_folder)
for json_file in files:
json_path = os.path.join(json_folder, json_file)
table_extract_obj = TableExtractor(json_path)
table_response = table_extract_obj.get_entities(table_entities)
if table_response and isinstance(table_response, list):
required_entities_list.extend(table_response)
break
if table_response is None:
Logger.error("No table data found")
return required_entities_list, flag