in microservices/prior_learning_assessment/src/services/extraction/utils_functions.py [0:0]
def form_parser_entities_mapping(form_parser_entity_list, mapping_dict,
form_parser_text, json_folder):
"""
Form parser entity mapping function
Parameters
----------
form_parser_entity_list: Extracted form parser entities before mapping
mapping_dict: Mapping dictionary have info of default, derived entities
along with desired keys
Returns: required entities - list of dicts having entity, value, confidence
and manual_extraction information
-------
"""
# extract entities information from config files
default_entities = mapping_dict.get("default_entities")
derived_entities = mapping_dict.get("derived_entities")
table_entities = mapping_dict.get("table_entities")
flag = check_duplicate_keys(default_entities, form_parser_entity_list)
df = pd.DataFrame(form_parser_entity_list)
key_list = df["key"].tolist()
required_entities_list = []
# loop through one by one default entities mentioned in the config file
for each_ocr_key, each_ocr_val in default_entities.items():
idx_list = []
for val in each_ocr_val:
extracted_one = process.extractOne(val, key_list)
if extracted_one[1] >= 90:
idx_list = df.index[df["key"] == extracted_one[0]].tolist()
break
if idx_list:
temp_dict = {
"entity": each_ocr_key, "value": df["value"][idx_list[0]],
"extraction_confidence": float(df["value_confidence"]
[idx_list[0]]),
"manual_extraction": False,
"corrected_value": None,
"value_coordinates": [float(i) for i in df["value_coordinates"]
[idx_list[0]]],
"key_coordinates": [float(i) for i in df["key_coordinates"]
[idx_list[0]]],
"page_no": int(df["page_no"][idx_list[0]]),
"page_width": int(df["page_width"][idx_list[0]]),
"page_height": int(df["page_height"][idx_list[0]])
}
else:
# filling null value if parser didn't extract
temp_dict = {
"entity": each_ocr_key,
"value": None,
"extraction_confidence": None,
"manual_extraction": False,
"corrected_value": None,
"value_coordinates": None,
"key_coordinates": None,
"page_no": None,
"page_width": None,
"page_height": None
}
required_entities_list.append(temp_dict)
print("Default entities created from Form parser response")
if derived_entities:
# this function can be used for all docs, if derived entities
# are extracted by using regex pattern
parser_data = {}
parser_data["text"] = form_parser_text
derived_entities_op_dict = derived_entities_extraction(parser_data,
derived_entities)
required_entities_list.extend(list(derived_entities_op_dict.values()))
print("Derived entities created from Form parser response")
#Print statements mentioned in this section needs to be removed after
#integration of the tabel extracted values into the function's return value
if table_entities:
table_response = None
files = os.listdir(json_folder)
for json_file in files:
json_path = os.path.join(json_folder, json_file)
table_extract_obj = TableExtractor(json_path)
final_table_list = table_extract_obj.filter_table(table_entities)
print("Filtered list of tables")
print(final_table_list)
table_response = table_extract_obj.course_extract(final_table_list,\
table_entities)
print("Extracted course details")
print(table_response)
#validating if there is atleast one prior experinece that was extracted
if table_response and table_response[0]["keys"]:
extracted_entities = extract_entities_from_table_response(
table_response)
if extracted_entities:
required_entities_list.extend(extracted_entities)
required_entities_list = separate_out_PE(required_entities_list)
break
else:
print("No experience data found from tables")
if not all(isinstance(item, list) for item in required_entities_list):
required_entities_list = [required_entities_list]
print("Checking final entity list after addition of table data")
print(required_entities_list)
return required_entities_list, flag