in microservices/extraction_service/src/utils/utils_functions.py [0:0]
def standard_entity_mapping(desired_entities_list, parser_name):
"""
This function changes entity name to standard names and also
create consolidated entities like name and date
Parameters
----------
desired_entities_list: List of default and derived entities
parser_name: name of the parser
Returns: Standard entities list
-------
"""
# convert extracted json to pandas dataframe
df_json = pd.DataFrame.from_dict(desired_entities_list)
# read entity standardization csv
entity_standardization = os.path.join(
os.path.dirname(__file__), ".", "entity-standardization.csv")
entities_standardization_csv = pd.read_csv(entity_standardization)
entities_standardization_csv.dropna(how="all", inplace=True)
# Keep first record incase of duplicate entities
entities_standardization_csv.drop_duplicates(subset=["entity"]
, keep="first", inplace=True)
entities_standardization_csv.reset_index(drop=True)
# Create a dictionary from the look up dataframe/excel which has
# the key col and the value col
dict_lookup = dict(
zip(entities_standardization_csv["entity"],
entities_standardization_csv["standard_entity_name"]))
# Get( all the entity (key column) from the json as a list
key_list = list(df_json["entity"])
# Replace the value by creating a list by looking up the value and assign
# to json entity
for index,item in enumerate(key_list):
if item in dict_lookup:
df_json["entity"][index]=dict_lookup[item]
else:
df_json = df_json.drop(index)
df_json.reset_index(inplace=True, drop=True)
# convert datatype from object to int for column "extraction_confidence"
df_json["extraction_confidence"] = pd.to_numeric\
(df_json["extraction_confidence"],errors="coerce")
group_by_columns = ["value", "extraction_confidence", "manual_extraction",
"corrected_value", "page_no",
"page_width", "page_height", "key_coordinates",
"value_coordinates"]
df_conc = df_json.groupby("entity")[group_by_columns[0]].apply(
lambda x: "/".join([v.strip() for v in x if v]) if check_int(x)
else " ".join([v.strip() for v in x if v])).reset_index()
df_av = df_json.groupby(["entity"])[group_by_columns[1]].mean().\
reset_index().round(2)
# taking mode for categorical variables
df_manual_extraction = df_json.groupby(["entity"])[group_by_columns[2]]\
.agg(pd.Series.mode).reset_index()
df_corrected_value = df_json.groupby(["entity"])[group_by_columns[3]]\
.mean().reset_index().round(2)
if parser_name == "FormParser":
df_page_no = df_json.groupby(["entity"])[group_by_columns[4]].mean()\
.reset_index().round(2)
df_page_width = df_json.groupby(["entity"])[group_by_columns[5]].mean()\
.reset_index().round(2)
df_page_height = df_json.groupby(["entity"])[group_by_columns[6]].mean()\
.reset_index().round(2)
# co-ordinate consolidation
df_key_coordinates = df_json.groupby("entity")[group_by_columns[7]].apply(
consolidate_coordinates).reset_index()
df_value_coordinates = df_json.groupby("entity")[group_by_columns[8]].apply(
consolidate_coordinates).reset_index()
dfs = [df_conc, df_av, df_manual_extraction, df_corrected_value,
df_page_no, df_page_width, df_page_height,
df_key_coordinates, df_value_coordinates]
else:
dfs = [df_conc, df_av, df_manual_extraction, df_corrected_value]
df_final = reduce(lambda left, right: pd.merge(left, right, on="entity"), dfs)
df_final = df_final.replace(r"^\s*$", np.nan, regex=True)
df_final = df_final.replace({np.nan: None})
extracted_entities_final_json = df_final.to_dict("records")
Logger.info("Entities standardization completed")
return extracted_entities_final_json