in microservices/extraction_service/src/utils/table_extractor.py [0:0]
def get_entities(self, table_entities):
"""
Extract data from table based on user specific inputs for a table.
Args:
table_entities (list): user specified table parameters
Returns:
out(list): extracted entities
"""
if table_entities["isheader"]:
inp_header = table_entities["headers"]
if isinstance(table_entities["table_num"], int):
table_num = table_entities["table_num"]
else:
table_num = 0
if isinstance(table_entities["page_num"], int):
page_num = table_entities["page_num"]
else:
page_num = 0
try:
if table_num > 0 and page_num > 0:
table_dict = self.master_dict[page_num][table_num]
columns = [val[0] for val in table_dict["headers"]]
if TableExtractor.compare_lists(columns, inp_header) < 0.70:
Logger.error( "Table does not match with the headers provided")
return self.table_not_found(table_entities)
# if no table and page info provided.Iterate over all the pages to find
# the table based on header
elif page_num == 0 and table_num == 0:
table_data = TableExtractor.get_table_using_header(
self.master_dict, inp_header)
if table_data:
table_dict, columns = table_data
else:
return self.table_not_found(table_entities)
elif page_num > 0 and table_num == 0:
if page_num not in self.master_dict:
Logger.error( "page not found")
return self.table_not_found(table_entities)
page_dict = self.master_dict[page_num]
table_dict, columns = TableExtractor.get_table_using_header(
page_dict, inp_header)
else:
Logger.error("Operation cannot be performed. Check your config")
return self.table_not_found(table_entities)
except Exception as e:
Logger.error(e)
return self.table_not_found(table_entities)
out = []
for user_inp in table_entities["entity_extraction"]:
try:
entity_data = {}
suffix, row = user_inp["entity_suffix"], user_inp["row_no"]
col = user_inp["col"]
row_dict = table_dict[row]["rows"]
if suffix in (None, ""):
suffix = ""
entity_name = f"{columns[col]} {suffix}"
entity_data = row_dict[col]
entity_data["entity"] = entity_name
entity_data["key_coordinates"] = table_dict["headers"][col][2]
entity_data["page_height"] = self.master_dict[0]["height"]
entity_data["page_width"] = self.master_dict[0]["width"]
entity_data["page_no"] = table_dict["page_num"]
out.append(deepcopy(entity_data))
except Exception as e:
Logger.warning(e)
continue
return out
else:
Logger.error("No header present in the table. Table not extracted.")
return self.table_not_found(table_entities)