in microservices/prior_learning_assessment/src/services/extraction/table_extractor.py [0:0]
def table_attributes(self):
"""
This function obtains information regarding all the tables.
For ex. total tables, table header info, table row wise data
in dataframe format
"""
if "pages" in self.data.keys():
# Iterate over pages
for pg_num, page in enumerate(self.data["pages"]):
page_data = {}
if "tables" in page.keys():
# Iterate over tables
for table_num, table in enumerate(page["tables"]):
# extract header(columns)
if "bodyRows" in table and "headerRows" in table:
for _, hrow in enumerate(table["headerRows"]):
header_row = [
TableExtractor.get_text(
cell["layout"], self.data) for cell in hrow["cells"]
]
columns = []
for val, conf in header_row:
if val is None:
tup = (val, conf)
columns.append(tup)
else:
tup = (" ".join(val.split()), conf)
columns.append(tup)
table_data = {"headers": columns}
table_data["page_num"] = pg_num
col_data = {}
try:
for row_num, row in enumerate(table["bodyRows"]):
row_data = [
TableExtractor.get_text(
cell["layout"], self.data) for cell in row["cells"]
]
for i_col in range(len(header_row)):
entity_val, conf = row_data[i_col]
col_data[i_col] = {
"value": entity_val,
"extraction_confidence": conf
}
table_data[row_num] = {"rows": deepcopy(col_data)}
except ValueError as e:
print(e)
return "Table Empty !!!"
page_data[table_num] = table_data
# page_data["height"] = page["dimension"]["height"]
# page_data["width"] = page["dimension"]["width"]
self.master_dict[pg_num] = page_data
print("Checking the tabel extractor")
#This code converts the dict to a simpler list format
final = []
for pg_num,page in self.master_dict.items():
for table_num, table in page.items():
obj = {}
obj["table_id"] = table_num
obj["header_data"] = table["headers"]
obj["row_data"] = []
for key, rows in table.items():
if(key not in ["headers","page_num"]):
row_arr = []
for _ , cell in rows.items():
print("Inside the final loop")
for _ , row_val in cell.items():
value= row_val["value"]
confidence = row_val["extraction_confidence"]
tup= (value,confidence)
row_arr.append(tup)
print("Appending the row_arr")
obj["row_data"].append(row_arr)
final.append(obj)
print(final)
self.table_list = final
else:
print("no data found in table")
return None