in microservices/extraction_service/src/utils/table_extractor.py [0:0]
def table_attributes(self):
"""
This function obtains information regarding all the tables.
For ex. total tables, table header info, table row wise data
in dataframe format
"""
if "pages" in self.data.keys():
# Iterate over pages
for pg_num, page in enumerate(self.data["pages"]):
page_data = {}
if "tables" in page.keys():
# Iterate over tables
for table_num, table in enumerate(page["tables"]):
# extract header(columns)
if "bodyRows" in table and "headerRows" in table:
for _, hrow in enumerate(table["headerRows"]):
header_row = [
TableExtractor.get_text(
cell["layout"], self.data) for cell in hrow["cells"]
]
columns = []
for val, conf, cord in header_row:
if val is None:
columns.append(val, conf, cord)
else:
columns.append((" ".join(val.split()), conf, cord))
table_data = {"headers": columns}
table_data["page_num"] = pg_num
col_data = {}
try:
for row_num, row in enumerate(table["bodyRows"]):
row_data = [
TableExtractor.get_text(
cell["layout"], self.data) for cell in row["cells"]
]
for i_col in range(len(header_row)):
entity_val, conf, coordinates = row_data[i_col]
col_data[i_col] = {
"value": entity_val,
"extraction_confidence": conf,
"value_coordinates": coordinates,
"manual_extraction": False,
"corrected_value": None
}
table_data[row_num] = {"rows": deepcopy(col_data)}
except ValueError as e:
Logger.error(e)
return "Table Empty !!!"
page_data[table_num] = table_data
page_data["height"] = page["dimension"]["height"]
page_data["width"] = page["dimension"]["width"]
self.master_dict[pg_num] = page_data
else:
Logger.error("no data found in table")
return None