def table_attributes()

in microservices/prior_learning_assessment/src/services/extraction/table_extractor.py [0:0]


  def table_attributes(self):
    """
    This function obtains information regarding all the tables.
    For ex. total tables, table header info, table row wise data
		in dataframe format
    """
    if "pages" in self.data.keys():
      # Iterate over pages
      for pg_num, page in enumerate(self.data["pages"]):

        page_data = {}
        if "tables" in page.keys():

          # Iterate over tables
          for table_num, table in enumerate(page["tables"]):

            # extract header(columns)
            if "bodyRows" in table and "headerRows" in table:
              for _, hrow in enumerate(table["headerRows"]):
                header_row = [
                  TableExtractor.get_text(
                    cell["layout"], self.data) for cell in hrow["cells"]
                ]
                columns = []
                for val, conf in header_row:
                  if val is None:
                    tup = (val, conf)
                    columns.append(tup)
                  else:
                    tup = (" ".join(val.split()), conf)
                    columns.append(tup)
                table_data = {"headers": columns}
                table_data["page_num"] = pg_num
                col_data = {}
                try:
                  for row_num, row in enumerate(table["bodyRows"]):
                    row_data = [
                        TableExtractor.get_text(
                          cell["layout"], self.data) for cell in row["cells"]
                    ]
                    for i_col in range(len(header_row)):
                      entity_val, conf = row_data[i_col]
                      col_data[i_col] = {
                        "value": entity_val,
                        "extraction_confidence": conf
                      }
                    table_data[row_num] = {"rows": deepcopy(col_data)}

                except ValueError as e:
                  print(e)
                  return "Table Empty !!!"

              page_data[table_num] = table_data
              # page_data["height"] = page["dimension"]["height"]
              # page_data["width"] = page["dimension"]["width"]
          self.master_dict[pg_num] = page_data

          print("Checking the tabel extractor")

          #This code converts the  dict to a simpler list format
          final = []
          for pg_num,page in self.master_dict.items():
            for table_num, table in page.items():
              obj = {}
              obj["table_id"] = table_num
              obj["header_data"] = table["headers"]
              obj["row_data"] = []
              for key, rows in table.items():
                if(key not in ["headers","page_num"]):
                  row_arr = []
                  for _ , cell in rows.items():
                    print("Inside the final loop")
                    for _ , row_val in cell.items():
                      value= row_val["value"]
                      confidence = row_val["extraction_confidence"]
                      tup= (value,confidence)
                      row_arr.append(tup)
                    print("Appending the row_arr")
                    obj["row_data"].append(row_arr)
              final.append(obj)
          print(final)
          self.table_list = final
    else:
      print("no data found in table")
      return None