def get_entities()

in microservices/extraction_service/src/utils/table_extractor.py [0:0]


  def get_entities(self, table_entities):
    """
    Extract data from table based on user specific inputs for a table.

    Args:
      table_entities (list): user specified table parameters

    Returns:
      out(list): extracted entities
    """

    if table_entities["isheader"]:
      inp_header = table_entities["headers"]
      if isinstance(table_entities["table_num"], int):
        table_num = table_entities["table_num"]
      else:
        table_num = 0
      if isinstance(table_entities["page_num"], int):
        page_num = table_entities["page_num"]
      else:
        page_num = 0

      try:
        if table_num > 0 and page_num > 0:
          table_dict = self.master_dict[page_num][table_num]
          columns = [val[0] for val in table_dict["headers"]]
          if TableExtractor.compare_lists(columns, inp_header) < 0.70:
            Logger.error( "Table does not match with the headers provided")
            return self.table_not_found(table_entities)
        # if no table and page info provided.Iterate over all the pages to find
        # the table based on header
        elif page_num == 0 and table_num == 0:
          table_data = TableExtractor.get_table_using_header(
            self.master_dict, inp_header)
          if table_data:
            table_dict, columns = table_data
          else:
            return self.table_not_found(table_entities)

        elif page_num > 0 and table_num == 0:
          if page_num not in self.master_dict:
            Logger.error( "page not found")
            return self.table_not_found(table_entities)
          page_dict = self.master_dict[page_num]
          table_dict, columns = TableExtractor.get_table_using_header(
            page_dict, inp_header)
        else:
          Logger.error("Operation cannot be performed. Check your config")
          return self.table_not_found(table_entities)
      except Exception as e:
        Logger.error(e)
        return self.table_not_found(table_entities)
      out = []

      for user_inp in table_entities["entity_extraction"]:
        try:
          entity_data = {}
          suffix, row = user_inp["entity_suffix"], user_inp["row_no"]
          col = user_inp["col"]
          row_dict = table_dict[row]["rows"]
          if suffix in (None, ""):
            suffix = ""
          entity_name = f"{columns[col]} {suffix}"
          entity_data = row_dict[col]
          entity_data["entity"] = entity_name
          entity_data["key_coordinates"] = table_dict["headers"][col][2]
          entity_data["page_height"] = self.master_dict[0]["height"]
          entity_data["page_width"] = self.master_dict[0]["width"]
          entity_data["page_no"] = table_dict["page_num"]

          out.append(deepcopy(entity_data))
        except Exception as e:
          Logger.warning(e)
          continue
      return out
    else:
      Logger.error("No header present in the table. Table not extracted.")
      return self.table_not_found(table_entities)