def __fill_sql_from_textract_json()

in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]


    def __fill_sql_from_textract_json(self):
        logger.debug("__fill_sql_from_textract_json")
        word_list: "list[TWord]" = list()
        line_list: "list[TWord]" = list()
        forms_list: List[TWord] = list()
        selection_elements: List[TWord] = list()

        for idx, page in enumerate(self.doc.pages):
            logger.debug(f"page: {idx}")
            if idx >= 1:
                selection_elements_tblocks = self.trp2_doc.get_blocks_by_type(
                    block_type_enum=t2.TextractBlockTypes.SELECTION_ELEMENT, page=self.trp2_doc.pages[idx])
                logger.debug(f"selection_elements_tblocks: {selection_elements_tblocks}")
                selection_elements = [self.get_TWord_from_TBlock(b) for b in selection_elements_tblocks]
                logger.debug(f"selection_elements: {[s.text for s in selection_elements]}")
            else:
                selection_elements = [
                    self.get_TWord_from_TBlock(b)
                    for b in self.trp2_doc.get_blocks_by_type(block_type_enum=t2.TextractBlockTypes.SELECTION_ELEMENT,
                                                              page=self.trp2_doc.pages[idx])
                ]
                logger.debug(f"selection_elements: {[s.text for s in selection_elements]}")

            for field in page.form.fields:
                reference = ""
                if field.key:
                    logger.debug(f"field-key: {field.key}")
                    if field.value:
                        forms_list.append(
                            TWord(trp_word=field.value,
                                  reference=field.key.id,
                                  doc_width=self.doc_width,
                                  doc_height=self.doc_height,
                                  page_number=idx + 1))
                        reference = field.value.id
                    else:
                        logger.warning(f"field.value is None: {field.value}")

                    forms_list.append(
                        TWord(trp_word=field.key,
                              reference=reference,
                              doc_width=self.doc_width,
                              doc_height=self.doc_height,
                              page_number=idx + 1))
                else:
                    logger.warning(f"field.key is None: {field.key}")

            for line in page.lines:
                line_text = make_alphanum_and_lower_for_non_numbers(line.text)
                xmin, ymin, xmax, ymax = self.get_coords_from_geo(line)
                if line_text:
                    line_text = line_list.append(
                        TWord(text=line_text,
                              text_type='line',
                              original_text=line.text,
                              page_number=idx + 1,
                              confidence=line.confidence,
                              xmin=xmin,
                              ymin=ymin,
                              xmax=xmax,
                              ymax=ymax,
                              id=line.id,
                              doc_width=self.doc_width,
                              doc_height=self.doc_height,
                              child_relationships=",".join([x.id for x in line.words])))

                for word in line.words:
                    # old_text = word.text
                    text = make_alphanum_and_lower_for_non_numbers(word.text)
                    if text:
                        xmin, ymin, xmax, ymax = self.get_coords_from_geo(word)
                        word_list.append(
                            TWord(text=text,
                                  original_text=word.text,
                                  text_type='word',
                                  page_number=idx + 1,
                                  confidence=word.confidence,
                                  xmin=xmin,
                                  ymin=ymin,
                                  xmax=xmax,
                                  ymax=ymax,
                                  id=word.id,
                                  doc_width=self.doc_width,
                                  doc_height=self.doc_height))
                    else:    # if no text left, store only original
                        xmin, ymin, xmax, ymax = self.get_coords_from_geo(word)
                        word_list.append(
                            TWord(text=word.text,
                                  original_text=word.text,
                                  text_type='word',
                                  page_number=idx + 1,
                                  confidence=word.confidence,
                                  xmin=xmin,
                                  ymin=ymin,
                                  xmax=xmax,
                                  ymax=ymax,
                                  id=word.id,
                                  doc_width=self.doc_width,
                                  doc_height=self.doc_height))

        if self.ocrdb:
            self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=word_list)
            self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=line_list)
            self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=selection_elements)
            self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=forms_list)
        else:
            logger.error(f"no ocrdb")