in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]
def __fill_sql_from_textract_json(self):
logger.debug("__fill_sql_from_textract_json")
word_list: "list[TWord]" = list()
line_list: "list[TWord]" = list()
forms_list: List[TWord] = list()
selection_elements: List[TWord] = list()
for idx, page in enumerate(self.doc.pages):
logger.debug(f"page: {idx}")
if idx >= 1:
selection_elements_tblocks = self.trp2_doc.get_blocks_by_type(
block_type_enum=t2.TextractBlockTypes.SELECTION_ELEMENT, page=self.trp2_doc.pages[idx])
logger.debug(f"selection_elements_tblocks: {selection_elements_tblocks}")
selection_elements = [self.get_TWord_from_TBlock(b) for b in selection_elements_tblocks]
logger.debug(f"selection_elements: {[s.text for s in selection_elements]}")
else:
selection_elements = [
self.get_TWord_from_TBlock(b)
for b in self.trp2_doc.get_blocks_by_type(block_type_enum=t2.TextractBlockTypes.SELECTION_ELEMENT,
page=self.trp2_doc.pages[idx])
]
logger.debug(f"selection_elements: {[s.text for s in selection_elements]}")
for field in page.form.fields:
reference = ""
if field.key:
logger.debug(f"field-key: {field.key}")
if field.value:
forms_list.append(
TWord(trp_word=field.value,
reference=field.key.id,
doc_width=self.doc_width,
doc_height=self.doc_height,
page_number=idx + 1))
reference = field.value.id
else:
logger.warning(f"field.value is None: {field.value}")
forms_list.append(
TWord(trp_word=field.key,
reference=reference,
doc_width=self.doc_width,
doc_height=self.doc_height,
page_number=idx + 1))
else:
logger.warning(f"field.key is None: {field.key}")
for line in page.lines:
line_text = make_alphanum_and_lower_for_non_numbers(line.text)
xmin, ymin, xmax, ymax = self.get_coords_from_geo(line)
if line_text:
line_text = line_list.append(
TWord(text=line_text,
text_type='line',
original_text=line.text,
page_number=idx + 1,
confidence=line.confidence,
xmin=xmin,
ymin=ymin,
xmax=xmax,
ymax=ymax,
id=line.id,
doc_width=self.doc_width,
doc_height=self.doc_height,
child_relationships=",".join([x.id for x in line.words])))
for word in line.words:
# old_text = word.text
text = make_alphanum_and_lower_for_non_numbers(word.text)
if text:
xmin, ymin, xmax, ymax = self.get_coords_from_geo(word)
word_list.append(
TWord(text=text,
original_text=word.text,
text_type='word',
page_number=idx + 1,
confidence=word.confidence,
xmin=xmin,
ymin=ymin,
xmax=xmax,
ymax=ymax,
id=word.id,
doc_width=self.doc_width,
doc_height=self.doc_height))
else: # if no text left, store only original
xmin, ymin, xmax, ymax = self.get_coords_from_geo(word)
word_list.append(
TWord(text=word.text,
original_text=word.text,
text_type='word',
page_number=idx + 1,
confidence=word.confidence,
xmin=xmin,
ymin=ymin,
xmax=xmax,
ymax=ymax,
id=word.id,
doc_width=self.doc_width,
doc_height=self.doc_height))
if self.ocrdb:
self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=word_list)
self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=line_list)
self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=selection_elements)
self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=forms_list)
else:
logger.error(f"no ocrdb")