in overlayer/textractoverlayer/t_overlay.py [0:0]
def get_bounding_boxes(textract_json: dict, overlay_features: List[Textract_Types],
document_dimensions: DocumentDimensions) -> List[BoundingBox]:
doc = trp.Document(textract_json)
bounding_box_list: List[BoundingBox] = list()
page_number: int = 0
for page in doc.pages:
page_number += 1
if Textract_Types.WORD in overlay_features or Textract_Types.LINE in overlay_features:
for line in page.lines:
if Textract_Types.LINE in overlay_features:
if line:
bounding_box_list.append(
BoundingBox(geometry=line.geometry,
document_dimensions=document_dimensions,
box_type=Textract_Types.LINE,
page_number=page_number,
confidence=line.confidence,
text=line.text))
if Textract_Types.WORD in overlay_features:
for word in line.words:
if word:
bounding_box_list.append(
BoundingBox(geometry=word.geometry,
document_dimensions=document_dimensions,
box_type=Textract_Types.WORD,
page_number=page_number,
confidence=word.confidence,
text=word.text))
if any([x for x in overlay_features if x in [Textract_Types.FORM, Textract_Types.KEY, Textract_Types.VALUE]]):
for field in page.form.fields:
if any([x for x in overlay_features if x in [Textract_Types.FORM, Textract_Types.KEY]]):
if field and field.key:
bounding_box_list.append(
BoundingBox(geometry=field.key.geometry,
document_dimensions=document_dimensions,
box_type=Textract_Types.KEY,
page_number=page_number,
confidence=field.key.confidence,
text=field.key.text))
if any([x for x in overlay_features if x in [Textract_Types.FORM, Textract_Types.VALUE]]):
if field and field.value:
bounding_box_list.append(
BoundingBox(geometry=field.value.geometry,
document_dimensions=document_dimensions,
box_type=Textract_Types.VALUE,
page_number=page_number,
confidence=field.value.confidence,
text=field.value.text))
if any([x for x in overlay_features if x in [Textract_Types.TABLE, Textract_Types.CELL]]):
for table in page.tables:
if Textract_Types.TABLE in overlay_features:
bounding_box_list.append(
BoundingBox(geometry=table.geometry,
document_dimensions=document_dimensions,
box_type=Textract_Types.TABLE,
page_number=page_number,
confidence=table.confidence,
text="table"))
if Textract_Types.CELL in overlay_features:
for _, row in enumerate(table.rows):
for _, cell in enumerate(row.cells):
if cell:
bounding_box_list.append(
BoundingBox(geometry=cell.geometry,
document_dimensions=document_dimensions,
box_type=Textract_Types.CELL,
page_number=page_number,
confidence=cell.confidence,
text="cell"))
return bounding_box_list