in tpipelinegeofinder/textractgeofinder/tword.py [0:0]
def __init__(
self,
text: str = None,
original_text: str = None,
text_type: str = None, # word, line or phrase
confidence: float = None,
id: str = None,
xmin: int = None,
ymin: int = None,
xmax: int = None,
ymax: int = None,
page_number: int = None,
ocrdb_row=None,
trp_word: trpWord = None,
doc_width: int = None,
doc_height: int = None,
child_relationships: str = "",
reference: str = None,
resolver: str = None):
"""
resolver: textract, tquery, table, forms
"""
len_word_params = len([x for x in [text, ocrdb_row, trp_word] if x])
if len_word_params > 1:
raise ValueError("Only can take one, text or trp_word or word_position or ocrdb_row.")
if len_word_params == 0:
raise ValueError("Have to pass in one of text or trp_word or word_position.")
if text:
missing_values: List[str] = list()
if not text_type:
missing_values.append("text_type")
else:
self.text_type = text_type.lower()
if not text:
missing_values.append("text")
else:
self.text = text.lower()
if not id:
missing_values.append("id")
else:
self.id = id
if original_text:
self.original_text = original_text
if not confidence:
missing_values.append("confidence")
else:
self.confidence: float = confidence
if xmin == None or ymin == None or xmax == None or ymax == None:
missing_values.append("xmin ymin xmax or ymax")
else:
self.xmin: int = xmin
self.ymin: int = ymin
self.xmax: int = xmax
self.ymax: int = ymax
if not page_number:
missing_values.append("page_number")
else:
self.page_number: int = page_number
if resolver:
self.resolver = resolver
if not doc_width or not doc_height:
missing_values.append("doc_width or doc_height")
else:
self.doc_width = doc_width
self.doc_height = doc_height
self.child_relationships = child_relationships
if reference:
self.reference = reference
if missing_values:
raise ValueError(f"missing: {missing_values}")
if ocrdb_row:
self.page_number = ocrdb_row[1]
self.text_type = ocrdb_row[2]
self.text = ocrdb_row[3]
self.original_text = ocrdb_row[4]
self.confidence = ocrdb_row[5]
self.xmin = ocrdb_row[6]
self.ymin = ocrdb_row[7]
self.xmax = ocrdb_row[8]
self.ymax = ocrdb_row[9]
self.id = ocrdb_row[10]
self.doc_width = ocrdb_row[11]
self.doc_height = ocrdb_row[12]
self.child_relationships = ocrdb_row[13]
self.reference = ocrdb_row[14]
if trp_word:
if not (doc_width and doc_height and page_number):
raise ValueError(
f"when using trp_word, need doc_width and doc_height and page_number parameters as well. \
doc_width: {doc_width}, doc_height: {doc_height}, page_number: {page_number}")
if isinstance(trp_word, FieldKey) or isinstance(trp_word, FieldValue):
self.text = trp_word.text.lower()
self.text_type = 'KEY' if isinstance(trp_word, FieldKey) else 'VALUE'
self.original_text = trp_word.text
if reference:
self.reference = reference
if isinstance(trp_word, trpWord):
self.text = trp_word.text.lower()
self.text_type = 'word'
self.original_text = trp_word.text
self.confidence = trp_word.confidence
bbox_width = trp_word.geometry.boundingBox.width
bbox_height = trp_word.geometry.boundingBox.height
bbox_left = trp_word.geometry.boundingBox.left
bbox_top = trp_word.geometry.boundingBox.top
self.xmin = round(bbox_left * doc_width)
self.ymin = round(bbox_top * doc_height)
self.xmax = round(self.xmin + (bbox_width * doc_width))
self.ymax = round(self.ymin + (bbox_height * doc_height))
self.page_number = page_number
if resolver:
self.resolver = resolver
self.id = trp_word.id
self.doc_width = doc_width
self.doc_height = doc_height
if isinstance(trp_word, Line):
self.child_relationships = ",".join([x.id for x in trp_word.words])
else:
self.child_relationships = ""