in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]
def find_intersect_value(self,
word_left: str,
word_up: str,
word_up_minus_x: int = 0,
word_up_plus_x: int = 0,
text_type: str = 'word',
stop_words: "list[str]" = None,
area_selection: AreaSelection = None) -> "list[TWord]":
"""
find intersect value by looking for the left word/phrase and the upper word/phrase and finding values where the center is in that block
"""
logger.debug(f"word_left: {word_left}, word_up: {word_up}")
result_tword_list: "list[TWord]" = list()
word_left = word_left.lower()
word_up = word_up.lower()
word_left_list = self.find_phrase_in_lines(word_left)
word_up_list = self.find_phrase_in_lines(word_up)
logger.debug(f"word_left_list: {word_left_list} \n word_up_list: {word_up_list}")
# TODO: one query instead of loop would be better
for tword_left in word_left_list:
# get ordered list of word_up that are higher and to the right of word_left
query = ''' and text=?
and ymin < ?
and xmin > ?
and page_number = ?
order by ( ((? - xmin) * (? - xmin)) + ((? - ymin) * (? - ymin))) asc
'''
params = [
word_up, tword_left.ymin, tword_left.xmin, tword_left.page_number, tword_left.xmin, tword_left.xmin,
tword_left.ymin, tword_left.ymin
]
j = self.ocrdb.execute(query=query,
params=params,
area_selection=area_selection,
textract_doc_uuid=self.textract_doc_uuid)
if j and len(j) >= 1:
logger.debug(f"found a word_up: {j}")
query = ''' and ((xmin + xmax) / 2) < ?
and ((xmin + xmax) / 2) > ?
and ((ymin + ymax) / 2) > ?
and ((ymin + ymax) / 2) < ?
and text_type=?
and page_number = ?
'''
params = [
j[0].xmax + word_up_plus_x, j[0].xmin - word_up_minus_x, tword_left.ymin, tword_left.ymax,
text_type, tword_left.page_number
]
found_intersect_word = self.ocrdb.execute(query=query,
params=params,
textract_doc_uuid=self.textract_doc_uuid,
area_selection=area_selection)
result_tword_list.extend(found_intersect_word)
if result_tword_list:
logger.info(f"word_left: {word_left}, word_up: {word_up}, result_tuples: {[p for p in result_tword_list]}")
return result_tword_list