in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]
def __find_phrase_on_page(self,
phrase_words: List[str],
min_textdistance: float = 0.9,
page_number: int = 1,
number_of_other_words_allowed: int = 0,
area_selection: AreaSelection = None,
exclude_ids=None) -> List[TWord]:
logger.debug(
f"find_phrase_on_page: phrase_words: {phrase_words}, min_textdistance: {min_textdistance}, area_selection: {area_selection}"
)
found_phrases: List[TWord] = list()
valid_combinations: List[List[TWord]] = list()
# find first words and then walk to right and down and lower_left_word is always the left-most and lowest
lower_left_word = phrase_words[0]
first_word_twords: List[TWord] = self.find_word_on_page(lower_left_word,
page_number=page_number,
min_textdistance=min_textdistance,
exclude_ids=exclude_ids)
logger.debug(f"find_phrase_on_page - first_word_twords: {first_word_twords}")
for first_word_option in first_word_twords:
logger.debug(f"find_phrase_on_page - trying to find phrase starting with: {first_word_option}")
lower_left_word = first_word_option
valid_combination: List[TWord] = list()
valid_combination.append(first_word_option)
below_area: AreaSelection = AreaSelection(
top_left=t2.TPoint(x=lower_left_word.xmin, y=lower_left_word.ymax),
lower_right=t2.TPoint(x=lower_left_word.xmax + lower_left_word.height * 3, y=self.doc_width),
page_number=page_number)
found_combination = True
current_word = first_word_option
for word in phrase_words[1:]:
logger.debug(f"find_phrase_on_page - looking for word: {word} with current_word: {current_word}")
words_to_right = self.get_words_to_the_right(anker=TGeoFinder.get_area_selection_for_twords(
[current_word]),
number_of_words_to_return=1)
logger.debug(f"find_phrase_on_page - words to the right: {words_to_right}")
if words_to_right and get_diff_for_alphanum_words(word1=words_to_right[0].text,
word2=word) > min_textdistance:
logger.debug(f"find_phrase_on_page - found word_to_right: {words_to_right[0]}")
current_word = words_to_right[0]
valid_combination.append(words_to_right[0])
#found one, next word to check
continue
# find below, take area from ymax of first word and get first words in there ordered by y
logger.debug(
f"find_phrase_on_page - trying to find below word: {word} from lower_left: {lower_left_word}")
words_below = self.get_twords_in_area(area_selection=below_area)
logger.debug(f"find_phrase_on_page - found words_below: {words_below}")
euclidean_distance_list = [x.euclid_distance(first_word_option) for x in words_below]
combined_list = [x for x in zip(euclidean_distance_list, words_below)]
if len(combined_list):
sorted_below_words = sorted(combined_list)
word_below_sorted = [x for (_, x) in sorted_below_words]
if word_below_sorted and get_diff_for_alphanum_words(word1=word_below_sorted[0].text,
word2=word) > min_textdistance:
logger.debug(f"find_phrase_on_page - found word_below: {word_below_sorted[0]}")
valid_combination.append(word_below_sorted[0])
lower_left_word = word_below_sorted[0]
current_word = word_below_sorted[0]
continue
logger.debug(f"find_phrase_on_page - did not find word right or below for {word}")
found_combination = False
break
if found_combination:
logger.debug(f"find_phrase_on_page - found valid combination: {valid_combination}")
valid_combinations.append(valid_combination)
for found_combination in valid_combinations:
found_phrase: TWord = TWord.combine_multiple_words_to_phrase(list(found_combination))
found_phrases.append(found_phrase)
# store for future requests
logger.debug(f"find_phrase_on_page: result: {found_phrases}")
if found_phrases:
self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=found_phrases)
return found_phrases