def __find_phrase_on_page()

in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]


    def __find_phrase_on_page(self,
                              phrase_words: List[str],
                              min_textdistance: float = 0.9,
                              page_number: int = 1,
                              number_of_other_words_allowed: int = 0,
                              area_selection: AreaSelection = None,
                              exclude_ids=None) -> List[TWord]:
        logger.debug(
            f"find_phrase_on_page: phrase_words: {phrase_words}, min_textdistance: {min_textdistance}, area_selection: {area_selection}"
        )
        found_phrases: List[TWord] = list()
        valid_combinations: List[List[TWord]] = list()
        # find first words and then walk to right and down and lower_left_word is always the left-most and lowest
        lower_left_word = phrase_words[0]
        first_word_twords: List[TWord] = self.find_word_on_page(lower_left_word,
                                                                page_number=page_number,
                                                                min_textdistance=min_textdistance,
                                                                exclude_ids=exclude_ids)
        logger.debug(f"find_phrase_on_page - first_word_twords: {first_word_twords}")

        for first_word_option in first_word_twords:
            logger.debug(f"find_phrase_on_page - trying to find phrase starting with: {first_word_option}")
            lower_left_word = first_word_option
            valid_combination: List[TWord] = list()
            valid_combination.append(first_word_option)
            below_area: AreaSelection = AreaSelection(
                top_left=t2.TPoint(x=lower_left_word.xmin, y=lower_left_word.ymax),
                lower_right=t2.TPoint(x=lower_left_word.xmax + lower_left_word.height * 3, y=self.doc_width),
                page_number=page_number)

            found_combination = True
            current_word = first_word_option
            for word in phrase_words[1:]:
                logger.debug(f"find_phrase_on_page - looking for word: {word} with current_word: {current_word}")
                words_to_right = self.get_words_to_the_right(anker=TGeoFinder.get_area_selection_for_twords(
                    [current_word]),
                                                             number_of_words_to_return=1)
                logger.debug(f"find_phrase_on_page - words to the right: {words_to_right}")
                if words_to_right and get_diff_for_alphanum_words(word1=words_to_right[0].text,
                                                                  word2=word) > min_textdistance:
                    logger.debug(f"find_phrase_on_page - found word_to_right: {words_to_right[0]}")
                    current_word = words_to_right[0]
                    valid_combination.append(words_to_right[0])
                    #found one, next word to check
                    continue

                # find below, take area from ymax of first word and get first words in there ordered by y
                logger.debug(
                    f"find_phrase_on_page - trying to find below word: {word} from lower_left: {lower_left_word}")
                words_below = self.get_twords_in_area(area_selection=below_area)
                logger.debug(f"find_phrase_on_page - found words_below: {words_below}")
                euclidean_distance_list = [x.euclid_distance(first_word_option) for x in words_below]
                combined_list = [x for x in zip(euclidean_distance_list, words_below)]
                if len(combined_list):
                    sorted_below_words = sorted(combined_list)
                    word_below_sorted = [x for (_, x) in sorted_below_words]
                    if word_below_sorted and get_diff_for_alphanum_words(word1=word_below_sorted[0].text,
                                                                         word2=word) > min_textdistance:
                        logger.debug(f"find_phrase_on_page - found word_below: {word_below_sorted[0]}")
                        valid_combination.append(word_below_sorted[0])
                        lower_left_word = word_below_sorted[0]
                        current_word = word_below_sorted[0]
                        continue
                logger.debug(f"find_phrase_on_page - did not find word right or below for {word}")
                found_combination = False
                break
            if found_combination:
                logger.debug(f"find_phrase_on_page - found valid combination: {valid_combination}")
                valid_combinations.append(valid_combination)

        for found_combination in valid_combinations:
            found_phrase: TWord = TWord.combine_multiple_words_to_phrase(list(found_combination))
            found_phrases.append(found_phrase)

        # store for future requests
        logger.debug(f"find_phrase_on_page: result: {found_phrases}")
        if found_phrases:
            self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=found_phrases)
        return found_phrases