def find_phrase_on_page()

in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]


    def find_phrase_on_page(self,
                            phrase: str,
                            min_textdistance: float = 0.8,
                            page_number: int = 1,
                            number_of_other_words_allowed: int = 0,
                            area_selection: AreaSelection = None,
                            exclude_ids: List[str] = None) -> List[TWord]:
        """returns new phrases, regardless of orientation"""
        """TODO: cannot do the caching this way with area_selection, because when using with area_selection first, it will create a phrase for the area and it will just return the value and not consider the other areas """
        phrase_words = phrase.split(" ")
        phrase_words = TGeoFinder.clean_up_phrase_words(phrase_words=phrase_words)
        logger.debug(f"find_phrase_on_page: phrase_words: {phrase_words}")
        if len(phrase_words) < 1:
            raise ValueError(f"no valid phrase: '{phrase}")
        # check if already in DB
        found_phrases: "list[TWord]" = self.ocrdb.select_text(textract_doc_uuid=self.textract_doc_uuid,
                                                              page_number=page_number,
                                                              text=make_alphanum_and_lower_for_non_numbers(phrase),
                                                              area_selection=area_selection,
                                                              exclude_ids=exclude_ids)
        if found_phrases:
            logger.debug(f"phrase already there, pull from DB: {found_phrases}")
            return found_phrases
        else:
            # first try to find with split
            found_phrases = self.__find_phrase_on_page(phrase_words=phrase_words,
                                                       min_textdistance=min_textdistance,
                                                       page_number=page_number,
                                                       number_of_other_words_allowed=number_of_other_words_allowed,
                                                       area_selection=area_selection,
                                                       exclude_ids=exclude_ids)
            if found_phrases:
                return found_phrases
            # now we try phrase combinations
            else:
                phrase_combinations = TGeoFinder.get_phrase_combinations(phrase_words)
                logger.debug(f"find_phrase_on_page: phrase_combinations: {phrase_combinations}")
                for phrase_combination in phrase_combinations:
                    found_phrases = self.__find_phrase_on_page(
                        phrase_words=phrase_combination,
                        min_textdistance=min_textdistance,
                        page_number=page_number,
                        number_of_other_words_allowed=number_of_other_words_allowed,
                        area_selection=area_selection,
                        exclude_ids=exclude_ids)
                    if found_phrases:
                        logger.debug(f"find_phrase_on_page: found_phrases: {found_phrases}")
                        return found_phrases
        # if really nothing found, then empty
        logger.debug(
            f"find_phrase_on_page: found nothing for {phrase} in area: {area_selection} with min_distance:{min_textdistance} on page: {page_number}"
        )
        return found_phrases