in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]
def find_phrase_on_page(self,
phrase: str,
min_textdistance: float = 0.8,
page_number: int = 1,
number_of_other_words_allowed: int = 0,
area_selection: AreaSelection = None,
exclude_ids: List[str] = None) -> List[TWord]:
"""returns new phrases, regardless of orientation"""
"""TODO: cannot do the caching this way with area_selection, because when using with area_selection first, it will create a phrase for the area and it will just return the value and not consider the other areas """
phrase_words = phrase.split(" ")
phrase_words = TGeoFinder.clean_up_phrase_words(phrase_words=phrase_words)
logger.debug(f"find_phrase_on_page: phrase_words: {phrase_words}")
if len(phrase_words) < 1:
raise ValueError(f"no valid phrase: '{phrase}")
# check if already in DB
found_phrases: "list[TWord]" = self.ocrdb.select_text(textract_doc_uuid=self.textract_doc_uuid,
page_number=page_number,
text=make_alphanum_and_lower_for_non_numbers(phrase),
area_selection=area_selection,
exclude_ids=exclude_ids)
if found_phrases:
logger.debug(f"phrase already there, pull from DB: {found_phrases}")
return found_phrases
else:
# first try to find with split
found_phrases = self.__find_phrase_on_page(phrase_words=phrase_words,
min_textdistance=min_textdistance,
page_number=page_number,
number_of_other_words_allowed=number_of_other_words_allowed,
area_selection=area_selection,
exclude_ids=exclude_ids)
if found_phrases:
return found_phrases
# now we try phrase combinations
else:
phrase_combinations = TGeoFinder.get_phrase_combinations(phrase_words)
logger.debug(f"find_phrase_on_page: phrase_combinations: {phrase_combinations}")
for phrase_combination in phrase_combinations:
found_phrases = self.__find_phrase_on_page(
phrase_words=phrase_combination,
min_textdistance=min_textdistance,
page_number=page_number,
number_of_other_words_allowed=number_of_other_words_allowed,
area_selection=area_selection,
exclude_ids=exclude_ids)
if found_phrases:
logger.debug(f"find_phrase_on_page: found_phrases: {found_phrases}")
return found_phrases
# if really nothing found, then empty
logger.debug(
f"find_phrase_on_page: found nothing for {phrase} in area: {area_selection} with min_distance:{min_textdistance} on page: {page_number}"
)
return found_phrases