in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]
def find_phrase_in_lines(self, phrase: str, min_textdistance=0.6, page_number: int = 1) -> List[TWord]:
"""
phrase = words seperated by space char
"""
# first check if we already did find this phrase and stored it in the DB
# TODO: Problem: it will not find Current: when the phrase has current and there are other current values in the document without :
if not phrase:
raise ValueError(f"no valid phrase: '{phrase}")
phrase_words = phrase.split(" ")
if len(phrase_words) < 1:
raise ValueError(f"no valid phrase: '{phrase}")
# TODO: check for page_number impl
found_phrases: "list[TWord]" = self.ocrdb.select_text(textract_doc_uuid=self.textract_doc_uuid,
text=make_alphanum_and_lower_for_non_numbers(phrase))
if found_phrases:
return found_phrases
alphanum_regex = re.compile(r'[\W_]+')
# find phrase (words that follow each other) in trp lines
for page in self.doc.pages:
page_number = 1
for line in page.lines:
for line_idx, word in enumerate(line.words):
found_words: "list[TWord]" = []
match_phrase = False
if difflib.SequenceMatcher(isjunk=None,
a=alphanum_regex.sub('', str(phrase_words[0].lower())),
b=alphanum_regex.sub('', str(
word.text.lower()))).ratio() > min_textdistance:
# assume the phrase to be correct
tword = TWord(trp_word=word,
text_type='word',
doc_width=self.doc_width,
doc_height=self.doc_height,
page_number=page_number)
tword.text = phrase_words[0].lower()
found_words.append(tword)
for phrase_idx, phrase_word in enumerate(phrase_words[1:]):
if len(line.words) <= line_idx + 1 + phrase_idx:
match_phrase = False
break
next_word = line.words[line_idx + 1 + phrase_idx]
if difflib.SequenceMatcher(isjunk=None,
a=alphanum_regex.sub('', str(phrase_word.lower())),
b=alphanum_regex.sub('', str(
next_word.text.lower()))).ratio() > min_textdistance:
match_phrase = True
tword = TWord(trp_word=next_word,
doc_width=self.doc_width,
doc_height=self.doc_height,
page_number=page_number)
tword.text = phrase_word.lower()
found_words.append(tword)
if match_phrase:
found_phrase: TWord = TWord.combine_multiple_words_to_phrase(found_words)
found_phrases.append(found_phrase)
# found_tuples.append((self.textract_doc_uuid, ) +
# found_phrase.get_tupel())
page_number += 1
# store for future requests
self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=found_phrases)
return found_phrases