def find_phrase_in_lines()

in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]


    def find_phrase_in_lines(self, phrase: str, min_textdistance=0.6, page_number: int = 1) -> List[TWord]:
        """
        phrase = words seperated by space char
        """
        # first check if we already did find this phrase and stored it in the DB
        # TODO: Problem: it will not find Current: when the phrase has current and there are other current values in the document without :
        if not phrase:
            raise ValueError(f"no valid phrase: '{phrase}")
        phrase_words = phrase.split(" ")
        if len(phrase_words) < 1:
            raise ValueError(f"no valid phrase: '{phrase}")
        # TODO: check for page_number impl
        found_phrases: "list[TWord]" = self.ocrdb.select_text(textract_doc_uuid=self.textract_doc_uuid,
                                                              text=make_alphanum_and_lower_for_non_numbers(phrase))
        if found_phrases:
            return found_phrases

        alphanum_regex = re.compile(r'[\W_]+')
        # find phrase (words that follow each other) in trp lines
        for page in self.doc.pages:
            page_number = 1
            for line in page.lines:
                for line_idx, word in enumerate(line.words):
                    found_words: "list[TWord]" = []
                    match_phrase = False
                    if difflib.SequenceMatcher(isjunk=None,
                                               a=alphanum_regex.sub('', str(phrase_words[0].lower())),
                                               b=alphanum_regex.sub('', str(
                                                   word.text.lower()))).ratio() > min_textdistance:
                        # assume the phrase to be correct
                        tword = TWord(trp_word=word,
                                      text_type='word',
                                      doc_width=self.doc_width,
                                      doc_height=self.doc_height,
                                      page_number=page_number)
                        tword.text = phrase_words[0].lower()
                        found_words.append(tword)
                        for phrase_idx, phrase_word in enumerate(phrase_words[1:]):
                            if len(line.words) <= line_idx + 1 + phrase_idx:
                                match_phrase = False
                                break
                            next_word = line.words[line_idx + 1 + phrase_idx]
                            if difflib.SequenceMatcher(isjunk=None,
                                                       a=alphanum_regex.sub('', str(phrase_word.lower())),
                                                       b=alphanum_regex.sub('', str(
                                                           next_word.text.lower()))).ratio() > min_textdistance:
                                match_phrase = True
                                tword = TWord(trp_word=next_word,
                                              doc_width=self.doc_width,
                                              doc_height=self.doc_height,
                                              page_number=page_number)
                                tword.text = phrase_word.lower()
                                found_words.append(tword)
                    if match_phrase:
                        found_phrase: TWord = TWord.combine_multiple_words_to_phrase(found_words)
                        found_phrases.append(found_phrase)
                        # found_tuples.append((self.textract_doc_uuid, ) +
                        #                     found_phrase.get_tupel())
            page_number += 1
        # store for future requests
        self.ocrdb.insert_bulk(textract_doc_uuid=self.textract_doc_uuid, rows=found_phrases)
        return found_phrases