def find_intersect_value()

in tpipelinegeofinder/textractgeofinder/tgeofinder.py [0:0]


    def find_intersect_value(self,
                             word_left: str,
                             word_up: str,
                             word_up_minus_x: int = 0,
                             word_up_plus_x: int = 0,
                             text_type: str = 'word',
                             stop_words: "list[str]" = None,
                             area_selection: AreaSelection = None) -> "list[TWord]":
        """
        find intersect value by looking for the left word/phrase and the upper word/phrase and finding values where the center is in that block
        """
        logger.debug(f"word_left: {word_left}, word_up: {word_up}")
        result_tword_list: "list[TWord]" = list()
        word_left = word_left.lower()
        word_up = word_up.lower()
        word_left_list = self.find_phrase_in_lines(word_left)
        word_up_list = self.find_phrase_in_lines(word_up)
        logger.debug(f"word_left_list: {word_left_list} \n word_up_list: {word_up_list}")
        # TODO: one query instead of loop would be better
        for tword_left in word_left_list:
            # get ordered list of word_up that are higher and to the right of word_left
            query = ''' and text=?
                        and ymin < ?
                        and xmin > ?
                        and page_number = ?
                        order by ( ((? - xmin) * (? - xmin)) + ((? - ymin) * (? - ymin))) asc
                               '''
            params = [
                word_up, tword_left.ymin, tword_left.xmin, tword_left.page_number, tword_left.xmin, tword_left.xmin,
                tword_left.ymin, tword_left.ymin
            ]

            j = self.ocrdb.execute(query=query,
                                   params=params,
                                   area_selection=area_selection,
                                   textract_doc_uuid=self.textract_doc_uuid)
            if j and len(j) >= 1:
                logger.debug(f"found a word_up: {j}")
                query = ''' and ((xmin + xmax) / 2) < ?
                            and ((xmin + xmax) / 2) > ?
                            and ((ymin + ymax) / 2) > ?
                            and ((ymin + ymax) / 2) < ?
                            and text_type=?
                            and page_number = ?
                '''
                params = [
                    j[0].xmax + word_up_plus_x, j[0].xmin - word_up_minus_x, tword_left.ymin, tword_left.ymax,
                    text_type, tword_left.page_number
                ]
                found_intersect_word = self.ocrdb.execute(query=query,
                                                          params=params,
                                                          textract_doc_uuid=self.textract_doc_uuid,
                                                          area_selection=area_selection)
                result_tword_list.extend(found_intersect_word)
        if result_tword_list:
            logger.info(f"word_left: {word_left}, word_up: {word_up}, result_tuples: {[p for p in result_tword_list]}")
        return result_tword_list