def combine_multiple_words_to_phrase()

in tpipelinegeofinder/textractgeofinder/tword.py [0:0]


    def combine_multiple_words_to_phrase(tword_list: "list[TWord]") -> "TWord":
        """
        word_array = trp.Word objects

        get xmin, ymin, xmax, ymax for both words and combine them with space 'word1 word2' and insert
        simple calculation of new average confidence (conf1 + conf2) / 2
        returns tuble ('word1 word2', xmin, ymin, xmax, ymax)"""
        if not tword_list:
            raise ValueError(f"tword_list is empty.")
        phrase = " ".join([x.text for x in tword_list])
        original_text = " ".join([x.original_text for x in tword_list if x.original_text])
        text_type = 'phrase'
        xmin = min([x.xmin for x in tword_list])
        xmax = max([x.xmax for x in tword_list])
        ymin = min([x.ymin for x in tword_list])
        ymax = max([x.ymax for x in tword_list])
        page_number = int(tword_list[0].page_number)
        confidence = statistics.mean([x.confidence for x in tword_list])
        doc_width = tword_list[0].doc_width
        doc_height = tword_list[0].doc_height
        return TWord(page_number=page_number,
                     original_text=original_text,
                     text_type=text_type,
                     text=phrase,
                     confidence=confidence,
                     xmin=xmin,
                     ymin=ymin,
                     xmax=xmax,
                     ymax=ymax,
                     id=str(uuid4()),
                     doc_width=doc_width,
                     doc_height=doc_height)