def getLinesInReadingOrder()

in textract-pipeline/lambda/textractor/python/trp.py [0:0]


    def getLinesInReadingOrder(self):
        columns = []
        lines = []
        for item in self._lines:
                column_found=False
                for index, column in enumerate(columns):
                    bbox_left = item.geometry.boundingBox.left
                    bbox_right = item.geometry.boundingBox.left + item.geometry.boundingBox.width
                    bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
                    column_centre = column['left'] + column['right']/2
                    if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right):
                        #Bbox appears inside the column
                        lines.append([index, item.text])
                        column_found=True
                        break
                if not column_found:
                    columns.append({'left':item.geometry.boundingBox.left, 'right':item.geometry.boundingBox.left + item.geometry.boundingBox.width})
                    lines.append([len(columns)-1, item.text])

        lines.sort(key=lambda x: x[0])
        return lines