def getLinesInReadingOrder()

in src/trp.py [0:0]


    def getLinesInReadingOrder(self):
        starting_point_tolerance = 0.01
        height_tolerance = 3
        same_line_top_tolerance = 0.001
        same_line_spacing_tolerance = 5
        columns = []
        lines = []
        for item in self._lines:
                column_found=False
                for index, column in enumerate(columns):
                    bbox_left = item.geometry.boundingBox.left
                    bbox_right = item.geometry.boundingBox.left + item.geometry.boundingBox.width
                    bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
                    bbox_top = item.geometry.boundingBox.top
                    bbox_height = item.geometry.boundingBox.height

                    # new logic:  
                    # if the starting point is within starting_point_tolerance (first_condition) and 
                    # the top location is within height_tolerance * bbox_height (second_condition), or
                    # the new line appeared to be broken by Textract mistake and should be of the same line 
                    # by looking at the top (third_condition) and 
                    # the left of the new line appears right next to the right of the last line (fourth_condition)
                    # then consider the new line as part of said column
                    first_condition = abs(bbox_left - column['left']) < starting_point_tolerance
                    second_condition = abs(bbox_top - column['top']) < height_tolerance * bbox_height
                    third_condition = abs(bbox_top - column['top']) < same_line_top_tolerance # appeared to be in the same line
                    fourth_condition = abs(bbox_left - column['right']) < same_line_spacing_tolerance * starting_point_tolerance
                    if (first_condition and second_condition) or (third_condition and fourth_condition):
                        #Bbox appears inside the column
                        lines.append([index, item.text])
                        # update the top and right with the new line added.
                        columns[index]['top'] = bbox_top
                        columns[index]['right'] = bbox_right
                        column_found=True
                        break
                if not column_found:
                    columns.append({'left':item.geometry.boundingBox.left, 
                                    'right':item.geometry.boundingBox.left + item.geometry.boundingBox.width,
                                    'top':item.geometry.boundingBox.top})
                    lines.append([len(columns)-1, item.text])

        lines.sort(key=lambda x: x[0])
        return lines