in textract-pipeline/lambda/textractor/python/trp.py [0:0]
def getLinesInReadingOrder(self):
columns = []
lines = []
for item in self._lines:
column_found=False
for index, column in enumerate(columns):
bbox_left = item.geometry.boundingBox.left
bbox_right = item.geometry.boundingBox.left + item.geometry.boundingBox.width
bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
column_centre = column['left'] + column['right']/2
if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right):
#Bbox appears inside the column
lines.append([index, item.text])
column_found=True
break
if not column_found:
columns.append({'left':item.geometry.boundingBox.left, 'right':item.geometry.boundingBox.left + item.geometry.boundingBox.width})
lines.append([len(columns)-1, item.text])
lines.sort(key=lambda x: x[0])
return lines