in source/lambda/helper/python/comprehendHelper.py [0:0]
def extractTextByPages(self,
textract,
rawPages,
numOfPages):
# the textract results json file in S3 has a peculiar structure of
# top-level list of block lists, each appear to have a size limit of 1000.
# A new one is creatd and appended when the previous blocklist reaches that limit.
# Not sure why it is so. A blocklist may contain blocks from different document pages.
for blocklist in textract:
for block in blocklist['Blocks']:
# PAGE block type have no text, so skip those
if block['BlockType'] == 'LINE':
# page numbers start at 1 in Textract for png, however
# for png there are no page reference since it is a single
# page, in this case all blocks have page 1
if numOfPages == 1:
pageNumber = 1
else:
pageNumber = block['Page']
# skip pages above limit
if pageNumber > numOfPages:
continue
# but our storage of page results list start at index 0
pageResultIndex = pageNumber - 1
# some block may not contain text
if 'Text' in block:
# calculate the size of this page if we add this text element + the ". " separator.
# Comprehend has a UTF8 size limit, so we dismiss excessive elements once size is
# reached.
projectedSize = len(
rawPages[pageResultIndex]) + len(block['Text']) + 2
# add if page size allows
if MAX_COMPREHEND_UTF8_PAGE_SIZE > projectedSize:
# add a separator from previous text block
rawPages[pageResultIndex] += ". "
# text block
rawPages[pageResultIndex] += block['Text']
return numOfPages