def extractTextByPages()

in source/lambda/helper/python/comprehendHelper.py [0:0]


    def extractTextByPages(self,
                           textract,
                           rawPages,
                           numOfPages):

        # the textract results json file in S3 has a peculiar structure of
        # top-level list of block lists, each appear to have a size limit of 1000.
        # A new one is creatd and appended when the previous blocklist reaches that limit.
        # Not sure why it is so. A blocklist may contain blocks from different document pages.
        for blocklist in textract:

            for block in blocklist['Blocks']:

                # PAGE block type have no text, so skip those
                if block['BlockType'] == 'LINE':

                    # page numbers start at 1 in Textract for png, however
                    # for png there are no page reference since it is a single
                    # page, in this case all blocks have page 1
                    if numOfPages == 1:
                        pageNumber = 1
                    else:
                        pageNumber = block['Page']

                    # skip pages above limit
                    if pageNumber > numOfPages:
                        continue

                    # but our storage of page results list start at index 0
                    pageResultIndex = pageNumber - 1

                    # some block may not contain text
                    if 'Text' in block:

                        # calculate the size of this page if we add this text element + the ". " separator.
                        # Comprehend has a UTF8 size limit, so we dismiss excessive elements once size is
                        # reached.
                        projectedSize = len(
                            rawPages[pageResultIndex]) + len(block['Text']) + 2

                        # add if page size allows
                        if MAX_COMPREHEND_UTF8_PAGE_SIZE > projectedSize:
                            # add a separator from previous text block
                            rawPages[pageResultIndex] += ". "
                            # text block
                            rawPages[pageResultIndex] += block['Text']

        return numOfPages