in sources/lambda/async/text_extractor.py [0:0]
def __extract_all_pages(self, jobId, textract_result, pages, page_numbers):
""" extract page content: build the pages array,
recurse if response is too big (when NextToken is provided by textract)
"""
blocks = [x for x in textract_result['Blocks']
if x['BlockType'] == "LINE"]
for block in blocks:
if block['Page'] not in page_numbers:
page_numbers.append(block['Page'])
pages[block['Page']] = {
"Number": block['Page'],
"Content": block['Text']
}
else:
pages[block['Page']]['Content'] += " " + block['Text']
nextToken = textract_result.get("NextToken", "")
if nextToken != '':
textract_result = textract.get_document_text_detection(
JobId=jobId,
NextToken=nextToken
)
self.__extract_all_pages(jobId,
textract_result,
pages,
page_numbers)