in code/textract_async/textract_processor.py [0:0]
def getJobResults(api, jobId, objectName):
resultJSON = []
s3_helper = S3Helper()
textractRawResultsFiles = s3_helper.listObjectsInS3(
bucketName = textractBucketName,
bucketPrefix = objectName + "/textract-output/" + jobId
)
# skip the s3 access file, which will always appear first
for textractResultFile in textractRawResultsFiles[1:]:
resultJSON.append(json.loads(s3_helper.readFromS3(textractBucketName, textractResultFile)))
# time.sleep(5)
# client = AwsHelper().getClient('textract')
# if(api == "StartDocumentTextDetection"):
# response = client.get_document_text_detection(JobId=jobId)
# else:
# response = client.get_document_analysis(JobId=jobId)
# pages.append(response)
# print("Resultset page received: {}".format(len(pages)))
# nextToken = None
# if('NextToken' in response):
# nextToken = response['NextToken']
# print("Next token: {}".format(nextToken))
# while(nextToken):
# time.sleep(5)
# if(api == "StartDocumentTextDetection"):
# response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
# else:
# response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)
# pages.append(response)
# print("Resultset page received: {}".format(len(pages)))
# nextToken = None
# if('NextToken' in response):
# nextToken = response['NextToken']
# print("Next token: {}".format(nextToken))
return resultJSON