in code/textract_async/textract_processor.py [0:0]
def processRequest(request):
output = ""
status = request['jobStatus']
jobId = request['jobId']
jobTag = request['jobTag']
jobAPI = request['jobAPI']
bucketName = request['bucketName']
objectName = request['objectName']
pipeline_client.body = {
"documentId": jobTag,
"bucketName": bucketName,
"objectName": objectName,
"stage": PIPELINE_STAGE
}
if status == 'FAILED':
pipeline_client.stageFailed("Textract job for document ID {}; bucketName {} fileName {}; failed during Textract analysis. Please double check the document quality".format(jobTag, bucketName, objectName))
raise Exception("Textract Analysis didn't complete successfully")
pipeline_client.stageInProgress()
try:
resultJSON = getJobResults(jobAPI, jobId, objectName)
except Exception as e:
pipeline_client.stageFailed("Textract job for document ID {}; bucketName {} filename {} failed during Textract processing. Could not read Textract output files under job Name {}".format(jobTag, bucketName, objectName, jobId))
raise Exception("Textract Analysis didn't complete successfully")
print("Result Textract result objects received: {}".format(len(resultJSON)))
detectForms = False
detectTables = False
if(jobAPI == "StartDocumentAnalysis"):
detectForms = True
detectTables = True
try:
opg = OutputGenerator(
documentId = jobTag,
response = resultJSON,
bucketName = textractBucketName,
objectName = objectName,
forms = detectForms,
tables = detectTables
)
except Exception as e:
pipeline_client.stageFailed("Could not convert results from Textract into processable object. Try uploading again.")
raise(e)
tagging = "documentId={}".format(jobTag)
opg.writeTextractOutputs(taggingStr=tagging)
lineage_client.recordLineage({
"documentId": jobTag,
"callerId": request["callerId"],
"sourceBucketName": bucketName,
"targetBucketName": textractBucketName,
"sourceFileName": objectName,
"targetFileName": objectName
})
output = "Processed -> Document: {}, Object: {}/{} processed.".format(jobTag, bucketName, objectName)
pipeline_client.stageSucceeded()
print(output)
return {
'statusCode': 200,
'body': output
}