in source/lambda/jobresultprocessor/lambda_function.py [0:0]
def processRequest(request):
output = ""
print("Request : {}".format(request))
jobId = request['jobId']
documentId = request['jobTag']
jobStatus = request['jobStatus']
jobAPI = request['jobAPI']
bucketName = request['bucketName']
outputBucketName = request['outputBucketName']
objectName = request['objectName']
outputTable = request["outputTable"]
documentsTable = request["documentsTable"]
elasticsearchDomain = request["elasticsearchDomain"]
pages = getJobResults(jobAPI, jobId)
print("Result pages recieved: {}".format(len(pages)))
dynamodb = AwsHelper().getResource("dynamodb")
ddb = dynamodb.Table(outputTable)
detectForms = False
detectTables = False
if(jobAPI == "StartDocumentAnalysis"):
detectForms = True
detectTables = True
dynamodb = AwsHelper().getResource('dynamodb')
ddb = dynamodb.Table(outputTable)
outputPath = '{}{}/{}'.format(PUBLIC_PATH_S3_PREFIX,documentId,SERVICE_OUTPUT_PATH_S3_PREFIX)
print("Generating output for DocumentId: {} and storing in {}".format(documentId,outputPath))
opg = OutputGenerator(documentId, pages, outputBucketName, objectName, detectForms, detectTables, ddb,outputPath, elasticsearchDomain)
opg_output = opg.run()
generatePdf(documentId, bucketName, objectName, outputBucketName,outputPath)
# generate Comprehend and ComprehendMedical entities
comprehendOutputPath = "{}{}".format(outputPath,COMPREHEND_PATH_S3_PREFIX)
print("Comprehend output path: " + comprehendOutputPath)
maxPages = 100
comprehendClient = ComprehendHelper()
responseDocumentName = "{}{}response.json".format(outputPath,TEXTRACT_PATH_S3_PREFIX)
comprehendAndMedicalEntities = comprehendClient.processComprehend(outputBucketName, responseDocumentName, comprehendOutputPath, maxPages)
# if Kendra is available then let it index the document
if 'KENDRA_INDEX_ID' in os.environ:
kendraClient = KendraHelper()
fileName = os.path.basename(objectName).split(".")[0]
fileExtension = os.path.basename(objectName).split(".")[1]
outputDocumentName = "{}{}-searchable.pdf".format(outputPath, fileName)
kendraClient.indexDocument(os.environ['KENDRA_INDEX_ID'],
os.environ['KENDRA_ROLE_ARN'],
bucketName,
outputDocumentName,
documentId,
fileExtension)
print("DocumentId: {}".format(documentId))
print("Processed Comprehend data: {}".format(comprehendAndMedicalEntities))
# index document once the comprehend entities and KVPairs have been extracted
for key, val in opg_output[KVPAIRS].items():
if key not in comprehendAndMedicalEntities:
comprehendAndMedicalEntities[key] = val
else:
comprehendAndMedicalEntities[key].add(val)
opg.indexDocument(opg_output[DOCTEXT], comprehendAndMedicalEntities)
ds = datastore.DocumentStore(documentsTable, outputTable)
ds.markDocumentComplete(documentId)
output = "Processed -> Document: {}, Object: {}/{} processed.".format(documentId, bucketName, objectName)
return {
'statusCode': 200,
'body': output
}