in lambda/comprehend.py [0:0]
def handler(event, context):
print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = unquote_plus(event['Records'][0]['s3']['object']['key'])
print("key is"+key)
print("bucket is"+bucket)
text=""
textvalues=[]
textvalues_entity={}
try:
s3.Bucket(bucket).download_file(Key=key,Filename='/tmp/{}')
# Read document content
with open('/tmp/{}', 'rb') as document:
imageBytes = bytearray(document.read())
print("Object downloaded")
response = textract.analyze_document(Document={'Bytes': imageBytes},FeatureTypes=["TABLES", "FORMS"])
document = Document(response)
table=[]
forms=[]
#print(document)
for page in document.pages:
table = outputTable(page)
forms = outputForm(page)
print(table)
blocks=response['Blocks']
for block in blocks:
if block['BlockType'] == 'LINE':
text += block['Text']+"\n"
print(text)
# Extracting Key Phrases
keyphrase_response = comprehend.detect_key_phrases(Text=text, LanguageCode='en')
KeyPhraseList=keyphrase_response.get("KeyPhrases")
for s in KeyPhraseList:
textvalues.append(s.get("Text"))
detect_entity= comprehend.detect_entities(Text=text, LanguageCode='en')
EntityList=detect_entity.get("Entities")
for s in EntityList:
textvalues_entity.update([(s.get("Type").strip('\t\n\r'),s.get("Text").strip('\t\n\r'))])
s3url= 'https://s3.console.aws.amazon.com/s3/object/'+bucket+'/'+key+'?region='+region
searchdata={'s3link':s3url,'KeyPhrases':textvalues,'Entity':textvalues_entity,'text':text, 'table':table, 'forms':forms}
print(searchdata)
print("connecting to ES")
es=connectES()
#es.index(index="resume-search", doc_type="_doc", body=searchdata)
es.index(index="document", doc_type="_doc", body=searchdata)
print("data uploaded to Elasticsearch")
return 'keyphrases Successfully Uploaded'
except Exception as e:
print(e)
print('Error: ')
raise e