in comprehend.py [0:0]
def handler(event, context):
print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = unquote_plus(event['Records'][0]['s3']['object']['key'])
print("key is"+key)
print("bucket is"+bucket)
textvalues=[]
textvalues_entity={}
text=""
try:
s3.Bucket(bucket).download_file(Key=key,Filename='/tmp/{}')
print("Object downloaded")
pdfFileObj = open('/tmp/{}', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
print("number of pages")
print(num_pages)
count = 0
extracted_pdftext = ""
searchable_text=[]
#The while loop will read each page
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
print(count)
print("-------------iteration starts---------")
extracted_pdftext = pageObj.extractText()
if(sys.getsizeof(extracted_pdftext)> 5000):
text = extracted_pdftext[:5000]
text.strip('\t\n\r')
else:
text=extracted_pdftext.strip('\t\n\r')
searchable_text.append(text)
# Extracting Key Phrases
print(text)
sentiment_response = comprehend.detect_key_phrases(Text=text, LanguageCode='en')
KeyPhraseList=sentiment_response.get("KeyPhrases")
accuracy=90.0
for s in KeyPhraseList:
score=float(s.get("Score"))*100
if(score >= accuracy):
textvalues.append(s.get("Text").strip('\t\n\r'))
detect_entity= comprehend.detect_entities(Text=text, LanguageCode='en')
#print(detect_entity)
EntityList=detect_entity.get("Entities")
#print(EntityList)
for s in EntityList:
score=float(s.get("Score"))*100
if(score >= accuracy):
textvalues_entity.update([(s.get("Type").strip('\t\n\r'),s.get("Text").strip('\t\n\r'))])
pdfFileObj.close()
#https://s3.console.aws.amazon.com/s3/object/%3Cbucket%3E/%3Ckey%3E?region=us-east-1
s3url= 'https://s3.console.aws.amazon.com/s3/object/'+bucket+'/'+key+'?region=us-east-1'
searchdata={'s3link':s3url,'KeyPhrases':textvalues,'Entity':textvalues_entity,'text':searchable_text}
print(searchdata)
print("connecting to ES")
es=connectES()
#es.index(index="resume-search", doc_type="_doc", body=searchdata)
es.index(index="resume", doc_type="_doc", body=searchdata)
print("data uploaded to Elasticsearch")
return 'keyphrases Successfully Uploaded'
except Exception as e:
print(e)
print('Error: ')
raise e