in sources/lambda/sync/index.py [0:0]
def handler(event, context):
source_bucket = event['Records'][0]['s3']['bucket']['name']
object_key = urllib.parse.unquote_plus(
event['Records'][0]['s3']['object']['key'])
textract_result = textract.detect_document_text(
Document={
'S3Object': {
'Bucket': source_bucket,
'Name': object_key
}
}
)
page = ""
blocks = [x for x in textract_result['Blocks']
if x['BlockType'] == "LINE"]
for block in blocks:
page += " " + block['Text']
text = page[:5000]
languages = comprehend.detect_dominant_language(
Text=text
)
dominant_languages = sorted(languages['Languages'],
key=lambda k: k['LanguageCode'])
dominant_language = dominant_languages[0]['LanguageCode']
if dominant_language not in ['en', 'es', 'fr', 'de', 'it', 'pt']:
# TODO (optional): call Amazon translate to get it in english
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/translate.html#Translate.Client.translate_text
dominant_language = "en"
detected_entities = comprehend.detect_entities(
Text=text,
LanguageCode=dominant_language
)
selected_entity_types = ["ORGANIZATION", "PERSON", "LOCATION", "DATE"]
selected_entities = [x for x in detected_entities['Entities']
if x['Score'] > 0.9 and
x['Type'] in selected_entity_types]
doc = {
"bucket": source_bucket,
"document": object_key,
"content": page,
"entities": selected_entities
}
response = requests.post(elastic_url,
auth=awsauth,
json=doc,
headers=headers)
response.raise_for_status()
es_response = response.json()
print(es_response)
return es_response["_id"]