def handler()

in sources/lambda/sync/index.py [0:0]


def handler(event, context):
    source_bucket = event['Records'][0]['s3']['bucket']['name']
    object_key = urllib.parse.unquote_plus(
                         event['Records'][0]['s3']['object']['key'])

    textract_result = textract.detect_document_text(
        Document={
            'S3Object': {
                'Bucket': source_bucket,
                'Name': object_key
            }
        }
    )
    page = ""
    blocks = [x for x in textract_result['Blocks']
              if x['BlockType'] == "LINE"]
    for block in blocks:
        page += " " + block['Text']

    text = page[:5000]

    languages = comprehend.detect_dominant_language(
        Text=text
    )
    dominant_languages = sorted(languages['Languages'],
                                key=lambda k: k['LanguageCode'])
    dominant_language = dominant_languages[0]['LanguageCode']
    if dominant_language not in ['en', 'es', 'fr', 'de', 'it', 'pt']:
        # TODO (optional): call Amazon translate to get it in english
        # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/translate.html#Translate.Client.translate_text
        dominant_language = "en"

    detected_entities = comprehend.detect_entities(
        Text=text,
        LanguageCode=dominant_language
    )
    selected_entity_types = ["ORGANIZATION", "PERSON", "LOCATION", "DATE"]
    selected_entities = [x for x in detected_entities['Entities']
                         if x['Score'] > 0.9 and
                         x['Type'] in selected_entity_types]

    doc = {
        "bucket": source_bucket,
        "document": object_key,
        "content": page,
        "entities": selected_entities
    }

    response = requests.post(elastic_url,
                             auth=awsauth,
                             json=doc,
                             headers=headers)
    response.raise_for_status()

    es_response = response.json()
    print(es_response)
    return es_response["_id"]