def lambda_handler()

in source/lambda_handlers/01-TextractComprehend.py [0:0]


def lambda_handler(event, context):
    # Create an SSM Client
    ssm_client = boto3.client('ssm')

    # Create an S3 Client
    s3_client = boto3.client('s3')

    # Create a Textract Client
    textract_client = boto3.client('textract')

    # Create a Comprehend Client
    comprehend_client = boto3.client('comprehend')

    # Get the Custom Entity Recognizer's ARN from SSM Parameter Store
    comprehend_parameters = ssm_client.get_parameters(Names=['CustomEntityRecognizerARN-TCA2I',
                                                             'ComprehendExecutionRole-TCA2I',
                                                             'ComprehendTemporaryDataStoreBucketName-TCA2I'],
                                                      WithDecryption=True)

    for parameter in comprehend_parameters['Parameters']:
        if parameter['Name'] == 'CustomEntityRecognizerARN-TCA2I':
            customer_recognizer_arn = parameter['Value']
        elif parameter['Name'] == 'ComprehendExecutionRole-TCA2I':
            comprehend_execution_role_arn = parameter['Value']
        elif parameter['Name'] == 'ComprehendTemporaryDataStoreBucketName-TCA2I':
            comprehend_output_bucket = parameter['Value']

    # Iterate over all S3 Put records that have been passed to this lambda function.
    for record in event['Records']:
        bucket = record['s3']['bucket']['name']
        key = unquote_plus(record['s3']['object']['key'])

        # Send S3 Object to Textract
        response = textract_client.detect_document_text(
            Document={'S3Object': {'Bucket': bucket, 'Name': key}})

        # Get just the filename (without input/ or trailing filetype)
        filename = ".".join(key.split(".")[:-1])
        filename = "/".join(filename.split("/")[1:])

        # Get the text blocks
        blocks = response['Blocks']

        # Save the JSON response from Textract to a folder in the S3 bucket
        raw_textract_data_response = s3_client.put_object(
            Bucket=bucket,
            Key='textract-output/raw/' + filename + '.json',
            Body=json.dumps(blocks)
        )
        print(f'Text Extraction Complete for {bucket}/{key}')

        # Recreate the raw text from the Textract Output
        raw_text = ""
        for block in blocks[1:]:
            if (block['BlockType'] == "WORD"):
                break
            raw_text = raw_text + block['Text'] + " "

        # Store it in an S3 bucket
        processed_data_key = 'textract-output/processed/' + filename + '.txt'

        # Store Processed Data in S3 Bucket
        processed_textract_data_response = s3_client.put_object(
            Bucket=bucket,
            Key=processed_data_key,
            Body=json.dumps(raw_text)
        )

        # Start the Custom Entity Recognition Job
        response = comprehend_client.start_entities_detection_job(
            InputDataConfig={
                'S3Uri': 's3://' + bucket + '/' + processed_data_key,
                'InputFormat': 'ONE_DOC_PER_FILE'
            },
            OutputDataConfig={
                'S3Uri': 's3://' + comprehend_output_bucket + '/comprehend-output/raw/'
            },
            DataAccessRoleArn=comprehend_execution_role_arn,
            JobName= re.sub(r'\W+', '', filename) + '-TextractComprehendA2I',
            EntityRecognizerArn=customer_recognizer_arn,
            LanguageCode='en'
        )

        print("Custom Entity Detection Job Started")
    return 0