def lambda_handler()

in src/detectEntities.py [0:0]


def lambda_handler(event, context):

    s3 = boto3.client('s3')
    s3_resource = boto3.resource('s3')
    comprehend = boto3.client('comprehend')

    t_prefix = 'quicksight/data/entity'

    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=os.environ['entityDetectionBucket'], Prefix='comprehendInput/')

    tempcols = ['Type', 'Score']
    df_temp = pd.DataFrame(columns=tempcols)

    cols = ['transcript_name', 'entity_type']
    df_ent = pd.DataFrame(columns=cols)

    comprehendEndpoint = comprehend.list_endpoints(
        Filter={
            'Status': 'IN_SERVICE',
        }
    )

    for item in comprehendEndpoint.get('EndpointPropertiesList'):
        if 'entity-recognizer-endpoint' in item['EndpointArn']:
            endpointArn = item['EndpointArn']

    for page in pages:
        for obj in page['Contents']:
            transcript_file_name = obj['Key'].split('/')[1]
            temp = s3_resource.Object(os.environ['entityDetectionBucket'], obj['Key'])
            transcript_content = temp.get()['Body'].read().decode('utf-8')
            transcript_truncated = transcript_content[500:1800]
            response = comprehend.detect_entities(Text=transcript_truncated, LanguageCode='en', EndpointArn=endpointArn)
            df_temp = pd.DataFrame(columns=tempcols)
            for ent in response['Entities']:
                df_temp.loc[len(df_temp.index)] = [ent['Type'],ent['Score']]
            if len(df_temp) > 0:
                entity = df_temp.iloc[df_temp.Score.argmax(), 0:2]['Type']
            else:
                entity = 'No entities'
            
            df_ent.loc[len(df_ent.index)] = [transcript_file_name.strip('en-'),entity]        

    wr.s3.to_csv(df_ent, path='s3://' + os.environ['entityDetectionBucket'] + '/' + t_prefix + '/' + 'entities.csv')