def list_s3_objects()

in lambda/indexer/crawler.py [0:0]


def list_s3_objects(bucketname, media_prefix, metadata_prefix, transcribeopts_prefix):
    logger.info(f"list_s3_media_objects(bucketname{bucketname}, media_prefix={media_prefix}, metadata_prefix={metadata_prefix})")
    s3mediaobjects={}
    s3metadataobjects={}
    s3transcribeoptsobjects={}
    logger.info(f"Find media and metadata files under media_prefix: {media_prefix}")
    paginator = S3.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucketname, Prefix=media_prefix)
    for page in pages:
        if "Contents" in page:
            for s3object in page["Contents"]:
                if is_supported_media_file(s3object['Key']):
                    logger.info("Supported media file type: " + s3object['Key'])
                    media_url = f"s3://{bucketname}/{s3object['Key']}"
                    s3mediaobjects[media_url]=s3object
                elif metadata_prefix=="" and is_supported_metadata_file(s3object['Key']):
                    ref_media_key = get_metadata_ref_file_key(s3object['Key'], media_prefix, metadata_prefix)
                    logger.info(f"Metadata file: {s3object['Key']}. References media file: {ref_media_key}")
                    media_url = f"s3://{bucketname}/{ref_media_key}"
                    s3metadataobjects[media_url]=s3object
                elif transcribeopts_prefix=="" and is_supported_transcribeopts_file(s3object['Key']):
                    ref_media_key = get_transcribeopts_ref_file_key(s3object['Key'], media_prefix, transcribeopts_prefix)
                    logger.info(f"Transcribe options file: {s3object['Key']}. References media file: {ref_media_key}")
                    media_url = f"s3://{bucketname}/{ref_media_key}"
                    s3transcribeoptsobjects[media_url]=s3object
                else:
                    logger.info("File type not supported. Skipping: " + s3object['Key'])
        else:
            logger.info(f"No files found in {bucketname}/{media_prefix}")
    # if media files were found, AND metadataprefix is defined, then find metadata files under metadataprefix
    if s3mediaobjects and metadata_prefix:
        logger.info(f"Find Kendra metadata files under metadata_prefix: {metadata_prefix}")
        pages = paginator.paginate(Bucket=bucketname, Prefix=metadata_prefix)
        for page in pages:
            if "Contents" in page:
                for s3object in page["Contents"]:
                    if is_supported_metadata_file(s3object['Key']):
                        ref_media_key = get_metadata_ref_file_key(s3object['Key'], media_prefix, metadata_prefix)
                        logger.info(f"Kendra metadata file: {s3object['Key']}. References media file: {ref_media_key}")
                        media_url = f"s3://{bucketname}/{ref_media_key}"
                        s3metadataobjects[media_url]=s3object
                    else:
                        logger.info("not a Kendra metadatafile. Skipping: " + s3object['Key'])
            else:
                logger.info(f"No metadata files found in {bucketname}/{metadata_prefix}")  
    # if media files were found, AND transcribeopts_prefix is defined, then find transcribe options files under transcribeopts_prefix
    if s3mediaobjects and transcribeopts_prefix:
        logger.info(f"Find Transcribe job options files under transcribeopts_prefix: {transcribeopts_prefix}")
        pages = paginator.paginate(Bucket=bucketname, Prefix=transcribeopts_prefix)
        for page in pages:
            if "Contents" in page:
                for s3object in page["Contents"]:
                    if is_supported_transcribeopts_file(s3object['Key']):
                        ref_media_key = get_transcribeopts_ref_file_key(s3object['Key'], media_prefix, transcribeopts_prefix)
                        logger.info(f"Transcribe options file: {s3object['Key']}. References media file: {ref_media_key}")
                        media_url = f"s3://{bucketname}/{ref_media_key}"
                        s3transcribeoptsobjects[media_url]=s3object
                    else:
                        logger.info("not a Transcribe options file. Skipping: " + s3object['Key'])
            else:
                logger.info(f"No Transcribe options files found in {bucketname}/{transcribeopts_prefix}")   
    return [s3mediaobjects, s3metadataobjects, s3transcribeoptsobjects]