in lambda/indexer/crawler.py [0:0]
def list_s3_objects(bucketname, media_prefix, metadata_prefix, transcribeopts_prefix):
logger.info(f"list_s3_media_objects(bucketname{bucketname}, media_prefix={media_prefix}, metadata_prefix={metadata_prefix})")
s3mediaobjects={}
s3metadataobjects={}
s3transcribeoptsobjects={}
logger.info(f"Find media and metadata files under media_prefix: {media_prefix}")
paginator = S3.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=bucketname, Prefix=media_prefix)
for page in pages:
if "Contents" in page:
for s3object in page["Contents"]:
if is_supported_media_file(s3object['Key']):
logger.info("Supported media file type: " + s3object['Key'])
media_url = f"s3://{bucketname}/{s3object['Key']}"
s3mediaobjects[media_url]=s3object
elif metadata_prefix=="" and is_supported_metadata_file(s3object['Key']):
ref_media_key = get_metadata_ref_file_key(s3object['Key'], media_prefix, metadata_prefix)
logger.info(f"Metadata file: {s3object['Key']}. References media file: {ref_media_key}")
media_url = f"s3://{bucketname}/{ref_media_key}"
s3metadataobjects[media_url]=s3object
elif transcribeopts_prefix=="" and is_supported_transcribeopts_file(s3object['Key']):
ref_media_key = get_transcribeopts_ref_file_key(s3object['Key'], media_prefix, transcribeopts_prefix)
logger.info(f"Transcribe options file: {s3object['Key']}. References media file: {ref_media_key}")
media_url = f"s3://{bucketname}/{ref_media_key}"
s3transcribeoptsobjects[media_url]=s3object
else:
logger.info("File type not supported. Skipping: " + s3object['Key'])
else:
logger.info(f"No files found in {bucketname}/{media_prefix}")
# if media files were found, AND metadataprefix is defined, then find metadata files under metadataprefix
if s3mediaobjects and metadata_prefix:
logger.info(f"Find Kendra metadata files under metadata_prefix: {metadata_prefix}")
pages = paginator.paginate(Bucket=bucketname, Prefix=metadata_prefix)
for page in pages:
if "Contents" in page:
for s3object in page["Contents"]:
if is_supported_metadata_file(s3object['Key']):
ref_media_key = get_metadata_ref_file_key(s3object['Key'], media_prefix, metadata_prefix)
logger.info(f"Kendra metadata file: {s3object['Key']}. References media file: {ref_media_key}")
media_url = f"s3://{bucketname}/{ref_media_key}"
s3metadataobjects[media_url]=s3object
else:
logger.info("not a Kendra metadatafile. Skipping: " + s3object['Key'])
else:
logger.info(f"No metadata files found in {bucketname}/{metadata_prefix}")
# if media files were found, AND transcribeopts_prefix is defined, then find transcribe options files under transcribeopts_prefix
if s3mediaobjects and transcribeopts_prefix:
logger.info(f"Find Transcribe job options files under transcribeopts_prefix: {transcribeopts_prefix}")
pages = paginator.paginate(Bucket=bucketname, Prefix=transcribeopts_prefix)
for page in pages:
if "Contents" in page:
for s3object in page["Contents"]:
if is_supported_transcribeopts_file(s3object['Key']):
ref_media_key = get_transcribeopts_ref_file_key(s3object['Key'], media_prefix, transcribeopts_prefix)
logger.info(f"Transcribe options file: {s3object['Key']}. References media file: {ref_media_key}")
media_url = f"s3://{bucketname}/{ref_media_key}"
s3transcribeoptsobjects[media_url]=s3object
else:
logger.info("not a Transcribe options file. Skipping: " + s3object['Key'])
else:
logger.info(f"No Transcribe options files found in {bucketname}/{transcribeopts_prefix}")
return [s3mediaobjects, s3metadataobjects, s3transcribeoptsobjects]