in lambda/indexer/crawler.py [0:0]
def process_s3_media_object(crawlername, bucketname, s3url, s3object, s3metadataobject, s3transcribeoptsobject, kendra_sync_job_id, role):
logger.info(f"process_s3_media_object() - Key: {s3url}")
lastModified = s3object['LastModified'].strftime("%m:%d:%Y:%H:%M:%S")
size_bytes = s3object['Size']
metadata_url = None
metadata_lastModified = None
transcribeopts_url = None
transcribeopts_lastModified = None
if s3metadataobject:
metadata_url = f"s3://{bucketname}/{s3metadataobject['Key']}"
metadata_lastModified = s3metadataobject['LastModified'].strftime("%m:%d:%Y:%H:%M:%S")
if s3transcribeoptsobject:
transcribeopts_url = f"s3://{bucketname}/{s3transcribeoptsobject['Key']}"
transcribeopts_lastModified = s3transcribeoptsobject['LastModified'].strftime("%m:%d:%Y:%H:%M:%S")
item = get_file_status(s3url)
job_name=None
if (item == None or item.get("status") == "DELETED"):
logger.info("NEW:" + s3url)
job_name = start_media_transcription(crawlername, s3url, role, transcribeopts_url)
if job_name:
put_file_status(
s3url, lastModified, size_bytes, duration_secs=None, status="ACTIVE-NEW",
metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
transcribe_job_id=job_name, transcribe_state="RUNNING", transcribe_secs=None,
sync_job_id=kendra_sync_job_id, sync_state="RUNNING"
)
elif (lastModified != item['lastModified'] or transcribeopts_lastModified != item.get('transcribeopts_lastModified')):
logger.info("MODIFIED:" + s3url)
job_name = restart_media_transcription(crawlername, s3url, role, transcribeopts_url)
if job_name:
put_file_status(
s3url, lastModified, size_bytes, duration_secs=None, status="ACTIVE-MODIFIED",
metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
transcribe_job_id=job_name, transcribe_state="RUNNING", transcribe_secs=None,
sync_job_id=kendra_sync_job_id, sync_state="RUNNING"
)
elif (metadata_lastModified != item.get('metadata_lastModified')):
logger.info("METADATA_MODIFIED:" + s3url)
if get_transcription_job(item['transcribe_job_id']):
# reindex existing transcription with new metadata
reindex_existing_doc_with_new_metadata(item['transcribe_job_id'])
put_file_status(
s3url, lastModified, size_bytes, duration_secs=None, status="ACTIVE-METADATA_MODIFIED",
metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
transcribe_job_id=item['transcribe_job_id'], transcribe_state="DONE", transcribe_secs=item['transcribe_secs'],
sync_job_id=kendra_sync_job_id, sync_state="RUNNING"
)
else:
# previous transcription gone - retranscribe
job_name = restart_media_transcription(crawlername, s3url, role, transcribeopts_url)
if job_name:
put_file_status(
s3url, lastModified, size_bytes, duration_secs=None, status="ACTIVE-METADATA_MODIFIED",
metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
transcribe_job_id=job_name, transcribe_state="RUNNING", transcribe_secs=None,
sync_job_id=kendra_sync_job_id, sync_state="RUNNING"
)
else:
logger.info("UNCHANGED:" + s3url)
put_file_status(
s3url, lastModified, size_bytes, duration_secs=item['duration_secs'], status="ACTIVE-UNCHANGED",
metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
transcribe_job_id=item['transcribe_job_id'], transcribe_state="DONE", transcribe_secs=item['transcribe_secs'],
sync_job_id=item['sync_job_id'], sync_state="DONE"
)
return s3url