def process_s3_media_object()

in aws-kendra-transcribe-media-search/lambda/indexer/crawler.py [0:0]


def process_s3_media_object(crawlername, bucketname, s3url, s3object, s3metadataobject, s3transcribeoptsobject, kendra_sync_job_id, role):
    logger.info(f"process_s3_media_object() - Key: {s3url}")
    lastModified = s3object['LastModified'].strftime("%m:%d:%Y:%H:%M:%S")
    size_bytes = s3object['Size']
    metadata_url = None
    metadata_lastModified = None
    transcribeopts_url = None
    transcribeopts_lastModified = None
    if s3metadataobject:
        metadata_url = f"s3://{bucketname}/{s3metadataobject['Key']}"
        metadata_lastModified = s3metadataobject['LastModified'].strftime("%m:%d:%Y:%H:%M:%S")
    if s3transcribeoptsobject:
        transcribeopts_url = f"s3://{bucketname}/{s3transcribeoptsobject['Key']}"
        transcribeopts_lastModified = s3transcribeoptsobject['LastModified'].strftime("%m:%d:%Y:%H:%M:%S")
    item = get_file_status(s3url)
    job_name=None
    if (item == None or item.get("status") == "DELETED"):
        logger.info("NEW:" + s3url)
        job_name = start_media_transcription(crawlername, s3url, role, transcribeopts_url)
        if job_name:
            put_file_status(
                s3url, lastModified, size_bytes, duration_secs=None, status="ACTIVE-NEW", 
                metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
                transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
                transcribe_job_id=job_name, transcribe_state="RUNNING", transcribe_secs=None, 
                sync_job_id=kendra_sync_job_id, sync_state="RUNNING"
                )
    elif (lastModified != item['lastModified'] or transcribeopts_lastModified != item.get('transcribeopts_lastModified')):
        logger.info("MODIFIED:" + s3url)
        job_name = restart_media_transcription(crawlername, s3url, role, transcribeopts_url)
        if job_name:
            put_file_status(
                s3url, lastModified, size_bytes, duration_secs=None, status="ACTIVE-MODIFIED", 
                metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
                transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
                transcribe_job_id=job_name, transcribe_state="RUNNING", transcribe_secs=None,
                sync_job_id=kendra_sync_job_id, sync_state="RUNNING"
                )
    elif (metadata_lastModified != item.get('metadata_lastModified')):
        logger.info("METADATA_MODIFIED:" + s3url)
        if get_transcription_job(item['transcribe_job_id']):
            # reindex existing transcription with new metadata
            reindex_existing_doc_with_new_metadata(item['transcribe_job_id'])
            put_file_status(
                s3url, lastModified, size_bytes, duration_secs=None, status="ACTIVE-METADATA_MODIFIED", 
                metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
                transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
                transcribe_job_id=item['transcribe_job_id'], transcribe_state="DONE", transcribe_secs=item['transcribe_secs'],
                sync_job_id=kendra_sync_job_id, sync_state="RUNNING"
                )
        else:
            # previous transcription gone - retranscribe 
            job_name = restart_media_transcription(crawlername, s3url, role, transcribeopts_url)
            if job_name:
                put_file_status(
                    s3url, lastModified, size_bytes, duration_secs=None, status="ACTIVE-METADATA_MODIFIED", 
                    metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
                    transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
                    transcribe_job_id=job_name, transcribe_state="RUNNING", transcribe_secs=None,
                    sync_job_id=kendra_sync_job_id, sync_state="RUNNING"
                    )
    else:
        logger.info("UNCHANGED:" + s3url)
        put_file_status(
            s3url, lastModified, size_bytes, duration_secs=item['duration_secs'], status="ACTIVE-UNCHANGED", 
            metadata_url=metadata_url, metadata_lastModified=metadata_lastModified,
            transcribeopts_url=transcribeopts_url, transcribeopts_lastModified=transcribeopts_lastModified,
            transcribe_job_id=item['transcribe_job_id'], transcribe_state="DONE", transcribe_secs=item['transcribe_secs'],
            sync_job_id=item['sync_job_id'], sync_state="DONE"
            )
    return s3url