in aws-kendra-transcribe-media-search/lambda/indexer/jobcomplete.py [0:0]
def lambda_handler(event, context):
logger.info("Received event: %s" % json.dumps(event))
job_name = event['detail']['TranscriptionJobName']
logger.info(f"Transcription job name: {job_name}")
# get results of Amazon Transcribe job
logger.info("** Retrieve transcription job **")
transcription_job = get_transcription_job(job_name)
if transcription_job == None or ('TranscriptionJob' not in transcription_job):
logger.error("Unable to retrieve transcription from job.")
else:
job_status = transcription_job['TranscriptionJob']['TranscriptionJobStatus']
media_s3url = transcription_job['TranscriptionJob']['Media']['MediaFileUri']
item = get_file_status(media_s3url)
if item == None:
logger.info("Transcription job for media file not tracked in Indexer Media File table.. possibly this is a job that is not started by MediaSearch indexer")
return
if job_status == "FAILED":
# job failed
failure_reason = transcription_job['TranscriptionJob']['FailureReason']
logger.error(f"Transcribe job failed: {job_status} - Reason {failure_reason}")
put_file_status(
media_s3url, lastModified=item['lastModified'], size_bytes=item['size_bytes'], duration_secs=None, status=item['status'],
metadata_url=item['metadata_url'], metadata_lastModified=item['metadata_lastModified'],
transcribeopts_url=item['transcribeopts_url'], transcribeopts_lastModified=item['transcribeopts_lastModified'],
transcribe_job_id=item['transcribe_job_id'], transcribe_state="FAILED", transcribe_secs=None,
sync_job_id=item['sync_job_id'], sync_state="NOT_SYNCED"
)
else:
# job completed
transcript_uri = transcription_job['TranscriptionJob']['Transcript']['TranscriptFileUri']
transcribe_secs = get_transcription_job_duration(transcription_job)
# Update transcribe_state
put_file_status(
media_s3url, lastModified=item['lastModified'], size_bytes=item['size_bytes'], duration_secs=None, status=item['status'],
metadata_url=item['metadata_url'], metadata_lastModified=item['metadata_lastModified'],
transcribeopts_url=item['transcribeopts_url'], transcribeopts_lastModified=item['transcribeopts_lastModified'],
transcribe_job_id=item['transcribe_job_id'], transcribe_state="DONE", transcribe_secs=transcribe_secs,
sync_job_id=item['sync_job_id'], sync_state=item['sync_state']
)
try:
logger.info("** Process transcription and prepare for indexing **")
[duration_secs, text] = prepare_transcript(transcript_uri)
logger.info("** Index transcription document in Kendra **")
put_document(dsId=DS_ID, indexId=INDEX_ID, s3url=media_s3url, item=item, text=text)
# Update sync_state
put_file_status(
media_s3url, lastModified=item['lastModified'], size_bytes=item['size_bytes'], duration_secs=duration_secs, status=item['status'],
metadata_url=item['metadata_url'], metadata_lastModified=item['metadata_lastModified'],
transcribeopts_url=item['transcribeopts_url'], transcribeopts_lastModified=item['transcribeopts_lastModified'],
transcribe_job_id=item['transcribe_job_id'], transcribe_state="DONE", transcribe_secs=transcribe_secs,
sync_job_id=item['sync_job_id'], sync_state="DONE"
)
except Exception as e:
logger.error("Exception thrown during indexing: " + str(e))
put_file_status(
media_s3url, lastModified=item['lastModified'], size_bytes=item['size_bytes'], duration_secs=None, status=item['status'],
metadata_url=item['metadata_url'], metadata_lastModified=item['metadata_lastModified'],
transcribeopts_url=item['transcribeopts_url'], transcribeopts_lastModified=item['transcribeopts_lastModified'],
transcribe_job_id=item['transcribe_job_id'], transcribe_state="DONE", transcribe_secs=transcribe_secs,
sync_job_id=item['sync_job_id'], sync_state="FAILED"
)
# Finally, in all cases stop sync job if not more transcription jobs are pending.
stop_kendra_sync_job_when_all_done(dsId=DS_ID, indexId=INDEX_ID)