in src/process_transcription_full_text.py [0:0]
def process_transcript(transcription_url, podcast_url, vocabulary_info):
custom_vocabs = None
if "mapping" in vocabulary_info:
try:
vocab_mapping_bucket = vocabulary_info['mapping']['bucket']
key = vocabulary_info['mapping']['key']
obj = s3_client.get_object(Bucket=vocab_mapping_bucket, Key=key)
custom_vocabs = json.loads(obj['Body'].read())
logger.info("key:" + key)
logger.info("using custom vocab mapping: \n" + json.dumps(custom_vocabs, indent=2))
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
raise InvalidInputError("The S3 file for custom vocab list does not exist.")
else:
raise
# job_status_response = transcribe_client.get_transcription_job(TranscriptionJobName=transcribe_job_id)
response = urlopen(transcription_url)
output = response.read()
json_data = json.loads(output)
logger.debug(json.dumps(json_data, indent=4))
results = json_data['results']
# free up memory
del json_data
comprehend_chunks, paragraphs = chunk_up_transcript(custom_vocabs, results)
start = time.time()
detected_entities_response = comprehend.batch_detect_entities(TextList=comprehend_chunks, LanguageCode='en')
round_trip = time.time() - start
logger.info('End of batch_detect_entities. Took time {:10.4f}\n'.format(round_trip))
entities = parse_detected_entities_response(detected_entities_response, {})
entities_as_list = {}
for entity_type in entities:
entities_as_list[entity_type] = list(entities[entity_type])
clean_up_entity_results(entities_as_list)
print(json.dumps(entities_as_list, indent=4))
# start = time.time()
# detected_phrase_response = comprehend.batch_detect_key_phrases(TextList=comprehend_chunks, LanguageCode='en')
# round_trip = time.time() - start
# logger.info('End of batch_detect_key_phrases. Took time {:10.4f}\n'.format(round_trip))
# key_phrases = parse_detected_key_phrases_response(detected_phrase_response)
# logger.debug(json.dumps(key_phrases, indent=4))
doc_to_update = {'transcript': paragraphs}
doc_to_update['transcript_entities'] = entities_as_list
logger.info(json.dumps(doc_to_update, indent=4))
# doc_to_update['key_phrases'] = key_phrases
key = 'podcasts/transcript/' + id_generator() + '.json'
response = s3_client.put_object(Body=json.dumps(doc_to_update, indent=2), Bucket=bucket, Key=key)
logger.info(json.dumps(response, indent=2))
logger.info("successfully written transcript to s3://" + bucket + "/" + key)
# Return the bucket and key of the transcription / comprehend result.
transcript_location = {"bucket": bucket, "key": key}
return transcript_location