def process_transcript()

in src/process_transcription_full_text.py [0:0]


def process_transcript(transcription_url, podcast_url, vocabulary_info):
    custom_vocabs = None
    if "mapping" in vocabulary_info:
        try:
            vocab_mapping_bucket = vocabulary_info['mapping']['bucket']
            key = vocabulary_info['mapping']['key']
            obj = s3_client.get_object(Bucket=vocab_mapping_bucket, Key=key)
            custom_vocabs = json.loads(obj['Body'].read())
            logger.info("key:" + key)
            logger.info("using custom vocab mapping: \n" + json.dumps(custom_vocabs, indent=2))
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                raise InvalidInputError("The S3 file for custom vocab list does not exist.")
            else:
                raise

    # job_status_response = transcribe_client.get_transcription_job(TranscriptionJobName=transcribe_job_id)
    response = urlopen(transcription_url)
    output = response.read()
    json_data = json.loads(output)

    logger.debug(json.dumps(json_data, indent=4))
    results = json_data['results']
    # free up memory
    del json_data

    comprehend_chunks, paragraphs = chunk_up_transcript(custom_vocabs, results)

    start = time.time()
    detected_entities_response = comprehend.batch_detect_entities(TextList=comprehend_chunks, LanguageCode='en')
    round_trip = time.time() - start
    logger.info('End of batch_detect_entities. Took time {:10.4f}\n'.format(round_trip))

    entities = parse_detected_entities_response(detected_entities_response, {})
    entities_as_list = {}
    for entity_type in entities:
        entities_as_list[entity_type] = list(entities[entity_type])

    clean_up_entity_results(entities_as_list)
    print(json.dumps(entities_as_list, indent=4))

    # start = time.time()
    # detected_phrase_response = comprehend.batch_detect_key_phrases(TextList=comprehend_chunks, LanguageCode='en')
    # round_trip = time.time() - start
    # logger.info('End of batch_detect_key_phrases. Took time {:10.4f}\n'.format(round_trip))

    # key_phrases = parse_detected_key_phrases_response(detected_phrase_response)
    # logger.debug(json.dumps(key_phrases, indent=4))

    doc_to_update = {'transcript': paragraphs}
    doc_to_update['transcript_entities'] = entities_as_list
    logger.info(json.dumps(doc_to_update, indent=4))
    # doc_to_update['key_phrases'] = key_phrases
    key = 'podcasts/transcript/' + id_generator() + '.json'

    response = s3_client.put_object(Body=json.dumps(doc_to_update, indent=2), Bucket=bucket, Key=key)
    logger.info(json.dumps(response, indent=2))

    logger.info("successfully written transcript to s3://" + bucket + "/" + key)
    # Return the bucket and key of the transcription / comprehend result.
    transcript_location = {"bucket": bucket, "key": key}
    return transcript_location