in python/ts-to-word.py [0:0]
def generate_document():
"""
Entrypoint for the command-line interface.
"""
# Parameter extraction
cli_parser = argparse.ArgumentParser(prog='ts-to-word',
description='Turn an Amazon Transcribe job output into an MS Word document')
source_group = cli_parser.add_mutually_exclusive_group(required=True)
source_group.add_argument('--inputFile', metavar='filename', type=str, help='File containing Transcribe JSON output')
source_group.add_argument('--inputJob', metavar='job-id', type=str, help='Transcribe job identifier')
cli_parser.add_argument('--outputFile', metavar='filename', type=str, help='Output file to hold MS Word document')
cli_parser.add_argument('--sentiment', choices=['on', 'off'], default='off', help='Enables sentiment analysis on each conversational turn via Amazon Comprehend')
cli_parser.add_argument('--confidence', choices=['on', 'off'], default='off', help='Displays information on word confidence scores throughout the transcript')
cli_parser.add_argument('--keep', action='store_true', help='Keeps any downloaded job transcript JSON file')
cli_args = cli_parser.parse_args()
# If we're downloading a job transcript then validate that we have a job, then download it
if cli_args.inputJob is not None:
try:
job_info, job_status = load_transcribe_job_status(cli_args)
except:
# Exception, most-likely due to the job not existing
print("NOT FOUND: Requested job-id '{0}' does not exist.".format(cli_args.inputJob))
exit(-1)
# If the job hasn't completed then there is no transcript available
if job_status == "FAILED":
print("{0}: Requested job-id '{1}' has failed to complete".format(job_status, cli_args.inputJob))
exit(-1)
elif job_status != "COMPLETED":
print("{0}: Requested job-id '{1}' has not yet completed.".format(job_status, cli_args.inputJob))
exit(-1)
# The transcript is available from a signed URL - get the redacted if it exists, otherwise the non-redacted
if "RedactedTranscriptFileUri" in job_info["Transcript"]:
# Get the redacted transcript
download_url = job_info["Transcript"]["RedactedTranscriptFileUri"]
else:
# Gen the non-redacted transcript
download_url = job_info["Transcript"]["TranscriptFileUri"]
cli_args.inputFile = cli_args.inputJob + "-asrOutput.json"
# Try and download the JSON - this will fail if the job delivered it to
# an S3 bucket, as in that case the service no longer has the results
try:
urllib.request.urlretrieve(download_url, cli_args.inputFile)
except:
print("UNAVAILABLE: Transcript for job-id '{0}' is not available for download.".format(cli_args.inputJob))
exit(-1)
# Set our output filename if one wasn't supplied
if cli_args.outputFile is None:
cli_args.outputFile = cli_args.inputJob + ".docx"
# Load in the JSON file for processing
json_filepath = Path(cli_args.inputFile)
if json_filepath.is_file():
json_data = json.load(open(json_filepath.absolute(), "r", encoding="utf-8"))
else:
print("FAIL: Specified JSON file '{0}' does not exists.".format(cli_args.inputFile))
exit(-1)
# If this is a file-input run then try and load the job status (which may no longer exist)
if cli_args.inputJob is None:
try:
# Ensure we don't delete our JSON later, reset our output file to match the job-name if it's currently blank
cli_args.keep = True
if cli_args.outputFile is None:
if "results" in json_data:
cli_args.outputFile = json_data["jobName"] + ".docx"
cli_args.inputJob = json_data["jobName"]
else:
cli_args.outputFile = json_data["JobName"] + ".docx"
cli_args.inputJob = json_data["JobName"]
job_info, job_status = load_transcribe_job_status(cli_args)
except:
# No job status - need to quickly work out what mode we're in,
# as standard job results look different from analytical ones
cli_args.inputJob = None
cli_args.outputFile = cli_args.inputFile + ".docx"
cli_args.analyticsMode = "results" not in json_data
job_info = None
# Disable Comprehend's sentiment if we're in Analytics mode
if cli_args.analyticsMode:
cli_args.sentiment = 'off'
# Generate the core transcript
start = perf_counter()
speech_segments = create_turn_by_turn_segments(json_data, cli_args)
# Inject Comprehend-based sentiments into the segment list if required
if cli_args.sentiment == 'on':
# Work out the mapped language code, as Transcribe supports more languages than Comprehend. Just
# see if the Transcribe language code starts with any of those that Comprehend supports and use that
sentiment_lang_code = None
for comprehend_code in SENTIMENT_LANGUAGES:
if job_info["LanguageCode"].startswith(comprehend_code):
sentiment_lang_code = comprehend_code
break
# If we have no match then we cannot perform sentiment analysis
if sentiment_lang_code is not None:
generate_sentiment(speech_segments, sentiment_lang_code)
else:
cli_args.sentiment = 'off'
# Write out our file and the performance statistics
write(cli_args, speech_segments, job_info)
finish = perf_counter()
duration = round(finish - start, 2)
print(f"> Transcript {cli_args.outputFile} writen in {duration} seconds.")
# Finally, remove any temporary downloaded JSON results file
if (cli_args.inputJob is not None) and (not cli_args.keep):
os.remove(cli_args.inputFile)