def generate_document()

in python/ts-to-word.py [0:0]


def generate_document():
    """
    Entrypoint for the command-line interface.
    """
    # Parameter extraction
    cli_parser = argparse.ArgumentParser(prog='ts-to-word',
                                         description='Turn an Amazon Transcribe job output into an MS Word document')
    source_group = cli_parser.add_mutually_exclusive_group(required=True)
    source_group.add_argument('--inputFile', metavar='filename', type=str, help='File containing Transcribe JSON output')
    source_group.add_argument('--inputJob', metavar='job-id', type=str, help='Transcribe job identifier')
    cli_parser.add_argument('--outputFile', metavar='filename', type=str, help='Output file to hold MS Word document')
    cli_parser.add_argument('--sentiment', choices=['on', 'off'], default='off', help='Enables sentiment analysis on each conversational turn via Amazon Comprehend')
    cli_parser.add_argument('--confidence', choices=['on', 'off'], default='off', help='Displays information on word confidence scores throughout the transcript')
    cli_parser.add_argument('--keep', action='store_true', help='Keeps any downloaded job transcript JSON file')
    cli_args = cli_parser.parse_args()

    # If we're downloading a job transcript then validate that we have a job, then download it
    if cli_args.inputJob is not None:
        try:
            job_info, job_status = load_transcribe_job_status(cli_args)
        except:
            # Exception, most-likely due to the job not existing
            print("NOT FOUND: Requested job-id '{0}' does not exist.".format(cli_args.inputJob))
            exit(-1)

        # If the job hasn't completed then there is no transcript available
        if job_status == "FAILED":
            print("{0}: Requested job-id '{1}' has failed to complete".format(job_status, cli_args.inputJob))
            exit(-1)
        elif job_status != "COMPLETED":
            print("{0}: Requested job-id '{1}' has not yet completed.".format(job_status, cli_args.inputJob))
            exit(-1)

        # The transcript is available from a signed URL - get the redacted if it exists, otherwise the non-redacted
        if "RedactedTranscriptFileUri" in job_info["Transcript"]:
            # Get the redacted transcript
            download_url = job_info["Transcript"]["RedactedTranscriptFileUri"]
        else:
            # Gen the non-redacted transcript
            download_url = job_info["Transcript"]["TranscriptFileUri"]
        cli_args.inputFile = cli_args.inputJob + "-asrOutput.json"

        # Try and download the JSON - this will fail if the job delivered it to
        # an S3 bucket, as in that case the service no longer has the results
        try:
            urllib.request.urlretrieve(download_url, cli_args.inputFile)
        except:
            print("UNAVAILABLE: Transcript for job-id '{0}' is not available for download.".format(cli_args.inputJob))
            exit(-1)

        # Set our output filename if one wasn't supplied
        if cli_args.outputFile is None:
            cli_args.outputFile = cli_args.inputJob + ".docx"

    # Load in the JSON file for processing
    json_filepath = Path(cli_args.inputFile)
    if json_filepath.is_file():
        json_data = json.load(open(json_filepath.absolute(), "r", encoding="utf-8"))
    else:
        print("FAIL: Specified JSON file '{0}' does not exists.".format(cli_args.inputFile))
        exit(-1)

    # If this is a file-input run then try and load the job status (which may no longer exist)
    if cli_args.inputJob is None:
        try:
            # Ensure we don't delete our JSON later, reset our output file to match the job-name if it's currently blank
            cli_args.keep = True
            if cli_args.outputFile is None:
                if "results" in json_data:
                    cli_args.outputFile = json_data["jobName"] + ".docx"
                    cli_args.inputJob = json_data["jobName"]
                else:
                    cli_args.outputFile = json_data["JobName"] + ".docx"
                    cli_args.inputJob = json_data["JobName"]
            job_info, job_status = load_transcribe_job_status(cli_args)
        except:
            # No job status - need to quickly work out what mode we're in,
            # as standard job results look different from analytical ones
            cli_args.inputJob = None
            cli_args.outputFile = cli_args.inputFile + ".docx"
            cli_args.analyticsMode = "results" not in json_data
            job_info = None

    # Disable Comprehend's sentiment if we're in Analytics mode
    if cli_args.analyticsMode:
        cli_args.sentiment = 'off'

    # Generate the core transcript
    start = perf_counter()
    speech_segments = create_turn_by_turn_segments(json_data, cli_args)

    # Inject Comprehend-based sentiments into the segment list if required
    if cli_args.sentiment == 'on':
        # Work out the mapped language code, as Transcribe supports more languages than Comprehend.  Just
        # see if the Transcribe language code starts with any of those that Comprehend supports and use that
        sentiment_lang_code = None
        for comprehend_code in SENTIMENT_LANGUAGES:
            if job_info["LanguageCode"].startswith(comprehend_code):
                sentiment_lang_code = comprehend_code
                break

        # If we have no match then we cannot perform sentiment analysis
        if sentiment_lang_code is not None:
            generate_sentiment(speech_segments, sentiment_lang_code)
        else:
            cli_args.sentiment = 'off'

    # Write out our file and the performance statistics
    write(cli_args, speech_segments, job_info)
    finish = perf_counter()
    duration = round(finish - start, 2)
    print(f"> Transcript {cli_args.outputFile} writen in {duration} seconds.")

    # Finally, remove any temporary downloaded JSON results file
    if (cli_args.inputJob is not None) and (not cli_args.keep):
        os.remove(cli_args.inputFile)