def submitTranscribeJob()

in pca-server/src/pca/pcacommon.py [0:0]


def submitTranscribeJob(bucket, key, langCode, role_arn):
    """
    Submits a job to Transcribe based upon the supplied parameters.  If the language code
    is an empty string then we are doing language detection.
    """
    # Get our boto3 client
    transcribeClient = boto3.client('transcribe')

    # Generate job-name - delete if it already exists
    jobName = generateJobName(key)
    currentJobStatus = checkExistingJobStatus(jobName, transcribeClient)
    uri = 's3://' + bucket + '/' + key

    # If there's a job already running then the input file may have been copied - quit
    if (currentJobStatus == "IN_PROGRESS") or (currentJobStatus == "QUEUED"):
        # Return empty job name
        print("A Transcription job named \'{}\' is already in progress - cannot continue.".format(jobName))
        return ""
    elif currentJobStatus != "":
        # But if an old one exists we can delete it
        transcribeClient.delete_transcription_job(TranscriptionJobName=jobName)

    # Start off our settings blocks
    mediaSettings = {'MediaFileUri': uri}
    jobSettings = {'ChannelIdentification': False}

    # Some settings are specific to language detection
    if langCode == "":
        # No specific code means language detection, so also turn off PII
        # and the output bucket as we have no interest in the transcript
        selectedLanguage = None
        contentRedaction = None
        outputBucket = None
        languageIdentification = True
        languageIdentList = cf.appConfig[cf.CONF_TRANSCRIBE_LANG]
    else:
        # Setup flags to ignore language detection
        selectedLanguage = langCode
        languageIdentList = None

        # Double check that a custom-vocab exists for our language,
        # and they aren't supported for language detection runs
        if cf.appConfig[cf.CONF_VOCABNAME] != "":
            try:
                vocabName = cf.appConfig[cf.CONF_VOCABNAME] + '-' + langCode.lower()
                ourVocab = transcribeClient.get_vocabulary(VocabularyName = vocabName)
                if ourVocab["VocabularyState"] == "READY":
                    # Only use it if it is ready for use
                    jobSettings["VocabularyName"] = vocabName
            except:
                # Doesn't exist - don't use it
                pass

        # Only enable content redaction if it's supported
        if langCode in cf.appConfig[cf.CONF_REDACTION_LANGS]:
            contentRedaction = {'RedactionType': 'PII', 'RedactionOutput': 'redacted_and_unredacted'}
        else:
            contentRedaction = None

        # Define our other full transcript settings
        outputBucket = cf.appConfig[cf.CONF_S3BUCKET_OUTPUT]
        jobSettings["ShowSpeakerLabels"] = True
        jobSettings["MaxSpeakerLabels"] = int(cf.appConfig[cf.CONF_MAX_SPEAKERS])
        jobSettings["ShowAlternatives"] = True
        jobSettings["MaxAlternatives"] = 2

    # Job execution settings - role required is in an environment variable
    executionSettings = {
        "AllowDeferredExecution": True,
        "DataAccessRoleArn": role_arn
    }

    # Should have a clear run at doing the job now
    kwargs = {'TranscriptionJobName': jobName,
              'LanguageCode': selectedLanguage,
              'Media': mediaSettings,
              'OutputBucketName': outputBucket,
              'Settings': jobSettings,
              'JobExecutionSettings': executionSettings,
              'ContentRedaction': contentRedaction,
              'IdentifyLanguage': languageIdentification,
              'LanguageOptions': languageIdentList
    }

    # Start the Transcribe job, removing any 'None' values on the way
    transcribeClient = boto3.client('transcribe')
    response = transcribeClient.start_transcription_job(
        **{k: v for k, v in kwargs.items() if v is not None}
    )

    # Return our job name, as we need to track it
    return jobName