def check_translate_webcaptions()

in source/operators/captions/webcaptions.py [0:0]


def check_translate_webcaptions(event, context):

    print("We got this event:\n", event)
    operator_object = MediaInsightsOperationHelper(event)
    webcaptions_object = WebCaptions(operator_object)

    # If the array of target languages only contains the source language
    # then we have nothing to do.
    if webcaptions_object.target_language_codes == [webcaptions_object.source_language_code]:
        print("Skipping check_translate_webcaptions because source language is same as target language")
        operator_object.update_workflow_status("Complete")

    else:
        try:
            translate_jobs = operator_object.metadata["TextTranslateJobPropertiesList"]
            workflow_id = operator_object.workflow_execution_id
            asset_id = operator_object.asset_id
            transcript_storage_path = dataplane.generate_media_storage_path(asset_id, workflow_id)
            bucket = transcript_storage_path['S3Bucket']
            translation_output_path = transcript_storage_path['S3Key']+"webcaptions_translate_output/"

        except KeyError as e:
            operator_object.update_workflow_status("Error")
            operator_object.add_workflow_metadata(TranslateError="Missing a required metadata key {e}".format(e=e))
            raise MasExecutionError(operator_object.return_output_object())

        # Check the status of each job
        # - IF ANY job has an error, we fail the workflow and return from the loop
        # - IF ANY job is still running, the workflow is still Executing
        # - If ALL jobs are complete, we reach the end of the loop and the workflow is complete
        for job in translate_jobs:

            try:
                job_id = job["JobId"]
                job_status_list = []

            except KeyError as e:
                operator_object.update_workflow_status("Error")
                operator_object.add_workflow_metadata(TranslateError="Missing a required metadata key {e}".format(e=e))
                raise MasExecutionError(operator_object.return_output_object())
            try:
                response = translate_client.describe_text_translation_job(
                    JobId=job_id
                )
                print(response)
                job_status = {
                    "JobId": job_id,
                    "Status": response["TextTranslationJobProperties"]["JobStatus"]
                }

            except Exception as e:
                operator_object.update_workflow_status("Error")
                operator_object.add_workflow_metadata(TranslateError=str(e), TranslateJobId=job_id)
                raise MasExecutionError(operator_object.return_output_object())
            else:
                if response["TextTranslationJobProperties"]["JobStatus"] in ["IN_PROGRESS", "SUBMITTED"]:
                    operator_object.update_workflow_status("Executing")
                    operator_object.add_workflow_metadata(TextTranslateJobStatusList=job_status_list, AssetId=asset_id, WorkflowExecutionId=workflow_id)
                    return operator_object.return_output_object()
                elif response["TextTranslationJobProperties"]["JobStatus"] in ["FAILED", "COMPLETED_WITH_ERROR", "STOP_REQUESTED", "STOPPED"]:
                    operator_object.update_workflow_status("Error")
                    operator_object.add_workflow_metadata(TextTranslateJobStatusList=job_status_list, AssetId=asset_id, WorkflowExecutionId=workflow_id)
                    raise MasExecutionError(operator_object.return_output_object())
                elif response["TextTranslationJobProperties"]["JobStatus"] == "COMPLETED":
                    print("{} is complete".format(job_id))
                    operator_object.add_workflow_metadata(TextTranslateJobStatusList=job_status_list, AssetId=asset_id, WorkflowExecutionId=workflow_id)

        # If we made it here, then all the translate jobs are complete.
        # Convert the translations back to WebCaptions and write them out
        # to the dataplane
        translation_storage_path = dataplane.generate_media_storage_path(asset_id, workflow_id)
        bucket = translation_storage_path['S3Bucket']
        translation_path = translation_storage_path['S3Key']

        webcaptions_collection = []
        for job in translate_jobs:
            try:
                print("Save translation for job {}".format(job["JobId"]))

                translateJobDescription = translate_client.describe_text_translation_job(JobId = job["JobId"])
                translateJobS3Uri = translateJobDescription["TextTranslationJobProperties"]["OutputDataConfig"]["S3Uri"]
                translateJobUrl = urlparse(translateJobS3Uri, allow_fragments = False)
                translateJobLanguageCode = translateJobDescription["TextTranslationJobProperties"]["TargetLanguageCodes"][0]

                translateJobS3Location = {
                    "Uri": translateJobS3Uri,
                    "Bucket": translateJobUrl.netloc,
                    "Key": translateJobUrl.path.strip("/")
                }

                # use input web captions to convert translation output to web captions format
                for outputS3ObjectKey in map(lambda s: s.key, s3_resource.Bucket(translateJobS3Location["Bucket"]).objects.filter(Prefix=translateJobS3Location["Key"] + "/", Delimiter="/")):
                    print("Save translation for each output of job {} output {}".format(job["JobId"], outputS3ObjectKey))

                    outputFilename = ntpath.basename(outputS3ObjectKey)

                    translateOutput = s3_resource.Object(translateJobS3Location["Bucket"], outputS3ObjectKey).get()["Body"].read().decode("utf-8")
                    #inputWebCaptions = get_webcaptions(operator_object, translateJobDescription["TextTranslationJobProperties"]["SourceLanguageCode"])
                    inputWebCaptions = webcaptions_object.GetWebCaptions(translateJobDescription["TextTranslationJobProperties"]["SourceLanguageCode"])
                    outputWebCaptions = webcaptions_object.DelimitedToWebCaptions(inputWebCaptions, translateOutput, webcaptions_object.marker, 15)
                    print(outputS3ObjectKey)
                    (targetLanguageCode, basename, ext) = outputFilename.split(".")
                    #put_webcaptions(operator_object, outputWebCaptions, targetLanguageCode)
                    operator_metadata = webcaptions_object.PutWebCaptions(outputWebCaptions, targetLanguageCode)
                    operator_metadata = webcaptions_object.PutWebCaptions(outputWebCaptions, targetLanguageCode, source="Translate")

                    # Save a copy of the translation text without delimiters.  The translation
                    # process may remove or add important whitespace around caption markers
                    # Try to replace markers with correct spacing (biased toward Latin based languages)
                    if webcaptions_object.contentType == 'text/html':
                        translateOutput = html.unescape(translateOutput)

                    # - delimiter next to space keeps the space
                    translation_text = translateOutput.replace(" "+webcaptions_object.marker, " ")
                    # - delimiter next to contraction (some languages) has no space
                    translation_text = translation_text.replace("'"+webcaptions_object.marker, "")
                    # - all the rest are replaced with a space. This might add an extra space
                    # - but that's better than no space
                    translation_text = translation_text.replace(webcaptions_object.marker, " ")

                    translation_text_key = translation_path+"translation"+"_"+targetLanguageCode+".txt"
                    s3_object = s3_resource.Object(bucket, translation_text_key)
                    s3_object.put(Body=translation_text)

                metadata = {
                    "OperatorName": "TranslateWebCaptions_"+translateJobLanguageCode,
                    "TranslationText": {"S3Bucket": bucket, "S3Key": translation_text_key},
                    #"WebCaptions": operator_metadata,
                    "WorkflowId": workflow_id,
                    "TargetLanguageCode": translateJobLanguageCode
                }
                print(json.dumps(metadata))

                webcaptions_collection.append(metadata)

            except Exception as e:
                operator_object.update_workflow_status("Error")
                operator_object.add_workflow_metadata(CaptionsError="Unable to construct path to translate output in S3: {e}".format(e=str(e)))
                raise MasExecutionError(operator_object.return_output_object())

        data = {}
        data["CaptionsCollection"] = webcaptions_collection
        webcaptions_object.PutMediaCollection(operator_object.name, data)

    return operator_object.return_output_object()