in source/operators/captions/webcaptions.py [0:0]
def TranslateWebCaptions(self, inputCaptions, sourceLanguageCode, targetLanguageCodes, terminology_names=[], parallel_data_names=[]):
marker = self.marker
try:
translate_role = os.environ['translateRole']
asset_id = self.operator_object.asset_id
workflow_id = self.operator_object.workflow_execution_id
translate_job_name = "MIE_"+asset_id+"_"+workflow_id
# Convert WebCaptions to text with marker between caption lines
inputEntries = map(lambda c: c["caption"], inputCaptions)
inputDelimited = marker.join(inputEntries)
transcript_storage_path = dataplane.generate_media_storage_path(asset_id, workflow_id)
bucket = transcript_storage_path['S3Bucket']
translation_input_path = transcript_storage_path['S3Key']+"webcaptions_translate_input/"
translation_input_uri = 's3://'+bucket+"/"+translation_input_path
translation_output_path = transcript_storage_path['S3Key']+"webcaptions_translate_output/"
translation_output_uri = 's3://'+bucket+"/"+translation_output_path
key = translation_input_path+"transcript_with_caption_markers.txt"
print("put object {} {}".format(bucket, key))
s3.put_object(Bucket=bucket, Key=key, Body=inputDelimited)
print("create translate output folder if it doesn't exist")
dummy_key = translation_output_path+"/"+"foo"
s3.put_object(Bucket=bucket, Key=dummy_key, Body="foo")
print("Translate inputs")
print("translation_input_uri {}".format(translation_input_uri))
print("translation_output_uri {}".format(translation_output_uri))
print("translate_role {}".format(translate_role))
# Kick off a job for each input language
translate_jobs = []
for targetLanguageCode in targetLanguageCodes:
# Avoid translating to the same language as the source language
if targetLanguageCode == sourceLanguageCode:
continue
print("Starting translation to {}".format(targetLanguageCode))
# Even though the API takes a list of targets, Translate only supports
# a list of 1 or less
# Set the job name to avoid creating the same job multiple time if
# we retry due to an error.
singletonTargetList = []
singletonTargetList.append(targetLanguageCode)
job_name = "MIE_"+asset_id+"_"+workflow_id+"_"+targetLanguageCode
print("JobName: {}".format(job_name))
terminology_name = []
if len(terminology_names) > 0:
# Find a terminology in the list of custom terminologies
# that defines translations for targetLanguageCode.
# If there happens to be more than one terminology matching targetLanguageCode
# then just use the first one in the list.
for item in terminology_names:
if targetLanguageCode in item['TargetLanguageCodes']:
terminology_name.append(item['Name'])
break
if len(terminology_name) == 0:
print("No custom terminology specified.")
else:
print("Using custom terminology {}".format(terminology_name))
parallel_data_name = []
if len(parallel_data_names) > 0:
# Find a parallel data set in the list of
# that defines translations for targetLanguageCode.
# If there happens to be more than one matching targetLanguageCode
# then just use the first one in the list.
for item in parallel_data_names:
if targetLanguageCode in item['TargetLanguageCodes']:
parallel_data_name.append(item['Name'])
break
if len(parallel_data_name) == 0:
print("No parallel data specified.")
else:
print("Using parallel data_names {}".format(parallel_data_name))
translation_job_config = {
"JobName": job_name,
"InputDataConfig": {
'S3Uri': translation_input_uri,
'ContentType': self.contentType
},
"OutputDataConfig": {
'S3Uri': translation_output_uri
},
"DataAccessRoleArn": translate_role,
"SourceLanguageCode": sourceLanguageCode,
"TargetLanguageCodes": singletonTargetList,
"TerminologyNames": terminology_name,
}
current_region = os.environ['AWS_REGION']
# Include Parallel Data configuration when running in a region where
# Active Custom Translation is available. Reference:
# https://docs.aws.amazon.com/translate/latest/dg/customizing-translations-parallel-data.html
active_custom_translation_supported_regions = ['us-east-1', 'us-west-2', 'eu-west-1']
if current_region in active_custom_translation_supported_regions:
translation_job_config["ParallelDataNames"] = parallel_data_name
# Save the delimited transcript text to S3
response = translate_client.start_text_translation_job(**translation_job_config)
jobinfo = {
"JobId": response["JobId"],
"TargetLanguageCode": targetLanguageCode
}
translate_jobs.append(jobinfo)
self.operator_object.add_workflow_metadata(TextTranslateJobPropertiesList=translate_jobs, TranslateSourceLanguage=sourceLanguageCode)
except Exception as e:
self.operator_object.update_workflow_status("Error")
self.operator_object.add_workflow_metadata(TranslateError="Unable to start translation WebCaptions job: {e}".format(e=str(e)))
raise MasExecutionError(self.operator_object.return_output_object())