projects/speech2speech_translation/speech2speech.py (231 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Translates speech audio from one language to another.
Args:
project_id: The Google Cloud project ID.
bucket_name: The name of the GCS bucket to which the audio file is uploaded.
stt_model: The Speech-to-Text transcription model to use.
source_language_code: The language code of the source audio.
target_language: The language to translate the transcript to.
target_language_code: The language code of the target language.
target_voice: The Text-to-Speech voice to use for the translation.
target_voice_gender: The gender of the generated voice. Male or female.
input_audio_file_name: Input audio file name.
input_audio_file_path: Input audio file path.
output_audio_file_name: Output audio file name.
log: Log level.
"""
import argparse
import configparser
import logging
import s2s_common as s2s
CONFIG_FILE = "config.ini"
def parse_config_args(config_file):
"""Parses config.ini and command line args.
Parses first the .ini -style config file, and then command line args.
CLI args overwrite default values from the config file.
Args:
config_file: Path to config file.
Returns:
Configparser config.
"""
config = configparser.ConfigParser()
config["parameters"] = {}
try:
config.read_file(open(config_file, encoding="utf-8"))
except FileNotFoundError as e:
print(f"Config file {config_file} cannot be read: {e}")
return None
parser = argparse.ArgumentParser()
parser.add_argument(
"--project_id",
default=config["parameters"]["project_id"],
type=str,
help="The Google Cloud project ID.",
)
parser.add_argument(
"--location",
default=config["parameters"]["location"],
type=str,
help="The Google Cloud location.",
)
parser.add_argument(
"--gcs_path",
default=config["parameters"]["gcs_path"],
type=str,
help=(
"Google Cloud Storage path. " "Example: gs://bucket_name/dir1/dir2/"
),
)
parser.add_argument(
"--stt_model",
default=config["parameters"]["stt_model"],
type=str,
help=("Speech to Text model. " "Choices: long|short|telephony"),
)
parser.add_argument(
"--stt_timeout",
default=config["parameters"]["stt_timeout"],
type=int,
help="Speech to Text operation timeout in seconds.",
)
parser.add_argument(
"--stt_alternative",
default=config["parameters"]["stt_alternative"],
type=int,
help="Speech to Text results alternative index.",
)
parser.add_argument(
"--input_audio_file_name",
default=config["parameters"]["input_audio_file_name"],
type=str,
help="Input audio file name.",
)
parser.add_argument(
"--input_audio_file_path",
default=config["parameters"]["input_audio_file_path"],
type=str,
help="Input audio file path.",
)
parser.add_argument(
"--output_audio_file_name",
default=config["parameters"]["output_audio_file_name"],
type=str,
help="Output audio file name.",
)
parser.add_argument(
"--source_language_code",
default=config["parameters"]["source_language_code"],
type=str,
help="Source language code. Example: fi-FI.",
)
parser.add_argument(
"--target_language",
default=config["parameters"]["target_language"],
type=str,
help="Target language. Example: en",
)
parser.add_argument(
"--target_language_code",
default=config["parameters"]["target_language_code"],
type=str,
help="Target language code. Example: en-US",
)
parser.add_argument(
"--target_voice",
default=config["parameters"]["target_voice"],
type=str,
help="Target voice. Example: en-US-Wavenet-A",
)
parser.add_argument(
"--target_voice_gender",
default=config["parameters"]["target_voice_gender"],
type=str,
choices=["male", "female"],
help="Target voice gender. Choices: female|male",
)
parser.add_argument(
"--tts_timeout",
default=config["parameters"]["tts_timeout"],
type=int,
help="Text to Speech operation timeout in seconds.",
)
parser.add_argument(
"--log",
default=config["parameters"]["log"],
type=str,
help="Logging level. Example --log debug",
)
parser.add_argument(
"--list_translate_languages",
action="store_true",
help="List available translation languages.",
)
parser.add_argument(
"--list_voices", action="store_true", help="List available TTS voices."
)
parser.add_argument(
"--filename_prefix",
default=config["parameters"]["filename_prefix"],
type=str,
choices=["none", "timestamp"],
help=("Prefix for output file names. " "Choices: none|timestamp"),
)
parser.add_argument(
"--output_interim_files",
action="store_true",
help=(
"Output interim such as transcript "
"and translation, and upload them to GCS."
),
)
args = parser.parse_args()
return args
def main():
"""Translates speech audio from one language to another.
Args: None
"""
args = parse_config_args(CONFIG_FILE)
if not args:
print("Failed to parse config file. Exiting.")
exit(1)
logger = logging.getLogger()
logger.setLevel(args.log.upper())
ch = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.info("Main started")
logger.info("args: %s", args)
if args.list_voices:
s2s.list_tts_voices(logger)
exit(0)
elif args.list_translate_languages:
s2s.list_translate_languages(logger)
exit(0)
gcs = s2s.parse_gcs_url(args.gcs_path)
if not gcs["path"].endswith("/"):
gcs["path"] += "/"
logger.info("GCS URL: %s", args.gcs_path)
logger.debug("GCS bucket: %s", gcs["bucket"])
logger.debug("GCS path: %s", gcs["path"])
prefix = s2s.generate_filename_prefix(args.filename_prefix) or ""
logger.info("Filename prefix: %s", prefix)
gcs_uri_input_audio = s2s.upload_file_to_gcs(
args.project_id,
gcs,
args.input_audio_file_path,
args.input_audio_file_name,
logger,
)
stt_response = s2s.speech_to_text(
args.project_id,
args.location,
args.source_language_code,
args.stt_model,
args.stt_timeout,
gcs_uri_input_audio,
logger,
)
transcript = s2s.parse_stt_response(
stt_response, gcs_uri_input_audio, args.stt_alternative, logger
)
translate_result = s2s.translate_text(
args.target_language, transcript, logger
)
logger.debug(
"Text (%s): %s",
translate_result["detectedSourceLanguage"],
translate_result["input"],
)
logger.debug(
"\nTranslation (%s): %s",
args.target_language,
translate_result["translatedText"],
)
s2s.text_to_speech(
args.project_id,
args.location,
args.target_voice,
args.target_voice_gender,
args.target_language_code,
translate_result["translatedText"],
args.tts_timeout,
gcs,
args.output_audio_file_name,
logger,
prefix,
)
if args.output_interim_files:
uri = s2s.upload_variable_to_gcs(
args.project_id,
gcs,
prefix + "transcript.txt",
transcript,
"text/plain; charset=utf-8",
)
logger.info("Transcript written to: %s", uri)
uri = s2s.upload_variable_to_gcs(
args.project_id,
gcs,
prefix + "translation.txt",
translate_result["translatedText"],
"text/plain; charset=utf-8",
)
logger.info("Translation written to: %s", uri)
if __name__ == "__main__":
main()