audio/speech/sample-apps/live-translator/app.py (84 lines of code) (raw):
import os
from google import genai
from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech
from google.genai.chats import Chat
from google.genai.types import GenerateContentConfig, Part
import streamlit as st
# Initialize session state for chat history
if "chat_history" not in st.session_state:
st.session_state["chat_history"] = []
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION")
if PROJECT_ID and not LOCATION:
LOCATION = "us-central1"
MODEL_ID = "gemini-2.0-flash-lite"
LANGUAGE_MAP = {
"Spanish (Español)": {
"language_code": "es-US",
"voice_name": "es-US-Chirp3-HD-Puck",
},
"English (Inglés)": {
"language_code": "en-US",
"voice_name": "en-US-Chirp3-HD-Fenrir",
},
}
@st.cache_resource
def load_chat() -> Chat:
"""Load Google Gen AI Client."""
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)
return client.chats.create(
model=MODEL_ID,
config=GenerateContentConfig(
system_instruction="You are an assistant tasked with translating between languages. Only respond with the translation.",
),
)
@st.cache_resource
def load_tts_client() -> texttospeech.TextToSpeechClient:
"""Load Text-to-Speech Client."""
return texttospeech.TextToSpeechClient(
client_options=ClientOptions(api_endpoint="us-texttospeech.googleapis.com")
)
chat = load_chat()
tts_client = load_tts_client()
def play_audio(audio_bytes: bytes) -> None:
"""Plays the audio from a byte stream."""
if audio_bytes is not None:
try:
st.audio(audio_bytes, format="audio/wav", autoplay=True)
except Exception as e: # pylint: disable=broad-except
st.error(f"Error playing audio: {e}")
def generate_audio(text: str, voice_name: str, language_code: str) -> bytes:
"""Generates audio from text using Google Cloud Text-to-Speech."""
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = tts_client.synthesize_speech(
input=texttospeech.SynthesisInput(text=text),
voice=texttospeech.VoiceSelectionParams(
language_code=language_code,
name=voice_name,
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
),
)
return response.audio_content
def main() -> None:
"""Main function to run the Streamlit app."""
st.title("HCA Translator")
# Add radio button for language direction
target_language = (
st.radio(
"**Output Language:**",
("Spanish (Español)", "English (Inglés)"),
)
or "Spanish (Español)"
)
audio_input = st.audio_input("Record a voice message")
if audio_input:
user_input = Part.from_bytes(data=audio_input.getvalue(), mime_type="audio/wav")
instruction = f"Translate the audio into {target_language}."
assistant_response = chat.send_message(message=[instruction, user_input]).text
with st.chat_message("assistant"):
st.markdown(assistant_response)
output_audio_bytes = generate_audio(
assistant_response,
voice_name=LANGUAGE_MAP[target_language]["voice_name"],
language_code=LANGUAGE_MAP[target_language]["language_code"],
)
if output_audio_bytes:
play_audio(output_audio_bytes)
audio_input = None
if __name__ == "__main__":
main()