experiments/babel/app/pages/gemini_studio.py (184 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Gemini 2.0 Voices Studio Mesop Page """
import json
import logging
import socket
#from typing import List, TypedDict, Any, cast
import urllib
from dataclasses import field
import google.auth
import google.auth.transport.requests as googlerequests
import google.oauth2.id_token
import mesop as me
from common.utility import get_uri_by_key_name
#from components.page_scaffold import page_scaffold, page_frame
from components.styles import BACKGROUND_COLOR, CONTENT_STYLE
from config.default import BabelMetadata, Default, gemini_voices, reference_voices
logging.basicConfig(level=logging.DEBUG)
config = Default()
BUCKET_PATH = "https://storage.mtls.cloud.google.com/" + config.GENMEDIA_BUCKET
@me.stateclass
class PageState:
"""Local Page State"""
is_loading: bool = False
gemini_voice: str = "Zephyr"
gemini_statement: str = ""
gemini_output_metadata: list[BabelMetadata] = field(default_factory=lambda: []) # pylint: disable=invalid-field-call
gemini_reference_voice_uri: str = ""
gemini_reference_voice_image_uri: str = ""
audio_output_infos: list[str] = field(default_factory=lambda: []) # pylint: disable=invalid-field-call
# Gemini voices
def gemini_studio_page(app_state: me.state):
"""Gemini Studio page"""
state = me.state(PageState)
# print(f"{state.current_page}")
# with me.box(style=me.Style(flex_direction="row", display="flex")):
with me.box(style=CONTENT_STYLE):
me.text("Enter text to voice", type="headline-6")
with me.box(
style=me.Style(
display="flex",
flex_direction="row",
gap=3,
align_items="flex-start",
padding=me.Padding(bottom=16),
)
):
me.image(
src=state.gemini_reference_voice_image_uri,
style=me.Style(height=56),
)
voice_options = []
gemini_voices.sort()
for voice in gemini_voices:
voice_options.append(me.SelectOption(label=voice, value=voice,))
me.select(
label="Select a Gemini Voice",
options=voice_options,
on_selection_change=on_click_set_gemini_voice,
value=state.gemini_voice,
)
subtle_chat_input_gemini()
if state.is_loading:
me.progress_spinner()
elif state.gemini_output_metadata:
with me.box(
style=me.Style(display="grid", grid_template_columns="1fr 1fr")
):
sorted_metadata = sorted(
state.gemini_output_metadata,
key=lambda voice: voice["language_code"],
)
for item in sorted_metadata:
# print(item)
audio_url = f"{BUCKET_PATH}/{item['audio_path']}"
# print(audio_url)
with me.box(
style=me.Style(
display="flex",
flex_direction="column",
gap=5,
padding=me.Padding(top=10, left=10, right=10, bottom=12),
)
):
me.text(
f"{item["voice_name"]}",
style=me.Style(font_weight="bold"),
)
me.audio(src=audio_url)
me.text(item["text"])
def on_click_set_gemini_voice(e: me.ClickEvent):
"""event to set the gemini voice"""
state = me.state(PageState)
state.gemini_voice = e.value
print(f"voice choice: {e.value}")
uri = get_uri_by_key_name(e.value, "uri")
if uri:
#print(f"the gsuri is: {uri}")
state.gemini_reference_voice_uri = uri.replace(
"gs://", "https://storage.mtls.cloud.google.com/"
)
else:
print("Couldn't find URI for voice")
image = get_uri_by_key_name(e.value, "icon")
if image:
print(f"the image gsuri is: {image}")
state.gemini_reference_voice_image_uri = uri.replace(
"gs://", "https://storage.mtls.cloud.google.com/"
)
else:
print("Couldn't find URI for voice")
@me.component
def subtle_chat_input_gemini():
"""input component"""
with me.box(
style=me.Style(
border_radius=16,
padding=me.Padding.all(8),
background=BACKGROUND_COLOR,
display="flex",
width="100%",
)
):
with me.box(
style=me.Style(
flex_grow=1,
)
):
me.native_textarea(
autosize=True,
min_rows=8,
placeholder="Voicing instructions for Gemini",
style=me.Style(
padding=me.Padding(top=16, left=16),
background=BACKGROUND_COLOR,
outline="none",
width="100%",
overflow_y="auto",
border=me.Border.all(
me.BorderSide(style="none"),
),
color=me.theme_var("on-surface"),
),
on_blur=on_blur_gemini_statement,
)
# with me.content_button(type="icon"):
# me.icon("upload")
# with me.content_button(type="icon"):
# me.icon("photo")
with me.content_button(type="icon", on_click=on_click_gemini):
me.icon("send")
def on_blur_gemini_statement(e: me.InputBlurEvent):
"""updates the statement to synthesize"""
state = me.state(PageState)
state.gemini_statement = e.value
def on_click_gemini(e: me.ClickEvent):
"""uses the Gemini voices to create audio"""
state = me.state(PageState)
state.is_loading = True
if not state.gemini_statement:
print("no statement provided. not synthesizing.")
return
state.audio_output_infos.clear()
yield
post_object = {
"statement": state.gemini_statement,
#"instructions": "say the following",
"voiceName": state.gemini_voice,
}
print(post_object)
endpoint = f"{config.BABEL_ENDPOINT}/gemini"
print(f"endpoint: {endpoint}")
req = urllib.request.Request(endpoint)
if "localhost" not in endpoint:
credentials, project_id = google.auth.default()
print(f"project id: {project_id}")
credentials.refresh(googlerequests.Request())
print(f"credentials.token {credentials.token}")
urlinfo = urllib.parse.urlparse(endpoint)
audience = f"{urlinfo.scheme}://{urlinfo.netloc}/"
print(f"audience: {audience}")
auth_req = google.auth.transport.requests.Request()
id_token = google.oauth2.id_token.fetch_id_token(auth_req, audience)
print(f"id token {id_token}")
req.add_header("Authorization", f"Bearer {id_token}")
try:
req.add_header("Content-Type", "application/json; charset=utf-8")
bindata = str(json.dumps(post_object)).encode("utf-8")
response = urllib.request.urlopen(req, bindata)
response_as_string = response.read().decode("utf-8")
print(response_as_string)
data = json.loads(response_as_string)
# state.audio_output_uri = f"{BUCKET_PATH}{data.get("outputfiles")[0]}"
# state.audio_output_infos.clear()
# for f in data.get("audio_metadata"):
# state.audio_output_infos.append(f"{BUCKET_PATH}{f}")
print(data.get("audio_metadata"))
state.gemini_output_metadata.clear()
state.gemini_output_metadata = [
BabelMetadata(item) for item in data.get("audio_metadata")
]
except urllib.error.HTTPError as err:
print(f"HTTP Error: {err.code} - {err.reason}")
# Handle the HTTP error (e.g., log it, retry the request, etc.)
except urllib.error.URLError as err:
print(f"URL Error: {err.reason}")
# Handle the URL error (e.g., check network connectivity)
except socket.error as err:
print(f"Socket Error: {err}")
# Handle the socket error (e.g., retry the request, check network
state.gemini_statement = ""
state.is_loading = False
yield