api/simultaneousinterpreter/translate_speech.go (126 lines of code) (raw):
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package simultaneousinterpreter
import (
"context"
"net/http"
"strings"
"sync"
"cloud.google.com/go/translate"
"golang.org/x/sync/errgroup"
"golang.org/x/text/language"
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
hd "github.com/GoogleCloudPlatform/appengine-cloud-demo-portal/api/pkg/handler"
)
type translateSpeechRequest struct {
Audio struct {
Content string `json:"content"`
} `json:"audio"`
Config struct {
LanguageCode string `json:"language_code"`
} `json:"config"`
}
type translateSpeechResponse struct {
LanguageCode string `json:"language_code"`
Translations map[string]string `json:"translations"`
}
func (h *handler) translateSpeech(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
req := &translateSpeechRequest{}
if err := hd.DecodeJSONBody(r, req); err != nil {
hd.RespondErrorJSON(w, r, err)
return
}
if req.Config.LanguageCode == "" {
hd.RespondErrorMessage(w, r,
http.StatusBadRequest,
"language_code must be specified")
return
}
text, err := h.speechToText(ctx, req.Config.LanguageCode, req.Audio.Content)
if err != nil {
hd.RespondErrorJSON(w, r, err)
return
}
translations, err := h.translate(ctx, req.Config.LanguageCode, text)
if err != nil {
hd.RespondErrorJSON(w, r, err)
return
}
res := &translateSpeechResponse{
LanguageCode: req.Config.LanguageCode,
Translations: translations,
}
hd.RespondJSON(w, r, http.StatusOK, res)
}
func (h *handler) translate(ctx context.Context, sourceLang, text string) (map[string]string, error) {
sourceLangTag, err := language.Parse(strings.Split(sourceLang, "-")[0])
if err != nil {
return nil, hd.Errorf(ctx,
http.StatusBadRequest,
"invalid language code",
"failed language.Parse: %w", err)
}
langs, err := h.getTranslateSupportedLanguages(ctx)
if err != nil {
return nil, hd.Wrapf("failed getTranslateSupportedLanguages: %w", err)
}
translations := map[string]string{}
mu := &sync.Mutex{}
eg := &errgroup.Group{}
for _, lang := range langs {
if sourceLangTag == lang.Tag {
translations[lang.Name] = text
continue
}
lang := lang
eg.Go(func() error {
resp, err := h.Translate.Translate(ctx, []string{text}, lang.Tag, &translate.Options{
Source: sourceLangTag,
Format: translate.Text,
})
if err != nil {
return hd.Errorf(ctx,
http.StatusInternalServerError,
http.StatusText(http.StatusInternalServerError),
"failed Translate.Translate: %w", err)
}
mu.Lock()
defer mu.Unlock()
translations[lang.Name] = resp[0].Text
return nil
})
}
if err := eg.Wait(); err != nil {
return nil, hd.Wrapf("failed to translate: %w", err)
}
return translations, nil
}
func (h *handler) speechToText(ctx context.Context, lang, audio string) (string, error) {
wave, err := hd.Base64ToWave(ctx, audio)
if err != nil {
return "", hd.Wrapf("failed Base64ToWave: %w", err)
}
req := &speechpb.RecognizeRequest{
Config: &speechpb.RecognitionConfig{
Encoding: speechpb.RecognitionConfig_LINEAR16,
SampleRateHertz: 48000,
LanguageCode: lang,
},
Audio: &speechpb.RecognitionAudio{
AudioSource: &speechpb.RecognitionAudio_Content{Content: wave},
},
}
res, err := h.Speech.Recognize(ctx, req)
if err != nil {
return "", hd.Errorf(ctx,
http.StatusInternalServerError,
http.StatusText(http.StatusInternalServerError),
"failed Speech.Recognize: %w", err)
}
if len(res.Results) == 0 {
return "", hd.Errorf(ctx,
http.StatusBadRequest,
"no text was recognized",
"no text was recognized")
}
return res.Results[0].Alternatives[0].Transcript, nil
}