speech/snippets/transcribe_streaming_v2_explicit_decoding.go (107 lines of code) (raw):
// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Command livecaption_from_file streams a local audio file to
// Google Speech API and outputs the transcript.
package snippets
// [START speech_transcribe_streaming_v2_explicit_decoding]
import (
"context"
"fmt"
"io"
"log"
"os"
"path/filepath"
"strings"
speech "cloud.google.com/go/speech/apiv2"
"cloud.google.com/go/speech/apiv2/speechpb"
)
func transcribeStreamingSpecificDecodingV2(w io.Writer) error {
projectID := os.Getenv("GOLANG_SAMPLES_PROJECT_ID")
path := "../testdata/audio.raw"
const location = "global"
audioFile, err := filepath.Abs(path)
if err != nil {
log.Println("Failed to load file: ", path)
return err
}
f, err := os.Open(audioFile)
if err != nil {
return err
}
defer f.Close()
ctx := context.Background()
client, err := speech.NewClient(ctx)
if err != nil {
log.Println(err)
return err
}
stream, err := client.StreamingRecognize(ctx)
if err != nil {
log.Println(err)
return err
}
// Send the initial configuration message.
err = stream.Send(&speechpb.StreamingRecognizeRequest{
Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location),
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
StreamingConfig: &speechpb.StreamingRecognitionConfig{
Config: &speechpb.RecognitionConfig{
// In case of specific file encoding , so specify the decoding config.
//DecodingConfig: &speechpb.RecognitionConfig_AutoDecodingConfig{},
DecodingConfig: &speechpb.RecognitionConfig_ExplicitDecodingConfig{
ExplicitDecodingConfig: &speechpb.ExplicitDecodingConfig{
Encoding: speechpb.ExplicitDecodingConfig_LINEAR16,
SampleRateHertz: 16000,
AudioChannelCount: 1,
},
},
Model: "long",
LanguageCodes: []string{"en-US"},
Features: &speechpb.RecognitionFeatures{
MaxAlternatives: 2,
},
},
StreamingFeatures: &speechpb.StreamingRecognitionFeatures{InterimResults: true},
},
},
})
if err != nil {
return err
}
go func() error {
buf := make([]byte, 1024)
for {
n, err := f.Read(buf)
if n > 0 {
if err := stream.Send(&speechpb.StreamingRecognizeRequest{
Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location),
StreamingRequest: &speechpb.StreamingRecognizeRequest_Audio{
Audio: buf[:n],
},
}); err != nil {
return fmt.Errorf("could not send audio: %w", err)
}
}
if err == io.EOF {
// Nothing else to pipe, close the stream.
if err := stream.CloseSend(); err != nil {
return fmt.Errorf("could not close stream: %w", err)
}
return nil
}
if err != nil {
log.Printf("Could not read from %s: %v", audioFile, err)
continue
}
}
}()
for {
resp, err := stream.Recv()
if err == io.EOF {
break
}
if err != nil {
return fmt.Errorf("cannot stream results: %w", err)
}
for i, result := range resp.Results {
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
fmt.Fprintf(w, "Result %d\n", i+1)
for j, alternative := range result.Alternatives {
log.Printf("Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript)
fmt.Fprintf(w, "Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript)
}
}
}
return nil
}
// [END speech_transcribe_streaming_v2_explicit_decoding]