speech/snippets/beta_snippets.py (213 lines of code) (raw):
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Google Cloud Speech API sample that demonstrates enhanced models
and recognition metadata.
Example usage:
python beta_snippets.py enhanced-model
python beta_snippets.py metadata
python beta_snippets.py punctuation
python beta_snippets.py diarization
python beta_snippets.py multi-channel
python beta_snippets.py multi-language
python beta_snippets.py word-level-conf
python beta_snippets.py spoken-punctuation-emojis
"""
import argparse
from google.cloud import speech_v1p1beta1 as speech
def transcribe_file_with_enhanced_model() -> speech.RecognizeResponse:
"""Transcribe the given audio file using an enhanced model."""
# [START speech_transcribe_enhanced_model_beta]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "resources/commercial_mono.wav"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
use_enhanced=True,
# A model must be specified to use enhanced model.
model="phone_call",
)
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print("-" * 20)
print(f"First alternative of result {i}")
print(f"Transcript: {alternative.transcript}")
return response.results
# [END speech_transcribe_enhanced_model_beta]
def transcribe_file_with_metadata() -> speech.RecognizeResponse:
"""Send a request that includes recognition metadata."""
# [START speech_transcribe_recognition_metadata_beta]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "resources/commercial_mono.wav"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
# Here we construct a recognition metadata object.
# Most metadata fields are specified as enums that can be found
# in speech.enums.RecognitionMetadata
metadata = speech.RecognitionMetadata()
metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
metadata.microphone_distance = (
speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD
)
metadata.recording_device_type = (
speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE
)
# Some metadata fields are free form strings
metadata.recording_device_name = "Pixel 2 XL"
# And some are integers, for instance the 6 digit NAICS code
# https://www.naics.com/search/
metadata.industry_naics_code_of_audio = 519190
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
# Add this in the request to send metadata.
metadata=metadata,
)
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print("-" * 20)
print(f"First alternative of result {i}")
print(f"Transcript: {alternative.transcript}")
return response.results
# [END speech_transcribe_recognition_metadata_beta]
def transcribe_file_with_auto_punctuation() -> speech.RecognizeResponse:
"""Transcribe the given audio file with auto punctuation enabled."""
# [START speech_transcribe_auto_punctuation_beta]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "resources/commercial_mono.wav"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
# Enable automatic punctuation
enable_automatic_punctuation=True,
)
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print("-" * 20)
print(f"First alternative of result {i}")
print(f"Transcript: {alternative.transcript}")
return response.results
# [END speech_transcribe_auto_punctuation_beta]
def transcribe_file_with_diarization() -> speech.RecognizeResponse:
"""Transcribe the given audio file synchronously with diarization."""
# [START speech_transcribe_diarization_beta]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "resources/commercial_mono.wav"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
diarization_config = speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
max_speaker_count=10,
)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
diarization_config=diarization_config,
)
print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)
# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]
words_info = result.alternatives[0].words
# Printing out the output:
for word_info in words_info:
print(f"word: '{word_info.word}', speaker_tag: {word_info.speaker_tag}")
return result
# [END speech_transcribe_diarization_beta]
def transcribe_file_with_multichannel() -> speech.RecognizeResponse:
"""Transcribe the given audio file synchronously with
multi channel."""
# [START speech_transcribe_multichannel_beta]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "resources/Google_Gnome.wav"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
audio_channel_count=1,
enable_separate_recognition_per_channel=True,
)
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print("-" * 20)
print(f"First alternative of result {i}")
print(f"Transcript: {alternative.transcript}")
print(f"Channel Tag: {result.channel_tag}")
return response.results
# [END speech_transcribe_multichannel_beta]
def transcribe_file_with_multilanguage() -> speech.RecognizeResponse:
"""Transcribe the given audio file synchronously with
multi language."""
# [START speech_transcribe_multilanguage_beta]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "resources/multi.wav"
first_lang = "en-US"
second_lang = "es"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=44100,
audio_channel_count=2,
language_code=first_lang,
alternative_language_codes=[second_lang],
)
print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print("-" * 20)
print(f"First alternative of result {i}: {alternative}")
print(f"Transcript: {alternative.transcript}")
return response.results
# [END speech_transcribe_multilanguage_beta]
def transcribe_file_with_word_level_confidence() -> speech.RecognizeResponse:
"""Transcribe the given audio file synchronously with
word level confidence."""
# [START speech_transcribe_word_level_confidence_beta]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "resources/Google_Gnome.wav"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
enable_word_confidence=True,
)
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print("-" * 20)
print(f"First alternative of result {i}")
print(f"Transcript: {alternative.transcript}")
print(
"First Word and Confidence: ({}, {})".format(
alternative.words[0].word, alternative.words[0].confidence
)
)
return response.results
# [END speech_transcribe_word_level_confidence_beta]
def transcribe_file_with_spoken_punctuation_end_emojis() -> speech.RecognizeResponse:
"""Transcribe the given audio file with spoken punctuation and emojis enabled."""
# [START speech_transcribe_spoken_punctuation_emojis_beta]
from google.cloud import speech_v1p1beta1 as speech
from google.protobuf import wrappers_pb2
client = speech.SpeechClient()
speech_file = "resources/commercial_mono.wav"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
# Enable spoken punctuation
enable_spoken_punctuation=wrappers_pb2.BoolValue(value=True),
# Enable spoken emojis
enable_spoken_emojis=wrappers_pb2.BoolValue(value=True),
)
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print("-" * 20)
print(f"First alternative of result {i}")
print(f"Transcript: {alternative.transcript}")
return response.results
# [END speech_transcribe_spoken_punctuation_emojis_beta]
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("command")
args = parser.parse_args()
if args.command == "enhanced-model":
transcribe_file_with_enhanced_model()
elif args.command == "metadata":
transcribe_file_with_metadata()
elif args.command == "punctuation":
transcribe_file_with_auto_punctuation()
elif args.command == "diarization":
transcribe_file_with_diarization()
elif args.command == "multi-channel":
transcribe_file_with_multichannel()
elif args.command == "multi-language":
transcribe_file_with_multilanguage()
elif args.command == "word-level-conf":
transcribe_file_with_word_level_confidence()
elif args.command == "spoken-punctuation-emojis":
transcribe_file_with_spoken_punctuation_end_emojis()