core/speech_to

# Copyright 2025 DeepMind Technologies Limited. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Handles extracting speech from streaming audio content parts. Uses Google Cloud Speech API to transcribe audio parts into text parts. install google cloud speech client with: ```python pip install --upgrade google-cloud-speech ``` See the `speech_to_text_cli.py` script for a usage example and how to test it locally. It is recommended to test the quality of the transcription with different models and recognizers. """ import asyncio from collections.abc import AsyncIterable import dataclasses import time from absl import logging import dataclasses_json from genai_processors import content_api from genai_processors import processor from genai_processors import streams from google.cloud import speech_v2 import grpc _SILENT_AUDIO_DELAY_SECONDS = 1 RecognizeStream = grpc.aio.StreamStreamCall[ speech_v2.types.StreamingRecognizeRequest, speech_v2.types.StreamingRecognizeResponse, ] DEFAULT_SAMPLE_RATE_HZ = 24000 # streaming_recognize RPC has limit on the duration and has to be restarted # periodically. Instead of waiting for the deadline we try to restart it at # the moments when that won't cause hiccups. STREAMING_HARD_LIMIT_SEC = ( 240 # 4 minutes / restart stream even when user is speaking. ) STREAMING_LIMIT_SEC = ( 180 # 3 minutes / restart stream when user is not speaking. ) ProcessorPart = content_api.ProcessorPart TRANSCRIPTION_SUBSTREAM_NAME = 'input_transcription' ENDPOINTING_SUBSTREAM_NAME = 'input_endpointing' @dataclasses_json.dataclass_json @dataclasses.dataclass(frozen=True) class StartOfSpeech: """Start of speech event.""" pass @dataclasses_json.dataclass_json @dataclasses.dataclass(frozen=True) class EndOfSpeech: """End of speech event.""" pass class AddSilentPartMaybe(processor.Processor): """Adds silent audio parts if no activity after `silent_part_duration_sec`. If the stream is empty after a few seconds, the Speech API will close the connection. This processor adds silent audio parts to the output stream to keep the connection alive. """ def __init__( self, silent_part_duration_sec: float = 1, sample_rate: int = DEFAULT_SAMPLE_RATE_HZ, ): self._silent_part_duration_sec = silent_part_duration_sec self._sample_rate = sample_rate async def call( self, content: AsyncIterable[ProcessorPart] ) -> AsyncIterable[ProcessorPart]: logging.info('Transcriber: _process_audio started.') last_streamed_audio_time_sec = time.perf_counter() async def _insert_silent_audio() -> AsyncIterable[ProcessorPart]: """Sends silent audio to the Speech API to keep the stream alive.""" nonlocal last_streamed_audio_time_sec while True: await asyncio.sleep(self._silent_part_duration_sec) delta_time_sec = time.perf_counter() - last_streamed_audio_time_sec if delta_time_sec > self._silent_part_duration_sec: yield ProcessorPart( value=b'\0' * round(self._sample_rate * delta_time_sec), mimetype=f'audio/l16; rate={self._sample_rate}', ) last_streamed_audio_time_sec = time.perf_counter() audio_stream = streams.merge( [content, _insert_silent_audio()], stop_on_first=True ) async for part in audio_stream: last_streamed_audio_time_sec = time.perf_counter() yield part logging.info('Transcriber: _process_audio finished.') class _Transcriber(processor.Processor): """Transcribes streaming audio using the Cloud Speech API.""" def __init__( self, project_id: str, recognition_config: speech_v2.types.RecognitionConfig, with_endpointing: bool = True, substream_endpointing: str = ENDPOINTING_SUBSTREAM_NAME, strict_endpointing: bool = True, with_interim_results: bool = True, substream_transcription: str = TRANSCRIPTION_SUBSTREAM_NAME, passthrough_audio: bool = False, ): """Transcribes audio parts using the Cloud Speech API. Args: project_id: The project ID to use for the Speech API. recognition_config: The recognition config to use for the Speech API. Set it up to adjust the sample rate, languages or the recognition model. with_endpointing: Whether to yield endpointing events. Endpointing events are text parts with the value set to one of the `speech_to_text.SpeechEventType` string enums. The endpointing events are yielded in the substream defined by substream_endpointing. substream_endpointing: The substream name to use for the endpointing events. strict_endpointing: Whether to send endpointing events only when interim results have been found. This avoids yielding endpointing events when the user speech is not recognized (e.g. does not return endpointing for noise or laughs or coughing, etc.). with_interim_results: Whether to yield interim results. If set to False, the processor will only yield the final transcription. substream_transcription: The substream name to use for the transcription. passthrough_audio: Whether to passthrough the audio parts to the output stream. The substream name is set to the default one: ''. """ self._config = speech_v2.types.StreamingRecognitionConfig( config=recognition_config, streaming_features=speech_v2.types.StreamingRecognitionFeatures( interim_results=True, enable_voice_activity_events=True, ), ) self._sample_rate = ( self._config.config.explicit_decoding_config.sample_rate_hertz or DEFAULT_SAMPLE_RATE_HZ ) self._with_endpointing = with_endpointing self._substream_endpointing = substream_endpointing self._strict_endpointing = strict_endpointing self._with_interim_results = with_interim_results self._substream_transcription = substream_transcription self._project_id = project_id self._passthrough_audio = passthrough_audio def _make_setup_request(self) -> speech_v2.types.StreamingRecognizeRequest: return speech_v2.types.StreamingRecognizeRequest( streaming_config=self._config, recognizer=( f'projects/{self._project_id}/locations/global/recognizers/_' ), ) async def call( self, content: AsyncIterable[ProcessorPart], ) -> AsyncIterable[ProcessorPart]: """Transcribes streaming audio using the Cloud Speech API.""" # The output queue is used to yield the audio parts unchanged in the output # stream when self._passthrough_audio is True. output_queue = asyncio.Queue[ProcessorPart | None]() stream_state: dict[str, bool | float] = { 'start_time_sec': time.perf_counter(), 'restart_stream': False, 'user_speaking': False, 'stream_is_on': True, } async def request_stream( request_queue: asyncio.Queue[ speech_v2.types.StreamingRecognizeRequest | None ], ): try: request_queue.put_nowait(self._make_setup_request()) async for part in content: if not content_api.is_audio(part.mimetype): output_queue.put_nowait(part) continue if self._passthrough_audio: output_queue.put_nowait(part) if part.part.inline_data is None: continue if not part.mimetype.lower().startswith( 'audio/l16' ) or not part.mimetype.lower().endswith(f'rate={self._sample_rate}'): raise ValueError( f'Unsupported audio mimetype: {part.mimetype}. Expected' f' audio/l16;[.*]rate={self._sample_rate}.' ) request_queue.put_nowait( speech_v2.types.StreamingRecognizeRequest( audio=part.part.inline_data.data, ) ) delta_time_sec = time.perf_counter() - stream_state['start_time_sec'] if ( (delta_time_sec > STREAMING_LIMIT_SEC) and not stream_state['user_speaking'] ) or (delta_time_sec > STREAMING_HARD_LIMIT_SEC): stream_state['restart_stream'] = True break finally: request_queue.put_nowait(None) async def send_audio_to_speech_api(): # Instantiates a client. try: logging.debug('Transcriber: (re)creating client') client = speech_v2.SpeechAsyncClient() last_endpointing_event = None while stream_state['stream_is_on']: request_queue = asyncio.Queue[ speech_v2.types.StreamingRecognizeRequest | None ]() populate_request_queue = processor.create_task( request_stream(request_queue) ) response_stream = await client.streaming_recognize( requests=streams.dequeue(request_queue) ) async for response in response_stream: if response == grpc.aio.EOF: break if ( response.speech_event_type == speech_v2.types.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN ): last_endpointing_event = StartOfSpeech() stream_state['user_speaking'] = True if self._with_endpointing and not self._strict_endpointing: last_endpointing_event = 'SPEECH_ACTIVITY_BEGIN_SENT' output_queue.put_nowait( ProcessorPart.from_dataclass( dataclass=StartOfSpeech(), substream_name=self._substream_endpointing, ) ) if response.results and response.results[0].alternatives: if ( isinstance(last_endpointing_event, StartOfSpeech) and self._strict_endpointing ): # We have not sent the SPEECH_ACTIVITY_BEGIN event yet, we # waited for the first transcript to appear. last_endpointing_event = 'SPEECH_ACTIVITY_BEGIN_SENT' output_queue.put_nowait( ProcessorPart.from_dataclass( dataclass=StartOfSpeech(), substream_name=self._substream_endpointing, ) ) if text := response.results[0].alternatives[0].transcript: metadata = { 'is_final': response.results[0].is_final, } if self._with_interim_results or response.results[0].is_final: output_queue.put_nowait( ProcessorPart( text, role='user', metadata=metadata, substream_name=self._substream_transcription, ) ) if ( response.speech_event_type == speech_v2.types.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END ): stream_state['user_speaking'] = False if ( self._with_endpointing and last_endpointing_event == 'SPEECH_ACTIVITY_BEGIN_SENT' ): output_queue.put_nowait( ProcessorPart.from_dataclass( dataclass=EndOfSpeech(), substream_name=self._substream_endpointing, ) ) last_endpointing_event = None if stream_state['restart_stream']: stream_state['restart_stream'] = False stream_state['stream_is_on'] = True stream_state['start_time_sec'] = time.perf_counter() client = speech_v2.SpeechAsyncClient() populate_request_queue.cancel() else: stream_state['stream_is_on'] = False finally: output_queue.put_nowait(None) send_task = processor.create_task(send_audio_to_speech_api()) while part := await output_queue.get(): yield part await send_task class SpeechToText(processor.Processor): """Converts audio parts into text parts.""" def __init__( self, project_id: str, recognition_config: speech_v2.types.RecognitionConfig | None = None, audio_passthrough: bool = False, with_endpointing: bool = True, substream_endpointing: str = ENDPOINTING_SUBSTREAM_NAME, strict_endpointing: bool = True, with_interim_results: bool = True, substream_transcription: str = TRANSCRIPTION_SUBSTREAM_NAME, maintain_connection_active_with_silent_audio: bool = False, ): """Initializes the SpeechToText processor. The speech processor uses the Cloud Speech API to transcribe audio parts into text parts. It injects silent audio parts to keep the stream alive when the user is not speaking and restarts the connection automatically after 3-4 minutes to avoid the stream being closed by the server. The processor yields endpointing events when the user starts and stops speaking. If with_endpointing is False, the endpointing events are not yielded. The endpointing events are yielded in the substream defined by substream_endpointing. When strict_endpointing is True, the endpointing events are yielded only when interim results have been found. This avoids yielding endpointing events when the user speech is not recognized (e.g. short noise or sound). Args: project_id: The project ID to use for the Speech API. recognition_config: The recognition config to use for the Speech API. Set it up to adjust the sample rate, languages or the recognition model. audio_passthrough: Whether to passthrough the audio parts to the output stream. The substream name is set to the default one: ''. with_endpointing: Whether to yield endpointing events. Endpointing events are text parts with the value set to one of the `speech_to_text.SpeechEventType` string enums. The endpointing events are yielded in the substream defined by substream_endpointing. substream_endpointing: The substream name to use for the endpointing events. strict_endpointing: Whether to send endpointing events only when interim results have been found. This avoids yielding endpointing events when the user speech is not recognized (e.g. does not return endpointing for noise or laughs or coughing, etc.). with_interim_results: Whether to yield interim results. If set to False, the processor will only yield the final transcription. substream_transcription: The substream name to use for the transcription. maintain_connection_active_with_silent_audio: Whether to maintain the connection active with silent audio. If set to True, the processor will inject silent audio parts to keep the stream alive when the processor does not receive any audio part. This can be needed if the Speech API closes the stream when it does not receive any audio for a long time. """ recognition_config = recognition_config or speech_v2.types.RecognitionConfig( explicit_decoding_config=speech_v2.types.ExplicitDecodingConfig( sample_rate_hertz=DEFAULT_SAMPLE_RATE_HZ, encoding=speech_v2.types.ExplicitDecodingConfig.AudioEncoding.LINEAR16, audio_channel_count=1, ), language_codes=['en-US'], model='latest_long', ) self._processor = _Transcriber( project_id=project_id, recognition_config=recognition_config, with_endpointing=with_endpointing, substream_endpointing=substream_endpointing, strict_endpointing=strict_endpointing, with_interim_results=with_interim_results, substream_transcription=substream_transcription, passthrough_audio=audio_passthrough, ) if maintain_connection_active_with_silent_audio: sample_rate = ( recognition_config.explicit_decoding_config.sample_rate_hertz or DEFAULT_SAMPLE_RATE_HZ ) self._processor = ( AddSilentPartMaybe( silent_part_duration_sec=_SILENT_AUDIO_DELAY_SECONDS, sample_rate=sample_rate, ) + self._processor ) async def call( self, content: AsyncIterable[ProcessorPart], ) -> AsyncIterable[ProcessorPart]: async for part in self._processor(content): yield part

core/speech_to_text.py (286 lines of code) (raw):