docker_images/speechbrain/app/pipelines/automatic_speech_recognition.py (28 lines of code) (raw):
from typing import Dict
import numpy as np
import torch
from app.common import ModelType, get_type
from app.pipelines import Pipeline
from speechbrain.inference import EncoderASR, EncoderDecoderASR, WhisperASR
class AutomaticSpeechRecognitionPipeline(Pipeline):
def __init__(self, model_id: str):
model_type = get_type(model_id)
if model_type is ModelType.ENCODERASR:
self.model = EncoderASR.from_hparams(source=model_id)
elif model_type is ModelType.ENCODERDECODERASR:
self.model = EncoderDecoderASR.from_hparams(source=model_id)
# Reduce latency
self.model.mods.decoder.beam_size = 1
elif model_type is ModelType.WHISPERASR:
self.model = WhisperASR.from_hparams(source=model_id)
else:
raise ValueError(
f"{model_type.value} is invalid for automatic-speech-recognition"
)
# Please define a `self.sampling_rate` for this pipeline
# to automatically read the input correctly
self.sampling_rate = self.model.hparams.sample_rate
def __call__(self, inputs: np.array) -> Dict[str, str]:
"""
Args:
inputs (:obj:`np.array`):
The raw waveform of audio received. By default at 16KHz.
Check `app.validation` if a different sample rate is required
or if it depends on the model
Return:
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
the detected language from the input audio
"""
batch = torch.from_numpy(inputs).unsqueeze(0)
rel_length = torch.tensor([1.0])
predicted_words, predicted_tokens = self.model.transcribe_batch(
batch, rel_length
)
return {"text": predicted_words[0]}