docker_images/espnet/app/pipelines/automatic_speech_recognition.py (12 lines of code) (raw):
from typing import Dict
import numpy as np
from app.pipelines import Pipeline
from espnet2.bin.asr_inference import Speech2Text
class AutomaticSpeechRecognitionPipeline(Pipeline):
def __init__(self, model_id: str):
self.model = Speech2Text.from_pretrained(model_id, device="cpu", beam_size=1)
self.sampling_rate = 16000
def __call__(self, inputs: np.array) -> Dict[str, str]:
"""
Args:
inputs (:obj:`np.array`):
The raw waveform of audio received. By default at 16KHz.
Check `app.validation` if a different sample rate is required
or if it depends on the model
Return:
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
the detected language from the input audio
"""
outputs = self.model(inputs)
text, *_ = outputs[0]
return {"text": text}