docker_images/k2/app/pipelines/automatic_speech_recognition.py (19 lines of code) (raw):
from typing import Dict
import app.common as cx
import numpy as np
import torch
from app.pipelines import Pipeline
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
# See https://github.com/pytorch/pytorch/issues/38342
# and https://github.com/pytorch/pytorch/issues/33354
#
# If we don't do this, the delay increases whenever there is
# a new request that changes the actual batch size.
# If you use `py-spy dump --pid <server-pid> --native`, you will
# see a lot of time is spent in re-compiling the torch script model.
torch._C._jit_set_profiling_executor(False)
torch._C._jit_set_profiling_mode(False)
torch._C._set_graph_executor_optimize(False)
class AutomaticSpeechRecognitionPipeline(Pipeline):
def __init__(self, model_id: str):
model_config = cx.get_hfconfig(model_id, "hf_demo")
self.model = cx.model_from_hfconfig(hf_repo=model_id, hf_config=model_config)
self.sampling_rate = self.model.sample_rate
def __call__(self, inputs: np.array) -> Dict[str, str]:
"""
Args:
inputs (:obj:`np.array`):
The raw waveform of audio received. By default at self.sampling_rate, otherwise 16KHz.
Check `app.validation` if a different sample rate is required
or if it depends on the model
Return:
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
the detected language from the input audio
"""
batch = torch.from_numpy(inputs)
words = cx.transcribe_batch_from_tensor(self.model, batch)
return {"text": words}