docker_images/pyannote_audio/app/pipelines/automatic_speech_recognition.py (18 lines of code) (raw):

from typing import Dict import numpy as np import torch from app.pipelines import Pipeline from pyannote.audio import Pipeline as Pypeline class AutomaticSpeechRecognitionPipeline(Pipeline): def __init__(self, model_id: str): # IMPLEMENT_THIS # Preload all the elements you are going to need at inference. # For instance your model, processors, tokenizer that might be needed. # This function is only called once, so do all the heavy processing I/O here # IMPLEMENT_THIS : Please define a `self.sampling_rate` for this pipeline # to automatically read the input correctly self.sampling_rate = 16000 self.model = Pypeline.from_pretrained(model_id) def __call__(self, inputs: np.array) -> Dict[str, str]: """ Args: inputs (:obj:`np.array`): The raw waveform of audio received. By default at self.sampling_rate, otherwise 16KHz. Return: A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing the detected language from the input audio """ wav = torch.from_numpy(inputs).unsqueeze(0) output = self.model({"waveform": wav, "sample_rate": self.sampling_rate}) regions = [ {"label": label, "start": segment.start, "stop": segment.end} for segment, _, label in output.itertracks(yield_label=True) ] regions = str(regions) return {"text": regions}