docker_images/nemo/app/pipelines/automatic_speech_recognition.py (36 lines of code) (raw):

import os import tempfile import uuid from typing import Dict import librosa import nemo.collections.asr as nemo_asr import numpy as np import soundfile from app.pipelines import Pipeline from huggingface_hub import hf_hub_download from huggingface_hub.hf_api import HfFolder class AutomaticSpeechRecognitionPipeline(Pipeline): def __init__(self, model_id: str): # IMPLEMENT_THIS # Preload all the elements you are going to need at inference. # For instance your model, processors, tokenizer that might be needed. # This function is only called once, so do all the heavy processing I/O here # Precheck for API key is_token_available = HfFolder.get_token() is not None # Prepare file name from model_id filename = model_id.split("/")[-1] + ".nemo" path = hf_hub_download( repo_id=model_id, filename=filename, use_auth_token=is_token_available ) # Load model self.model = nemo_asr.models.ASRModel.restore_from(path) self.model.freeze() # Pre-Initialize RNNT decoding strategy if hasattr(self.model, "change_decoding_strategy"): self.model.change_decoding_strategy(None) # IMPLEMENT_THIS : Please define a `self.sampling_rate` for this pipeline # to automatically read the input correctly self.sampling_rate = self.model.cfg.sample_rate def __call__(self, inputs: np.array) -> Dict[str, str]: """ Args: inputs (:obj:`np.array`): The raw waveform of audio received. By default at self.sampling_rate, otherwise 16KHz. Return: A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing the detected language from the input audio """ inputs = self.process_audio_file(inputs) with tempfile.TemporaryDirectory() as tmpdir: audio_path = os.path.join(tmpdir, f"audio_{uuid.uuid4()}.wav") soundfile.write(audio_path, inputs, self.sampling_rate) transcriptions = self.model.transcribe([audio_path]) # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis if isinstance(transcriptions, tuple) and len(transcriptions) == 2: transcriptions = transcriptions[0] audio_transcription = transcriptions[0] return {"text": audio_transcription} def process_audio_file(self, data): # monochannel data = librosa.to_mono(data) return data