in docker_images/nemo/app/pipelines/automatic_speech_recognition.py [0:0]
def __call__(self, inputs: np.array) -> Dict[str, str]:
"""
Args:
inputs (:obj:`np.array`):
The raw waveform of audio received. By default at self.sampling_rate, otherwise 16KHz.
Return:
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
the detected language from the input audio
"""
inputs = self.process_audio_file(inputs)
with tempfile.TemporaryDirectory() as tmpdir:
audio_path = os.path.join(tmpdir, f"audio_{uuid.uuid4()}.wav")
soundfile.write(audio_path, inputs, self.sampling_rate)
transcriptions = self.model.transcribe([audio_path])
# if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
transcriptions = transcriptions[0]
audio_transcription = transcriptions[0]
return {"text": audio_transcription}