def __call__()

in docker_images/nemo/app/pipelines/automatic_speech_recognition.py [0:0]


    def __call__(self, inputs: np.array) -> Dict[str, str]:
        """
        Args:
            inputs (:obj:`np.array`):
                The raw waveform of audio received. By default at self.sampling_rate, otherwise 16KHz.
        Return:
            A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
            the detected language from the input audio
        """
        inputs = self.process_audio_file(inputs)

        with tempfile.TemporaryDirectory() as tmpdir:
            audio_path = os.path.join(tmpdir, f"audio_{uuid.uuid4()}.wav")
            soundfile.write(audio_path, inputs, self.sampling_rate)

            transcriptions = self.model.transcribe([audio_path])

            # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
            if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
                transcriptions = transcriptions[0]

        audio_transcription = transcriptions[0]

        return {"text": audio_transcription}