def __call__()

in docker_images/fairseq/app/pipelines/audio_to_audio.py [0:0]


    def __call__(self, inputs: np.array) -> Tuple[np.array, int, List[str]]:
        """
        Args:
            inputs (:obj:`np.array`):
                The raw waveform of audio received. By default sampled at `self.sampling_rate`.
                The shape of this array is `T`, where `T` is the time axis
        Return:
            A :obj:`tuple` containing:
              - :obj:`np.array`:
                 The return shape of the array must be `C'`x`T'`
              - a :obj:`int`: the sampling rate as an int in Hz.
              - a :obj:`List[str]`: the annotation for each out channel.
                    This can be the name of the instruments for audio source separation
                    or some annotation for speech enhancement. The length must be `C'`.
        """
        _inputs = torch.from_numpy(inputs).unsqueeze(0)
        sample, text = None, None
        if self.cfg.task._name in ["speech_to_text", "speech_to_text_sharded"]:
            sample = S2THubInterface.get_model_input(self.task, _inputs)
            text = S2THubInterface.get_prediction(
                self.task, self.model, self.generator, sample
            )
        elif self.cfg.task._name in ["speech_to_speech"]:
            s2shubinerface = S2SHubInterface(self.cfg, self.task, self.model)
            sample = s2shubinerface.get_model_input(self.task, _inputs)
            text = S2SHubInterface.get_prediction(
                self.task, self.model, self.generator, sample
            )

        wav, sr = np.zeros((0,)), self.sampling_rate
        if self.unit_vocoder is not None:
            tts_sample = self.tts_model.get_model_input(text)
            wav, sr = self.tts_model.get_prediction(tts_sample)
            text = ""
        else:
            tts_sample = TTSHubInterface.get_model_input(self.tts_task, text)
            wav, sr = TTSHubInterface.get_prediction(
                self.tts_task, self.tts_model, self.tts_generator, tts_sample
            )

        return wav.unsqueeze(0).numpy(), sr, [text]