docker_images/speechbrain/app/pipelines/text_to_speech.py (36 lines of code) (raw):
from typing import Tuple
import numpy as np
from app.common import ModelType, get_type, get_vocoder_model_id
from app.pipelines import Pipeline
from speechbrain.inference import HIFIGAN, FastSpeech2, Tacotron2
class TextToSpeechPipeline(Pipeline):
def __init__(self, model_id: str):
model_type = get_type(model_id)
if model_type is ModelType.TACOTRON2:
self.model = Tacotron2.from_hparams(source=model_id)
self.type = "tacotron2"
elif model_type is ModelType.FASTSPEECH2:
self.model = FastSpeech2.from_hparams(source=model_id)
self.type = "fastspeech2"
else:
raise ValueError(f"{model_type.value} is invalid for text-to-speech")
vocoder_type = get_type(model_id, "vocoder_interface")
vocoder_model_id = get_vocoder_model_id(model_id)
if vocoder_type is ModelType.HIFIGAN:
self.vocoder_model = HIFIGAN.from_hparams(source=vocoder_model_id)
else:
raise ValueError(
f"{vocoder_type.value} is invalid vocoder for text-to-speech"
)
self.sampling_rate = self.model.hparams.sample_rate
def __call__(self, inputs: str) -> Tuple[np.array, int]:
"""
Args:
inputs (:obj:`str`):
The text to generate audio from
Return:
A :obj:`np.array` and a :obj:`int`: The raw waveform as a numpy array, and the sampling rate as an int.
"""
if not inputs.replace("\0", "").strip():
inputs = "Empty query"
if self.type == "tacotron2":
mel_output, _, _ = self.model.encode_text(inputs)
elif self.type == "fastspeech2":
mel_output, _, _, _ = self.model.encode_text(
[inputs], pace=1.0, pitch_rate=1.0, energy_rate=1.0
)
waveforms = self.vocoder_model.decode_batch(mel_output).numpy()
return waveforms, self.sampling_rate