docker_images/espnet/app/pipelines/text_to_speech.py (15 lines of code) (raw):
from typing import Tuple
import numpy as np
from app.pipelines import Pipeline
from espnet2.bin.tts_inference import Text2Speech
class TextToSpeechPipeline(Pipeline):
def __init__(self, model_id: str):
self.model = Text2Speech.from_pretrained(model_id, device="cpu")
if hasattr(self.model, "fs"):
self.sampling_rate = self.model.fs
else:
# 16000 by default if not specified
self.sampling_rate = 16000
def __call__(self, inputs: str) -> Tuple[np.array, int]:
"""
Args:
inputs (:obj:`str`):
The text to generate audio from
Return:
A :obj:`np.array` and a :obj:`int`: The raw waveform as a numpy array, and the sampling rate as an int.
"""
outputs = self.model(inputs)
speech = outputs["wav"]
return speech.numpy(), self.sampling_rate