docker_images/fairseq/app/pipelines/text_to_speech.py (28 lines of code) (raw):

import os from typing import Tuple import numpy as np from app.pipelines import Pipeline from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub from fairseq.models.text_to_speech.hub_interface import TTSHubInterface class TextToSpeechPipeline(Pipeline): def __init__(self, model_id: str): model, cfg, task = load_model_ensemble_and_task_from_hf_hub( model_id, arg_overrides={"vocoder": "griffin_lim", "fp16": False}, cache_dir=os.getenv("HUGGINGFACE_HUB_CACHE"), ) self.model = model[0].cpu() self.model.eval() cfg["task"].cpu = True self.task = task TTSHubInterface.update_cfg_with_data_cfg(cfg, self.task.data_cfg) self.generator = self.task.build_generator(model, cfg) def __call__(self, inputs: str) -> Tuple[np.array, int]: """ Args: inputs (:obj:`str`): The text to generate audio from Return: A :obj:`np.array` and a :obj:`int`: The raw waveform as a numpy array, and the sampling rate as an int. """ inputs = inputs.strip("\x00") if len(inputs) == 0: return np.zeros((0,)), self.task.sr sample = TTSHubInterface.get_model_input(self.task, inputs) wav, sr = TTSHubInterface.get_prediction( self.task, self.model, self.generator, sample ) return wav.numpy(), sr