recipes/sota/2019/lm_analysis/tts

# https://github.com/mozilla/TTS/blob/master/notebooks/Benchmark.ipynb - original code which we adapted import io import os import sys import time from collections import OrderedDict import numpy as np import torch from localimport import localimport from matplotlib import pylab as plt from TTS.layers import * from TTS.models.tacotron import Tacotron from TTS.utils.audio import AudioProcessor from TTS.utils.data import * from TTS.utils.generic_utils import load_config, setup_model from TTS.utils.synthesis import synthesis from TTS.utils.text import text_to_sequence from TTS.utils.text.symbols import phonemes, symbols sys.path.append("TTS") sys.path.append("WaveRNN") tts_pretrained_model_config = "tts_models/config.json" wavernn_pretrained_model_config = "wavernn_models/config.json" wavernn_pretrained_model = "wavernn_models/checkpoint_433000.pth.tar" tts_pretrained_model = "tts_models/checkpoint_261000.pth.tar" def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None): t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis( model, text, CONFIG, use_cuda, ap, truncated=True, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, ) if CONFIG.model == "Tacotron" and not use_gl: mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T if not use_gl: waveform = wavernn.generate( torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550, ) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, mel_postnet_spec, stop_tokens, waveform use_cuda = True batched_wavernn = True # initialize TTS CONFIG = load_config(tts_pretrained_model_config) print(CONFIG) # load the model num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, CONFIG) # load the audio processor ap = AudioProcessor(**CONFIG.audio) # load model state if use_cuda: cp = torch.load(tts_pretrained_model) else: cp = torch.load(tts_pretrained_model, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp["model"]) if use_cuda: model.cuda() model.eval() print(cp["step"]) model.decoder.max_decoder_steps = 2000 # initialize WaveRNN VOCODER_CONFIG = load_config(wavernn_pretrained_model_config) with localimport("/content/WaveRNN") as _importer: from models.wavernn import Model bits = 10 wavernn = Model( rnn_dims=512, fc_dims=512, mode="mold", pad=2, upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset feat_dims=VOCODER_CONFIG.audio["num_mels"], compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=ap.hop_length, sample_rate=ap.sample_rate, ).cuda() check = torch.load(wavernn_pretrained_model) wavernn.load_state_dict(check["model"]) if use_cuda: wavernn.cuda() wavernn.eval() print(check["step"]) def run_tts(transcription, sample_id, name): _, _, _, wav = tts( model, transcription, CONFIG, use_cuda, ap, speaker_id=0, use_gl=False, figures=False, ) ap.save_wav(wav, name) with open(sys.argv[1], "r") as f: transcriptions = [line.strip() for line in f] sample_ids = np.arange(len(transcriptions)) names = [sys.argv[2] + str(sample_id) + ".wav" for sample_id in sample_ids] for index in range(len(transcriptions)): run_tts(transcriptions[index], sample_ids[index], names[index])

recipes/sota/2019/lm_analysis/tts_forward.py (103 lines of code) (raw):