torchbenchmark/models/tacotron2/__init__.py (92 lines of code) (raw):
from .train_tacotron2 import load_model, prepare_dataloaders
import torch
from .loss_function import Tacotron2Loss
from argparse import Namespace
from .text import symbols
from pathlib import Path
from ...util.model import BenchmarkModel
from typing import Tuple
from torchbenchmark.tasks import SPEECH
class Model(BenchmarkModel):
task = SPEECH.SYNTHESIS
# Training batch size comes from the source code:
# Source: https://github.com/NVIDIA/tacotron2/blob/bb6761349354ee914909a42208e4820929612069/hparams.py#L84
DEFAULT_TRAIN_BSIZE = 64
DEFAULT_EVAL_BSIZE = 64
def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
if device == 'cpu' or jit:
# TODO - currently load_model assumes cuda
raise NotImplementedError("Tacotron2 doesn't support CPU or JIT because load_model assumes CUDA")
self.hparams = self.create_hparams(batch_size=self.batch_size)
self.model = load_model(self.hparams).to(device=device)
self.optimizer = torch.optim.Adam(self.model.parameters(),
lr=self.hparams.learning_rate,
weight_decay=self.hparams.weight_decay)
self.criterion = Tacotron2Loss().to(device=device)
loader, valset, collate_fn = prepare_dataloaders(self.hparams)
self.example_inputs, self.target = self.model.parse_batch(next(iter(loader)), device=self.device)
# Parameters were obtained from the source code.
# Source: https://github.com/NVIDIA/tacotron2/blob/bb6761349354ee914909a42208e4820929612069/hparams.py#L5
def create_hparams(hparams_string=None, verbose=False, batch_size=64):
"""Create model hyperparameters. Parse nondefault from given string."""
root = str(Path(__file__).parent.parent.parent)
hparams = Namespace(**{
################################
# Experiment Parameters #
################################
'epochs': 2, # Reduced in TorchBench to shorten number of train iterations.
'iters_per_checkpoint': 1000,
'dynamic_loss_scaling': True,
'fp16_run': False,
'distributed_run': False,
'dist_backend': "nccl",
'dist_url': "tcp://localhost:54321",
'cudnn_enabled': True,
'cudnn_benchmark': False,
'ignore_layers': ['embedding.weight'],
################################
# Data Parameters #
################################
'load_mel_from_disk': False,
'training_files': f'{root}/data/.data/tacotron2-minimal/filelists/ljs_audio_text_train_filelist.txt',
'validation_files': f'{root}/data/.data/tacotron2-minimal/filelists/ljs_audio_text_val_filelist.txt',
'text_cleaners': ['english_cleaners'],
################################
# Audio Parameters #
################################
'max_wav_value': 32768.0,
'sampling_rate': 22050,
'filter_length': 1024,
'hop_length': 256,
'win_length': 1024,
'n_mel_channels': 80,
'mel_fmin': 0.0,
'mel_fmax': 8000.0,
################################
# Model Parameters #
################################
'n_symbols': len(symbols),
'symbols_embedding_dim': 512,
# Encoder parameters
'encoder_kernel_size': 5,
'encoder_n_convolutions': 3,
'encoder_embedding_dim': 512,
# Decoder parameters
'n_frames_per_step': 1, # currently only 1 is supported
'decoder_rnn_dim': 1024,
'prenet_dim': 256,
'max_decoder_steps': 1000,
'gate_threshold': 0.5,
'p_attention_dropout': 0.1,
'p_decoder_dropout': 0.1,
# Attention parameters
'attention_rnn_dim': 1024,
'attention_dim': 128,
# Location Layer parameters
'attention_location_n_filters': 32,
'attention_location_kernel_size': 31,
# Mel-post processing network parameters
'postnet_embedding_dim': 512,
'postnet_kernel_size': 5,
'postnet_n_convolutions': 5,
################################
# Optimization Hyperparameters #
################################
'use_saved_learning_rate': False,
'learning_rate': 1e-3,
'weight_decay': 1e-6,
'grad_clip_thresh': 1.0,
'batch_size': batch_size,
'mask_padding': True # set model's padded outputs to padded values
})
return hparams
def get_module(self):
return self.model, (self.example_inputs,)
def train(self, niter=1):
self.model.train()
for _ in range(niter):
self.model.zero_grad()
y_pred = self.model(self.example_inputs)
loss = self.criterion(y_pred, self.target)
loss.backward()
self.optimizer.step()
def eval(self, niter=1) -> Tuple[torch.Tensor]:
self.model.eval()
for _ in range(niter):
out = self.model(self.example_inputs)
return out