in torchbenchmark/models/tacotron2/train_tacotron2.py [0:0]
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
rank, group_name, hparams):
"""Training and validation logging results to tensorboard and stdout
Params
------
output_directory (string): directory to save checkpoints
log_directory (string) directory to save tensorboard logs
checkpoint_path(string): checkpoint path
n_gpus (int): number of gpus
rank (int): rank of current gpu
hparams (object): comma separated list of "name=value" pairs.
"""
if hparams.distributed_run:
init_distributed(hparams, n_gpus, rank, group_name)
model = load_model(hparams)
learning_rate = hparams.learning_rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
weight_decay=hparams.weight_decay)
if hparams.fp16_run:
from apex import amp
model, optimizer = amp.initialize(
model, optimizer, opt_level='O2')
if hparams.distributed_run:
model = apply_gradient_allreduce(model)
criterion = Tacotron2Loss()
logger = prepare_directories_and_logger(
output_directory, log_directory, rank)
train_loader, valset, collate_fn = prepare_dataloaders(hparams)
# Load checkpoint if one exists
iteration = 0
epoch_offset = 0
if checkpoint_path is not None:
if warm_start:
model = warm_start_model(
checkpoint_path, model, hparams.ignore_layers)
else:
model, optimizer, _learning_rate, iteration = load_checkpoint(
checkpoint_path, model, optimizer)
if hparams.use_saved_learning_rate:
learning_rate = _learning_rate
iteration += 1 # next iteration is iteration + 1
epoch_offset = max(0, int(iteration / len(train_loader)))
model.train()
is_overflow = False
# ================ MAIN TRAINNIG LOOP! ===================
for epoch in range(epoch_offset, hparams.epochs):
print("Epoch: {}".format(epoch))
for i, batch in enumerate(train_loader):
start = time.perf_counter()
for param_group in optimizer.param_groups:
param_group['lr'] = learning_rate
model.zero_grad()
x, y = model.parse_batch(batch)
y_pred = model(x)
loss = criterion(y_pred, y)
if hparams.distributed_run:
reduced_loss = reduce_tensor(loss.data, n_gpus).item()
else:
reduced_loss = loss.item()
if hparams.fp16_run:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if hparams.fp16_run:
grad_norm = torch.nn.utils.clip_grad_norm_(
amp.master_params(optimizer), hparams.grad_clip_thresh)
is_overflow = math.isnan(grad_norm)
else:
grad_norm = torch.nn.utils.clip_grad_norm_(
model.parameters(), hparams.grad_clip_thresh)
optimizer.step()
if not is_overflow and rank == 0:
duration = time.perf_counter() - start
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
iteration, reduced_loss, grad_norm, duration))
logger.log_training(
reduced_loss, grad_norm, learning_rate, duration, iteration)
if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
validate(model, criterion, valset, iteration,
hparams.batch_size, n_gpus, collate_fn, logger,
hparams.distributed_run, rank)
if rank == 0:
checkpoint_path = os.path.join(
output_directory, "checkpoint_{}".format(iteration))
save_checkpoint(model, optimizer, learning_rate, iteration,
checkpoint_path)
iteration += 1