in tools/train_net.py [0:0]
def train_epoch(
train_loader, model, loss_fun, optimizer, train_meter, cur_epoch, writer_train=None, params=0, flops=0,
is_master=False):
"""Performs one epoch of training."""
# Shuffle the data
loader.shuffle(train_loader, cur_epoch)
# Update the learning rate
lr = optim.get_epoch_lr(cur_epoch)
optim.set_lr(optimizer, lr)
# Enable training mode
model.train()
train_meter.iter_tic()
for cur_iter, (inputs, labels) in enumerate(train_loader):
# Transfer the data to the current GPU device
inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True)
# Perform the forward pass
preds = model(inputs)
# Compute the loss
loss = loss_fun(preds, labels)
# Perform the backward pass
optimizer.zero_grad()
loss.backward()
# Update the parameters
optimizer.step()
# Compute the errors
top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5])
# Combine the stats across the GPUs
if cfg.NUM_GPUS > 1:
loss, top1_err, top5_err = du.scaled_all_reduce(
[loss, top1_err, top5_err]
)
# Copy the stats from GPU to CPU (sync point)
loss, top1_err, top5_err = loss.item(), top1_err.item(), top5_err.item()
train_meter.iter_toc()
# Update and log stats
train_meter.update_stats(
top1_err, top5_err, loss, lr, inputs.size(0) * cfg.NUM_GPUS
)
train_meter.log_iter_stats(cur_epoch, cur_iter)
train_meter.iter_tic()
# Log epoch stats
train_meter.log_epoch_stats(cur_epoch, writer_train, params, flops, is_master=is_master)
train_meter.reset()