in train.py [0:0]
def train_loop(H, data_train, data_valid, preprocess_fn, vae, ema_vae, logprint):
optimizer, scheduler, cur_eval_loss, iterate, starting_epoch = load_opt(H, vae, logprint)
train_sampler = DistributedSampler(data_train, num_replicas=H.mpi_size, rank=H.rank)
viz_batch_original, viz_batch_processed = get_sample_for_visualization(data_valid, preprocess_fn, H.num_images_visualize, H.dataset)
early_evals = set([1] + [2 ** exp for exp in range(3, 14)])
stats = []
iters_since_starting = 0
H.ema_rate = torch.as_tensor(H.ema_rate).cuda()
for epoch in range(starting_epoch, H.num_epochs):
train_sampler.set_epoch(epoch)
for x in DataLoader(data_train, batch_size=H.n_batch, drop_last=True, pin_memory=True, sampler=train_sampler):
data_input, target = preprocess_fn(x)
training_stats = training_step(H, data_input, target, vae, ema_vae, optimizer, iterate)
stats.append(training_stats)
scheduler.step()
if iterate % H.iters_per_print == 0 or iters_since_starting in early_evals:
logprint(model=H.desc, type='train_loss', lr=scheduler.get_last_lr()[0], epoch=epoch, step=iterate, **accumulate_stats(stats, H.iters_per_print))
if iterate % H.iters_per_images == 0 or (iters_since_starting in early_evals and H.dataset != 'ffhq_1024') and H.rank == 0:
write_images(H, ema_vae, viz_batch_original, viz_batch_processed, f'{H.save_dir}/samples-{iterate}.png', logprint)
iterate += 1
iters_since_starting += 1
if iterate % H.iters_per_save == 0 and H.rank == 0:
if np.isfinite(stats[-1]['elbo']):
logprint(model=H.desc, type='train_loss', epoch=epoch, step=iterate, **accumulate_stats(stats, H.iters_per_print))
fp = os.path.join(H.save_dir, 'latest')
logprint(f'Saving model@ {iterate} to {fp}')
save_model(fp, vae, ema_vae, optimizer, H)
if iterate % H.iters_per_ckpt == 0 and H.rank == 0:
save_model(os.path.join(H.save_dir, f'iter-{iterate}'), vae, ema_vae, optimizer, H)
if epoch % H.epochs_per_eval == 0:
valid_stats = evaluate(H, ema_vae, data_valid, preprocess_fn)
logprint(model=H.desc, type='eval_loss', epoch=epoch, step=iterate, **valid_stats)