in src/train.py [0:0]
def train_epoch_sl(model, optimizer, lr_scheduler, epoch, train_dataset, val_datasets, problem, tb_logger, opts):
print("\nStart train epoch {}, lr={} for run {}".format(epoch, optimizer.param_groups[0]['lr'], opts.run_name))
step = epoch * (opts.epoch_size // opts.batch_size)
start_time = time.time()
if not opts.no_tensorboard:
tb_logger.log_value('learnrate_pg0', optimizer.param_groups[0]['lr'], step)
# Create data loader with random sampling
train_dataloader = DataLoader(train_dataset, batch_size=opts.batch_size, num_workers=opts.num_workers,
sampler=BatchedRandomSampler(train_dataset, opts.batch_size))
# Put model in train mode!
model.train()
optimizer.zero_grad()
set_decode_type(model, "greedy")
for batch_id, batch in enumerate(tqdm(train_dataloader, disable=opts.no_progress_bar, ascii=True)):
train_batch_sl(
model,
optimizer,
epoch,
batch_id,
step,
batch,
tb_logger,
opts
)
step += 1
lr_scheduler.step(epoch)
epoch_duration = time.time() - start_time
print("Finished epoch {}, took {} s".format(epoch, time.strftime('%H:%M:%S', time.gmtime(epoch_duration))))
if (opts.checkpoint_epochs != 0 and epoch % opts.checkpoint_epochs == 0) or epoch == opts.n_epochs - 1:
print('Saving model and state...')
torch.save(
{
'model': get_inner_model(model).state_dict(),
'optimizer': optimizer.state_dict(),
'rng_state': torch.get_rng_state(),
'cuda_rng_state': torch.cuda.get_rng_state_all()
},
os.path.join(opts.save_dir, 'epoch-{}.pt'.format(epoch))
)
for val_idx, val_dataset in enumerate(val_datasets):
avg_reward, avg_opt_gap = validate(model, val_dataset, problem, opts)
if not opts.no_tensorboard:
tb_logger.log_value('val{}/avg_reward'.format(val_idx+1), avg_reward, step)
tb_logger.log_value('val{}/opt_gap'.format(val_idx+1), avg_opt_gap, step)