in experiment.py [0:0]
def run_training(cfg):
"""
run the training loops
"""
# torch gpu setup
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = str(cfg.gpu_idx)
if cfg.model_zoo is not None:
os.environ["TORCH_MODEL_ZOO"] = cfg.model_zoo
# make the exp dir
os.makedirs(cfg.exp_dir, exist_ok=True)
# set the seeds
np.random.seed(cfg.seed)
torch.manual_seed(cfg.seed)
# set cudnn to reproducibility mode
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# dump the exp config to the exp dir
dump_config(cfg)
# setup datasets
dset_train, dset_val, dset_test = dataset_zoo(**cfg.DATASET)
# init loaders
trainloader = torch.utils.data.DataLoader(dset_train,
num_workers=cfg.num_workers,
pin_memory=True,
batch_size=cfg.batch_size,
shuffle=True)
if dset_val is not None:
valloader = torch.utils.data.DataLoader(dset_val,
num_workers=cfg.num_workers,
pin_memory=True,
batch_size=cfg.batch_size,
shuffle=False)
else:
valloader = None
# test loaders
if dset_test is not None:
testloader = torch.utils.data.DataLoader(dset_test,
num_workers=cfg.num_workers,
pin_memory=True,
batch_size=cfg.batch_size,
shuffle=False)
_, _, eval_vars = eval_zoo(cfg.DATASET.dataset_name)
else:
testloader = None
eval_vars = None
# init the model
model, stats, optimizer_state = init_model(cfg, add_log_vars=eval_vars)
start_epoch = stats.epoch + 1
# move model to gpu
if torch.cuda.is_available():
model.cuda()
# init the optimizer
optimizer, scheduler = init_optimizer(
model, optimizer_state=optimizer_state, **cfg.SOLVER)
# loop through epochs
scheduler.last_epoch = start_epoch
for epoch in range(start_epoch, cfg.SOLVER.max_epochs):
with stats: # automatic new_epoch and plotting at every epoch start
print("scheduler lr = %1.2e" % float(scheduler.get_lr()[-1]))
# train loop
trainvalidate(model, stats, epoch, trainloader, optimizer, False,
visdom_env_root=get_visdom_env(cfg), **cfg)
# val loop
if valloader is not None:
trainvalidate(model, stats, epoch, valloader, optimizer, True,
visdom_env_root=get_visdom_env(cfg), **cfg)
# eval loop (optional)
if testloader is not None:
eval_result = run_eval(cfg, model, testloader, stats=stats)
dump_eval_result(cfg, eval_result)
assert stats.epoch == epoch, "inconsistent stats!"
# delete previous models if required
if cfg.store_checkpoints_purge > 0 and cfg.store_checkpoints:
for prev_epoch in range(epoch-cfg.store_checkpoints_purge):
purge_epoch(cfg.exp_dir, prev_epoch)
# save model
if cfg.store_checkpoints:
outfile = get_checkpoint(cfg.exp_dir, epoch)
save_model(model, stats, outfile, optimizer=optimizer)
scheduler.step()
# the final eval
if testloader is not None:
eval_result = run_eval(cfg, model, testloader, stats=None)
dump_eval_result(cfg, eval_result)
return eval_result
else:
return None