in train_net.py [0:0]
def do_train(cfg, model, resume=False):
model.train()
if cfg.SOLVER.USE_CUSTOM_SOLVER:
optimizer = build_custom_optimizer(cfg, model)
else:
assert cfg.SOLVER.OPTIMIZER == 'SGD'
assert cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE != 'full_model'
assert cfg.SOLVER.BACKBONE_MULTIPLIER == 1.
optimizer = build_optimizer(cfg, model)
scheduler = build_lr_scheduler(cfg, optimizer)
checkpointer = DetectionCheckpointer(
model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
)
start_iter = checkpointer.resume_or_load(
cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
if not resume:
start_iter = 0
max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.TRAIN_ITER < 0 else cfg.SOLVER.TRAIN_ITER
periodic_checkpointer = PeriodicCheckpointer(
checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
)
writers = (
[
CommonMetricPrinter(max_iter),
JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
TensorboardXWriter(cfg.OUTPUT_DIR),
]
if comm.is_main_process()
else []
)
use_custom_mapper = cfg.WITH_IMAGE_LABELS
MapperClass = CustomDatasetMapper if use_custom_mapper else DatasetMapper
mapper = MapperClass(cfg, True) if cfg.INPUT.CUSTOM_AUG == '' else \
DetrDatasetMapper(cfg, True) if cfg.INPUT.CUSTOM_AUG == 'DETR' else \
MapperClass(cfg, True, augmentations=build_custom_augmentation(cfg, True))
if cfg.DATALOADER.SAMPLER_TRAIN in ['TrainingSampler', 'RepeatFactorTrainingSampler']:
data_loader = build_detection_train_loader(cfg, mapper=mapper)
else:
data_loader = build_custom_train_loader(cfg, mapper=mapper)
if cfg.FP16:
scaler = GradScaler()
logger.info("Starting training from iteration {}".format(start_iter))
with EventStorage(start_iter) as storage:
step_timer = Timer()
data_timer = Timer()
start_time = time.perf_counter()
for data, iteration in zip(data_loader, range(start_iter, max_iter)):
data_time = data_timer.seconds()
storage.put_scalars(data_time=data_time)
step_timer.reset()
iteration = iteration + 1
storage.step()
loss_dict = model(data)
losses = sum(
loss for k, loss in loss_dict.items())
assert torch.isfinite(losses).all(), loss_dict
loss_dict_reduced = {k: v.item() \
for k, v in comm.reduce_dict(loss_dict).items()}
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
if comm.is_main_process():
storage.put_scalars(
total_loss=losses_reduced, **loss_dict_reduced)
optimizer.zero_grad()
if cfg.FP16:
scaler.scale(losses).backward()
scaler.step(optimizer)
scaler.update()
else:
losses.backward()
optimizer.step()
storage.put_scalar(
"lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
step_time = step_timer.seconds()
storage.put_scalars(time=step_time)
data_timer.reset()
scheduler.step()
if (cfg.TEST.EVAL_PERIOD > 0
and iteration % cfg.TEST.EVAL_PERIOD == 0
and iteration != max_iter):
do_test(cfg, model)
comm.synchronize()
if iteration - start_iter > 5 and \
(iteration % 20 == 0 or iteration == max_iter):
for writer in writers:
writer.write()
periodic_checkpointer.step(iteration)
total_time = time.perf_counter() - start_time
logger.info(
"Total training time: {}".format(
str(datetime.timedelta(seconds=int(total_time)))))