in training/trainer.py [0:0]
def run_train(self):
while self.epoch < self.max_epochs:
dataloader = self.train_dataset.get_loader(epoch=int(self.epoch))
barrier()
outs = self.train_epoch(dataloader)
self.logger.log_dict(outs, self.epoch) # Logged only on rank 0
# log train to text file.
if self.distributed_rank == 0:
with g_pathmgr.open(
os.path.join(self.logging_conf.log_dir, "train_stats.json"),
"a",
) as f:
f.write(json.dumps(outs) + "\n")
# Save checkpoint before validating
self.save_checkpoint(self.epoch + 1)
del dataloader
gc.collect()
# Run val, not running on last epoch since will run after the
# loop anyway
if self.is_intermediate_val_epoch(self.epoch):
self.run_val()
if self.distributed_rank == 0:
self.best_meter_values.update(self._get_trainer_state("train"))
with g_pathmgr.open(
os.path.join(self.logging_conf.log_dir, "best_stats.json"),
"a",
) as f:
f.write(json.dumps(self.best_meter_values) + "\n")
self.epoch += 1
# epoch was incremented in the loop but the val step runs out of the loop
self.epoch -= 1