in src/train.py [0:0]
def train(self):
best_eval = float('inf')
# Begin!
for epoch in range(self.start_epoch, self.start_epoch + self.args.epochs):
self.logger.info(f'Starting epoch, Rank {self.args.rank}, Dataset: {self.args.data[self.args.rank]}')
self.train_epoch(epoch)
self.evaluate_epoch(epoch)
self.logger.info(f'Epoch %s Rank {self.args.rank} - Train loss: (%s), Test loss (%s)',
epoch, self.train_losses(), self.eval_losses())
self.lr_manager.step()
val_loss = self.eval_total.summarize_epoch()
if val_loss < best_eval:
self.save_model(f'bestmodel_{self.args.rank}.pth')
best_eval = val_loss
if not self.args.per_epoch:
self.save_model(f'lastmodel_{self.args.rank}.pth')
else:
self.save_model(f'lastmodel_{epoch}_rank_{self.args.rank}.pth')
if self.args.is_master:
torch.save([self.args,
epoch],
'%s/args.pth' % self.expPath)
self.logger.debug('Ended epoch')