in train/model.py [0:0]
def save_checkpoint(self, epoch, optimizer_state=None, suffix=''):
if self.single_checkpoint and torch.distributed._initialized and torch.distributed.get_rank() != 0:
logging.info("Checkpoint saved by node 0 (rank=0).")
return
save_path = self.get_checkpoint_path(epoch, suffix)
save_folder = os.path.dirname(save_path)
if not os.path.exists(save_folder):
logging.debug("mkdir {}".format(save_folder))
os.makedirs(save_folder)
if not optimizer_state:
torch.save({'epoch': epoch,
'state_dict': self.net.state_dict()},
save_path)
logging.info("Checkpoint (only model) saved to: {}".format(save_path))
else:
torch.save({'epoch': epoch,
'state_dict': self.net.state_dict(),
'optimizer': optimizer_state},
save_path)
logging.info("Checkpoint (model & optimizer) saved to: {}".format(save_path))