in cm/train_util.py [0:0]
def save(self):
import blobfile as bf
step = self.global_step
def save_checkpoint(rate, params):
state_dict = self.mp_trainer.master_params_to_state_dict(params)
if dist.get_rank() == 0:
logger.log(f"saving model {rate}...")
if not rate:
filename = f"model{step:06d}.pt"
else:
filename = f"ema_{rate}_{step:06d}.pt"
with bf.BlobFile(bf.join(get_blob_logdir(), filename), "wb") as f:
th.save(state_dict, f)
for rate, params in zip(self.ema_rate, self.ema_params):
save_checkpoint(rate, params)
logger.log("saving optimizer state...")
if dist.get_rank() == 0:
with bf.BlobFile(
bf.join(get_blob_logdir(), f"opt{step:06d}.pt"),
"wb",
) as f:
th.save(self.opt.state_dict(), f)
if dist.get_rank() == 0:
if self.target_model:
logger.log("saving target model state")
filename = f"target_model{step:06d}.pt"
with bf.BlobFile(bf.join(get_blob_logdir(), filename), "wb") as f:
th.save(self.target_model.state_dict(), f)
if self.teacher_model and self.training_mode == "progdist":
logger.log("saving teacher model state")
filename = f"teacher_model{step:06d}.pt"
with bf.BlobFile(bf.join(get_blob_logdir(), filename), "wb") as f:
th.save(self.teacher_model.state_dict(), f)
# Save model parameters last to prevent race conditions where a restart
# loads model at step N, but opt/ema state isn't saved for step N.
save_checkpoint(0, self.mp_trainer.master_params)
dist.barrier()