in distributed_training/src_dir/dis_util.py [0:0]
def smp_savemodel(model, optimizer, is_best, args):
filepath = '/opt/ml/local_checkpoints'
filename = os.path.join(filepath, 'smp_full_checkpoint.pt')
if args.rank == 0:
if os.path.exists(filepath):
print("-INFO- PATH DO EXIST")
else:
os.makedirs(filepath)
print("-INFO- PATH DO NOT EXIST")
smp.barrier()
if args.dp_rank == 0:
if args.save_full_model:
model_dict = model.state_dict()
opt_dict = optimizer.state_dict()
smp.save(
{
"model_state_dict": model_dict,
"optimizer_state_dict": opt_dict
},
filename,
partial=False,
)
else:
model_dict = model.local_state_dict()
opt_dict = optimizer.local_state_dict()
smp.save(
{
"model_state_dict": model_dict,
"optimizer_state_dict": opt_dict
},
filename,
partial=True,
)
smp.barrier()
if args.rank == 0:
print("Start syncing")
base_s3_path = os.path.dirname(
os.path.dirname(os.getenv('SM_MODULE_DIR', '')))
curr_host = os.getenv('SM_CURRENT_HOST')
full_s3_path = f'{base_s3_path}/checkpoints/{curr_host}/'
util.sync_local_checkpoints_to_s3(local_path=filepath,
s3_path=full_s3_path)
print("Finished syncing")
print("is_best : {}".format(is_best))
if is_best:
shutil.copyfile(filename,
os.path.join(args.model_dir, 'model_best.pth'))
smp.barrier()