in distributed_training/src_dir/dis_util.py [0:0]
def dist_init(fn, args):
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
cudnn.deterministic = True
if cudnn.deterministic:
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
args.is_distributed = len(args.hosts) > 1 and args.backend is not None
args.is_multigpus = args.num_gpus > 1
args.multigpus_distributed = (args.is_distributed or args.is_multigpus)
logger.debug("multigpus_distributed - {}".format(
args.multigpus_distributed))
logger.debug("Number of gpus available - {}".format(args.num_gpus))
if args.multigpus_distributed and args.exp_cnt == 0:
if args.apex:
# Initialize the distributed environment.
mp.spawn(fn, nprocs=args.num_gpus, args=(args, ))
else:
if args.data_parallel:
sdp, DDP = _sdp_import(args)
sdp.init_process_group() if not sdp.is_initialized() else None
elif args.model_parallel:
smp.init()
args.exp_cnt = fn(None, args)
else:
args.exp_cnt = fn(0, args)
return args