in utils/main_utils.py [0:0]
def distribute_model_to_cuda(models, args, batch_size, num_workers, ngpus_per_node):
if ngpus_per_node == 0:
return models, args, batch_size, num_workers
squeeze = False
if not isinstance(models, list):
models = [models]
squeeze = True
for i in range(len(models)):
if args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
models[i].cuda(args.gpu)
models[i] = torch.nn.parallel.DistributedDataParallel(models[i], device_ids=[args.gpu])
else:
models[i].cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
models[i] = torch.nn.parallel.DistributedDataParallel(models[i])
elif args.gpu is not None:
torch.cuda.set_device(args.gpu)
models[i] = models[i].cuda(args.gpu)
else:
# DataParallel will divide and allocate batch_size to all available GPUs
models[i] = torch.nn.DataParallel(models[i]).cuda()
if squeeze:
models = models[0]
if args.distributed and args.gpu is not None:
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
batch_size = int(batch_size / ngpus_per_node)
num_workers = int((num_workers + ngpus_per_node - 1) / ngpus_per_node)
return models, args, batch_size, num_workers