in main_eval.py [0:0]
def main_worker(**kwargs):
opts = get_eval_arguments()
print(opts)
# device set-up
opts = device_setup(opts)
node_rank = getattr(opts, "ddp.rank", 0)
if node_rank < 0:
logger.error('--rank should be >=0. Got {}'.format(node_rank))
is_master_node = is_master(opts)
# create the directory for saving results
save_dir = getattr(opts, "common.results_loc", "results")
run_label = getattr(opts, "common.run_label", "run_1")
exp_dir = '{}/{}'.format(save_dir, run_label)
setattr(opts, "common.exp_loc", exp_dir)
create_directories(dir_path=exp_dir, is_master_node=is_master_node)
world_size = getattr(opts, "ddp.world_size", 1)
num_gpus = getattr(opts, "dev.num_gpus", 1)
use_distributed = getattr(opts, "ddp.enable", False)
if num_gpus <= 1:
use_distributed = False
setattr(opts, "ddp.use_distributed", use_distributed)
# No of data workers = no of CPUs (if not specified or -1)
n_cpus = multiprocessing.cpu_count()
dataset_workers = getattr(opts, "dataset.workers", -1)
if use_distributed:
if world_size == -1:
logger.log("Setting --ddp.world-size the same as the number of available gpus")
world_size = num_gpus
setattr(opts, "ddp.world_size", world_size)
elif world_size != num_gpus:
logger.log("--ddp.world-size does not match num. of available GPUs. Got {} !={}".format(world_size, num_gpus))
logger.log("Setting --ddp.world-size={}".format(num_gpus))
world_size = num_gpus
setattr(opts, "ddp.world_size", world_size)
if dataset_workers == -1 or dataset_workers is None:
setattr(opts, "dataset.workers", n_cpus // world_size)
start_rank = getattr(opts, "ddp.rank", 0)
setattr(opts, "ddp.rank", None)
kwargs['start_rank'] = start_rank
torch.multiprocessing.spawn(
fn=distributed_worker,
args=(main, opts, kwargs),
nprocs=num_gpus,
)
else:
if dataset_workers == -1:
setattr(opts, "dataset.workers", n_cpus)
# adjust the batch size
train_bsize = getattr(opts, "dataset.train_batch_size0", 32) * max(1, num_gpus)
val_bsize = getattr(opts, "dataset.val_batch_size0", 32) * max(1, num_gpus)
setattr(opts, "dataset.train_batch_size0", train_bsize)
setattr(opts, "dataset.val_batch_size0", val_bsize)
setattr(opts, "dev.device_id", None)
main(opts=opts, **kwargs)