in dora/executor.py [0:0]
def start_ddp_workers(package, main, argv):
import torch as th
world_size = th.cuda.device_count()
if not world_size:
fatal(
"DDP is only available on GPU. Make sure GPUs are properly configured with cuda.")
sys.exit(1)
xp = main.get_xp(argv)
xp.folder.mkdir(exist_ok=True, parents=True)
if xp.rendezvous_file.exists():
xp.rendezvous_file.unlink()
log(f"Starting {world_size} worker processes for DDP.")
with ChildrenManager() as manager:
for rank in range(world_size):
kwargs = {}
env = dict(os.environ)
env['RANK'] = str(rank)
env['WORLD_SIZE'] = str(world_size)
args = ["-m", "dora", "-P", package, "run", "--"]
args += argv
if rank > 0:
kwargs['stdin'] = sp.DEVNULL
kwargs['stdout'] = open(xp.folder / f'worker_{rank}.log', 'w')
kwargs['stderr'] = sp.STDOUT
manager.add(
sp.Popen([sys.executable] + args, env=env, **kwargs))
sys.exit(int(manager.failed))