in classy_vision/distributed/launch_ray.py [0:0]
def run(self, master_addr, master_port, node_rank, dist_world_size, args):
processes = []
# set PyTorch distributed related environmental variables
current_env = os.environ.copy()
current_env["MASTER_ADDR"] = master_addr
current_env["MASTER_PORT"] = str(master_port)
current_env["WORLD_SIZE"] = str(dist_world_size)
if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1:
current_env["OMP_NUM_THREADS"] = str(1)
print(
"*****************************************\n"
"Setting OMP_NUM_THREADS environment variable for each process "
"to be {} in default, to avoid your system being overloaded, "
"please further tune the variable for optimal performance in "
"your application as needed. \n"
"*****************************************".format(
current_env["OMP_NUM_THREADS"]
)
)
# Set the init_method and rank of the process for distributed training.
for local_rank in range(0, args.nproc_per_node):
# each process's rank
dist_rank = args.nproc_per_node * node_rank + local_rank
current_env["RANK"] = str(dist_rank)
current_env["LOCAL_RANK"] = str(local_rank)
# spawn the processes
with_python = not args.no_python
cmd = []
if with_python:
cmd = [sys.executable, "-u"]
if args.module:
cmd.append("-m")
else:
if not args.use_env:
raise ValueError(
"When using the '--no_python' flag, "
"you must also set the '--use_env' flag."
)
if args.module:
raise ValueError(
"Don't use both the '--no_python' flag"
"and the '--module' flag at the same time."
)
cmd.append(args.training_script)
if not args.use_env:
cmd.append("--local_rank={}".format(local_rank))
cmd.extend(args.training_script_args)
process = subprocess.Popen(cmd, env=current_env)
processes.append(process)
for process in processes:
process.wait()
if process.returncode != 0:
raise subprocess.CalledProcessError(
returncode=process.returncode, cmd=cmd
)