in src/utils.py [0:0]
def init_distributed_mode(args):
"""
Initialize the following variables:
- world_size
- rank
"""
args.is_slurm_job = "SLURM_JOB_ID" in os.environ
if args.is_slurm_job:
args.rank = int(os.environ["SLURM_PROCID"])
args.world_size = int(os.environ["SLURM_NNODES"]) * int(
os.environ["SLURM_TASKS_PER_NODE"][0]
)
else:
# multi-GPU job (local or multi-node) - jobs started with torch.distributed.launch
# read environment variables
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ["WORLD_SIZE"])
# prepare distributed
dist.init_process_group(
backend="nccl",
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank,
)
# set cuda device
args.gpu_to_work_on = args.rank % torch.cuda.device_count()
torch.cuda.set_device(args.gpu_to_work_on)
return