in throughput/all_reduce_bench.py [0:0]
def init_processes(local_rank, fn, backend='nccl'): torch.cuda.set_device(local_rank) dist.init_process_group(backend) fn(local_rank)