launcher_scripts/custom_script/custom_allreduce.py (10 lines of code) (raw):
import torch
import torch.distributed as dist
print("init process group")
dist.init_process_group("nccl")
print("rank:", dist.get_rank())
torch.cuda.set_device(dist.get_rank() % 8)
tensor = torch.randn(4, 4, device="cuda")
print(f"[{dist.get_rank()}] tensor {tensor}")
dist.all_reduce(tensor)
print(f"[{dist.get_rank()}] tensor {tensor} after reduce")