in throughput/all_reduce_bench.py [0:0]
def timed_allreduce(mat, id):
pre = time.perf_counter()
dist.all_reduce(mat)
printflock(f"ignore me {int(mat[0][0])}") # required due to lazy evaluation
duration = time.perf_counter() - pre
tput = ((M*N*4*2)/duration)*8 # *2 is for send + receive, *8 for gigabits/second
size = M * N * 4 # 4 is fp32
n = dist.get_world_size()
busbw = (size / duration) * (2 * (n - 1) / n) * 8
printflock(f"{id}:\n",
f"duration: {duration:.4f} sec\n",
f"algo throughput: {tput:.4f} bps, {tput/1e9:.4f} Gbps\n",
f"busbw: {busbw / 1e9:.4f} Gbps"
)