def timed_allreduce()

in throughput/all_reduce_bench.py [0:0]


def timed_allreduce(mat, id):
    pre = time.perf_counter()
    dist.all_reduce(mat)
    printflock(f"ignore me {int(mat[0][0])}")  # required due to lazy evaluation
    duration = time.perf_counter() - pre
    tput = ((M*N*4*2)/duration)*8 # *2 is for send + receive, *8 for gigabits/second
    size = M * N * 4 # 4 is fp32
    n = dist.get_world_size()
    busbw = (size / duration) * (2 * (n - 1) / n) * 8
    printflock(f"{id}:\n",
               f"duration: {duration:.4f} sec\n",
               f"algo throughput: {tput:.4f} bps, {tput/1e9:.4f} Gbps\n",
               f"busbw: {busbw / 1e9:.4f}  Gbps"
    )