def bench_vanilla_spmm_cuda()

in benchmark/bench_vanilla_spmm.py [0:0]


def bench_vanilla_spmm_cuda(adj_scipy_csr, feat_len):
    num_rows = adj_scipy_csr.shape[0]
    num_cols = adj_scipy_csr.shape[1]

    def _bench_vanilla_spmm_cuda(num_cuda_blocks, num_threads_per_cuda_block):
        vanilla_spmm_module = VanillaSpMMcuda(adj_scipy_csr)
        SrcFeat = te.placeholder((num_cols, feat_len))
        input_placeholders = [SrcFeat]
        compute_args = {}
        schedule_args = {'num_cuda_blocks': num_cuda_blocks,
                         'num_threads_per_cuda_block': num_threads_per_cuda_block}
        vanilla_spmm_module.build(input_placeholders, compute_args, schedule_args)
        src_feat_np = np.random.random(get_const_tuple(SrcFeat.shape)).astype('float32')
        src_feat_tvm = tvm.nd.array(src_feat_np, vanilla_spmm_module.ctx)
        input_tvm_ndarrays = [src_feat_tvm]
        num_runs = 5
        tcost = vanilla_spmm_module.measure_average_time(input_tvm_ndarrays, num_runs)
        print("average time of {} runs: {} ms".format(num_runs, tcost * 1000))

    for num_cuda_blocks in exp_range(64, num_rows, 4):
        for num_threads_per_cuda_block in exp_range(min(feat_len, 32), feat_len, 2):
            print()
            print("num_cuda_blocks:", num_cuda_blocks)
            print("num_threads_per_cuda_block:", num_threads_per_cuda_block)
            _bench_vanilla_spmm_cuda(num_cuda_blocks, num_threads_per_cuda_block)