in bench/BenchUtils.h [198:500]
void performance_test(
int num_instances,
bool flush,
int repetitions,
bool is_mkl) {
#ifdef USE_MKL
mkl_set_xerbla((XerblaEntry)test_xerbla);
#endif
(void)is_mkl; // Suppress unused variable warning
float alpha = 1.f, beta = 1.f;
matrix_op_t btran = matrix_op_t::Transpose;
#if dataset == 1
const int NITER = (flush) ? 10 : 100;
std::vector<std::vector<int>> shapes;
for (auto m = 1; m < 120; m++) {
// shapes.push_back({m, 128, 512});
shapes.push_back({m, 512, 512});
}
#elif dataset == 2
const int NITER = (flush) ? 10 : 100;
#include "shapes_dataset.h"
#else
flush = false;
constexpr int NITER = 1;
std::vector<std::vector<int>> shapes;
std::random_device r;
std::default_random_engine generator(r());
std::uniform_int_distribution<int> dm(1, 100);
std::uniform_int_distribution<int> dnk(1, 1024);
for (int i = 0; i < 1000; i++) {
int m = dm(generator);
int n = dnk(generator);
int k = dnk(generator);
shapes.push_back({m, n, k});
}
#endif
std::string type;
double gflops, gbs, ttot;
for (auto s : shapes) {
int m = s[0];
int n = s[1];
int k = s[2];
// initialize with small numbers
aligned_vector<int> Aint(m * k);
randFill(Aint, 0, 4);
std::vector<aligned_vector<float>> A;
for (int i = 0; i < num_instances; ++i) {
A.push_back(aligned_vector<float>(Aint.begin(), Aint.end()));
}
aligned_vector<int> Bint(k * n);
randFill(Bint, 0, 4);
aligned_vector<float> B(Bint.begin(), Bint.end());
std::vector<std::unique_ptr<PackedGemmMatrixB<btype>>> Bp;
for (int i = 0; i < num_instances; ++i) {
Bp.emplace_back(std::unique_ptr<PackedGemmMatrixB<btype>>(
new PackedGemmMatrixB<btype>(btran, k, n, alpha, B.data())));
}
auto kAligned = ((k * sizeof(float) + 64) & ~63) / sizeof(float);
auto nAligned = ((n * sizeof(float) + 64) & ~63) / sizeof(float);
std::vector<aligned_vector<float>> Bt(num_instances);
auto& Bt_ref = Bt[0];
if (btran == matrix_op_t::Transpose) {
Bt_ref.resize(k * nAligned);
for (auto row = 0; row < k; ++row) {
for (auto col = 0; col < n; ++col) {
Bt_ref[row * nAligned + col] = alpha * B[col * k + row];
}
}
} else {
Bt_ref.resize(kAligned * n);
for (auto row = 0; row < k; ++row) {
for (auto col = 0; col < n; ++col) {
Bt_ref[col * kAligned + row] = alpha * B[col * k + row];
}
}
}
for (auto i = 1; i < num_instances; ++i) {
Bt[i] = Bt_ref;
}
std::vector<aligned_vector<float>> C_ref;
std::vector<aligned_vector<float>> C_fb;
if (beta != 0.0f) {
aligned_vector<int> Cint(m * n);
randFill(Cint, 0, 4);
for (int i = 0; i < num_instances; ++i) {
C_ref.push_back(aligned_vector<float>(Cint.begin(), Cint.end()));
C_fb.push_back(aligned_vector<float>(Cint.begin(), Cint.end()));
}
} else {
for (int i = 0; i < num_instances; ++i) {
C_ref.push_back(aligned_vector<float>(m * n, 1.f));
C_fb.push_back(aligned_vector<float>(m * n, NAN));
}
}
double nflops = 2.0 * m * n * k;
double nbytes = 4.0 * m * k + sizeof(btype) * 1.0 * k * n + 4.0 * m * n;
// warm up MKL and fbgemm
// check correctness at the same time
for (auto w = 0; w < 3; w++) {
#if defined(USE_MKL) || defined(USE_BLAS)
cblas_sgemm(
CblasRowMajor,
CblasNoTrans,
CblasNoTrans, // B is pretransposed, if required by operation
m,
n,
k,
1.0, // Mutliplication by Alpha is done during transpose of B
A[0].data(),
k,
Bt[0].data(),
btran == matrix_op_t::NoTranspose ? kAligned : nAligned,
beta,
C_ref[0].data(),
n);
#else
cblas_sgemm_ref(
matrix_op_t::NoTranspose,
matrix_op_t::NoTranspose,
m,
n,
k,
1.0,
A[0].data(),
k,
Bt[0].data(),
(btran == matrix_op_t::NoTranspose) ? kAligned : nAligned,
beta,
C_ref[0].data(),
n);
#endif
#ifdef _OPENMP
#pragma omp parallel if (num_instances == 1)
#endif
{
int num_threads = num_instances == 1 ? fbgemm_get_num_threads() : 1;
int tid = num_instances == 1 ? fbgemm_get_thread_num() : 0;
cblas_gemm_compute(
matrix_op_t::NoTranspose,
m,
A[0].data(),
*Bp[0],
beta,
C_fb[0].data(),
tid,
num_threads);
}
#if defined(USE_MKL) || defined(USE_BLAS)
// Compare results
for (size_t i = 0; i < C_ref[0].size(); i++) {
if (std::abs(C_ref[0][i] - C_fb[0][i]) > 1e-3) {
fprintf(
stderr,
"Error: too high diff between fp32 ref %f and fp16 %f at %ld\n",
C_ref[0][i],
C_fb[0][i],
i);
return;
}
}
#endif
}
#ifdef USE_MKL
if (is_mkl) {
// Gold via MKL sgemm
type = "MKL_FP32";
#elif defined(USE_BLAS)
type = "BLAS_FP32";
#else
type = "REF_FP32";
#endif
ttot = measureWithWarmup(
[&]() {
int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
for (int i = 0; i < repetitions; ++i) {
#if defined(USE_MKL) || defined(USE_BLAS)
cblas_sgemm(
CblasRowMajor,
CblasNoTrans,
CblasNoTrans,
m,
n,
k,
1.0,
A[copy].data(),
k,
Bt[copy].data(),
btran == matrix_op_t::NoTranspose ? kAligned : nAligned,
beta,
C_ref[copy].data(),
n);
#else
cblas_sgemm_ref(
matrix_op_t::NoTranspose,
matrix_op_t::NoTranspose,
m,
n,
k,
1.0,
A[copy].data(),
k,
Bt[copy].data(),
(btran == matrix_op_t::NoTranspose) ? kAligned : nAligned,
beta,
C_ref[copy].data(),
n);
#endif
}
},
3,
NITER,
[&]() {
if (flush) {
int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
cache_evict(A[copy]);
cache_evict(Bt[copy]);
cache_evict(C_ref[copy]);
}
},
// Use OpenMP if num instances > 1
num_instances > 1);
gflops = nflops / ttot / 1e9;
gbs = nbytes / ttot / 1e9;
printf(
"\n%30s m = %5d n = %5d k = %5d Gflops = %8.4lf GBytes = %8.4lf\n",
type.c_str(),
m,
n,
k,
gflops * repetitions,
gbs * repetitions);
#ifdef USE_MKL
}
#endif
type = "FBP_" + std::string(typeid(btype).name());
ttot = measureWithWarmup(
[&]() {
// When executing in data decomposition (single-instance) mode
// Different threads will access different regions of the same
// matrices. Thus, copy to be used is always 0. The numbers of
// threads would be the as number of threads in the parallel
// region.
// When running in functional decomposition (multi-instance) mode
// different matrices are used. The copy to be used selected by
// thread_id (thread_num), and the number of threads performance
// the compute of the same instance is 1.
int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
int num_threads = num_instances == 1 ? fbgemm_get_num_threads() : 1;
int tid = num_instances == 1 ? fbgemm_get_thread_num() : 0;
for (int i = 0; i < repetitions; ++i) {
cblas_gemm_compute(
matrix_op_t::NoTranspose,
m,
A[copy].data(),
*Bp[copy],
beta,
C_fb[copy].data(),
tid,
num_threads);
}
},
3,
NITER,
[&]() {
if (flush) {
int copy = num_instances == 1 ? 0 : fbgemm_get_thread_num();
cache_evict(A[copy]);
cache_evict(*Bp[copy]);
cache_evict(C_fb[copy]);
}
},
true /*useOpenMP*/);
gflops = nflops / ttot / 1e9;
gbs = nbytes / ttot / 1e9;
printf(
"%30s m = %5d n = %5d k = %5d Gflops = %8.4lf GBytes = %8.4lf\n",
type.c_str(),
m,
n,
k,
gflops * repetitions,
gbs * repetitions);
}
}