in bench/PackedRequantizeAcc16Benchmark.cc [39:456]
void performance_test() {
// clang-format off
vector<vector<int>> shapes = {
// NOTE: clang-format wants to use a different formatting but the current
// formatting should be easier to read.
// m, n, k
{64, 68, 17},
{60, 128, 64},
{25088, 256, 64},
{25088, 64, 64},
{25088, 64, 576},
{25088, 64, 256},
{6272, 512, 256},
{6272, 128, 256},
{6272, 128, 1152},
{6272, 512, 128},
{6272, 128, 512},
{1568, 1024, 512},
{1568, 256, 512},
{1568, 256, 2304},
{1568, 1024, 256},
{1568, 256, 1024},
{392, 2048, 1024},
{392, 512, 1024},
{392, 512, 4608},
{392, 2048, 512},
{392, 512, 2048},
};
// clang-format on
bool flush = true;
std::vector<char> llc;
if (flush) {
llc.resize(128 * 1024 * 1024, 1.0);
}
constexpr int NWARMUP = 4;
constexpr int NITER = 10;
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
cout << "WARNING: the timer may be inaccurate when used by multiple threads."
<< endl;
cout << "M, "
<< "N, "
<< "K, "
<< "Output Processing, "
<< "Packing (ms), "
<< "Kernel (ms), "
<< "Postprocessing (ms), "
<< "Total (ms), "
<< "GOPS" << endl;
#else
cout << setw(7) << "M, " << setw(7) << "N, " << setw(7) << "K, " << setw(32)
<< "Output Processing, " << setw(18) << "Type, " << setw(5) << "GOPS"
<< endl;
#endif
chrono::time_point<chrono::high_resolution_clock> begin, end;
for (auto shape : shapes) {
int m = shape[0];
int n = shape[1];
int k = shape[2];
aligned_vector<uint8_t> Aint8(m * k);
aligned_vector<int8_t> Bint8(k * n);
aligned_vector<float> Cfp32_mkl(m * n);
// just used for result comparisons
aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
// requantize results
aligned_vector<uint8_t> Cint8_mkl(Cfp32_mkl.size());
aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
// A matrix
randFill<uint8_t>(Aint8, 0, 50);
int32_t Aint8_zero_point = 43;
aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
randFill<int8_t>(Bint8, -8, 8);
aligned_vector<int8_t> Bint8_copy(Bint8);
aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
double nops = 2.0 * m * n * k;
double ttot = 0.0;
string runType;
#ifdef USE_MKL
const float alpha = 1.0f;
const float beta = 0.0f;
ttot = 0.0;
runType = "MKL_fp32";
cout << setw(5) << m << ", " << setw(5) << n << ", " << setw(5) << k
<< ", ";
cout << setw(30) << "NA";
cout << ", ";
ttot = measureWithWarmup(
[&]() {
cblas_sgemm(
CblasRowMajor,
CblasNoTrans,
CblasNoTrans,
m,
n,
k,
alpha,
Afp32.data(),
k,
Bfp32.data(),
n,
beta,
Cfp32_mkl.data(),
n);
},
NWARMUP,
NITER,
[&]() {
if (flush) {
llc_flush(llc);
}
});
ttot *= 1e9; // convert to ns
if (flush) {
((volatile char*)(llc.data()))[0] += 1;
}
cout << setw(16) << runType << ", " << fixed << setw(5) << setprecision(1)
<< nops / ttot << endl;
Cint32_mkl.assign(Cfp32_mkl.begin(), Cfp32_mkl.end());
#endif
for (BenchmarkType bench_type :
{BenchmarkType::BARE_BONE,
BenchmarkType::REQUANTIZATION,
BenchmarkType::ROW_OFFSET_AND_REQUANTIZATION,
BenchmarkType::EVERYTHING}) {
// When we don't compute row_offset in fbgemm, we set B_zero_point to 0
// to get the same result as the reference.
int32_t Bint8_zero_point = (bench_type == BenchmarkType::BARE_BONE ||
bench_type == BenchmarkType::REQUANTIZATION)
? 0
: -30;
// computing column offset
vector<int32_t> col_offsets(n);
Bint8 = Bint8_copy;
col_offsets_with_zero_pt_s8acc32_ref(
k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
vector<int32_t> row_offsets(m);
row_offsets_u8acc32_ref(m, k, k, Aint8.data(), row_offsets.data());
float C_multiplier =
(bench_type == BenchmarkType::BARE_BONE) ? 1.0f : 0.1234f;
int32_t C_zero_pt = (bench_type == BenchmarkType::BARE_BONE) ? 0 : 5;
// printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
// "A unpacked");
// printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n,
// "B unpacked");
// packedB.printPackedMatrix("B Packed");
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
double total_packing_time = 0.0;
double total_computing_time = 0.0;
double total_kernel_time = 0.0;
double total_postprocessing_time = 0.0;
double total_run_time = 0.0;
#endif
cout << setw(5) << m << ", " << setw(5) << n << ", " << setw(5) << k
<< ", ";
switch (bench_type) {
case BenchmarkType::BARE_BONE:
cout << setw(30) << "bare_bone";
break;
case BenchmarkType::REQUANTIZATION:
cout << setw(30) << "requantization";
break;
case BenchmarkType::ROW_OFFSET_AND_REQUANTIZATION:
cout << setw(30) << "row_offset_and_requantization";
break;
case BenchmarkType::EVERYTHING:
cout << setw(30) << "everything";
break;
};
cout << ", ";
requantize_u8acc32_ref(
m,
n,
n,
Cint32_mkl.data(),
Cint8_mkl.data(),
&C_multiplier,
C_zero_pt,
Aint8_zero_point,
&Bint8_zero_point,
row_offsets.data(),
col_offsets.data(),
nullptr, // bias
n); // ncols per quant group
CompressedSparseColumn B_csc(k, n);
float density = 0.001f;
// deterministic random number
default_random_engine eng;
binomial_distribution<> per_col_nnz_dist(k, density);
if (bench_type == BenchmarkType::EVERYTHING) {
vector<int> row_indices(k);
int total_nnz = 0;
for (int j = 0; j < n; ++j) {
B_csc.ColPtr()[j] = total_nnz;
int nnz_of_j = per_col_nnz_dist(eng);
total_nnz += nnz_of_j;
iota(row_indices.begin(), row_indices.end(), 0);
shuffle(row_indices.begin(), row_indices.end(), eng);
sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
B_csc.RowIdx().push_back(row_indices[kidx]);
// put the current B value
B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
// make current B value zero
Bint8[row_indices[kidx] * n + j] = 0;
// std::cout << "(" << row_indices[kidx] << ", " << j << ")" <<
// endl;
}
}
B_csc.ColPtr()[n] = total_nnz;
}
PackBMatrix<int8_t, int16_t> packedB(
matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
// printMatrix(matrix_op_t::NoTranspose,
// Cint32_mkl.data(), m, n, n, "C mkl");
ttot = 0;
runType = "FBGEMM_i8_acc16";
for (auto i = 0; i < NWARMUP + NITER; ++i) {
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
packing_time = 0.0;
computing_time = 0.0;
kernel_time = 0.0;
postprocessing_time = 0.0;
run_time = 0.0;
#endif
llc_flush(llc);
begin = chrono::high_resolution_clock::now();
#ifdef _OPENMP
#pragma omp parallel
#endif
{
vector<int32_t> row_offset_buf(
PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
PackAMatrix<uint8_t, int16_t> packA(
matrix_op_t::NoTranspose, m, k, Aint8.data(), k, nullptr, 1);
PackAWithRowOffset<uint8_t, int16_t> packAWithRowOffset(
matrix_op_t::NoTranspose,
m,
k,
Aint8.data(),
k,
nullptr,
1,
row_offset_buf.data());
// no-op output process objects
DoNothing<int32_t, int32_t> doNothing32BitObj;
memCopy<> memcopyObj(doNothing32BitObj);
// spmdm -> requantization -> nothing
// construct an output processing pipeline in reverse order
// i.e. last output operation first
// Last operation should always be DoNothing with
// correct input and output type.
DoNothing<> doNothingObj{};
// Requantization back to int8
ReQuantizeOutput<false> reqObj(
doNothingObj,
&C_multiplier,
C_zero_pt,
Aint8_zero_point,
&Bint8_zero_point,
bench_type == BenchmarkType::REQUANTIZATION
? nullptr
: packAWithRowOffset.getRowOffsetBuffer(),
col_offsets.data(),
nullptr,
n);
// the top most (first) operation in the output processing
// pipeline is spmdm
// outType = final output type after fullly processing through
// pipeline; inType = initial input type at the first call to the
// whole pipeline
DoSpmdmOnInpBuffer<
ReQuantizeOutput<false>::outType,
int32_t,
ReQuantizeOutput<false>>
spmdmObj(reqObj, Aint8.data(), k, B_csc);
int num_threads = fbgemm_get_num_threads();
int tid = fbgemm_get_thread_num();
// printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
switch (bench_type) {
case BenchmarkType::BARE_BONE:
fbgemmPacked(
packA,
packedB,
Cint32_fb.data(),
Cint32_fb.data(),
n,
memcopyObj,
tid,
num_threads);
break;
case BenchmarkType::REQUANTIZATION:
fbgemmPacked(
packA,
packedB,
Cint8_fb.data(),
Cint32_fb.data(),
n,
reqObj,
tid,
num_threads);
break;
case BenchmarkType::ROW_OFFSET_AND_REQUANTIZATION:
fbgemmPacked(
packAWithRowOffset,
packedB,
Cint8_fb.data(),
Cint32_fb.data(),
n,
reqObj,
tid,
num_threads);
break;
case BenchmarkType::EVERYTHING:
fbgemmPacked(
packAWithRowOffset,
packedB,
Cint8_fb.data(),
Cint32_fb.data(),
n,
spmdmObj,
tid,
num_threads);
break;
};
}
end = chrono::high_resolution_clock::now();
if (i >= NWARMUP) {
auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin);
ttot += dur.count();
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
total_packing_time += packing_time;
total_computing_time += computing_time;
total_kernel_time += kernel_time;
total_postprocessing_time += postprocessing_time;
total_run_time += run_time;
#endif
}
}
if (flush) {
((volatile char*)(llc.data()))[0] += 1;
}
// printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
// unpacked");
// printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
// "A unpacked");
// printMatrix(matrix_op_t::NoTranspose, Cint8_local.data(),
// m, n, n, "C requantized after");
// printMatrix(matrix_op_t::NoTranspose,
// Cint8_fb.data(), m, n, n, "C fb");
// printMatrix(matrix_op_t::NoTranspose,
// col_offsets.data(), 1, n, n, "col offsets after");
// compare_buffers(row_offsets.data(), row_offset_buf.data(),
// row_offsets.size(), 5);
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
cout << fixed << total_packing_time / (double)NITER / 1e6 << ", "
<< total_kernel_time / (double)NITER / 1e6 << ", "
<< total_postprocessing_time / (double)NITER / 1e6 << ", "
<< total_run_time / (double)NITER / 1e6 << ", ";
#endif
cout << setw(16) << runType << ", " << fixed << setw(5) << setprecision(1)
<< NITER * nops / ttot << endl;
#ifdef USE_MKL
if (bench_type == BenchmarkType::BARE_BONE) {
compare_buffers(Cint32_mkl.data(), Cint32_fb.data(), m, n, n, 5);
} else {
compare_buffers(Cint8_mkl.data(), Cint8_fb.data(), m, n, n, 5);
}
#endif
} // test_outlier
cout << endl;
} // shapes
}