bench/RequantizeBenchmark.cc (123 lines of code) (raw):

/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include <chrono> #include <initializer_list> #include <iomanip> #include <iostream> #ifdef _OPENMP #include <omp.h> #endif #include "./BenchUtils.h" #include "fbgemm/Fbgemm.h" using namespace std; using namespace fbgemm; enum class BenchmarkType { BARE_BONE, BIAS, A_ASYMMETRIC, B_ASYMMETRIC, PER_CHANNEL, }; void performance_test() { constexpr int NWARMUP = 4; constexpr int NITER = 256; cout << setw(4) << "len" << ", " << setw(10) << "Type" << ", B_elements_per_sec" << endl; for (int len : {1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256}) { aligned_vector<float> C_multiplier(len); randFill<float>(C_multiplier, -8, 8); aligned_vector<int32_t> Bint8_zero_point(len), row_offset_buf(len), col_offsets(len), bias_vector(len), input(len); randFill<int32_t>(Bint8_zero_point, -8, 8); randFill<int32_t>(row_offset_buf, -8, 8); randFill<int32_t>(col_offsets, -8, 8); randFill<int32_t>(bias_vector, -8, 8); randFill<int32_t>(input, -8, 8); int32_t C_zero_point = -3; block_type_t block{0, 1, 0, len}; aligned_vector<uint8_t> output(len); for (BenchmarkType bench_type : {BenchmarkType::BARE_BONE, BenchmarkType::BIAS, BenchmarkType::A_ASYMMETRIC, BenchmarkType::B_ASYMMETRIC, BenchmarkType::PER_CHANNEL}) { int32_t Aint8_zero_point = bench_type < BenchmarkType::A_ASYMMETRIC ? 0 : -3; if (bench_type < BenchmarkType::B_ASYMMETRIC) { Bint8_zero_point[0] = 0; } const int32_t* bias = bench_type == BenchmarkType::BARE_BONE ? nullptr : bias_vector.data(); double duration = 0.0; DoNothing<> doNothingObj{}; if (bench_type == BenchmarkType::PER_CHANNEL) { ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> reqObj( doNothingObj, C_multiplier.data(), C_zero_point, Aint8_zero_point, Bint8_zero_point.data(), row_offset_buf.data(), col_offsets.data(), bias, len); duration = measureWithWarmup( [&]() { reqObj.f<inst_set_t::avx2>( output.data(), input.data(), block, len, len); }, NWARMUP, NITER); } else { ReQuantizeOutput<false> reqObj( doNothingObj, C_multiplier.data(), C_zero_point, Aint8_zero_point, Bint8_zero_point.data(), row_offset_buf.data(), col_offsets.data(), bias, len); duration = measureWithWarmup( [&]() { reqObj.f<inst_set_t::avx2>( output.data(), input.data(), block, len, len); }, NWARMUP, NITER); } duration *= 1e9; // convert to ns cout << setw(4) << len << ", "; switch (bench_type) { case BenchmarkType::BARE_BONE: cout << setw(10) << "bare_bone"; break; case BenchmarkType::BIAS: cout << setw(10) << "bias"; break; case BenchmarkType::A_ASYMMETRIC: cout << setw(10) << "a_asymmetric"; break; case BenchmarkType::B_ASYMMETRIC: cout << setw(10) << "b_asymmetric"; break; case BenchmarkType::PER_CHANNEL: cout << setw(10) << "per_channel"; break; } cout << ", " << setw(10) << setprecision(3) << len / duration << endl; } // for each bench_type } // for each length } // performance_test int main() { #ifdef _OPENMP // Use 1 thread unless OMP_NUM_THREADS is explicit set. const char* val = getenv("OMP_NUM_THREADS"); if (val == nullptr || !*val) { omp_set_num_threads(1); } #endif performance_test(); return 0; }