bench/Depthwise3DBenchmark.cc (188 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <iostream>
#include <vector>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "./AlignedVec.h"
#include "./BenchUtils.h"
#include "fbgemm/FbgemmI8DepthwiseAvx2.h"
#include "fbgemm/Utils.h"
#include "src/RefImplementations.h"
using namespace std;
using namespace fbgemm;
int main() {
#ifdef _OPENMP
// Use 1 thread unless OMP_NUM_THREADS is explicit set.
const char* val = getenv("OMP_NUM_THREADS");
if (val == nullptr || !*val) {
omp_set_num_threads(1);
}
#endif
// From ResNeXt-3D-101
// clang-format off
vector<vector<int>> shapes_3d = {
// NOTE: clang-format wants to use a different formatting but the current
// formatting should be easier to read.
// N, K, T_in, H_in, W_in, stride
{ 1, 64, 32, 56, 56, 1, },
{ 1, 128, 16, 28, 28, 1, },
{ 1, 256, 8, 14, 14, 1, },
{ 1, 512, 4, 7, 7, 1, },
{ 1, 128, 32, 56, 56, 2, },
{ 1, 256, 16, 28, 28, 2, },
{ 1, 512, 8, 14, 14, 2, },
{ 5, 64, 32, 56, 56, 1, },
{ 5, 128, 16, 28, 28, 1, },
{ 5, 256, 8, 14, 14, 1, },
{ 5, 512, 4, 7, 7, 1, },
{ 5, 128, 32, 56, 56, 2, },
{ 5, 256, 16, 28, 28, 2, },
{ 5, 512, 8, 14, 14, 2, },
{ 32, 24, 4, 56, 56, 1, },
{ 32, 24, 2, 28, 28, 1, },
{ 32, 48, 4, 56, 56, 1, },
{ 32, 48, 2, 28, 28, 1, },
{ 32, 48, 1, 14, 14, 1, },
{ 1, 8, 4, 4, 4, 1, },
};
// clang-format on
// Depthwise is memory BW bound so we want to flush LLC.
bool flush = true;
std::vector<char> llc;
if (flush) {
llc.resize(128 * 1024 * 1024, 1.0);
}
constexpr int NWARMUP = 4;
constexpr int NITER = 16;
for (auto shape : shapes_3d) {
int N = shape[0];
int K = shape[1];
int T = shape[2];
int H = shape[3];
int W = shape[4];
int stride_t = shape[5];
int stride_h = stride_t;
int stride_w = stride_t;
constexpr int K_T = 3, K_H = 3, K_W = 3;
constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1,
PAD_R = 1;
conv_param_t<3> conv_p(
N,
K,
K,
{T, H, W},
K,
{K_T, K_H, K_W},
{stride_t, stride_h, stride_w},
{PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R});
int T_OUT = conv_p.OUT_DIM[0];
int H_OUT = conv_p.OUT_DIM[1];
int W_OUT = conv_p.OUT_DIM[2];
int MDim = N * T_OUT * H_OUT * W_OUT;
int KDim = K_T * K_H * K_W * K;
int KDimPerGroup = KDim / conv_p.G;
aligned_vector<uint8_t> A(N * T * H * W * K);
aligned_vector<int8_t> B(KDim);
aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;
randFill<int8_t>(B, -16, 16);
int32_t B_zero_point = 5;
aligned_vector<float> C_multiplier(1);
randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
int32_t C_zero_point = 5;
vector<int32_t> row_offsets(MDim);
// im2col to compute row offset later
vector<uint8_t> A_im2col(MDim * KDim);
im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
aligned_vector<int32_t> col_offsets(K);
aligned_vector<int32_t> bias(K);
randFill(col_offsets, -100, 100);
randFill(bias, -40, 40);
conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
for (int g = 0; g < conv_p.G; ++g) {
// Compute row offset
row_offsets_u8acc32_ref(
MDim,
KDimPerGroup,
KDim,
A_im2col.data() + g * KDimPerGroup,
row_offsets.data());
// Requantization
requantize_u8acc32_ref(
MDim,
1,
conv_p.G,
C_ref.data() + g,
C_uint8_ref.data() + g,
C_multiplier.data(),
C_zero_point,
A_zero_point,
&B_zero_point,
row_offsets.data(),
col_offsets.data() + g,
bias.data() + g,
K);
}
PackedDepthWiseConvMatrix Bp(K, 3 * 3 * 3, B.data());
double bytes =
(K *
(N * (2.0 * sizeof(int32_t) * T_OUT * H_OUT * W_OUT + T * H * W) +
K_T * K_H * K_W));
double ops = 2.0 * N * T_OUT * H_OUT * W_OUT * K * K_T * K_H * K_W;
double ttot = measureWithWarmup(
[&]() {
int num_threads = fbgemm_get_num_threads();
int tid = fbgemm_get_thread_num();
depthwise_3d_same_pad<QuantizationGranularity::TENSOR>(
conv_p,
A_zero_point,
A.data(),
&B_zero_point,
Bp,
C_multiplier.data(),
C_zero_point,
C_uint8.data(),
col_offsets.data(),
bias.data(),
false, /* fuse_relu */
nullptr, /* act_scale * w_scale */
tid,
num_threads);
},
NWARMUP,
NITER,
[&]() {
if (flush) {
llc_flush(llc);
}
},
true /*useOpenMP*/);
// correctness check
for (int n = 0; n < N; ++n) {
for (int t = 0; t < T_OUT; ++t) {
for (int h = 0; h < H_OUT; ++h) {
for (int w = 0; w < W_OUT; ++w) {
for (int g = 0; g < K; ++g) {
uint8_t expected = C_uint8_ref
[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + g];
uint8_t actual =
C_uint8[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + g];
if (expected != actual) {
cerr << "Depthwise 3x3x3 results differ at (" << n << ", " << t
<< ", " << h << ", " << w << ", " << g << "). expected "
<< (int)expected << " actual " << (int)actual << endl;
return -1;
}
assert(expected == actual);
}
} // w
} // h
} // t
} // n
// Report performance
printf(
"N = %d K = %d T = %d H = %d W = %d stride = %d with requantization "
"fused\n",
N,
K,
T,
H,
W,
stride_h);
printf("GB/s = %f Gops/s = %f\n", bytes / ttot / 1e9, ops / ttot / 1e9);
} // for each shape
return 0;
}