src/PackWeightsForDirectConv.cc (412 lines of code) (raw):
/*
* Copyright (c) Facebook, Inc. and its affiliates.
* All rights reserved.
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#define FBGEMM_EXPORTS
#include "fbgemm/FbgemmI8DirectconvAvx2.h"
#include <immintrin.h>
#include <cassert>
#include "./DirectConv.h"
#include "./ExecuteKernel.h"
#include "./MaskAvx2.h"
#include "fbgemm/ConvUtils.h"
#include "fbgemm/Fbgemm.h"
#include "fbgemm/FbgemmBuild.h"
#include "fbgemm/UtilsAvx2.h"
#include "./CodeGenHelpers.h"
#include "./OptimizedKernelsAvx2.h"
#include "./RefImplementations.h"
#include "./TransposeUtils.h"
#include "fbgemm/QuantUtilsAvx512.h"
namespace fbgemm {
PackedDirectConvMatrix::PackedDirectConvMatrix(
int IC_per_G,
int OC_per_G,
int filter_prod,
const int8_t* smat) {
// Allocate packed arrays
int kernel_prod_aligned = (filter_prod + 1) / 2 * 2;
pmat_ = static_cast<int8_t*>(fbgemmAlignedAlloc(
64,
((OC_per_G + 31) / 32 * 32) * kernel_prod_aligned * IC_per_G *
sizeof(int8_t)));
// the transposed weight layout: W[oc/8][r][s][ic/4][8][4]
for (int g = 0; g < /* G */ 1; ++g) {
for (int k = 0; k < OC_per_G; ++k) {
for (int f = 0; f < filter_prod; ++f) {
for (int c = 0; c < IC_per_G; ++c) {
int ocB = k / 8;
int ocb = k % 8;
int icB = c / 4;
int icb = c % 4;
pmat_
[((((g * (OC_per_G / 8) + ocB) * filter_prod + f) *
(IC_per_G / 4) +
icB) *
8 +
ocb) *
4 +
icb] =
smat[((g * OC_per_G + k) * filter_prod + f) * IC_per_G + c];
}
}
}
}
}
PackedDirectConvMatrix::~PackedDirectConvMatrix() {
fbgemmAlignedFree(pmat_);
}
template <int kSpatialDim>
void PackedDirectConvMatrix::col_offsets_with_zero_pt_s8acc32_DirectConvT(
const fbgemm::conv_param_t<kSpatialDim>& conv_p,
std::int32_t* B_zero_point,
std::vector<int32_t>& col_offsets,
int ncols_per_quant_group) {
// if use direct convolution implementation, compute the col_offsets
// of the weight matrix at the first time of inference.
// We need to know the shape of output matrix
// to compute col_offsets for direct convolution.
// Hence it cannot be called from inside weight packing function
// at initialization stage like other quantized conv implementation.
// Thus the col_offsets computation will be invoked at forward pass,
// and only the first pass will prepare the col_offsets.
if (first_call == false) {
return;
}
int IC = conv_p.IC;
int OC = conv_p.OC;
int IN_DIM0 = conv_p.IN_DIM[0];
int IN_DIM1 = conv_p.IN_DIM[1];
int OUT_DIM0 = conv_p.OUT_DIM[0];
int OUT_DIM1 = conv_p.OUT_DIM[1];
int K0 = conv_p.K[0];
int K1 = conv_p.K[1];
int stride0 = conv_p.stride[0];
int stride1 = conv_p.stride[1];
int MDim = conv_p.MB * OUT_DIM0 * OUT_DIM1;
int NDim = conv_p.OC / conv_p.G;
// int KDim = K[0] * K[1] * conv_p.IC;
col_offsets.resize(MDim * NDim, 0);
std::fill(col_offsets.begin(), col_offsets.end(), 0);
std::vector<int> count(MDim * NDim, 0);
for (int oc = 0; oc < OC; oc++) {
for (int ih = 0; ih < IN_DIM0; ih++) {
for (int iw = 0; iw < IN_DIM1; iw++) {
for (int kh = 0; kh < K0; kh++) {
for (int kw = 0; kw < K1; kw++) {
for (int ic = 0; ic < IC; ic++) {
int oh = ih * stride0 + kh;
int ow = iw * stride1 + kw;
col_offsets[(oh * OUT_DIM1 + ow) * OC + oc] += pmat_
[(((((oc / 8) * K0 + kh) * K1 + kw) * (IC / 4) + ic / 4) * 8 +
(oc % 8)) *
4 +
(ic % 4)];
count[(oh * OUT_DIM1 + ow) * OC + oc]++;
}
}
}
}
}
}
for (int oc = 0; oc < OC; oc++) {
for (int oh = 0; oh < OUT_DIM0; oh++) {
for (int ow = 0; ow < OUT_DIM1; ow++) {
col_offsets[(oh * OUT_DIM1 + ow) * OC + oc] -=
B_zero_point[oc / ncols_per_quant_group] *
count[(oh * OUT_DIM1 + ow) * OC + oc];
}
}
}
first_call = false;
}
template FBGEMM_API void
PackedDirectConvMatrix::col_offsets_with_zero_pt_s8acc32_DirectConvT<1>(
const fbgemm::conv_param_t<1>& conv_p,
std::int32_t* B_zero_point,
std::vector<int32_t>& col_offsets,
int ncols_per_quant_group);
template FBGEMM_API void
PackedDirectConvMatrix::col_offsets_with_zero_pt_s8acc32_DirectConvT<2>(
const fbgemm::conv_param_t<2>& conv_p,
std::int32_t* B_zero_point,
std::vector<int32_t>& col_offsets,
int ncols_per_quant_group);
template FBGEMM_API void
PackedDirectConvMatrix::col_offsets_with_zero_pt_s8acc32_DirectConvT<3>(
const fbgemm::conv_param_t<3>& conv_p,
std::int32_t* B_zero_point,
std::vector<int32_t>& col_offsets,
int ncols_per_quant_group);
template <int SPATIAL_DIM>
void directConvRowSum(
const conv_param_t<SPATIAL_DIM>& conv_p,
const uint8_t* A,
int32_t* inSum,
int32_t* rowSum) {
int IN0 = conv_p.IN_DIM[0];
int IN1 = conv_p.IN_DIM[1];
int IC = conv_p.IC;
int K0 = conv_p.K[0];
int K1 = conv_p.K[1];
int OUT0 = conv_p.OUT_DIM[0];
int OUT1 = conv_p.OUT_DIM[1];
int stride = conv_p.stride[1];
memset(rowSum, 0, sizeof(int32_t) * OUT0 * OUT1);
for (int ih = 0; ih < IN0; ++ih) {
for (int iw = 0; iw < IN1; ++iw) {
inSum[ih * IN1 + iw] = reduceAvx2(A + ih * IN1 * IC + iw * IC, IC);
}
}
for (int ih = 0; ih < IN0; ++ih) {
for (int iw = 0; iw < IN1; iw++) {
for (int r = 0; r < K0; ++r) {
for (int s = 0; s < K1; ++s) {
rowSum[(ih + r) * OUT1 + iw * stride + s] += inSum[ih * IN1 + iw];
}
}
}
}
/*
compare_buffers(
rowSum,
rowoffsets,
OUT0,
OUT1,
OUT1,
5);
*/
}
template void directConvRowSum<1>(
const conv_param_t<1>& conv_p,
const uint8_t* A,
int32_t* inSum,
int32_t* rowSum);
template void directConvRowSum<2>(
const conv_param_t<2>& conv_p,
const uint8_t* A,
int32_t* inSum,
int32_t* rowSum);
template void directConvRowSum<3>(
const conv_param_t<3>& conv_p,
const uint8_t* A,
int32_t* inSum,
int32_t* rowSum);
template <
int SPATIAL_DIM,
QuantizationGranularity Q_GRAN,
bool FUSE_RELU,
typename BIAS_TYPE>
void fbgemmDirectConv(
const conv_param_t<SPATIAL_DIM>& conv_p,
const uint8_t* Aint8,
PackedDirectConvMatrix& Bint8_tr,
uint8_t* C,
int32_t* C_buffer,
const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess,
const BIAS_TYPE* bias,
// const int32_t* bias,
int thread_id,
int num_threads) {
// support for single thread now,
// will enable multithread later
if (thread_id > 0 || thread_id >= num_threads) {
return;
}
if (SPATIAL_DIM != 2) {
assert(false && "1d/3d direct conv not supported");
} else {
if (conv_p.transposed) {
DirectConvCodeGenBase<uint8_t, int8_t, int32_t, int32_t>::
jit_micro_kernel_fp_convT fn;
DirectConvCodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj;
/*
fn = codeObj.getOrCreateDirectConvTrans<inst_set_t::avx2>(
true, conv_p.stride[1]);
*/
fn = codeObj.getOrCreateDirectConvTrans<inst_set_t::avx2>(
true, conv_p.stride[1], conv_p.K[1]);
int32_t* inSum = static_cast<int32_t*>(fbgemmAlignedAlloc(
64, conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * sizeof(int32_t)));
int32_t* rowSum = static_cast<int32_t*>(fbgemmAlignedAlloc(
64, conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * sizeof(int32_t)));
directConvRowSum(conv_p, Aint8, inSum, rowSum);
int kernel_dim = conv_p.K[0] * conv_p.K[1];
std::memset(
C_buffer,
0,
sizeof(int32_t) * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
std::memset(
C,
0,
sizeof(int8_t) * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
// no-op output process objects
for (int i = 0; i < conv_p.OC; i += 8) {
for (int j = 0; j < conv_p.IN_DIM[0]; j++) {
fn(Aint8 + j * conv_p.IC * conv_p.IN_DIM[1],
Bint8_tr.PackedMat() + i * kernel_dim * conv_p.IC,
C_buffer + j * conv_p.OUT_DIM[1] * conv_p.OC + i,
conv_p.IC,
conv_p.OC,
(conv_p.OC * conv_p.OUT_DIM[1] - conv_p.OC * conv_p.K[1]) * 4,
conv_p.IN_DIM[1]);
}
}
int32_t A_zero_point = outProcess.getAZeroPoint();
const int32_t* B_zero_point = outProcess.getBZeroPoint();
// const float* C_multiplier = outProcess.getCMultiplier();
const int32_t* col_offsets = outProcess.getColOffsets();
/*
int groups = 1;
if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
groups = conv_p.OC;
}
*/
requantizationParams_t<BIAS_TYPE> reqObj = {
outProcess.getAZeroPoint(),
outProcess.getBZeroPoint(),
outProcess.getCZeroPoint(),
outProcess.getCMultiplier(),
rowSum, // rowOffsetBuf,
outProcess.getColOffsets(),
(outProcess.getBias()),
static_cast<std::uint32_t>(conv_p.OC), // outProcess.getNCols(),
1, // groups
outProcess.getActWScale()};
// Dispatch HAS_BIAS
if (bias == nullptr) {
// Dispatch A_SYMMETRIC and B_SYMMETRIC
if (A_zero_point == 0 || col_offsets == nullptr) {
if (Q_GRAN == QuantizationGranularity::TENSOR &&
B_zero_point[0] == 0) {
requantizeOutputProcessingAvx2<
true,
true,
QuantizationGranularity::TENSOR,
false, // HAS_BIAS,
FUSE_RELU,
BIAS_TYPE,
true>(
C,
C_buffer,
{0, conv_p.OUT_DIM[1] * conv_p.OUT_DIM[0], 0, conv_p.OC},
conv_p.OC,
conv_p.OC,
reqObj);
} else {
requantizeOutputProcessingAvx2<
true,
false,
Q_GRAN,
false, // HAS_BIAS,
FUSE_RELU,
BIAS_TYPE,
true>(
C,
C_buffer,
{0, conv_p.OUT_DIM[1] * conv_p.OUT_DIM[0], 0, conv_p.OC},
conv_p.OC,
conv_p.OC,
reqObj);
}
} else {
if (Q_GRAN == QuantizationGranularity::TENSOR &&
B_zero_point[0] == 0) {
requantizeOutputProcessingAvx2<
false,
true,
QuantizationGranularity::TENSOR,
false, // HAS_BIAS,
FUSE_RELU,
BIAS_TYPE,
true>(
C,
C_buffer,
{0, conv_p.OUT_DIM[1] * conv_p.OUT_DIM[0], 0, conv_p.OC},
conv_p.OC,
conv_p.OC,
reqObj);
} else {
requantizeOutputProcessingAvx2<
false,
false,
Q_GRAN,
false, // HAS_BIAS,
FUSE_RELU,
BIAS_TYPE,
true>(
C,
C_buffer,
{0, conv_p.OUT_DIM[1] * conv_p.OUT_DIM[0], 0, conv_p.OC},
conv_p.OC,
conv_p.OC,
reqObj);
}
}
} else { // has_bias == true
// dispatch A_SYMMETRIC and B_SYMMETRIC
if (A_zero_point == 0 || col_offsets == nullptr) {
if (Q_GRAN == QuantizationGranularity::TENSOR &&
B_zero_point[0] == 0) {
requantizeOutputProcessingAvx2<
true,
true,
QuantizationGranularity::TENSOR,
true, // HAS_BIAS,
FUSE_RELU,
BIAS_TYPE,
true>(
C,
C_buffer,
{0, conv_p.OUT_DIM[1] * conv_p.OUT_DIM[0], 0, conv_p.OC},
conv_p.OC,
conv_p.OC,
reqObj);
} else {
requantizeOutputProcessingAvx2<
true,
false,
Q_GRAN,
true, // HAS_BIAS,
FUSE_RELU,
BIAS_TYPE,
true>(
C,
C_buffer,
{0, conv_p.OUT_DIM[1] * conv_p.OUT_DIM[0], 0, conv_p.OC},
conv_p.OC,
conv_p.OC,
reqObj);
}
} else {
if (Q_GRAN == QuantizationGranularity::TENSOR &&
B_zero_point[0] == 0) {
requantizeOutputProcessingAvx2<
false,
true,
QuantizationGranularity::TENSOR,
true, // HAS_BIAS,
FUSE_RELU,
BIAS_TYPE,
true>(
C,
C_buffer,
{0, conv_p.OUT_DIM[1] * conv_p.OUT_DIM[0], 0, conv_p.OC},
conv_p.OC,
conv_p.OC,
reqObj);
} else {
requantizeOutputProcessingAvx2<
false,
false,
Q_GRAN,
true, // HAS_BIAS,
FUSE_RELU,
BIAS_TYPE,
true>(
C,
C_buffer,
{0, conv_p.OUT_DIM[1] * conv_p.OUT_DIM[0], 0, conv_p.OC},
conv_p.OC,
conv_p.OC,
reqObj);
}
}
}
fbgemmAlignedFree(inSum);
fbgemmAlignedFree(rowSum);
} // transposed conv
else { // non-transposed conv
assert(false && "non-transposed direct conv not integrated yet.");
}
} // else SPATIAL_DIM
}
#define INSTANTIATE_REQUANTIZE_SPATIAL_DIM( \
SPATIAL_DIM, Q_GRAN, RELU, BIAS_TYPE) \
template void FBGEMM_API \
fbgemmDirectConv<SPATIAL_DIM, Q_GRAN, RELU, BIAS_TYPE>( \
const conv_param_t<SPATIAL_DIM>& conv_p, \
const uint8_t* Aint8, \
PackedDirectConvMatrix& Bint8_tr, \
uint8_t* C, \
int32_t* C_buffer, \
const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \
const BIAS_TYPE* bias, \
int thread_id, \
int num_threads);
#define INSTANTIATE_REQUANTIZE_BIAS_TYPE(Q_GRAN, RELU, BIAS_TYPE) \
INSTANTIATE_REQUANTIZE_SPATIAL_DIM(1, Q_GRAN, RELU, BIAS_TYPE) \
INSTANTIATE_REQUANTIZE_SPATIAL_DIM(2, Q_GRAN, RELU, BIAS_TYPE) \
INSTANTIATE_REQUANTIZE_SPATIAL_DIM(3, Q_GRAN, RELU, BIAS_TYPE)
#define INSTANTIATE_REQUANTIZE(Q_GRAN, RELU) \
INSTANTIATE_REQUANTIZE_BIAS_TYPE(Q_GRAN, RELU, float) \
INSTANTIATE_REQUANTIZE_BIAS_TYPE(Q_GRAN, RELU, int32_t)
#define INSTANTIATE_Q_GRANS(RELU) \
INSTANTIATE_REQUANTIZE(QuantizationGranularity::TENSOR, RELU) \
INSTANTIATE_REQUANTIZE(QuantizationGranularity::GROUP, RELU) \
INSTANTIATE_REQUANTIZE(QuantizationGranularity::OUT_CHANNEL, RELU)
INSTANTIATE_Q_GRANS(true)
INSTANTIATE_Q_GRANS(false)
#undef INSTANTIATE_REQUANTIZE_SPATIAL_DIM
#undef INSTANTIATE_REQUANTIZE_BIAS_TYPE
#undef INSTANTIATE_REQUANTIZE
#undef INSTANTIATE_Q_GRANS
} // namespace fbgemm