in source/backend/cpu/compute/ConvInt8TiledExecutor.cpp [615:926]
ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
// Initialize.
mUseBatchQuan = false;
mIm2ColBasedInt8 = true;
auto option = static_cast<CPUBackend*>(backend())->getRuntime()->hint().dynamicQuantOption;
int batch = inputs[0]->batch();
int inC = inputs[0]->channel();
auto output = outputs[0];
int inputPlane = batch * inputs[0]->width() * inputs[0]->height();
auto planeSize = output->width() * output->height() * output->batch();
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
auto gcore =static_cast<CPUBackend*>(backend())->functions();
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
int kernelCount = mCommon->kernelY() * mCommon->kernelX();
bool fastway = (kernelCount == 1) && (output->width() == inputs[0]->width()) && (output->height() == inputs[0]->height()) && (mCommon->strideX() * mCommon->strideY()) == 1;
if (inputPlane > 1) {
mUseBatchQuan = true;
}
if (!fastway) { // general conv
mIm2ColBasedInt8 = false;
if (planeSize > 1) {
mUseBatchQuan = true;
}
if (option == 1) { // lowest level.
mIm2ColBasedInt8 = true;
mUseBatchQuan = false;
}
}
float weightBytes = mResourceInt8->mActBits == 4 ? 0.5 : 1;
mBlockNum = mResourceInt8->mBlockNum;
#ifdef MNN_KLEIDIAI_ENABLED
KleidiAI& kai = KleidiAI::getInstance();
if(mResourceInt8->mDynamicQuant && mResourceInt8->mActBits == 4 && kai.canAccelerate(mAccelType)) {
MNN_ASSERT(kai.isLoaded(mAccelType));
const size_t m = inputs[0]->batch(); //lhs vector number.
const size_t n = outputs[0]->channel(); //rhs vector number.
const size_t k = inputs[0]->channel(); //vector size.
const size_t blkSize = mBlockNum == 1 ? 0 : k / mBlockNum;
int packedSize = kai.getLhsQuantedPackedSize(mAccelType, m, k, blkSize);
int elementSize = kai.isHalf() ? sizeof(__fp16) : sizeof(float);
if(m > 1 && !kai.isLinear()) {
int srcSize = m * k * elementSize;
int dstSize = m * n * elementSize;
int extraSize = srcSize > dstSize ? srcSize : dstSize;
packedSize += extraSize;
}
//Split mTempIm2ColBuffer as two parts for linear/tile transfer:
//Part0: Lhs_packed.
//Part1: Lhs/Dst before transfer.
mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({packedSize}));
bool success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
if (!success) {
MNN_ERROR("Out of dynamic memory!\n");
return OUT_OF_MEMORY;
}
backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
return NO_ERROR;
}
#endif
CPUConvolution::onResize(inputs, outputs);
if (mResourceInt8->mDynamicQuant == false) {
mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
if (!mMutableResource->mResource->mUseConvQuan) {
// In some previous quantized models, input's scale already fused with weight's scale and output's scale.
// So there is no need to read input's scale additionally.
mBatchQuantInfo.reset(Tensor::createDevice<int8_t>({1, DST_XUNIT * QUANT_INFO_BYTES}));
auto success = backend()->onAcquireBuffer(mBatchQuantInfo.get(), Backend::DYNAMIC);
if (!success) {
return OUT_OF_MEMORY;
}
}
mBlockNum = 1;
mIm2ColBasedInt8 = true;
mUseBatchQuan = false;
}
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, gcore, core);
// input scale buffer
const int threads = static_cast<CPUBackend*>(backend())->threadNumber();
// Im2col info
int im2colBytes = 1;
const int L2Size = 2048;
int tileLimitByC = UP_DIV(L2Size, mIm2ColParamter.kernelCountUnit * SRC_UNIT);
if (mIm2ColBasedInt8 == false) {
im2colBytes = gcore->bytes;
tileLimitByC = 1;
}
int ic = inputs[0]->channel();
int tileLimit = 0;
int outC = output->channel();
int outC4 = UP_DIV(outC, gcore->pack);
auto kernelCountUnit = mIm2ColParamter.kernelCountUnit;
mSplitByOc = true;
// flop and io
float flop = gcore->bytes * planeSize * (ROUND_UP(output->channel(), gcore->pack) * kernelCountUnit * SRC_UNIT / 1024.0 / 1024.0 / 1024.0);
float ios = (((CPUBackend*)backend())->getTensorSize(outputs[0], true) + ((CPUBackend*)backend())->getTensorSize(inputs[0], true) + ((CPUBackend*)backend())->getTensorSize(mResourceInt8->mWeightInt8.get()) * weightBytes) / (1024.0 * 1024.0 * 1024.0);
if (threads < planeSize) { // Thread split by output nhw.
tileLimit = ALIMIN(tileLimitByC, UP_DIV(planeSize, threads));
mIm2ColCount = UP_DIV(tileLimit, DST_XUNIT);
auto DynamicDestUnit = DST_XUNIT * mIm2ColCount;
mTileCount = UP_DIV(planeSize, DynamicDestUnit);
if (mTileCount > threads) {
mSplitByOc = false;
}
}
if (mSplitByOc) {
tileLimit = ALIMIN(tileLimitByC, planeSize);
mIm2ColCount = UP_DIV(tileLimit, DST_XUNIT);
auto DynamicDestUnit = DST_XUNIT * mIm2ColCount;
mTileCount = UP_DIV(planeSize, DynamicDestUnit);
auto ocPerThread = UP_DIV(outC4, threads);
auto threadNeed = UP_DIV(outC4, ocPerThread);
int totalWork = outC4;
int part = 1;
if (UNIT > gcore->pack) { // AVX512:UNIT=64,pack=16
MNN_ASSERT(UNIT % gcore->pack == 0);
int ocDivUnit = UP_DIV(outC4 * gcore->pack, UNIT);
ocPerThread = UP_DIV(ocDivUnit, threads);
threadNeed = UP_DIV(ocDivUnit, ocPerThread);
totalWork = ocDivUnit;
part = UNIT / gcore->pack;
}
mThreadNums = ALIMIN(threads, threadNeed);
mDivides.resize(threads+1);
mDivides[0] = 0;
static_cast<CPUBackend *>(backend())->computeDivideSizes(totalWork, mDivides.data() + 1, flop / ios);
for (int i = 0; i < mDivides.size(); ++i) {
mDivides[i] *= part;
}
}
if (!mSplitByOc) {
mThreadNums = ALIMIN(threads, mTileCount);
mDivides.resize(threads+1);
mDivides[0] = 0;
static_cast<CPUBackend *>(backend())->computeDivideSizes(mTileCount, mDivides.data() + 1, flop / ios);
}
int ocUp4 = ROUND_UP(outC, gcore->pack);
int k = mThreadNums;
int workPT = DST_XUNIT * mIm2ColCount;
if (mSplitByOc) {
k = 1; // Use one thread to finish im2col.
workPT = mTileCount * DST_XUNIT * mIm2ColCount;
}
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(workPT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, k);
mBlitInfoStride = blitInfoSize.second;
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
const int unitColBufferSize = kernelCountUnit * DST_XUNIT * SRC_UNIT * sizeof(int8_t);
const int colBufferSize = unitColBufferSize * mIm2ColCount;
if (!mSplitByOc) {
mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({threads, colBufferSize * im2colBytes}));
mTempSrcSum = bufferAlloc->alloc(threads * mBlockNum * DST_XUNIT * mIm2ColCount * QUANT_INFO_BYTES);
} else {
mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mTileCount, colBufferSize * im2colBytes}));
mTempSrcSum = bufferAlloc->alloc(mTileCount * mBlockNum * DST_XUNIT * mIm2ColCount * QUANT_INFO_BYTES);
}
auto success = backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
if (!success || mBlitInfo.invalid() || mTempSrcSum.invalid()) {
return OUT_OF_MEMORY;
}
if (false == mResourceInt8->mDynamicQuant) {
bufferAlloc->free(mBlitInfo);
bufferAlloc->free(mTempSrcSum);
backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
if (mBatchQuantInfo.get()) {
backend()->onReleaseBuffer(mBatchQuantInfo.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
#ifdef MNN_LOW_MEMORY
{ // Dynamic Quant kernels
mGemmKernel = core->Int8GemmKernel;
if (mResourceInt8->mActBits == 4) {
mGemmKernel = core->Int8GemmKernel_W4;
}
mQuantFunc = core->MNNFloat2Int8;
if (gcore->bytes == 2 && gcore->pack == 8) {
mGemmKernel = core->MNNGemmInt8AddBiasScale_Unit_FP16;
if (mResourceInt8->mActBits == 4) {
mGemmKernel = core->MNNGemmInt8AddBiasScale_w4_Unit_FP16;
}
mQuantFunc = core->DynamicQuanInput_ARM82;
mQuantAndReorderFunc = core->DynamicQuanInputAndReorder_ARM82;
}
// A axisSum kernel
mSumByAxisLFunc = gcore->MNNSumByAxisLForMatmul_A;
}
mInputBlockNum = (option == 2) ? mBlockNum : 1;
bool symmetricQuant = (option != 2 && mUseBatchQuan) ? true : false;
int size = 0;
if (!mUseBatchQuan) { // single quant
if (mSplitByOc) {
size = 2 * mInputBlockNum * ALIMIN(DST_XUNIT, planeSize) * QUANT_INFO_BYTES;
} else {
size = 2 * mInputBlockNum * mIm2ColCount * DST_XUNIT * QUANT_INFO_BYTES;
}
}
if (mUseBatchQuan) {
if (mIm2ColBasedInt8) {
size = 2 * mInputBlockNum * inputPlane * QUANT_INFO_BYTES;
} else if (!mSplitByOc){ // only threads buffer needed by this case
size = 2 * mInputBlockNum * mIm2ColCount * DST_XUNIT * QUANT_INFO_BYTES;
} else {
size = 2 * mInputBlockNum * planeSize * QUANT_INFO_BYTES;
}
}
if (symmetricQuant) { // symmetric quant
size /= 2;
}
if (!mIm2ColBasedInt8 && !mSplitByOc) {
mBatchQuantInfo.reset(Tensor::createDevice<int8_t>({threads, size}));
} else {
mBatchQuantInfo.reset(Tensor::createDevice<int8_t>({1, size})); // keep dimensions=2!
}
success &= backend()->onAcquireBuffer(mBatchQuantInfo.get(), Backend::DYNAMIC);
// Dynamic quant.
// set im2col tensor info
if (mIm2ColBasedInt8) {
mQuantInput.reset((Tensor::createDevice<int8_t>({batch, mIm2ColParamter.ih, mIm2ColParamter.iw, ROUND_UP(inC, gcore->pack)})));
} else if (!mSplitByOc){
mQuantInput.reset((Tensor::createDevice<int8_t>({threads, colBufferSize * 1})));
} else {
mQuantInput.reset((Tensor::createDevice<int8_t>({mTileCount, colBufferSize * 1})));
}
success &= backend()->onAcquireBuffer(mQuantInput.get(), Backend::DYNAMIC);
// set compute buffer
int tempSize = threads * 2 * mInputBlockNum * inputPlane;
if (!mIm2ColBasedInt8) {
if (!mSplitByOc) {
tempSize = threads * 2 * mInputBlockNum * DST_XUNIT * mIm2ColCount;
} else {
tempSize = threads * 2 * mInputBlockNum * ROUND_UP(planeSize, DST_XUNIT);
}
}
if (symmetricQuant) { // symmetric batch quant.
tempSize /= 2;
}
mSizeInputBlockQuant = tempSize / threads;
mTempMaxMinValueBuffer = bufferAlloc->alloc(tempSize * gcore->bytes);
mQScaleZero = bufferAlloc->alloc(tempSize * QUANT_INFO_BYTES);
if (mQScaleZero.invalid()) {
return OUT_OF_MEMORY;
}
mToFuseInputbias2Bias = (!mUseBatchQuan && option != 2) ? true : false;
if (mToFuseInputbias2Bias) { // input data has only one bias&scale
if (mIm2ColBasedInt8) {
mBiasBufferFusedInputzero = bufferAlloc->alloc(ocUp4 * QUANT_INFO_BYTES);
} else {
mBiasBufferFusedInputzero = bufferAlloc->alloc(threads *ocUp4 * QUANT_INFO_BYTES);
}
if (mBiasBufferFusedInputzero.invalid()) {
return OUT_OF_MEMORY;
}
}
mAccumBuffer.reset(Tensor::createDevice<int32_t>({threads, DST_XUNIT * ALIMAX(UNIT, gcore->pack)}));
success &= backend()->onAcquireBuffer(mAccumBuffer.get(), Backend::DYNAMIC);
if (mBlockNum > 1 && kernelCount > 1) {
if (mSplitByOc) {
mReorderBuffer = bufferAlloc->alloc(UP_DIV(planeSize, DST_XUNIT) * unitColBufferSize);
} else {
mReorderBuffer = bufferAlloc->alloc(threads * colBufferSize);
}
if (mReorderBuffer.invalid()) {
return OUT_OF_MEMORY;
}
}
if (!success || mTempMaxMinValueBuffer.invalid()) {
return OUT_OF_MEMORY;
}
bufferAlloc->free(mBlitInfo);
bufferAlloc->free(mTempSrcSum);
bufferAlloc->free(mTempMaxMinValueBuffer);
bufferAlloc->free(mQScaleZero);
if (mBlockNum >1 && kernelCount > 1) {
bufferAlloc->free(mReorderBuffer);
}
if (mToFuseInputbias2Bias) {
bufferAlloc->free(mBiasBufferFusedInputzero);
}
backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(mBatchQuantInfo.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(mQuantInput.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(mAccumBuffer.get(), Backend::DYNAMIC);
#endif
return NO_ERROR;
}