source/backend/cpu/compute/Convolution1x1Strassen.cpp (198 lines of code) (raw):

// // Convolution1x1Strassen.cpp // MNN // // Created by MNN on 2019/02/12. // Copyright © 2018, Alibaba Group Holding Limited // #include "Convolution1x1Strassen.hpp" #include "DenseConvolutionTiledExecutor.hpp" #include <string.h> #include "core/BufferAllocator.hpp" #include "backend/cpu/CPUBackend.hpp" #include "core/Concurrency.h" #include "ConvOpt.h" #include "core/Macro.h" #include "CommonOptFunction.h" #include "core/TensorUtils.hpp" namespace MNN { Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight, size_t originWeightSize, const float *bias, size_t biasSize) : CPUConvolution(common, b) { auto outputCount = (int)biasSize; int ePack, lPack, hPack; auto core = static_cast<CPUBackend*>(b)->functions(); core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack); mResource.reset(new CPUConvolution::Resource); mResource->backend = b; auto mSrcCount = (int)originWeightSize / outputCount; if (!mResource->copyBiasAlign(bias, (int)biasSize)) { MNN_ERROR("Not Enough Memory\n"); mValid = false; return; } // Use Float Weight. mResource->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputCount, hPack), UP_DIV(mSrcCount, lPack) * lPack, hPack})); mValid = b->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC); if (!mValid) { MNN_ERROR("Not Enough Memory\n"); return; } if (b->getRuntime()->hint().useCachedMmap > 1) { return; } if (core->bytes < 4) { AutoRelease<Tensor> tempTensor(Tensor::createDevice<float>({outputCount * mSrcCount})); mValid = b->onAcquireBuffer(tempTensor.get(), Backend::STATIC); if (!mValid) { MNN_ERROR("Not Enough Memory\n"); return; } core->MNNFp32ToLowp(originWeight, tempTensor->host<int16_t>(), outputCount * mSrcCount); core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), tempTensor->host<float>(), outputCount, mSrcCount, true); b->onReleaseBuffer(tempTensor.get(), Backend::STATIC); } else { core->MNNPackForMatMul_B(mResource->mWeight->host<float>(), originWeight, outputCount, mSrcCount, true); } } Convolution1x1Strassen::Convolution1x1Strassen(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon *common, Backend* b) : CPUConvolution(common, b) { mResource = resource; } Convolution1x1Strassen::~Convolution1x1Strassen() { // Do nothing } bool Convolution1x1Strassen::onClone(Backend* bn, const Op* op, Execution** dst) { if (!mValid) { return false; } if (nullptr == dst) { return true; } *dst = new Convolution1x1Strassen(mResource, op->main_as_Convolution2D()->common(), bn); return true; } ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { CPUConvolution::onResize(inputs, outputs); auto core = static_cast<CPUBackend*>(backend())->functions(); int ePack, lPack, hPack; core->MNNGetMatMulPackMode(&ePack, &lPack, &hPack); int bytes = core->bytes; auto CONVOLUTION_TILED_NUMBER = ePack; auto input = inputs[0]; auto output = outputs[0]; const int numberThread = ((CPUBackend *)backend())->threadNumber(); auto ic = input->channel(); auto oc = output->channel(); auto icC4 = UP_DIV(ic, core->pack); auto ocC4 = UP_DIV(oc, core->pack); auto batch = input->batch(); auto matrixSizeE = output->height() * output->width() * input->batch(); auto outputPlane = output->height() * output->width(); mUnits.clear(); std::shared_ptr<char> __autoFunction; auto padY = mPadY; auto padX = mPadX; auto strideX = mCommon->strideX(); auto strideY = mCommon->strideY(); auto postParameters = getPostParameters(); auto memoryPool = ((CPUBackend *)backend())->getBufferAllocator(); memoryPool->barrierBegin(); std::shared_ptr<void> __a(nullptr, [memoryPool](void *) { memoryPool->barrierEnd(); }); int maxDepth = 5; auto icAlign = UP_DIV(ic, lPack) * lPack; auto weightTensor = mResource->mWeight.get(); uint8_t* dequantAlpha = nullptr; uint8_t* dequantBias = nullptr; int dequantBits = bytes * 8; // fp16:16, fp32:32 mWeightBytes = bytes; if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) { std::vector<int> divides(numberThread+1); divides[0] = 0; static_cast<CPUBackend *>(backend())->computeDivideSizes(matrixSizeE, divides.data()+1); mUnits.resize(numberThread); for (int i = 0; i < numberThread; ++i) { int planeStart = divides[i]; int planeEnd = divides[i+1]; int planeSize = planeEnd - planeStart; Unit &unit = mUnits[i]; if (planeSize <= 0) { unit.mValid = false; continue; } unit.offset[1] = 0; unit.offset[2] = 0; unit.offset[0] = core->pack * planeStart * bytes; unit.offset[3] = core->pack * planeStart * bytes; unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth)); int e = planeSize; int l = ic; int h = oc; uint8_t* aPtr = nullptr; auto bPtr = TensorUtils::getDescribeOrigin(weightTensor)->mem->chunk();; uint8_t* cPtr = nullptr; auto biasPtr = TensorUtils::getDescribeOrigin(mResource->mBias.get())->mem->chunk(); memoryPool->beginGroup(); auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters); if (NO_ERROR != code) { memoryPool->endGroup(); return code; } memoryPool->endGroup(); } } else { // Divide in ocC4 auto hDiv = 1; if (hPack > core->pack) { hDiv = hPack / core->pack; } auto ocDiv = UP_DIV(ocC4, hDiv); std::vector<int> divides(numberThread+1); divides[0] = 0; static_cast<CPUBackend *>(backend())->computeDivideSizes(ocDiv, divides.data()+1); mUnits.resize(numberThread); for (int i = 0; i < numberThread; ++i) { int ocStart = divides[i] * hDiv; int ocEnd = divides[i+1] * hDiv; if (ocEnd >= ocC4) { ocEnd = ocC4; } int ocSize = ocEnd - ocStart; Unit &unit = mUnits[i]; if (ocSize <= 0) { unit.mValid = false; continue; } auto ocStartWeight = (ocStart * core->pack) / hPack; auto ocWeightSize = std::min(UP_DIV((ocSize * core->pack), hPack), mResource->mWeight->length(0) - ocStartWeight); unit.offset[1] = hPack * icAlign * ocStartWeight * mWeightBytes; unit.offset[2] = core->pack * ocStart * bytes; unit.offset[0] = 0; unit.offset[3] = core->pack * matrixSizeE * ocStart * bytes; unit.mStracssenComputor.reset(new StrassenMatrixComputor(backend(), false, maxDepth)); int e = matrixSizeE; int l = ic; int h = std::min(ocSize * core->pack, ocWeightSize * hPack); uint8_t* aPtr = nullptr; auto bPtr = TensorUtils::getDescribeOrigin(mResource->mWeight.get())->mem->chunk() + hPack * icAlign * ocStartWeight * mWeightBytes; uint8_t* cPtr = nullptr; auto biasPtr = TensorUtils::getDescribeOrigin(mResource->mBias.get())->mem->chunk() + core->pack * ocStart * bytes; memoryPool->beginGroup(); auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters); if (NO_ERROR != code) { memoryPool->endGroup(); return code; } memoryPool->endGroup(); } } return NO_ERROR; } ErrorCode Convolution1x1Strassen::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { auto size = mUnits.size(); auto input = inputs[0]; auto output = outputs[0]; auto core = static_cast<CPUBackend*>(backend())->functions(); auto inputPtr = input->host<uint8_t>(); auto outputPtr = output->host<uint8_t>(); auto weightPtr = mResource->mWeight->host<uint8_t>(); auto biasPtr = mResource->mBias->host<uint8_t>(); MNN_CONCURRENCY_BEGIN(tId, size) { auto &unit = mUnits[tId]; if (unit.mValid) { unit.mStracssenComputor->onExecute(inputPtr + unit.offset[0], weightPtr + unit.offset[1], biasPtr + unit.offset[2], outputPtr + unit.offset[3]); } } MNN_CONCURRENCY_END(); return NO_ERROR; } } // namespace MNN