source/backend/cpu/CPUDeconvolution.cpp (337 lines of code) (raw):
//
// CPUDeconvolution.cpp
// MNN
//
// Created by MNN on 2018/07/20.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "CPUDeconvolution.hpp"
#include "core/BufferAllocator.hpp"
#include "CPUBackend.hpp"
#include "core/Concurrency.h"
#include "core/Macro.h"
#include "core/OpCommonUtils.hpp"
#include "core/AutoStorage.h"
#include "math/Matrix.hpp"
#include "core/TensorUtils.hpp"
#include "core/ConvolutionCommon.hpp"
#include "compute/CommonOptFunction.h"
#include "compute/ConvOpt.h"
//#define MNN_OPEN_TIME_TRACE
#include <MNN/AutoTime.hpp>
namespace MNN {
CPUDeconvolutionBasic::CPUDeconvolutionBasic(int inputChannel, const Op* convOp, Backend* b)
: CPUConvolution(convOp->main_as_Convolution2D()->common(), b) {
mSrcCount = inputChannel;
mPostParameters = getPostParameters();
}
ErrorCode CPUDeconvolutionBasic::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto input = inputs[0];
auto output = outputs[0];
auto pad = ConvolutionCommon::convolutionTransposePad(input, output, mCommon);
mPadY = pad.second;
mPadX = pad.first;
return NO_ERROR;
}
// Float Weight.
static void _transformWeight(const uint8_t* tempWeight, uint8_t* dest, int outputCount, int srcCount, int fh, int fw,
uint8_t* cache, const CoreFunctions* core) {
auto outputC4 = UP_DIV(outputCount, core->pack);
int offset[] = {
(int)(fw * fh),
(int)(fw * fh),
};
// c, n, h, w-> c, n/4 * 4, h, w
for (int c=0; c<srcCount; ++c) {
auto dst = cache + c * outputC4 * fw * fh * core->pack * core->bytes;
auto src = tempWeight + c * outputCount * fw * fh * core->bytes;
core->MNNPackCUnit((float*)dst, (const float*)src, fw*fh, outputCount, offset);
}
//printf("%d - %d - %d - %d\n", outputCount, srcCount, fh, fw);
core->MNNPackForMatMul_B((float*)dest, (const float*)cache, outputC4 * fw * fh * core->pack, srcCount, false);
}
std::shared_ptr<DeconvolutionResource> CPUDeconvolution::makeResource(int srcCount, const Op *convOp, Backend* backend, bool dynamic) {
auto core = static_cast<CPUBackend*>(backend)->functions();
auto coreInt8 = static_cast<CPUBackend*>(backend)->int8Functions();
int eP, lP, hP;
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
auto conv2d = convOp->main_as_Convolution2D();
auto layer = conv2d->common();
int outputCount = layer->outputCount();
const auto outputChannleUp4 = UP_DIV(outputCount, hP) * hP;
int fw = layer->kernelX();
int fh = layer->kernelY();
std::shared_ptr<DeconvolutionResource> res(new DeconvolutionResource);
res->mParam.fh = fh;
res->mParam.fw = fw;
res->mParam.srcCount = srcCount;
res->mParam.outputCount = outputCount;
if (dynamic) {
return res;
}
auto outputAlign = UP_DIV(layer->outputCount(), core->pack) * core->pack * fw * fh;
const float* tempWeight = nullptr;
int tempWeightSize = 0;
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
ConvolutionCommon::getConvParameters(&quanCommon, backend, convOp, &tempWeight, &tempWeightSize);
AutoStorage<uint8_t> lowpWeight;
if (core->bytes < 4) {
lowpWeight.reset(outputCount * srcCount * fh * fw * core->bytes);
if (lowpWeight.get() == nullptr) {
return nullptr;
}
core->MNNFp32ToLowp(tempWeight, (int16_t*)lowpWeight.get(), outputCount * srcCount * fh * fw);
tempWeight = (float*)lowpWeight.get();
quanCommon.reset();
}
res->mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
res->mBias.reset(Tensor::createDevice<float>({UP_DIV(outputCount, core->pack) * core->pack}));
bool success = backend->onAcquireBuffer(res->mWeight.get(), Backend::STATIC) && backend->onAcquireBuffer(res->mBias.get(), Backend::STATIC);
AutoStorage<float> cache(outputAlign * srcCount);
if (!success || cache.get() == nullptr) {
MNN_ERROR("Alloc memory error for deconvolution\n");
return nullptr;
}
CPUConvolution::Resource::copyBias(res->mBias->host<float>(), convOp->main_as_Convolution2D()->bias()->data(), outputCount, backend);
_transformWeight((uint8_t*)tempWeight, res->mWeight->host<uint8_t>(), outputCount, srcCount, fh, fw, (uint8_t*)cache.get(), core);
return res;
}
bool CPUDeconvolution::onClone(Backend* bn, const Op* op, Execution** dst) {
if (mDynamicWeight) {
return false;
}
if (nullptr == dst) {
return true;
}
auto exe = new CPUDeconvolution(mSrcCount, op, bn, mDynamicWeight, mResource);
*dst = exe;
return true;
}
CPUDeconvolution::CPUDeconvolution(int srcCount, const Op* convOp, Backend* backend, bool dynamicWeight, std::shared_ptr<DeconvolutionResource> resource) : MNN::CPUDeconvolutionBasic(srcCount, convOp, backend) {
mDynamicWeight = dynamicWeight;
mResource = resource;
if (dynamicWeight) {
auto core = static_cast<CPUBackend*>(backend)->functions();
auto coreInt8 = static_cast<CPUBackend*>(backend)->int8Functions();
int eP, lP, hP;
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
auto conv2d = convOp->main_as_Convolution2D();
auto layer = conv2d->common();
int outputCount = layer->outputCount();
const auto outputChannleUp4 = UP_DIV(outputCount, hP) * hP;
int fw = layer->kernelX();
int fh = layer->kernelY();
auto outputAlign = UP_DIV(layer->outputCount(), core->pack) * core->pack * fw * fh;
mWeight.reset(Tensor::createDevice<float>(std::vector<int>{UP_DIV(outputAlign, hP), UP_DIV(srcCount, lP) * lP, hP}));
mBias.reset(Tensor::createDevice<float>({UP_DIV(outputCount, core->pack) * core->pack}));
mOrigin.reset(new CPUDeconvolutionOrigin(srcCount, convOp, backend));
mWeightTransformCache.reset(Tensor::createDevice<float>({outputAlign * srcCount}));
return;
} else {
mWeight = mResource->mWeight;
mBias = mResource->mBias;
}
mOrigin.reset(new CPUDeconvolutionOrigin(srcCount, convOp, backend));
}
CPUDeconvolution::~CPUDeconvolution() {
// Do nothing
}
ErrorCode CPUDeconvolution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
if (mDynamicWeight) {
auto core = static_cast<CPUBackend*>(backend())->functions();
_transformWeight(inputs[1]->host<uint8_t>(), mWeight->host<uint8_t>(), mResource->mParam.outputCount, mResource->mParam.srcCount, mResource->mParam.fh, mResource->mParam.fw, mWeightTransformCache->host<uint8_t>(), core);
::memset(mBias->host<uint8_t>(), 0, mBias->length(0) * core->bytes);
if (inputs.size() >= 3) {
::memcpy(mBias->host<uint8_t>(), inputs[2]->host<uint8_t>(), TensorUtils::getRawSize(inputs[2]) * core->bytes);
}
}
return mOrigin->onExecute(mTempInputs, outputs);
}
ErrorCode CPUDeconvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
if (mDynamicWeight) {
bool res = backend()->onAcquireBuffer(mWeight.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
res = backend()->onAcquireBuffer(mWeightTransformCache.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
res = backend()->onAcquireBuffer(mBias.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
}
mTempInputs = {inputs[0], mWeight.get(), mBias.get()};
auto code = mOrigin->onResize(mTempInputs, outputs);
if (NO_ERROR != code) {
return code;
}
if (mDynamicWeight) {
backend()->onReleaseBuffer(mWeight.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(mWeightTransformCache.get(), Backend::DYNAMIC);
backend()->onReleaseBuffer(mBias.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
CPUDeconvolutionOrigin::CPUDeconvolutionOrigin(int inputChannel, const Op *convOp, Backend *b) : CPUDeconvolutionBasic(inputChannel, convOp, b) {
// Do nothing
}
ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
CPUDeconvolutionBasic::onResize(inputs, outputs);
auto core = static_cast<CPUBackend*>(backend())->functions();
int bytes = core->bytes;
auto input = inputs[0];
auto output = outputs[0];
auto oc = output->channel();
if (UP_DIV(oc, core->pack) * core->pack != inputs[2]->length(0)) {
return INPUT_DATA_ERROR;
}
int eP, lP, hP;
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
auto ocC4 = UP_DIV(output->channel(), core->pack);
auto icC4 = UP_DIV(input->channel(), core->pack);
auto kw = mCommon->kernelX();
auto kh = mCommon->kernelY();
auto dilateX = mCommon->dilateX();
auto dilateY = mCommon->dilateY();
auto strideX = mCommon->strideX();
auto strideY = mCommon->strideY();
auto padX = mPadX;
auto padY = mPadY;
auto width = input->width();
auto height = input->height();
auto src_height = output->height();
auto src_width = output->width();
auto batch = output->batch();
auto weightTensor = inputs[1];
auto biasTensor = inputs[2];
auto kernelCount = ocC4 * mCommon->kernelX() * mCommon->kernelY();
auto plane = width * height * batch;
auto allocator = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
auto tileCount = UP_DIV(plane, eP);
threadNumber = ALIMIN(tileCount, threadNumber);
auto memMode = static_cast<CPUBackend*>(backend())->memoryMode();
if (memMode != BackendConfig::Memory_High) {
// Limit threadNumber to avoid too large memory
threadNumber = ALIMIN(threadNumber, 4);
}
auto im2colOutputStride = input->channel() * eP * core->bytes;
mGemmInput = allocator->alloc(threadNumber * im2colOutputStride);
auto gemmOutputStride = kernelCount * core->pack * eP * core->bytes;
mGemmOutput = allocator->alloc(threadNumber * gemmOutputStride);
auto outputSize = batch*src_width*src_height*ocC4*core->pack*core->bytes;
if (threadNumber > 1) {
mExtraOutput = allocator->alloc((threadNumber-1)*outputSize);
}
allocator->free(mGemmInput);
allocator->free(mGemmOutput);
if (threadNumber > 1) {
allocator->free(mExtraOutput);
}
auto first = std::make_pair([=](uint8_t* outputPtr, int tId) {
auto gemmInputBufferPtr = mGemmInput.ptr() + tId * im2colOutputStride;
auto colBufferPtr = mGemmOutput.ptr() + tId * gemmOutputStride;
auto inputPtr = input->host<uint8_t>();
auto unitBytes = core->pack * core->bytes;
auto tempOutPtr = outputPtr;
if (tId > 0) {
tempOutPtr = mExtraOutput.ptr() + (tId-1) * outputSize;
}
::memset(tempOutPtr, 0, outputSize);
int l = mSrcCount;
int h = kernelCount * core->pack;
auto weightPtr = weightTensor->host<uint8_t>();
for (int index=tId; index < tileCount; index+=threadNumber) {
int xStart = index * eP;
int xEnd = ALIMIN(xStart + eP, plane);
int xCount = xEnd-xStart;
if (xCount <= 0) {
continue;
}
size_t parameters[7];
parameters[0] = xCount * core->bytes;
parameters[1] = l;
parameters[2] = h;
parameters[3] = xCount * core->bytes * core->pack;
parameters[4] = 0;
parameters[5] = 0;
parameters[6] = 0;
const float* postParametersPtr = nullptr;
int32_t info[4];
int32_t stride[4];
stride[0] = xCount;
stride[1] = (int32_t)parameters[1];
stride[2] = 0;
stride[3] = 0;
info[0] = 1;
info[1] = plane;
info[2] = xCount;
info[3] = 1;
auto aStart = inputPtr + xStart * unitBytes;
core->MNNPackC4ForMatMul_A((float*)(gemmInputBufferPtr), (const float**)(&aStart), info, stride);
if (xCount == eP) {
core->MNNPackedMatMul((float*)(colBufferPtr), (float*)gemmInputBufferPtr, (float*)weightPtr, parameters, postParametersPtr, nullptr, nullptr, nullptr);
} else {
core->MNNPackedMatMulRemain((float*)(colBufferPtr), (float*)gemmInputBufferPtr, (float*)weightPtr, xCount, parameters, postParametersPtr, nullptr, nullptr, nullptr);
}
// Col2Im
for (int z = 0; z < ocC4; ++z) {
auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
auto srcZ = colBufferPtr + kw * kh * xCount * z * unitBytes;
for (int x=0; x<xCount; ++x) {
auto index = xStart + x;
int b = index / (width * height);
index = index % (width * height);
int oy = index / width;
int ox = index % width;
int srcStartX = ox * strideX - padX;
int srcStartY = oy * strideY - padY;
int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
int efy = ALIMIN(kh, UP_DIV(src_height - srcStartY, dilateY));
int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
int efx = ALIMIN(kw, UP_DIV(src_width - srcStartX, dilateX));
auto dstStart = dstZ + b * src_width * src_height * unitBytes + srcStartX * unitBytes + srcStartY * src_width * unitBytes;
auto srcStart = srcZ + x * unitBytes;
if (sfy >= efy || sfx >= efx) {
continue;
}
for (int fy = sfy; fy < efy; ++fy) {
auto dstY = dstStart + fy * unitBytes * dilateY * src_width;
auto srcY = srcStart + fy * kw * xCount * unitBytes;
core->MNNAddC4WithStride((const float*)(srcY + sfx * xCount * unitBytes), (float*)(dstY + sfx * dilateX * unitBytes), xCount * core->pack, dilateX * core->pack, efx - sfx);
}
}
}
}
}, threadNumber);
auto second = std::make_pair([ocC4, src_height, src_width, threadNumber, batch, biasTensor, this, outputSize, core](uint8_t* outputPtr, int tId) {
auto unitBytes = core->pack * core->bytes;
auto biasPtr = biasTensor->host<uint8_t>();
for (int z = tId; z < ocC4; z+=threadNumber) {
auto dstZ = outputPtr + z * src_height * src_width * batch * unitBytes;
if (threadNumber > 1) {
for (int index=0; index<threadNumber-1; ++index) {
auto src = mExtraOutput.ptr() + index * outputSize + z * src_height * src_width * batch * unitBytes;
core->MNNMatrixAdd((float*)(dstZ), (float*)(src), (float*)(dstZ), src_height * src_width * batch, 0, 0, 0, 1);
}
}
core->MNNAxByClampBroadcastUnit((float*)dstZ, (float*)dstZ, (const float*)((uint8_t*)biasPtr + unitBytes * z), src_height * src_width * batch, 0, 0, 1, mPostParameters.data());
}
}, threadNumber);
mExecuteFuntion = {first, second};
return NO_ERROR;
}
ErrorCode CPUDeconvolutionOrigin::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto inputPtr = inputs[0]->host<uint8_t>();
auto outputPtr = outputs[0]->host<uint8_t>();
for (auto& unit : mExecuteFuntion) {
MNN_CONCURRENCY_BEGIN(tId, unit.second) {
unit.first(outputPtr, (int)tId);
}
MNN_CONCURRENCY_END();
}
return NO_ERROR;
}
class CPUDeconvolutionCreator : public CPUBackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const {
auto convOp = op->main_as_Convolution2D();
auto common = convOp->common();
auto res = CPUDeconvolution::makeResource(inputs[0]->channel(), op, backend, inputs.size() > 1);
if (nullptr == res) {
MNN_ERROR("CPUDeconvolution makeResource error\n");
return nullptr;
}
return new CPUDeconvolution(inputs[0]->channel(), op, backend, inputs.size() > 1, res);
}
};
REGISTER_CPU_OP_CREATOR(CPUDeconvolutionCreator, OpType_Deconvolution);
} // namespace MNN