in source/backend/cpu/CPURaster.cpp [50:269]
ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(outputs.size() == 1);
auto output = outputs[0];
OpCommonUtils::rasterInputReset(____inputs, outputs[0]);
auto des = TensorUtils::getDescribe(output);
auto outputDes = TensorUtils::getDescribe(output);
mNeedZero = !TensorUtils::regionIsFull(output);
mZeroPoint = 0;
mUseThreads = false;
if (outputDes->quantAttr != nullptr && outputDes->type == DataType_DT_INT8) {
#ifdef MNN_USE_SSE
mZeroPoint = (int)outputDes->quantAttr->zero + 128;
#else
mZeroPoint = (int)outputDes->quantAttr->zero;
#endif
}
mTempInput.clear();
mFastBlit.clear();
mCacheRegions.clear();
mTempOutput = nullptr;
auto midFormat = MNN_DATA_FORMAT_NCHW;
mTempInputCopy.clear();
mFast = false;
auto core = static_cast<CPUBackend*>(backend())->functions();
mSingleConvert.type = 0;
// all_srcFormat == dstFormat == NC4HW4 : Fast Exe
if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
mFast = true;
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
mFast = false;
break;
}
if (!OpCommonUtils::canBlitFast(slice, output, core->pack, true)) {
mFast = false;
break;
}
}
if (mFast) {
mUseThreads = des->regions.size() > 16 ? true : false;
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (slice.origin == nullptr) {
continue;
}
Tensor::InsideDescribe::Region newRegion;
OpCommonUtils::turnToPackRegion(slice, newRegion, output, core->pack, true);
mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
}
return NO_ERROR;
}
}
// srcNum == 1 && srcFormat != dstFormat : Single Convert
if (des->regions.size() == 1) {
OpCommonUtils::turnRegion2Convert(des->regions[0], output, mSingleConvert);
if (mSingleConvert.type > 0) {
mUseThreads = (mSingleConvert.batch * mSingleConvert.channel * mSingleConvert.area > LAUNCH_MULTI_THREADS_WORKLOAD) ? true : false;
return NO_ERROR;
}
}
// Acquire Buffer for temp output
// TODO: optimize it
if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
mTempOutput.reset(new Tensor);
TensorUtils::setupTensorInfo(output, mTempOutput.get(), midFormat);
}
if (nullptr != mTempOutput) {
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
}
// input is NC4HW4 add Convert
std::vector<Tensor*> forRelease;
TensorUtils::FuseWrap fuseUtils;
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
auto origin = slice.origin;
if (nullptr == origin /*|| nullptr == origin->host<void>()*/) {
continue;
}
// if tensor is not NC4HW4 or has been merged, don't need deal
if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
if (slice.size[0] * slice.size[1] * slice.size[2] > LAUNCH_MULTI_THREADS_WORKLOAD) {
mUseThreads = true;
}
mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
continue;
}
// if NC4HW4's C%4 == 0, change convert to transpose and fuse it
if (origin->batch() == 1 && origin->channel() % core->pack == 0) {
int channel = origin->channel();
int area = 1;
// conv3d/pool3d will has 5 dims, area = depth * width * height, otherwise area = width * height
for (int d = 2; d < origin->dimensions(); d++) {
area *= origin->length(d);
}
Tensor::InsideDescribe::Region regionTmp;
regionTmp.src.offset = 0;
regionTmp.src.stride[0] = area * core->pack;
regionTmp.src.stride[1] = 1;
regionTmp.src.stride[2] = core->pack;
regionTmp.dst.offset = 0;
regionTmp.dst.stride[0] = area * core->pack;
regionTmp.dst.stride[1] = area;
regionTmp.dst.stride[2] = 1;
regionTmp.size[0] = channel / core->pack;
regionTmp.size[1] = core->pack;
regionTmp.size[2] = area;
regionTmp.origin = slice.origin;
bool merge = fuseUtils.match(regionTmp, slice);
if (merge) {
std::shared_ptr<Tensor::InsideDescribe::Region> newSlice(new Tensor::InsideDescribe::Region);
*newSlice = slice;
fuseUtils.apply(regionTmp, *newSlice);
// cache the merged tensor
if (newSlice->size[0] * newSlice->size[1] * newSlice->size[2] > LAUNCH_MULTI_THREADS_WORKLOAD) {
mUseThreads = true;
}
mTempInputCopy.emplace_back(std::make_pair(origin, newSlice.get()));
mCacheRegions.emplace_back(newSlice);
continue;
}
}
auto cache = static_cast<CPUBackend*>(backend())->getCache();
auto tempTensor = cache->findCacheTensor(origin, midFormat);
//MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
if (nullptr == tempTensor) {
std::shared_ptr<Tensor> newTensor(new Tensor);
TensorUtils::copyShape(origin, newTensor.get());
TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
newTensor->buffer().type = origin->getType();
TensorUtils::setLinearLayout(newTensor.get());
mTempInput.insert(std::make_pair(origin, newTensor.get()));
auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
tempTensor = newTensor.get();
TensorUtils::getDescribe(tempTensor)->useCount = TensorUtils::getDescribe(origin)->useCount;
cache->pushCacheTensor(newTensor, origin, midFormat);
}
if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
forRelease.emplace_back(tempTensor);
}
if (slice.size[0] * slice.size[1] * slice.size[2] > LAUNCH_MULTI_THREADS_WORKLOAD) {
mUseThreads = true;
}
mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
}
for (auto t : forRelease) {
backend()->onReleaseBuffer(t, Backend::DYNAMIC);
}
if (nullptr != mTempOutput) {
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
}
auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
mHasReduce = false;
ReduceInfo reduceInfo;
for (auto& iter : mTempInputCopy) {
if (reduceInfo.compute(*iter.second)) {
mHasReduce = true;
break;
}
}
if (mTempInputCopy.size() == 1 && threadNumber > 1 && (!mHasReduce)) {
// Split to multi region
auto region = mTempInputCopy[0].second;
if (region->size[0] * region->size[1] * region->size[2] < LAUNCH_MULTI_THREADS_WORKLOAD) {
mUseThreads = false;
return NO_ERROR;
}
if (region->size[0] * region->size[1] * region->size[2] > LAUNCH_MULTI_THREADS_WORKLOAD) {
mUseThreads = true;
}
auto tensorPtr = mTempInputCopy[0].first;
int pos = -1;
for (int i=0; i<3; ++i) {
if (region->size[i] > 1) {
pos = i;
break;
}
}
if (-1 == pos) {
// Don't need divide
return NO_ERROR;
}
mTempInputCopy.clear();
int divSize = UP_DIV(region->size[pos], threadNumber);
for (int i=0; i<threadNumber; ++i) {
std::shared_ptr<Tensor::InsideDescribe::Region> cacheRegPtr(new Tensor::InsideDescribe::Region);
auto& cacheReg = *cacheRegPtr;
int sta = i * divSize;
int fin = sta + divSize;
fin = std::min(fin, region->size[pos]);
if (fin <= sta) {
break;
}
for (int v=0; v<3; ++v) {
cacheReg.src.stride[v] = region->src.stride[v];
cacheReg.dst.stride[v] = region->dst.stride[v];
}
int curSize = fin - sta;
for (int v=0; v<pos; ++v) {
cacheReg.size[v] = region->size[v];
}
cacheReg.size[pos] = curSize;
cacheReg.src.offset = region->src.offset + sta * region->src.stride[pos];
cacheReg.dst.offset = region->dst.offset + sta * region->dst.stride[pos];
for (int v=pos+1; v<3; ++v) {
cacheReg.size[v] = region->size[v];
}
mTempInputCopy.emplace_back(std::make_pair(tensorPtr, cacheRegPtr.get()));
mCacheRegions.emplace_back(cacheRegPtr);
}
}
return NO_ERROR;
}