ErrorCode CPURaster::onResize()

in source/backend/cpu/CPURaster.cpp [50:269]


ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
    MNN_ASSERT(outputs.size() == 1);
    auto output = outputs[0];
    OpCommonUtils::rasterInputReset(____inputs, outputs[0]);
    auto des = TensorUtils::getDescribe(output);
    auto outputDes = TensorUtils::getDescribe(output);
    mNeedZero = !TensorUtils::regionIsFull(output);
    mZeroPoint = 0;
    mUseThreads = false;
    if (outputDes->quantAttr != nullptr && outputDes->type == DataType_DT_INT8) {
#ifdef MNN_USE_SSE
        mZeroPoint = (int)outputDes->quantAttr->zero + 128;
#else
        mZeroPoint = (int)outputDes->quantAttr->zero;
#endif
    }
    mTempInput.clear();
    mFastBlit.clear();
    mCacheRegions.clear();
    mTempOutput = nullptr;
    auto midFormat = MNN_DATA_FORMAT_NCHW;
    mTempInputCopy.clear();
    mFast = false;
    auto core = static_cast<CPUBackend*>(backend())->functions();
    mSingleConvert.type = 0;
    // all_srcFormat == dstFormat == NC4HW4 : Fast Exe
    if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
        mFast = true;
        for (int i=0; i< des->regions.size(); ++i) {
            auto& slice = des->regions[i];
            if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
                mFast = false;
                break;
            }
            if (!OpCommonUtils::canBlitFast(slice, output, core->pack, true)) {
                mFast = false;
                break;
            }
        }
        if (mFast) {
            mUseThreads = des->regions.size() > 16 ? true : false;
            for (int i=0; i< des->regions.size(); ++i) {
                auto& slice = des->regions[i];
                if (slice.origin == nullptr) {
                    continue;
                }
                Tensor::InsideDescribe::Region newRegion;
                OpCommonUtils::turnToPackRegion(slice, newRegion, output, core->pack, true);
                mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
            }
            return NO_ERROR;
        }
    }
    // srcNum == 1 && srcFormat != dstFormat : Single Convert
    if (des->regions.size() == 1) {
        OpCommonUtils::turnRegion2Convert(des->regions[0], output, mSingleConvert);
        if (mSingleConvert.type > 0) {
            mUseThreads = (mSingleConvert.batch * mSingleConvert.channel * mSingleConvert.area > LAUNCH_MULTI_THREADS_WORKLOAD) ? true : false;
            return NO_ERROR;
        }
    }
    // Acquire Buffer for temp output
    // TODO: optimize it
    if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
        mTempOutput.reset(new Tensor);
        TensorUtils::setupTensorInfo(output, mTempOutput.get(), midFormat);
    }
    if (nullptr != mTempOutput) {
        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
        if (!res) {
            return OUT_OF_MEMORY;
        }
    }
    // input is NC4HW4 add Convert
    std::vector<Tensor*> forRelease;
    TensorUtils::FuseWrap fuseUtils;
    for (int i=0; i< des->regions.size(); ++i) {
        auto& slice = des->regions[i];
        auto origin = slice.origin;
        if (nullptr == origin /*|| nullptr == origin->host<void>()*/) {
            continue;
        }
        // if tensor is not NC4HW4 or has been merged, don't need deal
        if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
            if (slice.size[0] * slice.size[1] * slice.size[2] > LAUNCH_MULTI_THREADS_WORKLOAD) {
                mUseThreads = true;
            }
            mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
            continue;
        }
        // if NC4HW4's C%4 == 0, change convert to transpose and fuse it
        if (origin->batch() == 1 && origin->channel() % core->pack == 0) {
            int channel = origin->channel();
            int area = 1;
            // conv3d/pool3d will has 5 dims, area = depth * width * height, otherwise area = width * height
            for (int d = 2; d < origin->dimensions(); d++) {
                area *= origin->length(d);
            }
            Tensor::InsideDescribe::Region regionTmp;
            regionTmp.src.offset = 0;
            regionTmp.src.stride[0] = area * core->pack;
            regionTmp.src.stride[1] = 1;
            regionTmp.src.stride[2] = core->pack;
            regionTmp.dst.offset = 0;
            regionTmp.dst.stride[0] = area * core->pack;
            regionTmp.dst.stride[1] = area;
            regionTmp.dst.stride[2] = 1;
            regionTmp.size[0] = channel / core->pack;
            regionTmp.size[1] = core->pack;
            regionTmp.size[2] = area;
            regionTmp.origin = slice.origin;
            bool merge = fuseUtils.match(regionTmp, slice);
            if (merge) {
                std::shared_ptr<Tensor::InsideDescribe::Region> newSlice(new Tensor::InsideDescribe::Region);
                *newSlice = slice;
                fuseUtils.apply(regionTmp, *newSlice);
                // cache the merged tensor
                if (newSlice->size[0] * newSlice->size[1] * newSlice->size[2] > LAUNCH_MULTI_THREADS_WORKLOAD) {
                    mUseThreads = true;
                }
                mTempInputCopy.emplace_back(std::make_pair(origin, newSlice.get()));
                mCacheRegions.emplace_back(newSlice);
                continue;
            }
        }
        auto cache = static_cast<CPUBackend*>(backend())->getCache();
        auto tempTensor = cache->findCacheTensor(origin, midFormat);
        //MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
        if (nullptr == tempTensor) {
            std::shared_ptr<Tensor> newTensor(new Tensor);
            TensorUtils::copyShape(origin, newTensor.get());
            TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
            TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
            newTensor->buffer().type = origin->getType();
            TensorUtils::setLinearLayout(newTensor.get());
            mTempInput.insert(std::make_pair(origin, newTensor.get()));
            auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
            if (!res) {
                return OUT_OF_MEMORY;
            }
            tempTensor = newTensor.get();
            TensorUtils::getDescribe(tempTensor)->useCount = TensorUtils::getDescribe(origin)->useCount;
            cache->pushCacheTensor(newTensor, origin, midFormat);
        }
        if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
            forRelease.emplace_back(tempTensor);
        }
        if (slice.size[0] * slice.size[1] * slice.size[2] > LAUNCH_MULTI_THREADS_WORKLOAD) {
            mUseThreads = true;
        }
        mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
    }
    for (auto t : forRelease) {
        backend()->onReleaseBuffer(t, Backend::DYNAMIC);
    }
    if (nullptr != mTempOutput) {
        backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
    }
    auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
    mHasReduce = false;
    ReduceInfo reduceInfo;
    for (auto& iter : mTempInputCopy) {
        if (reduceInfo.compute(*iter.second)) {
            mHasReduce = true;
            break;
        }
    }
    if (mTempInputCopy.size() == 1 && threadNumber > 1 && (!mHasReduce)) {
        // Split to multi region
        auto region = mTempInputCopy[0].second;
        if (region->size[0] * region->size[1] * region->size[2] < LAUNCH_MULTI_THREADS_WORKLOAD) {
            mUseThreads = false;
            return NO_ERROR;
        }
        if (region->size[0] * region->size[1] * region->size[2] > LAUNCH_MULTI_THREADS_WORKLOAD) {
            mUseThreads = true;
        }
        auto tensorPtr = mTempInputCopy[0].first;
        int pos = -1;
        for (int i=0; i<3; ++i) {
            if (region->size[i] > 1) {
                pos = i;
                break;
            }
        }
        if (-1 == pos) {
            // Don't need divide
            return NO_ERROR;
        }
        mTempInputCopy.clear();
        int divSize = UP_DIV(region->size[pos], threadNumber);
        for (int i=0; i<threadNumber; ++i) {
            std::shared_ptr<Tensor::InsideDescribe::Region> cacheRegPtr(new Tensor::InsideDescribe::Region);
            auto& cacheReg = *cacheRegPtr;
            int sta = i * divSize;
            int fin = sta + divSize;
            fin = std::min(fin, region->size[pos]);
            if (fin <= sta) {
                break;
            }
            for (int v=0; v<3; ++v) {
                cacheReg.src.stride[v] = region->src.stride[v];
                cacheReg.dst.stride[v] = region->dst.stride[v];
            }
            int curSize = fin - sta;
            for (int v=0; v<pos; ++v) {
                cacheReg.size[v] = region->size[v];
            }
            cacheReg.size[pos] = curSize;
            cacheReg.src.offset = region->src.offset + sta * region->src.stride[pos];
            cacheReg.dst.offset = region->dst.offset + sta * region->dst.stride[pos];
            for (int v=pos+1; v<3; ++v) {
                cacheReg.size[v] = region->size[v];
            }
            mTempInputCopy.emplace_back(std::make_pair(tensorPtr, cacheRegPtr.get()));
            mCacheRegions.emplace_back(cacheRegPtr);
        }
    }
    return NO_ERROR;
}