source/core/OpCommonUtils.cpp (870 lines of code) (raw):

// // OpCommonUtils.cpp // MNN // // Created by MNN on 2020/03/08. // Copyright © 2018, Alibaba Group Holding Limited // #include "OpCommonUtils.hpp" #include "core/Execution.hpp" #include "MNN_generated.h" #include "Macro.h" #include <random> namespace MNN { Tensor::DimensionType OpCommonUtils::convertDimType(MNN_DATA_FORMAT dimensionFormat) { auto dimType = Tensor::CAFFE; switch (dimensionFormat) { case MNN_DATA_FORMAT_NCHW: break; case MNN_DATA_FORMAT_NC4HW4: dimType = Tensor::CAFFE_C4; break; case MNN_DATA_FORMAT_NHWC: dimType = Tensor::TENSORFLOW; break; default: break; } return dimType; } void OpCommonUtils::loadBlobData(FileLoader* loader, const Op* op, char* ptr, int size) { if (OpParameter_Blob != op->main_type()) { return; } auto b = op->main_as_Blob(); if (USE_EXTERNAL_DATA(b)) { if (op->externalPath() != nullptr) { loader = new FileLoader(op->externalPath()->c_str()); } loadExternalDatas(loader, { ptr }, b->external()->data()); if (op->externalPath() != nullptr) { delete loader; } return; } void* result = nullptr; switch (b->dataType()) { case DataType_DT_FLOAT: result = (void*)b->float32s()->Data(); break; case DataType_DT_BFLOAT16: result = (void*)b->uint8s()->Data(); break; case DataType_DT_INT32: result = (void*)b->int32s()->Data(); break; case DataType_DT_QUINT8: case DataType_DT_UINT8: result = (void*)b->uint8s()->Data(); break; case DataType_DT_INT8: result = (void*)b->int8s()->Data(); break; default: MNN_ASSERT(false); break; } ::memcpy(ptr, result, size); } static std::tuple<int, int, int> _split(int offset, int axisL, int area) { int inside = offset % area; int temp = offset / area; int axis = temp % axisL; int outside = temp / axisL; return std::make_tuple(inside, axis, outside); } static std::tuple<bool, bool, bool> _computeAxisFused(const std::tuple<int, int, int>& dstTup) { bool ncFused = std::get<1>(dstTup) > 0 && std::get<2>(dstTup) > 0; bool nwFused = std::get<0>(dstTup) > 0 && std::get<2>(dstTup) > 0; bool cwFused = std::get<1>(dstTup) > 0 && std::get<0>(dstTup) > 0; return std::make_tuple(ncFused, cwFused, nwFused); } static std::tuple<int, int, int> _computeStride(const std::tuple<int, int, int>& srcTup, const std::tuple<int, int, int>& srcSplit, int step, bool swapnc, int stride) { int inside = std::get<0>(srcTup) / step; int axis = std::get<1>(srcTup) / step; int outside = std::get<2>(srcTup) / step; auto fuse = _computeAxisFused(srcTup); if (std::get<0>(fuse)) { // nc fused if (swapnc) { axis = 0; outside = stride / std::get<0>(srcSplit); } else { outside = 0; axis = stride / std::get<0>(srcSplit); } } else if (std::get<2>(fuse)) { // nw fused outside = 0; inside = stride; } else if (std::get<1>(fuse)) { // cw fused axis = 0; inside = stride; } return std::make_tuple(inside, axis, outside); } static bool _checkFuseValid(const OpCommonUtils::SPLITS& srcTup, const OpCommonUtils::SPLITS& srcSplits, bool swapnc, bool swapcw, const std::tuple<bool,bool,bool>& valid) { auto srcFused = _computeAxisFused(srcTup); if (swapnc) { // cw can't be fused if n > 1, because layout is c, n, w if (std::get<1>(srcFused) && std::get<2>(valid)) { return false; } if (std::get<0>(srcFused)) { // nc fuse but n is not full, don't support fuse if (std::get<2>(srcTup) + 1 != std::get<2>(srcSplits)) { return false; } } } else if (swapcw) { // nc can't be fused if w > 1 if (std::get<0>(srcFused) && std::get<0>(valid)) { return false; } if (std::get<1>(srcFused)) { // cw fuse but c is not full, don't support fuse if (std::get<1>(srcTup) + 1 != std::get<1>(srcSplits)) { return false; } } } else { // nw can't be fused if c > 1 if (std::get<2>(srcFused) && std::get<1>(valid)) { return false; } // nc can't be fused if w > 1 if (std::get<0>(srcFused) && std::get<0>(valid)) { return false; } // cw can't be fused if n > 1, because layout is c, n, w if (std::get<1>(srcFused) && std::get<2>(valid)) { return false; } } return true; } bool OpCommonUtils::canBlitFast(const Tensor::InsideDescribe::Region& region, const SPLITS& srcSplits, const SPLITS& dstSplits, int pack, bool swapnc, bool swapcw) { int srcCOffset = (region.src.offset / std::get<0>(srcSplits)) % std::get<1>(srcSplits); if (srcCOffset % pack != 0) { return false; } int dstCOffset = (region.dst.offset / std::get<0>(dstSplits)) % std::get<1>(dstSplits); if (dstCOffset % pack != 0) { return false; } auto wValid = std::get<0>(srcSplits) > 1 || std::get<0>(dstSplits) > 1; auto cValid = std::get<1>(srcSplits) > 1 || std::get<1>(dstSplits) > 1; auto nValid = std::get<2>(srcSplits) > 1 || std::get<2>(dstSplits) > 1; auto valid = std::make_tuple(wValid, cValid, nValid); // Check Dst stride for (int i = 0; i < 3; ++i) { if (region.size[i] <= 1) { continue; } int dstStride = (region.size[i] - 1) * region.dst.stride[i]; auto srcStride = region.src.stride[i] * (region.size[i] - 1); auto dstTup = _split(dstStride, std::get<1>(dstSplits), std::get<0>(dstSplits)); auto srcTup = _split(srcStride, std::get<1>(srcSplits), std::get<0>(srcSplits)); if (std::get<1>(dstTup) != std::get<1>(srcTup)) { return false; } if (!_checkFuseValid(srcTup, srcSplits, swapnc, swapcw, valid)) { return false; } if (!_checkFuseValid(dstTup, dstSplits, swapnc, swapcw, valid)) { return false; } } return true; } void OpCommonUtils::turnToPackRegion(const Tensor::InsideDescribe::Region& region, Tensor::InsideDescribe::Region& c4Region, const SPLITS& srcSplits, const SPLITS& dstSplits, int pack, bool swapnc) { int srcAxisC4 = UP_DIV(std::get<1>(srcSplits), pack); auto dstAxisC4 = UP_DIV(std::get<1>(dstSplits), pack); auto fuseSrc = std::get<0>(srcSplits) * std::get<2>(srcSplits); auto fuseDst = std::get<0>(dstSplits) * std::get<2>(dstSplits); for (int i = 0; i < 3; ++i) { int dstStride = (region.size[i] - 1) * region.dst.stride[i]; // Get Last Point's inside, axis, outside postion auto tup = _split(dstStride, std::get<1>(dstSplits), std::get<0>(dstSplits)); if (std::get<1>(tup) > 0) { // The size has axis offset, divide the axis and mul axisC4 instead auto midC4 = UP_DIV(std::get<1>(tup) + 1, pack); c4Region.size[i] = region.size[i] / (std::get<1>(tup) + 1) * midC4; } } for (int i = 0; i < 3; ++i) { if (region.size[i] <= 1) { // No need compute stride c4Region.src.stride[i] = 0; c4Region.dst.stride[i] = 0; continue; } int step = region.size[i] - 1; int dstStride = region.dst.stride[i] * step; auto srcStride = region.src.stride[i] * step; auto dstTup = _split(dstStride, std::get<1>(dstSplits), std::get<0>(dstSplits)); auto srcTup = _split(srcStride, std::get<1>(srcSplits), std::get<0>(srcSplits)); { auto tup = _computeStride(srcTup, srcSplits, step, swapnc, region.src.stride[i]); int inside = std::get<0>(tup); int axis = std::get<1>(tup); int outside = std::get<2>(tup); if (swapnc) { c4Region.src.stride[i] = outside * std::get<0>(srcSplits) + axis * std::get<0>(srcSplits) * std::get<2>(srcSplits) + inside; } else { c4Region.src.stride[i] = outside * srcAxisC4 * std::get<0>(srcSplits) + axis * std::get<0>(srcSplits) + inside; } } { auto tup = _computeStride(dstTup, dstSplits, step, swapnc, region.dst.stride[i]); int inside = std::get<0>(tup); int axis = std::get<1>(tup); int outside = std::get<2>(tup); if (swapnc) { c4Region.dst.stride[i] = outside * std::get<0>(dstSplits) + axis * std::get<0>(dstSplits) * std::get<2>(dstSplits) + inside; } else { c4Region.dst.stride[i] = outside * dstAxisC4 * std::get<0>(dstSplits) + axis * std::get<0>(dstSplits) + inside; } } } { // Origin offset is compute as NCHW auto offsetTup = _split(region.src.offset, std::get<1>(srcSplits), std::get<0>(srcSplits)); if (swapnc) { // New offset is compute as C4NHW c4Region.src.offset = std::get<2>(offsetTup) * pack * std::get<0>(srcSplits) + std::get<1>(offsetTup) * std::get<0>(srcSplits) * std::get<2>(srcSplits) + std::get<0>(offsetTup) * pack; } else { c4Region.src.offset = std::get<2>(offsetTup) * srcAxisC4 * pack * std::get<0>(srcSplits) + std::get<1>(offsetTup) * std::get<0>(srcSplits) + std::get<0>(offsetTup) * pack; } } { // Origin offset is compute as NCHW auto offsetTup = _split(region.dst.offset, std::get<1>(dstSplits), std::get<0>(dstSplits)); if (swapnc) { // New offset is compute as C4NHW c4Region.dst.offset = std::get<2>(offsetTup) * pack * std::get<0>(dstSplits) + std::get<1>(offsetTup) * std::get<0>(dstSplits) * std::get<2>(dstSplits) + std::get<0>(offsetTup) * pack; } else { c4Region.dst.offset = std::get<2>(offsetTup) * dstAxisC4 * pack * std::get<0>(dstSplits) + std::get<1>(offsetTup) * std::get<0>(dstSplits) + std::get<0>(offsetTup) * pack; } } #ifdef MNN_DEBUG_BLIT MNN_PRINT("Src WCN: %d-%d-%d, Dst WCN:%d-%d-%d\n", std::get<0>(srcSplits), std::get<1>(srcSplits), std::get<2>(srcSplits), std::get<0>(dstSplits), std::get<1>(dstSplits), std::get<2>(dstSplits)); MNN_PRINT("Origin:%d, %d, %d, %d, src: %d - %d, %d, %d, dst: %d - %d, %d, %d\n", pack, region.size[0],region.size[1], region.size[2], region.src .offset, region.src.stride[0], region.src.stride[1], region.src.stride[2], region.dst.offset, region.dst.stride[0], region.dst .stride[1], region.dst.stride[2]); MNN_PRINT("Pack:%d, %d, %d, %d, src: %d - %d, %d, %d, dst: %d - %d, %d, %d\n", pack, c4Region.size[0],c4Region.size[1], c4Region.size[2], c4Region.src .offset, c4Region.src.stride[0], c4Region.src.stride[1], c4Region.src.stride[2], c4Region.dst.offset, c4Region.dst.stride[0], c4Region.dst .stride[1], c4Region.dst.stride[2]); #endif } bool OpCommonUtils::canBlitFast(const Tensor::InsideDescribe::Region& region, const Tensor* dest, int pack, bool swapnc, bool swapcw) { auto src = region.origin; int srcArea = 1; // FIXME: Support dimensions = 1 if (src->dimensions() == 1 || dest->dimensions() == 1) { return false; } for (int i = 2; i < src->dimensions(); ++i) { srcArea *= src->length(i); } int dstArea = 1; for (int i = 2; i < dest->dimensions(); ++i) { dstArea *= dest->length(i); } int inputBatch = 1; int inputChannel = 1; if (src->dimensions() > 0) { inputBatch = src->length(0); } if (src->dimensions() > 1) { inputChannel = src->length(1); } int dstBatch = 1; int dstChannel = 1; if (dest->dimensions() > 0) { dstBatch = dest->length(0); } if (dest->dimensions() > 1) { dstChannel = dest->length(1); } return canBlitFast(region, std::make_tuple(srcArea, inputChannel, inputBatch), std::make_tuple(dstArea, dstChannel, dstBatch), pack, swapnc, swapcw); } void OpCommonUtils::turnToPackRegion(const Tensor::InsideDescribe::Region& region, Tensor::InsideDescribe::Region& c4Region, const Tensor* dest, int pack, bool swapnc) { c4Region = region; auto src = region.origin; int srcArea = 1; for (int i = 2; i < src->dimensions(); ++i) { srcArea *= src->length(i); } int dstArea = 1; for (int i = 2; i < dest->dimensions(); ++i) { dstArea *= dest->length(i); } int inputBatch = 1; int inputChannel = 1; if (src->dimensions() > 0) { inputBatch = src->length(0); } if (src->dimensions() > 1) { inputChannel = src->length(1); } int dstBatch = 1; int dstChannel = 1; if (dest->dimensions() > 0) { dstBatch = dest->length(0); } if (dest->dimensions() > 1) { dstChannel = dest->length(1); } turnToPackRegion(region, c4Region, std::make_tuple(srcArea, inputChannel, inputBatch), std::make_tuple(dstArea, dstChannel, dstBatch), pack, swapnc); } void OpCommonUtils::broastCastComputeDim(int* dims, int* stride, int* iStride0, int* iStride1, const Tensor* input0, const Tensor* input1, const Tensor* output) { for (int i = MNN_MAX_TENSOR_DIM - 1; i >= 0; --i) { dims[i] = 1; stride[i] = 0; iStride0[i] = 0; iStride1[i] = 0; int input0I = i - (output->dimensions() - input0->dimensions()); int input1I = i - (output->dimensions() - input1->dimensions()); if (i < output->dimensions()) { dims[i] = output->length(i); stride[i] = output->stride(i); } if (input0I >= 0 && input0->length(input0I) != 1) { iStride0[i] = input0->stride(input0I); } if (input1I >= 0 && input1->length(input1I) != 1) { iStride1[i] = input1->stride(input1I); } } } std::vector<std::tuple<int, int, int>> OpCommonUtils::computeReduceDims(const std::vector<Tensor*>& inputs, const Op* op) { // Compute axises std::vector<int> axises; if (inputs.size() >= 2) { auto size = inputs[1]->elementSize(); auto dims = inputs[1]->host<int32_t>(); for (int i = 0; i < size; ++i) { axises.emplace_back(dims[i]); } } else { auto reduct = op->main_as_ReductionParam(); if (nullptr != reduct->dim()) { for (int i = 0; i < reduct->dim()->size(); ++i) { axises.emplace_back(reduct->dim()->data()[i]); } } } auto totalSize = TensorUtils::getRawSize(inputs[0]); if (axises.empty()) { return {std::make_tuple(1, totalSize, 1)}; } for (int i = 0; i < axises.size(); ++i) { if (axises[i] < 0) { axises[i] = inputs[0]->dimensions() + axises[i]; if (axises[i] < 0) { return {std::make_tuple(1, totalSize, 1)}; } } } // Cache for input's dims std::vector<int> lengths(inputs[0]->dimensions()); for (int i = 0; i < lengths.size(); ++i) { lengths[i] = inputs[0]->length(i); } std::vector<std::pair<int, int>> groupAxises; { // Merge adj axis std::sort(axises.begin(), axises.end()); int lastAxis = axises[0]; int length = 1; int start = axises[0]; for (int i = 1; i < axises.size(); ++i) { // MNN_PRINT("%d - %d\n", axises[i], lastAxis); if (axises[i] - lastAxis == 1) { length++; } else { groupAxises.emplace_back(std::make_pair(start, length)); length = 1; start = axises[i]; } lastAxis = axises[i]; } groupAxises.emplace_back(std::make_pair(start, length)); } // Compute inside-outside-axis std::vector<std::tuple<int, int, int>> result; for (int i = 0; i < groupAxises.size(); ++i) { int outsideSize = 1; int insideSize = 1; int axisSize = 1; auto start = groupAxises[i].first; auto length = groupAxises[i].second; if (start >= (int)lengths.size()) { break; } for (int j = 0; j < start; ++j) { outsideSize *= lengths[j]; } for (int j = start; j < start + length; ++j) { if (j >= (int)lengths.size()) { break; } axisSize *= lengths[j]; lengths[j] = 1; } for (int j = start + length; j < lengths.size(); ++j) { insideSize *= lengths[j]; } if (1 == axisSize) { continue; } result.emplace_back(std::make_tuple(outsideSize, axisSize, insideSize)); } // FUNC_PRINT(result.size()); if (result.empty()) { result.emplace_back(std::make_tuple(1, 1, totalSize)); } return result; } void OpCommonUtils::unravelIndexHelper(int32_t* coordinate, const int32_t* mod, int size, int indice) { int value = indice; for (int i = 0; i < size; ++i) { coordinate[i] = value / mod[i]; value = value % mod[i]; } } int OpCommonUtils::computeStride(int32_t* strides, const int* shape, int length) { if (length <= 0) { return 1; } int stride = 1; for (int i = length - 1; i >= 0; --i) { strides[i] = stride; stride *= shape[i]; } return stride; } bool OpCommonUtils::opNeedContent(const MNN::Op* op, int index) { int type = op->type(); switch (type) { case OpType_ZerosLike: case OpType_ZeroGrad: case OpType_Shape: case OpType_Rank: case OpType_Const: case OpType_Size: case OpType_PriorBox: return false; case OpType_Interp: case OpType_Crop: case OpType_Reshape: case OpType_Reduction: case OpType_Resize: if (1 == index) { return false; } break; case OpType_GridSample: if (2 == index) { return false; } break; #ifdef MNN_SUPPORT_RENDER case OpType_RasterAndInterpolate: { if (0 == index) { int type = 4; if (op->main_type() == OpParameter_Extra) { auto extra = op->main_as_Extra(); if (nullptr != extra->attr()) { for (int i=0; i<extra->attr()->size(); ++i) { auto attr = extra->attr()->GetAs<Attribute>(i); if (attr->key()->str() == "primitiveType") { type = attr->i(); break; } } } } if (type <= 4) { return false; } } break; } #endif default: break; } return true; } bool OpCommonUtils::opCompabilityForLowp(const Op* op, int bytes) { switch (op->type()) { case OpType_While: { if (bytes == 4) { return true; } if (op->main_type() != OpParameter_LoopParam) { return false; } // Check fuse auto loop = op->main_as_LoopParam(); if (nullptr != loop->initCommand()) { for (int i=0; i<loop->initCommand()->size(); ++i) { auto cmd = loop->initCommand()->GetAs<RegionCommand>(i); if (cmd->fuse() >= 0) { return false; } } } if (nullptr != loop->commands()) { for (int i=0; i<loop->commands()->size(); ++i) { auto cmd = loop->commands()->GetAs<RegionCommand>(i); if (cmd->fuse() >= 0) { return false; } } } return true; } case OpType_Scale: case OpType_Convolution: case OpType_ConvolutionDepthwise: case OpType_Deconvolution: case OpType_DeconvolutionDepthwise: case OpType_MatMul: case OpType_BatchMatMul: case OpType_BinaryOp: case OpType_Eltwise: case OpType_UnaryOp: case OpType_Pooling: case OpType_Raster: case OpType_ReLU: case OpType_ReLU6: case OpType_Select: case OpType_PReLU: case OpType_GridSample: case OpType_ROIPooling: case OpType_ROIAlign: case OpType_RNNSequenceGRU: case OpType_DynamicQuant: case OpType_Attention: case OpType_LayerNorm: case OpType_Softmax: return true; default: break; } return false; } void OpCommonUtils::rasterInputReset(const std::vector<Tensor *> &inputs, Tensor *output) { auto outputDes = TensorUtils::getDescribe(output); // For output with copy, the region's size may not be the same as input's size outputDes->regions.resize(inputs.size()); for (int i=0; i<outputDes->regions.size(); ++i) { outputDes->regions[i].origin = inputs[i]; } } static bool _RebuildExternalOp(FileLoader* external, const MNN::Op* origin, flatbuffers::FlatBufferBuilder& builder) { if (nullptr == external) { MNN_ERROR("Can't rebuild external op because external is nullptr\n"); return false; } builder.Clear(); bool externalTmp = false; if (nullptr != origin->externalPath()) { external = new FileLoader(origin->externalPath()->c_str()); externalTmp = true; } std::shared_ptr<MNN::OpT> op(origin->UnPack()); switch (op->main.type) { case OpParameter_Scale: { auto scale = op->main.AsScale(); int outputCount = static_cast<int>(scale->external[1] / sizeof(float)); scale->scaleData.resize(outputCount); external->offset(scale->external[0]); external->read((char*)scale->scaleData.data(), scale->external[1]); if (scale->external.size() > 2) { scale->biasData.resize(outputCount); external->read((char*)scale->biasData.data(), scale->external[2]); } break; } case OpParameter_LayerNorm: { auto layer_norm_param = op->main.AsLayerNorm(); int32_t size = static_cast<int32_t>(layer_norm_param->external[1]); layer_norm_param->gamma.resize(size / sizeof(float)); layer_norm_param->beta.resize(size / sizeof(float)); external->offset(layer_norm_param->external[0]); external->read((char*)layer_norm_param->gamma.data(), layer_norm_param->external[1]); external->read((char*)layer_norm_param->beta.data(), layer_norm_param->external[2]); break; } case OpParameter_Convolution2D: { auto param = op->main.AsConvolution2D(); if (param->quanParameter) { bool isSparse = param->sparseParameter.get() != nullptr; bool isPTQ = param->quanParameter->scaleIn != 0; if (isSparse || isPTQ) { external->offset(param->external[0]); if (0 != param->external[1]) { param->quanParameter->buffer.resize(param->external[1]); external->read((char*)param->quanParameter->buffer.data(), param->external[1]); } param->quanParameter->alpha.resize(param->external[2] / sizeof(float)); external->read((char*)param->quanParameter->alpha.data(), param->external[2]); } else { // skip weight and dequant alpha for load speed op->externalPath = external->path(); external->offset(param->external[0] + param->external[1] + param->external[2]); } if (param->bias.empty() && param->external.size() > 3) { if (param->external[3] > 0) { param->bias.resize(param->external[3]/sizeof(float)); external->read((char*)param->bias.data(), param->external[3]); } else { param->bias.resize(param->common->outputCount); } } if (param->quanParameter->index.empty() && param->external.size() > 4) { if (param->external[4] > 0) { param->quanParameter->index.resize(param->external[4]/sizeof(uint32_t)); external->read((char*)param->quanParameter->index.data(), param->external[4]); } } } else { // Create quanParameter, will load external weight in ConvolutionCommon::load param->quanParameter.reset(new IDSTQuanT); param->quanParameter->type = 8; op->externalPath = external->path(); param->bias.resize(param->external[2] / sizeof(float)); external->offset(param->external[0] + param->external[1]); external->read((char*)param->bias.data(), param->external[2]); } break; } default: break; } if (externalTmp) { delete external; } builder.Finish(Op::Pack(builder, op.get())); return true; } Execution* OpCommonUtils::createExecutionWithExternal(Backend* backend, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, FileLoader* externalFile, std::shared_ptr<BufferStorage>& tmpstore) { #ifdef MNN_BUILD_MINI return backend->onCreate(inputs, outputs, op); #else bool hasExternal = false; switch (op->main_type()) { case OpParameter_Convolution2D: hasExternal = USE_EXTERNAL_DATA(op->main_as_Convolution2D()); break; case OpParameter_Scale: hasExternal = USE_EXTERNAL_DATA(op->main_as_Scale()); break; case OpParameter_LayerNorm: hasExternal = USE_EXTERNAL_DATA(op->main_as_LayerNorm()); break; default: break; } if (!hasExternal) { return backend->onCreate(inputs, outputs, op); } flatbuffers::FlatBufferBuilder builder; bool usemmap = false; if (backend && backend->getRuntime()) { usemmap = (backend->getRuntime()->hint().useCachedMmap > 1); } Execution* execution; if ((!usemmap)) { bool res = _RebuildExternalOp(externalFile, op, builder); if (!res) { MNN_ERROR("Rebuild External Op failed\n"); return nullptr; } auto newOp = flatbuffers::GetRoot<MNN::Op>(builder.GetBufferPointer()); execution = backend->onCreate(inputs, outputs, newOp); } else { execution = backend->onCreate(inputs, outputs, op); } if (nullptr == execution) { return execution; } if (op->main_type() == OpParameter_Convolution2D) { // For convolution / deconvolution, execution will need op info after created, try clone it and remove newOp Execution* copyExe = nullptr; execution->onClone(backend, op, &copyExe); if (nullptr != copyExe) { delete execution; return copyExe; } else { #ifdef DEBUG MNN_ERROR("Clone error for convolution/deconvolution, will Increase memory\n"); #endif tmpstore.reset(new BufferStorage); tmpstore->storage = builder.ReleaseRaw(tmpstore->allocated_size, tmpstore->offset); } } return execution; #endif } void OpCommonUtils::loadExternalDatas(FileLoader* fileloader, std::vector<char*> addrs, const int64_t* external) { fileloader->offset(external[0]); for (int i = 0; i < addrs.size(); i++) { fileloader->read(addrs[i], external[i+1]); } } static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int& area) { if (t->dimensions() == 0) { batch = 1; channel = 1; area = 1; return; } if (t->dimensions() == 1) { batch = t->length(0); channel = 1; area = 1; return; } batch = t->length(0); channel = t->length(1); area = 1; for (int i=2; i<t->dimensions(); ++i) { area *= t->length(i); } } bool OpCommonUtils::isTranspose(const Tensor::InsideDescribe::Region& region, int& srcOne, int& dstOne) { srcOne = -1; dstOne = -1; for (int i = 0; i < 3; i++) { if (region.size[i] == 1) { continue; } if (region.src.stride[i] == 1) { if (srcOne >= 0) { return false; } srcOne = i; } if (region.dst.stride[i] == 1) { if (dstOne >= 0) { return false; } dstOne = i; } } return srcOne >= 0 && dstOne >= 0 && srcOne != dstOne; } bool OpCommonUtils::supportDynamicInputMemory(MNNForwardType type) { if (type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_VULKAN) { return false; } return true; } void OpCommonUtils::turnRegion2Convert(const Tensor::InsideDescribe::Region& region, const Tensor* dest, OpCommonUtils::TensorConvertParameter& info) { auto origin = region.origin; auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat; auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat; info.type = 0; if (srcFormat == dstFormat) { return; } if (srcFormat != MNN_DATA_FORMAT_NC4HW4 && dstFormat != MNN_DATA_FORMAT_NC4HW4) { return; } const Tensor* nc4hw4Tensor = origin; const Tensor* originTensor = dest; if (dstFormat == MNN_DATA_FORMAT_NC4HW4) { nc4hw4Tensor = dest; originTensor = origin; } getBatchChannelArea(nc4hw4Tensor, info.batch, info.channel, info.area); if (0 != region.src.offset || 0 != region.dst.offset) { return; } if (TensorUtils::isCopyRegion(region)) { if (info.batch * info.channel * info.area == region.size[0] * region.size[1] * region.size[2]) { info.type = 1; return; } return; } int srcOne, dstOne; if (isTranspose(region, srcOne, dstOne)) { int keepDim = -1; for (int i = 0; i < 3; i++) { if (i != srcOne && i != dstOne) { keepDim = i; break; } } if (info.batch == region.size[keepDim]) { if ((info.channel == region.size[srcOne] && info.area == region.size[dstOne]) // NCHW || (info.area == region.size[srcOne] && info.channel == region.size[dstOne])) {// NHWC auto srcSize = TensorUtils::getRawSize(originTensor); auto dstSize = TensorUtils::getRawSize(nc4hw4Tensor); auto regionSize = region.size[0] * region.size[1] * region.size[2]; if (srcSize != dstSize || regionSize != srcSize) { return; } info.type = 2; return; } return; } } return; } bool OpCommonUtils::computeMatMulSize(bool transposeA, bool transposeB, const Tensor* A, const Tensor* B, int& e, int& l, int& h) { auto i0Dim = A->dimensions(); auto i1Dim = B->dimensions(); if (i0Dim < 1 || i1Dim < 1) { return false; } int w0, h0; int w1, h1; if (i0Dim == 1) { w0 = A->length(0); h0 = 1; transposeA = false; } else { w0 = A->length(i0Dim - 1); h0 = A->length(i0Dim - 2); } if (i1Dim == 1) { w1 = 1; h1 = B->length(0); transposeB = false; } else { w1 = B->length(i1Dim - 1); h1 = B->length(i1Dim - 2); } if (transposeA) { auto t = w0; w0 = h0; h0 = t; } if (transposeB) { auto t = w1; w1 = h1; h1 = t; } if (w0 != h1) { return false; } e = h0; l = w0; h = w1; return true; } DataType OpCommonUtils::convertDataType(halide_type_t type) { if (type.code == halide_type_float) { return DataType_DT_FLOAT; } if (type.code == halide_type_uint && type.bits == 8) { return DataType_DT_UINT8; } if (type.code == halide_type_int && type.bits == 8) { return DataType_DT_INT8; } if (type.code == halide_type_int && type.bits == 32) { return DataType_DT_INT32; } return DataType_DT_INVALID; } } // namespace MNN