source/backend/metal/MetalConvolution.mm

// // MetalConvolution.mm // MNN // // Created by MNN on 2019/01/30. // Copyright © 2018, Alibaba Group Holding Limited // #import "backend/metal/MetalConvolution.hpp" #import "core/Macro.h" #import "backend/metal/MetalBackend.hpp" #import "backend/metal/MetalConvolution1x1.hpp" #import "backend/metal/MetalConvolutionWinograd.hpp" #include <string> #if MNN_METAL_ENABLED namespace MNN { MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op, nullptr) { loadWeight(op); } MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op, std::shared_ptr<MNN::Tensor> weight, std::shared_ptr<MNN::Tensor> bias) : MetalConvolutionCommon(backend, op, bias) { mWeight = weight; } bool MetalConvolution::onClone(Backend* bn, const Op* op, Execution** dst) { if (!mValid) { return false; } if (nullptr == dst) { return true; } *dst = new MetalConvolution(bn, op, mWeight, mBias); return true; } ErrorCode MetalConvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { // prepare auto backend = static_cast<MetalBackend *>(this->backend()); auto mtbn = backend; auto context = (__bridge MNNMetalContext *)backend->context(); auto input = inputs[0]; auto output = outputs[0]; auto iw = input->width(); auto ih = input->height(); auto ic_4 = UP_DIV(input->channel(), 4); auto ow = output->width(); auto oh = output->height(); auto oc_4 = UP_DIV(output->channel(), 4); auto ob = output->batch(); auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common()); auto padX = pads.first; auto padY = pads.second; int stepSlices = ic_4; // create const buffer int constants[] = {iw, ih, iw * ih, ic_4, ow, oh, ow * oh, oc_4, ob, oc_4 * ob, stepSlices, mKernelX, mKernelY, mKernelX * mKernelY, mStrideX, mStrideY, padX, padY, mDilateX, mDilateY, mActivationType}; mConstBuffer = backend->getConstBuffer(sizeof(constants)); ::memcpy(mConstBuffer.contents, constants, sizeof(constants)); mParam = "_ic" + std::to_string(ic_4) + "oc" + std::to_string(oc_4) + "k" + std::to_string(mKernelX) + "x" + std::to_string(mKernelY) + "s" + std::to_string(mStrideX) + "x" + std::to_string(mStrideY) + "d" + std::to_string(mDilateX) + "x" + std::to_string(mDilateY); MetalRuntime* rt = (MetalRuntime *)backend->runtime(); bool isS1D1 = (mStrideX==1 && mStrideY==1 && mDilateX==1 && mDilateY==1); bool isS1D1P0 = isS1D1 && (padX==0 && padY==0 && mKernelX>1 && mKernelX%2==1); bool is3x3s1Conv = (mKernelX==3 && mKernelY==3 && mStrideX==1 && mStrideY==1 && padX==1 && padY==1 && mDilateX==1 && mDilateY==1); // printf("isS1D1P0: %d, c:%d %d, K:%d %d, s:%d %d, p:%d %d, iwh:%d %d, owh:%d %d\n", isS1D1P0, ic_4, oc_4, mKernelX, mKernelY, mStrideX, mStrideY, padX, padY, iw, ih, ow, oh); if(rt->getTuneLevel() == Never) { int packW = 1; int packC = 2; NSString* kernelName = @"conv_z2"; if(isS1D1P0) { packW = 2; packC = 1; kernelName = @"conv_s1d1p0_w2"; } NSUInteger gid_x = UP_DIV(ow, packW); NSUInteger gid_y = oh; NSUInteger gid_z = UP_DIV(oc_4, packC) * ob; mPipeline = [context pipelineWithName:kernelName fp16:backend->useFp16InsteadFp32()]; NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(), (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(), mConstBuffer, ((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId())->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil]; const Tensor* weight = mWeight.get(); const Tensor* bias = mBias.get(); int buffer_offset[] = {TensorUtils::getDescribe(input)->extra.offset, TensorUtils::getDescribe(output)->extra.offset, 0, TensorUtils::getDescribe(weight)->extra.offset, TensorUtils::getDescribe(bias)->extra.offset}; std::string name = [kernelName UTF8String] + mParam; auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name offsets:buffer_offset queue:backend->queue()]; mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret)); } else { const int total_kernel = 5; NSString* shaderName[total_kernel] = {@"conv", @"conv_z4", @"conv_z2", @"conv_s1d1p0_w2", @"conv_s1d1p0_w4"}; int itemW[total_kernel] = {1, 1, 1, 2, 4}; int itemH[total_kernel] = {1, 1, 1, 1, 1}; int itemC[total_kernel] = {1, 4, 2, 1, 1}; int actual_kernel = 3; if(isS1D1P0) { actual_kernel = 4; if(mKernelX == 3) { actual_kernel = 5; } } else if(is3x3s1Conv) { actual_kernel = 4; shaderName[3] = @"convk3s1d1p1_w2z4"; itemW[3] = 2; itemH[3] = 1; itemC[3] = 4; } else { actual_kernel = 3; } std::pair<NSUInteger, int> min_cost(INT_MAX, 0);//(min_time, min_index) NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(), (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(), mConstBuffer, (((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil]; const Tensor* weight = mWeight.get(); const Tensor* bias = mBias.get(); int buffer_offset[] = { TensorUtils::getDescribe(input)->extra.offset, TensorUtils::getDescribe(output)->extra.offset, 0, TensorUtils::getDescribe(weight)->extra.offset, TensorUtils::getDescribe(bias)->extra.offset }; for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) { id<MTLComputePipelineState> pipeline = [context pipelineWithName:shaderName[knl_idx] fp16:mtbn->useFp16InsteadFp32()]; NSUInteger gid_x = UP_DIV(ow, itemW[knl_idx]); NSUInteger gid_y = UP_DIV(oh, itemH[knl_idx]); NSUInteger gid_z = UP_DIV(oc_4, itemC[knl_idx]) * ob; std::string name = [shaderName[knl_idx] UTF8String] + mParam; auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name offsets: buffer_offset queue:backend->queue()]; if(min_cost.first > std::get<2>(ret)) { min_cost.first = std::get<2>(ret); min_cost.second = knl_idx; mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret)); } // printf("conv idx:%d %s, global:%d %d %d, local:%d %d %d, min_cost: %d -> %d\n", knl_idx, name.c_str(), (int)std::get<0>(ret).width, (int)std::get<0>(ret).height, (int)std::get<0>(ret).depth, (int)std::get<1>(ret).width, (int)std::get<1>(ret).height, (int)std::get<1>(ret).depth, std::get<2>(ret), (int)min_cost.first); } // printf("conv idx:%d, min_cost:%d\n", (int)min_cost.second, (int)min_cost.first); // std::string tmp = [shaderName[min_cost.second] UTF8String]; // printf("!!~ %s\n", tmp.c_str()); mPipeline = [context pipelineWithName:shaderName[min_cost.second] fp16:mtbn->useFp16InsteadFp32()]; } return NO_ERROR; } void MetalConvolution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) { auto input = inputs[0]; auto output = outputs[0]; [encoder setComputePipelineState:mPipeline]; MetalBackend::setTensor(input, encoder, 0); MetalBackend::setTensor(output, encoder, 1); [encoder setBuffer:mConstBuffer offset:0 atIndex:2]; MetalBackend::setTensor(mWeight.get(), encoder, 3); MetalBackend::setTensor(mBias.get(), encoder, 4); [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second]; } class MetalConvolutionCreator : public MetalBackend::Creator { public: virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const { auto param = op->main_as_Convolution2D(); if (param->quanParameter() != nullptr) { if (param->quanParameter()->has_scaleInt()) { return nullptr; } } if (inputs.size() > 1) { return nullptr; } auto conv = op->main_as_Convolution2D(); if (conv->common()->group() > 1) { return nullptr; } if (op->type() == OpType_Convolution) { auto input = inputs[0]; if (MetalConvolutionWinograd::isValid(backend, conv, inputs[0], outputs[0])) { return new MetalConvolutionWinograd(backend, op); } if (MetalConvolution1x1::isValid(conv, input)) { return new MetalConvolution1x1(backend, op); } } return new MetalConvolution(backend, op); } }; REGISTER_METAL_OP_CREATOR(MetalConvolutionCreator, OpType_Convolution); } // namespace MNN #endif /* MNN_METAL_ENABLED */

source/backend/metal/MetalConvolution.mm (187 lines of code) (raw):