ErrorCode ConvBufExecution::onResize()

in source/backend/opencl/execution/buffer/ConvBufExecution.cpp [251:668]


ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
#ifdef LOG_VERBOSE
    MNN_PRINT("Start ConvExecution onResize !\n");
#endif
    mKernel.resize(1);
    auto input  = inputs[0];
    auto output = outputs[0];
    if (inputs.size() > 1) {
        // Multi Input, need pretreat
        _generateFilterConvertRegion(mResource->mFilter.get(), inputs[1]);
        bool res = backend()->onAcquireBuffer(mResource->mFilter.get(), Backend::DYNAMIC);
        if (!res) {
            return OUT_OF_MEMORY;
        }
        mResource->mRasterExe->onResize({}, {mResource->mFilter.get()});
    }
    mOpenCLBackend->startRecord(mRecording);
    std::vector<int> inputShape  = tensorShapeFormat(input);
    std::vector<int> outputShape = tensorShapeFormat(output);
    const int batch              = outputShape.at(0);
    const int height             = outputShape.at(1);
    const int width              = outputShape.at(2);
    const int outChannel         = outputShape.at(3);

    const int inputHeight   = inputShape.at(1);
    const int inputWidth    = inputShape.at(2);
    const int inputChannels = inputShape.at(3);

    const int inputChannelBlocks = UP_DIV(inputChannels, 4);
    auto padding = ConvolutionCommon::convolutionPad(input, output, mResource->mConv2dCommonParams);
    mPaddings[0] = padding.second;//padY
    mPaddings[1] = padding.first;//padX

    // printf("nchw %d %d %d %d, cohw %d %d %d, khw %d %d  gemm:%d \n", inputs[0]->batch(), inputs[0]->channel(), inputs[0]->height(), inputs[0]->width(), outputs[0]->channel(), outputs[0]->height(), outputs[0]->width(), mResource->mKernelWidth, mResource->mKernelHeight, mResource->mConvGemmOptLevel);

    std::string info = std::to_string(inputChannels) + "_" + std::to_string(outChannel) + "_" + std::to_string(mResource->mKernelHeight) + "_" + std::to_string(mResource->mKernelWidth) + "_" + std::to_string(mResource->mStrides[0]) + "_" + std::to_string(mResource->mStrides[1]) + "_" + std::to_string(mResource->mDilations[0]) + "_" + std::to_string(mResource->mDilations[1]);

    if (mResource->mConvGemmOptLevel > 0) {
        int area = height * width;
        int M = outputShape.at(0) * area;
        int N = outputShape.at(3);
        int K = inputShape.at(3);
        
        // total computation not enough
       if(M < 128 || 1.0 * M / 512 * N / 512 * K / 256 < 1.0) {
            mResource->mConvGemmOptLevel = 0;
        }
    }
    
    if (mResource->mConvGemmOptLevel == 1) {
        int area = height * width;
        int M = outputShape.at(0) * area;
        int N = outputShape.at(3);
        int K = inputShape.at(3);
        // set M Align
        float ratio = 1.0 * M / 1024.0 * N / 1024.0 * K / 1024.0;
        if(M > 1024 && ratio >= 1.0) {
            mAlignM = 128;
        } else if(M > 512 && ratio >= 0.1) {
            mAlignM = 64;
        } else if(M > 96){
            mAlignM = 32;
        } else {
            mAlignM = 16;
        }

        int alignM = ROUND_UP(M, mAlignM);
        int alignN = ROUND_UP(N, mResource->mAlignN);
        int alignK = ROUND_UP(K, mResource->mAlignK);

        // ReArrange input
        mConvGemmInpTensor.reset(Tensor::createDevice<float>({alignK * alignM}));
        mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
        mConvGemmOutTensor.reset(Tensor::createDevice<float>({alignN * alignM}));
        mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
        
        {
            std::set<std::string> buildOptions;
            
            int m_pack = 4;
            mPreKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_buf", "transpose_pad", buildOptions, mOpenCLBackend->getPrecision());
            uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mPreKernel));
            mPreGlobalWorkSize = {static_cast<uint32_t>(alignM/m_pack), static_cast<uint32_t>(alignK/4)};

            int offset = 0;
            int idx            = 0;
            cl_int ret = CL_SUCCESS;
            ret |= mPreKernel->get().setArg(idx++, static_cast<int>(mPreGlobalWorkSize[0]));
            ret |= mPreKernel->get().setArg(idx++, static_cast<int>(mPreGlobalWorkSize[1]));
            ret |= mPreKernel->get().setArg(idx++, static_cast<int>(alignM));
            ret |= mPreKernel->get().setArg(idx++, static_cast<int>(alignK));
            ret |= mPreKernel->get().setArg(idx++, static_cast<int>(M));
            ret |= mPreKernel->get().setArg(idx++, static_cast<int>(K));
            ret |= mPreKernel->get().setArg(idx++, static_cast<int>(area));
            ret |= mPreKernel->get().setArg(idx++, openCLBuffer(input));
            ret |= mPreKernel->get().setArg(idx++, openCLBuffer(mConvGemmInpTensor.get()));
            MNN_CHECK_CL_SUCCESS(ret, "setArg mConvgemmOptLevel==1 PreKernel");
            mPreLocalWorkSize = localWS2DDefault(mPreGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "transpose_pad", mPreKernel, mOpenCLBackend->getCLTuneLevel()).first;

            mOpenCLBackend->recordKernel2d(mPreKernel, mPreGlobalWorkSize, mPreLocalWorkSize);
            mPreGlobalWorkSize[0] = ROUND_UP(mPreGlobalWorkSize[0], std::max((uint32_t)1, mPreLocalWorkSize[0]));
            mPreGlobalWorkSize[1] = ROUND_UP(mPreGlobalWorkSize[1], std::max((uint32_t)1, mPreLocalWorkSize[1]));
        }
        
        // call gemm strassen
        {
            mStrassenComputor.reset(new StrassenMatrixComputor(backend(), 3));
            mStrassenComputor->onEncode(alignM, alignK, alignN, alignM, alignN, alignN, openCLBuffer(mConvGemmInpTensor.get()), openCLBuffer(mResource->mFilter.get()), openCLBuffer(mConvGemmOutTensor.get()),
                                         false, openCLBuffer(mResource->mBias.get()));
        }
        
        // call output transpose
        {
            std::set<std::string> buildOptions = mResource->mBuildOptions;
            int pack_m = 1;
            if(M % 8 == 0) {
                pack_m = 8;
            } else if(M % 4 == 0) {
                pack_m = 4;
            }
            buildOptions.emplace("-DM_VEC=" + std::to_string(pack_m));
            mPostKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_buf", "transpose_bias", buildOptions, mOpenCLBackend->getPrecision());
            uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mPostKernel));

            mPostGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(M, pack_m)), static_cast<uint32_t>(UP_DIV(N, 4))};

            int offset = 0;
            int idx            = 0;
            cl_int ret = CL_SUCCESS;
            ret |= mPostKernel->get().setArg(idx++, static_cast<int>(mPostGlobalWorkSize[0]));
            ret |= mPostKernel->get().setArg(idx++, static_cast<int>(mPostGlobalWorkSize[1]));
            ret |= mPostKernel->get().setArg(idx++, static_cast<int>(alignM));
            ret |= mPostKernel->get().setArg(idx++, static_cast<int>(alignN));
            ret |= mPostKernel->get().setArg(idx++, static_cast<int>(M));
            ret |= mPostKernel->get().setArg(idx++, static_cast<int>(N));
            ret |= mPostKernel->get().setArg(idx++, static_cast<int>(area));
            ret |= mPostKernel->get().setArg(idx++, openCLBuffer(mConvGemmOutTensor.get()));
            ret |= mPostKernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
            ret |= mPostKernel->get().setArg(idx++, openCLBuffer(output));

            MNN_CHECK_CL_SUCCESS(ret, "setArg mConvgemmOptLevel==1 PostKernel");
            mPostLocalWorkSize = localWS2DDefault(mPostGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "transpose_bias", mPostKernel, mOpenCLBackend->getCLTuneLevel()).first;
            mOpenCLBackend->recordKernel2d(mPostKernel, mPostGlobalWorkSize, mPostLocalWorkSize);
            mPostGlobalWorkSize[0] = ROUND_UP(mPostGlobalWorkSize[0], std::max((uint32_t)1, mPostLocalWorkSize[0]));
            mPostGlobalWorkSize[1] = ROUND_UP(mPostGlobalWorkSize[1], std::max((uint32_t)1, mPostLocalWorkSize[1]));

            mOpenCLBackend->endRecord(mRecording);
        }
        mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
        mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
        
        return NO_ERROR;
    } else if (mResource->mConv1x1Opt) {
        if(inputChannels >= 128 && outputShape[0] * outChannel * width * height <= 64){
            mResource->mConv1x1Local = true;
            int local_size = 1;
            while(local_size * 2 <= 256 && local_size * 2 <= inputChannelBlocks){
                local_size *= 2;
            }
            mGlobalWorkSize = {static_cast<uint32_t>(local_size), static_cast<uint32_t>(UP_DIV(outChannel, 4) * width), static_cast<uint32_t>(outputShape[0] * height)};
            mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
            
            std::set<std::string> buildOption = mResource->mBuildOptions;
            buildOption.emplace("-DCONV_LOCAL_SIZE=" + std::to_string(local_size));
            mKernel[0]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", "conv_2d_1x1_local", buildOption, mOpenCLBackend->getPrecision());
            uint32_t idx = 0;
            cl_int ret = CL_SUCCESS;

            ret |= mKernel[0]->get().setArg(idx++, UP_DIV(width, 1));
            ret |= mKernel[0]->get().setArg(idx++, openCLBuffer(input));
            ret |= mKernel[0]->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
            ret |= mKernel[0]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
            ret |= mKernel[0]->get().setArg(idx++, openCLBuffer(output));
            ret |= mKernel[0]->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
            ret |= mKernel[0]->get().setArg(idx++, batch);
            ret |= mKernel[0]->get().setArg(idx++, height);
            ret |= mKernel[0]->get().setArg(idx++, width);
            ret |= mKernel[0]->get().setArg(idx++, UP_DIV(outChannel, 4));
            ret |= mKernel[0]->get().setArg(idx++, ROUND_UP(outChannel, mResource->mAlignN));
            MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf");
        } else {
            mResource->mConv1x1Local = false;
            // {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1", "conv_2d_1x1_c8h1w4"};
            const int total_kernel = 3;
            std::string kernelName[total_kernel] = {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1"};
            int itemC[total_kernel] = {4, 4, 4};
            int itemW[total_kernel] = {4, 2, 1};

            int M = outputShape.at(0) * outputShape.at(1) * outputShape.at(2);
            mResource->mConv1x1C8Opt = (mResource->mOutputChannel >= 16 && M >= 16 && M * mResource->mOutputChannel >= 65536);
            
            int actual_kernel = total_kernel;
            if(mResource->mConv1x1C8Opt) {
                actual_kernel = 2;
                kernelName[0] = "conv_2d_1x1_c8h1w4";
                itemC[0]      = 8;
                itemW[0]      = 4;

                kernelName[1] = "conv_2d_1x1_c8h1w2";
                itemC[1]      = 8;
                itemW[1]      = 2;
            }

            std::shared_ptr<KernelWrap> kernel[total_kernel];
            std::vector<uint32_t> globalWorkSize[total_kernel];
            std::vector<uint32_t> localWorkSize[total_kernel];
            std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
            for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
                std::set<std::string> buildOption = mResource->mBuildOptions;
                if(itemC[knl_idx] == 8 && outputShape.at(3) % itemC[knl_idx] > 0 && outputShape.at(3) % itemC[knl_idx] <= 4){
                    buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
                }
                if((outputShape.at(2) % itemW[knl_idx]) != 0){
                    buildOption.emplace("-DBLOCK_LEAVE");
                }
                kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[knl_idx], buildOption, mOpenCLBackend->getPrecision());
                uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
                
                uint32_t idx            = 0;
                cl_int ret = CL_SUCCESS;
                globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};

                ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][0]);
                ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][1]);
                ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(width, itemW[knl_idx]));
                ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(input));
                ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
                ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
                ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(output));
                ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
                ret |= kernel[knl_idx]->get().setArg(idx++, height);
                ret |= kernel[knl_idx]->get().setArg(idx++, width);
                ret |= kernel[knl_idx]->get().setArg(idx++, batch);
                ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(outChannel, 4));
                ret |= kernel[knl_idx]->get().setArg(idx++, ROUND_UP(outChannel, mResource->mAlignN));

                MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf Kernel Select");

                std::pair<std::vector<uint32_t>, int> retTune;
                retTune = localWS2DDefault(globalWorkSize[knl_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx] + info, kernel[knl_idx], mOpenCLBackend->getCLTuneLevel());
                if(min_cost.first > retTune.second) {
                    min_cost.first = retTune.second;
                    min_cost.second = knl_idx;
                    mLocalWorkSize = {retTune.first[0], retTune.first[1]};
                }
            }

            int min_index  = min_cost.second;
            mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};

            std::set<std::string> buildOption = mResource->mBuildOptions;
            if(itemC[min_index] == 8 && outputShape.at(3) % itemC[min_index] > 0 && outputShape.at(3) % itemC[min_index] <= 4){
                buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
            }
            if((outputShape.at(2) % itemW[min_index]) != 0){
                buildOption.emplace("-DBLOCK_LEAVE");
            }
            mKernel[0]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[min_index], buildOption, mOpenCLBackend->getPrecision());
            uint32_t idx = 0;
            cl_int ret = CL_SUCCESS;

            ret |= mKernel[0]->get().setArg(idx++, mGlobalWorkSize[0]);
            ret |= mKernel[0]->get().setArg(idx++, mGlobalWorkSize[1]);
            ret |= mKernel[0]->get().setArg(idx++, UP_DIV(width, itemW[min_index]));
            ret |= mKernel[0]->get().setArg(idx++, openCLBuffer(input));
            ret |= mKernel[0]->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
            ret |= mKernel[0]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
            ret |= mKernel[0]->get().setArg(idx++, openCLBuffer(output));
            ret |= mKernel[0]->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
            ret |= mKernel[0]->get().setArg(idx++, height);
            ret |= mKernel[0]->get().setArg(idx++, width);
            ret |= mKernel[0]->get().setArg(idx++, batch);
            ret |= mKernel[0]->get().setArg(idx++, UP_DIV(outChannel, 4));
            ret |= mKernel[0]->get().setArg(idx++, ROUND_UP(outChannel, mResource->mAlignN));
            MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf");
        }
    } else {
        int inputImageShape[2]  = {inputHeight, inputWidth};
        int outputImageShape[2] = {height, width};
        int kernelShape[2]      = {mResource->mKernelHeight, mResource->mKernelWidth};
        int strideShape[2]      = {mResource->mStrides[0],mResource->mStrides[1]};
        int paddingShape[2]     = {mPaddings[0], mPaddings[1]};
        int dilationShape[2]    = {mResource->mDilations[0], mResource->mDilations[1]};

        // {"conv_2d_c4h1w2", "conv_2d_c4h1w1", "conv_2d_c8h1w1", "conv_2d_c4h1w4", "conv_2d_c8h2w1", "conv_2d_c4h4w1"};
        const int total_kernel = 7;
        std::string kernelName[total_kernel] = {"conv_2d_c4h1w1", "conv_2d_c4h1w2", "conv_2d_c4h4w1",  "conv_2d_c4h1w4", "conv_2d_c8h2w1", "conv_2d_c8h4w1", "conv_2d_c8h1w4"};
        int itemC[total_kernel] = {4, 4, 4, 4, 8, 8, 8};
        int itemH[total_kernel] = {1, 1, 4, 1, 2, 4, 1};
        int itemW[total_kernel] = {1, 2, 1, 4, 1, 1, 4};

        int actual_kernel = total_kernel;
        int outChannelBlocks = UP_DIV(outChannel, 4);
        int conv_block_num = 1;
        auto magic_ratio = 1.0 * outputShape.at(0) * outputShape.at(1) * outputShape.at(2) / 1024.0 * \
                            inputChannels * kernelShape[0] * kernelShape[1] / 1024.0 * \
                            outChannel / 1024.0;
        if(magic_ratio >= 16.0 && outChannelBlocks >= 64) {
            conv_block_num = 8;
        } else if(magic_ratio >= 8.0 && outChannelBlocks >= 32) {
            conv_block_num = 4;
        } else if(magic_ratio >= 4.0 && outChannelBlocks >= 16) {
            conv_block_num = 2;
        } else {
            conv_block_num = 1;
        }

        mKernel.resize(conv_block_num);
        
        std::shared_ptr<KernelWrap> kernel[total_kernel];
        std::vector<uint32_t> globalWorkSize[total_kernel];
        std::vector<uint32_t> localWorkSize[total_kernel];
        std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
        for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
            std::set<std::string> buildOption = mResource->mBuildOptions;
            if(outputShape.at(3) % itemC[knl_idx] != 0){
                buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
            }
            if((outputShape.at(2) % itemW[knl_idx]) != 0 || (outputShape.at(1) % itemH[knl_idx]) != 0){
                buildOption.emplace("-DBLOCK_LEAVE");
            }
            kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[knl_idx], buildOption, mOpenCLBackend->getPrecision());
            uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));

            int each_oc = (UP_DIV(outputShape.at(3), itemC[knl_idx]) + conv_block_num - 1) / conv_block_num;

            globalWorkSize[knl_idx] = {static_cast<uint32_t>(each_oc * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * UP_DIV(outputShape.at(1), itemH[knl_idx]))};
            uint32_t idx            = 0;
            cl_int ret = CL_SUCCESS;
            ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][0]);
            ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][1]);
            ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(input));
            ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
            ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
            ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(output));
            ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
            ret |= kernel[knl_idx]->get().setArg(idx++, inputChannels);
            ret |= kernel[knl_idx]->get().setArg(idx++, inputChannelBlocks);
            ret |= kernel[knl_idx]->get().setArg(idx++, batch);
            ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
            ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(kernelShape), kernelShape);
            ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(strideShape), strideShape);
            ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(paddingShape), paddingShape);
            ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(dilationShape), dilationShape);
            ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(width, itemW[knl_idx]));
            ret |= kernel[knl_idx]->get().setArg(idx++, outChannelBlocks);
            ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(height, itemH[knl_idx]));
            int outChannelBase = 0;
            ret |= kernel[knl_idx]->get().setArg(idx++, outChannelBase);
            MNN_CHECK_CL_SUCCESS(ret, "setArg ConvBuf Kernel Select");

            std::pair<std::vector<uint32_t>, int> retTune;
            retTune = localWS2DDefault(globalWorkSize[knl_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx] + info, kernel[knl_idx], mOpenCLBackend->getCLTuneLevel());

            if(min_cost.first > retTune.second) {
                min_cost.first = retTune.second;
                min_cost.second = knl_idx;
                mLocalWorkSize = {retTune.first[0], retTune.first[1]};
            }
        }
        int min_index  = min_cost.second;
        mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};

        std::set<std::string> buildOption = mResource->mBuildOptions;
        if(outputShape.at(3) % itemC[min_index] != 0){
            buildOption.emplace("-DCHANNEL_BOUNDARY_PROTECT");
        }
        if((outputShape.at(2) % itemW[min_index]) != 0 || (outputShape.at(1) % itemH[min_index]) != 0){
            buildOption.emplace("-DBLOCK_LEAVE");
        }
        
        for(int kernel_idx = 0; kernel_idx < conv_block_num; kernel_idx++) {
            mKernel[kernel_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[min_index], buildOption, mOpenCLBackend->getPrecision());
            
            uint32_t idx            = 0;
            cl_int ret = CL_SUCCESS;
            
            ret |= mKernel[kernel_idx]->get().setArg(idx++, mGlobalWorkSize[0]);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, mGlobalWorkSize[1]);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, openCLBuffer(input));
            ret |= mKernel[kernel_idx]->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
            ret |= mKernel[kernel_idx]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
            ret |= mKernel[kernel_idx]->get().setArg(idx++, openCLBuffer(output));
            ret |= mKernel[kernel_idx]->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, inputChannels);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, inputChannelBlocks);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, batch);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, sizeof(kernelShape), kernelShape);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, sizeof(strideShape), strideShape);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, sizeof(paddingShape), paddingShape);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, sizeof(dilationShape), dilationShape);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, UP_DIV(width, itemW[min_index]));
            ret |= mKernel[kernel_idx]->get().setArg(idx++, outChannelBlocks);
            ret |= mKernel[kernel_idx]->get().setArg(idx++, UP_DIV(height, itemH[min_index]));
            int outChannelBase = mGlobalWorkSize[0] / UP_DIV(width, itemW[min_index]) * kernel_idx;
            ret |= mKernel[kernel_idx]->get().setArg(idx++, outChannelBase);
            MNN_CHECK_CL_SUCCESS(ret, "setArg ConvBuf");
        }
    }
    if (inputs.size() > 1) {
        backend()->onReleaseBuffer(mResource->mFilter.get(), Backend::DYNAMIC);
    }
    if (mResource->mConv1x1Opt && mResource->mConv1x1Local){
        mOpenCLBackend->recordKernel3d(mKernel[0], mGlobalWorkSize, mLocalWorkSize);
    }else{
        for(int i = 0; i < mKernel.size(); i++) {
            mOpenCLBackend->recordKernel2d(mKernel[i], mGlobalWorkSize, mLocalWorkSize);
        }
        mGlobalWorkSize[0] = ROUND_UP(mGlobalWorkSize[0], std::max((uint32_t)1, mLocalWorkSize[0]));
        mGlobalWorkSize[1] = ROUND_UP(mGlobalWorkSize[1], std::max((uint32_t)1, mLocalWorkSize[1]));
    }
    mOpenCLBackend->endRecord(mRecording);
#ifdef LOG_VERBOSE
    MNN_PRINT("end ConvExecution onResize !\n");
#endif
    return NO_ERROR;
}