source/backend/metal/MetalConvolutionDepthwise.mm (120 lines of code) (raw):
//
// MetalConvolutionDepthwise.mm
// MNN
//
// Created by MNN on 2019/02/25.
// Copyright © 2018, Alibaba Group Holding Limited
//
#import "backend/metal/MetalConvolutionDepthwise.hpp"
#import "core/Macro.h"
#import "backend/metal/MetalBackend.hpp"
#if MNN_METAL_ENABLED
namespace MNN {
MetalConvolutionDepthwise::MetalConvolutionDepthwise(Backend *backend, const MNN::Op *op)
: MetalConvolutionCommon(backend, op, nullptr) {
loadWeight(op);
}
ErrorCode MetalConvolutionDepthwise::onResize(const std::vector<Tensor *> &inputs,
const std::vector<Tensor *> &outputs) {
MetalConvolutionCommon::onResize(inputs, outputs);
auto backend = static_cast<MetalBackend *>(this->backend());
// prepare
auto input = inputs[0], output = outputs[0];
auto iw = input->width();
auto ih = input->height();
auto ic_4 = UP_DIV(input->channel(), 4);
auto ow = output->width();
auto oh = output->height();
auto ob = output->batch();
auto oc_4 = UP_DIV(output->channel(), 4);
auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common());
auto padX = pads.first;
auto padY = pads.second;
// create const buffer
int constants[] = {iw,
ih,
iw * ih,
ow,
oh,
ow * oh,
ic_4,
ob,
mKernelX,
mKernelY,
mKernelX * mKernelY,
mStrideX,
mStrideY,
padX,
padY,
mDilateX,
mDilateY,
mActivationType};
mConstBuffer = backend->getConstBuffer(sizeof(constants));
::memcpy(mConstBuffer.contents, constants, sizeof(constants));
auto context = (__bridge MNNMetalContext *)backend->context();
mPipeline = [context pipelineWithName:@"conv_depthwise" fp16:backend->useFp16InsteadFp32()];
NSUInteger gid_x = ow;
NSUInteger gid_y = oh;
NSUInteger gid_z = oc_4*ob;
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
mConstBuffer, (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil];
const Tensor* weight = mWeight.get();
const Tensor* bias = mBias.get();
int buffer_offset[] = {
TensorUtils::getDescribe(input)->extra.offset,
TensorUtils::getDescribe(output)->extra.offset,
0,
TensorUtils::getDescribe(weight)->extra.offset,
TensorUtils::getDescribe(bias)->extra.offset
};
std::string name = "conv_depthwise";
MetalRuntime *rt = (MetalRuntime *)backend->runtime();
auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name offsets:buffer_offset queue:backend->queue()];
mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret));
return NO_ERROR;
}
void MetalConvolutionDepthwise::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
[encoder setComputePipelineState:mPipeline];
MetalBackend::setTensor(inputs[0], encoder, 0);
MetalBackend::setTensor(outputs[0], encoder, 1);
[encoder setBuffer:mConstBuffer offset:0 atIndex:2];
MetalBackend::setTensor(mWeight.get(), encoder, 3);
MetalBackend::setTensor(mBias.get(), encoder, 4);
[encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
}
template <typename FType, typename TType>
static void weightInBlock(int group, int kh, int kw, const FType *src, uint8_t* dstOrigin) {
auto dst = (TType *)dstOrigin;
for (int g = 0; g < group; g++) {
auto z = g / 4, r = g % 4;
auto z_dst = dst + z * kh * kw * 4 + r;
for (int h = 0; h < kh; h++) {
for (int w = 0; w < kw; w++) {
// to [g/4][h][w][4]
// from [g][h][w]
// dst[(z * kh * kw + h * kw + w) * 4 + r] =
// src[ g * kh * kw + h * kw + w];
z_dst[(h * kw + w) * 4] = *src++;
}
}
}
}
std::shared_ptr<MNN::Tensor> MetalConvolutionDepthwise::weightTransform(int group, int oc, int ic, int kh, int kw, const float *src, bool int8Weight, bool int4Weight, id<MTLBuffer> srcGpuBuffer) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)static_cast<MetalBackend *>(backend)->context();
auto length = UP_DIV(group, 4) * 4 * kw * kh;
std::shared_ptr<MNN::Tensor> t(MNN::Tensor::createDevice<float>({length}));
auto res = backend->onAcquireBuffer(t.get(), Backend::STATIC);
if (!res) {
MNN_ERROR("Alloca gpu memory error in MetalConvolutionDepthwise\n");
return nullptr;
}
auto buffer = MetalBackend::getBuffer(t.get());
auto content = (uint8_t*)[buffer.first contents] + buffer.second;
if (backend->useFp16InsteadFp32()) {
weightInBlock<float, __fp16>(group, kh, kw, src, content);
} else {
weightInBlock<float, float>(group, kh, kw, src, content);
}
return t;
}
class MetalConvolutionDepthwiseCreator : public MetalBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
if (inputs.size() > 1) {
return nullptr;
}
return new MetalConvolutionDepthwise(backend, op);
}
};
REGISTER_METAL_OP_CREATOR(MetalConvolutionDepthwiseCreator, OpType_ConvolutionDepthwise);
} // namespace MNN
#endif /* MNN_METAL_ENABLED */