source/backend/metal/MetalBackend.mm (1,195 lines of code) (raw):
//
// MetalBackend.mm
// MNN
//
// Created by MNN on 2019/01/30.
// Copyright © 2018, Alibaba Group Holding Limited
//
#import "backend/metal/MetalBackend.hpp"
#define MNN_METAL
#import <MNN/MNNSharedContext.h>
#define METAL_CONST_BUFFER_LIMIT 128
#define METAL_SEPERATE_MAX_COUNT 2
#if MNN_METAL_ENABLED
#include <mutex>
#import "backend/metal/MNNMetalContext.h"
#import "core/Macro.h"
#import "core/TensorUtils.hpp"
#include "MetalCache_generated.h"
int MNNMetalGetTensorContent(MNNMetalTensorContent* content, void* tensor) {
if (nullptr == content || nullptr == tensor) {
return 0;
}
auto t = (MNN::Tensor*)tensor;
auto des = MNN::TensorUtils::getDescribe(t);
content->buffer = ((MNN::MetalRuntimeAllocator::MetalBufferAlloc*)t->deviceId())->getBuffer();
content->texture = nil;
content->offset = des->extra.offset;
return 0;
}
namespace MNN {
static void _MetalApplyTensor(uint8_t* host, size_t offset, Tensor* t) {
// ptr of MetalBufferAlloc
t->buffer().device = (uint64_t)host;
auto des = TensorUtils::getDescribe(t);
des->extra.offset = offset;
}
BufferAllocator* MetalRuntime::createDynamicAllocator(int index, bool secondResize) const {
if (hint().memoryAllocatorType == Runtime::Allocator_Defer && secondResize) {
return new DeferBufferAllocator(buffer(index), 1024, _MetalApplyTensor);
}
if (mStaticCache.get() != nullptr) {
return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticCache.get()), 1024);
}
return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStatic.get()), 1024);
}
struct TunedInfo {
std::vector<std::unique_ptr<MetalCache::OpInfoT>> mInfos;
};
void registerMetalOps();
#ifdef MNN_SUPPORT_RENDER
extern void registerMetalRenderOps();
#endif
static inline std::map<OpType, MetalBackend::Creator *> *getCreatorMap() {
static std::once_flag of;
static std::map<OpType, MetalBackend::Creator *> *ret = nullptr;
std::call_once(of, [&]() { ret = new std::map<OpType, MetalBackend::Creator *>; });
return ret;
}
void MetalBackend::addCreator(OpType t, Creator *c) {
auto map = getCreatorMap();
if (map->find(t) != map->end()) {
MNN_PRINT("Error: %d type has be added\n", t);
}
map->insert(std::make_pair(t, c));
}
MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime, bool usefp16AsFp32, BackendConfig::MemoryMode mode) : Backend(MNN_FORWARD_METAL),
mEmptyMem(nil)
{
mRuntime = runtime;
auto ctx = (__bridge MNNMetalContext *)runtime->context();
mBufferPool.reset(runtime->createDynamicAllocator(0, false));
mCurrentAllocator = mBufferPool.get();
mStaticBufferPool = staticMem;
mUseFloatAsFp16 = usefp16AsFp32;
mMemoryMode = mode;
mIsIphone = ctx.isIphone;
if (runtime->getCommandQueue() == nil) {
// one command queue can create only a few command buffer, so let each backend own a command queue
_commandQueue = [[ctx device] newCommandQueue];
mSupportDeferEncode = true;
} else {
// otherwise forbid defer encode optimize
_commandQueue = runtime->getCommandQueue();
mSupportDeferEncode = false;
}
_commandBuffer = nil;
_commandBuffer_net = nil;
_waiting = nil;
}
MetalBackend::~MetalBackend() {
flushEncoder();
}
id<MTLComputeCommandEncoder> MetalBackend::encoder_net() const {
id<MTLComputeCommandEncoder> result = [getCommandBufferForNet() computeCommandEncoder];
#if MNN_METAL_DEBUG || MNN_METAL_BENCHMARK
result.label = nil;
#endif
return result;
}
void *MetalBackend::context() const {
return mRuntime->context();
}
class MetalMemRelease : public Backend::MemObj {
public:
MetalMemRelease(MemChunk buffer, BufferAllocator* allocator) {
mBuffer = buffer;
mAllocator = allocator;
}
virtual ~ MetalMemRelease() {
mAllocator->free(mBuffer);
}
MemChunk chunk() override {
return mBuffer;
}
private:
MemChunk mBuffer;
BufferAllocator* mAllocator;
};
size_t MetalBackend::getTensorSizeInBytes(const Tensor* tensor) const {
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
size_t size;
if (MNN_DATA_FORMAT_NC4HW4 == format && tensor->dimensions() >= 2) {
int width = 1;
int height = 1;
int batch = tensor->length(0);
int channel = tensor->length(1);
if (tensor->dimensions() >= 3) {
height = tensor->length(2);
}
for (int i=3; i<tensor->dimensions(); ++i) {
width *= tensor->length(i);
}
int alignC = ROUND_UP(channel, 4);
int hR = ROUND_UP(height, 4) - height;
// width parallel 4, may exceed 3 elements
int wR = ROUND_UP(width + 3, 4) - width;
int bhw = batch * width * height;
int bhwR = UP_DIV(bhw, 16) * 16 - bhw;
int extraPadding = ALIMAX(bhwR, (hR * width + wR));
size = batch * alignC * width * height;
size = size + extraPadding * 4;
} else {
size = 1;
for (int i=0; i<tensor->dimensions(); ++i) {
size *= tensor->length(i);
}
size = ROUND_UP(size, 4);
}
if (0 == size) {
return 0;
}
// use metal_float when meets float
if (halide_type_float == tensor->buffer().type.code && tensor->buffer().type.bits == 32 && mUseFloatAsFp16) {
size *= 2;
} else {
size *= tensor->getType().bytes();
}
size_t align = 4 * sizeof(int);
size = ROUND_UP(size, align);
return size;
}
Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
auto tensor = const_cast<Tensor *>(_tensor);
size_t size = getTensorSizeInBytes(_tensor);
if (0 == size) {
return nullptr;
}
// reuse if possible
MemChunk buffer;
BufferAllocator* allocator = nullptr;
switch (storageType) {
case Backend::STATIC: {
buffer = mStaticBufferPool->alloc(size, false);
allocator = mStaticBufferPool.get();
} break;
case Backend::DYNAMIC: {
buffer = mCurrentAllocator->alloc(size, false);
allocator = mCurrentAllocator;
} break;
case Backend::DYNAMIC_SEPERATE: {
buffer = mCurrentAllocator->alloc(size, true);
allocator = mCurrentAllocator;
} break;
default:{
break;
}
}
if (storageType == Backend::STATIC) {
if(nullptr == buffer.first) {
MNN_ERROR("onAcquireBuffer error!\n");
return nullptr;
}
} else {
buffer.attach(tensor);
}
if (nullptr == buffer.first) {
_MetalApplyTensor((uint8_t*)(&mEmptyMem), 0, (Tensor*)_tensor);
} else {
_MetalApplyTensor((uint8_t*)buffer.first, buffer.second, (Tensor*)_tensor);
}
return new MetalMemRelease(buffer, allocator);
}
bool MetalBackend::onClearBuffer() {
mCurrentAllocator->release(true);
if (nullptr != mRuntime->mStaticCache.get()) {
mStaticBufferPool = mRuntime->mStaticCache;
}
return true;
}
Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
const Op *op) {
auto map = getCreatorMap();
auto iter = map->find(op->type());
if (iter == map->end()) {
mSupportDeferEncode = false;
if (nullptr != op->name()) {
MNN_PRINT("Don't support type [%s], %s\n", EnumNameOpType(op->type()), op->name()->c_str());
} else {
MNN_PRINT("Don't support type [%s]\n", EnumNameOpType(op->type()));
}
return NULL;
}
//MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type()));
auto exe = iter->second->onCreate(inputs, op, this, outputs);
if (NULL == exe) {
mSupportDeferEncode = false;
MNN_PRINT("The Creator Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name() ? op->name()->c_str() : "");
return NULL;
}
return exe;
}
void MetalBackend::flushEncoder() const {
if (nil != mComputeEncoder) {
[mComputeEncoder endEncoding];
mComputeEncoder = nil;
}
}
void MetalBackend::_resetDynamicMemory() const {
mRuntime->pCurrentStatus = mCurrentAllocator->apply();
if (NO_ERROR != mRuntime->pCurrentStatus) {
return;
}
if (nullptr != mBufferPoolShapeImmutable.get()) {
mRuntime->pCurrentStatus = mBufferPoolShapeImmutable->apply();
}
}
void MetalBackend::onExecuteBegin() const {
_resetDynamicMemory();
mEncoderCount = 0;
}
void MetalBackend::onExecuteEnd() const {
flushEncoder();
commit_net();
}
BufferAllocator* MetalBackend::getBufferPool() const {
return mCurrentAllocator;
}
bool MetalBackend::onSelectDynamicAllocator(int index, int maxIndex) {
if (maxIndex > 2) {
return false;
}
if (maxIndex == 2 && mBufferPoolShapeImmutable.get() == nullptr) {
mBufferPoolShapeImmutable.reset(mRuntime->createDynamicAllocator(1, true));
mBufferPool.reset(mRuntime->createDynamicAllocator(0, true));
}
if (1 == index) {
mCurrentAllocator = mBufferPoolShapeImmutable.get();
} else {
mCurrentAllocator = mBufferPool.get();
}
return true;
}
bool MetalBackend::onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
if (nullptr == dstInfo) {
return true;
}
auto dst = (MNNMetalTensorContent*)dstInfo;
dst->type.code = halide_type_float;
if (mUseFloatAsFp16) {
dst->type.bits = 16;
} else {
dst->type.bits = 32;
}
MNNMetalGetTensorContent(dst, (void*)tensor);
return true;
}
bool MetalBackend::isCmdBufferCommit() {
auto ctx = (__bridge MNNMetalContext *)context();
//TODO: set magic number
const int magicNum = mRuntime->hint().encorderNumForCommit;
mEncoderCount++;
if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
return true;
}
return false;
}
id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
// reuse
if (nullptr != mHostBuffer && mHostBuffer.length >= size) {
return mHostBuffer;
}
// create larger
auto context = (__bridge MNNMetalContext *)this->context();
mHostBuffer = [context newDeviceBuffer:size access:CPUReadWrite];
return mHostBuffer;
}
id<MTLBuffer> MetalBackend::getConstBuffer(size_t size) const {
if (size < METAL_CONST_BUFFER_LIMIT) {
if (!mHoldBuffers.empty()) {
auto res = mHoldBuffers.front();
mHoldBuffers.pop();
return res;
}
size = METAL_CONST_BUFFER_LIMIT;
}
auto context = (__bridge MNNMetalContext *)this->context();
auto buffer = [context newDeviceBuffer:size access:CPUReadWrite];
return buffer;
}
void MetalBackend::returnConstBuffer(id<MTLBuffer> buffer) const {
mHoldBuffers.push(buffer);
}
static inline void _getNCPlane(const Tensor* tensor, int& s, int& c, int& b) {
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
s = 1, c = 1, b = 1;
b = tensor->length(0);
if (format == MNN_DATA_FORMAT_NHWC) {
c = tensor->length(tensor->dimensions()-1);
for (int i=1; i<tensor->dimensions()-1; ++i) {
s *= tensor->length(i);
}
} else {
c = tensor->length(1);
for (int i=2; i<tensor->dimensions(); ++i) {
s *= tensor->length(i);
}
}
}
MTLSize getTensorShape(id<MTLBuffer> shape, const Tensor *tensor) {
auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
int s, b, c;
_getNCPlane(tensor, s, c, b);
int z = UP_DIV(c, 4);
// shape
((int *)shape.contents)[0] = b;
((int *)shape.contents)[1] = c;
((int *)shape.contents)[2] = s;
((int *)shape.contents)[3] = 1;
// stride
if (format == MNN_DATA_FORMAT_NHWC) {
((int *)shape.contents)[4] = s * c;
((int *)shape.contents)[5] = 1;
((int *)shape.contents)[6] = c;
((int *)shape.contents)[7] = 1;
} else {
((int *)shape.contents)[4] = s * c;
((int *)shape.contents)[5] = s;
((int *)shape.contents)[6] = 1;
((int *)shape.contents)[7] = 1;
}
// threads
MTLSize threads = {(NSUInteger)s * b * z, 1, 1};
return threads;
}
static const char* gTranspose = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct tensor_shape {
uint4 size; // n, c, plane, 1
uint4 stride;
};
kernel void main0(const device IType* in [[buffer(0)]], device OType* out [[buffer(1)]], constant tensor_shape &uConstant [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
int channel = uConstant.size.y;
if (gid < channel * uConstant.size.x * uConstant.size.z) {
int tmp = gid % (channel * uConstant.size.x);
int x = gid / (channel * uConstant.size.x);
int b = tmp / channel;
int c = tmp % channel;
int outPos = b * uConstant.size.y * uConstant.size.z + c * uConstant.size.z + x;
int inPos = b * uConstant.size.y * uConstant.size.z + c + x * uConstant.size.y;
out[outPos] = (OType)(in[inPos]);
}
})metal";
static const char* gNC4HW4Convert = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct tensor_shape {
uint4 size; // n, c, plane, 1
uint4 stride;
};
kernel void main0(const device IType* in [[buffer(0)]], device OType* out [[buffer(1)]], constant tensor_shape &uConstant [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
int channelC4 = (uConstant.size.y + 3) / 4;
if (gid < channelC4 * uConstant.size.x * uConstant.size.z)
{
int3 pos;
pos.z = gid % (channelC4 * uConstant.size.x);
pos.y = gid / (channelC4 * uConstant.size.x);
pos.x = 0;
int batchIndex = pos.z / channelC4;
int zDiv4 = pos.z % channelC4;
int lastZ = uConstant.size.y / 4;
int cIndex = uConstant.size.y % 4;
int z = zDiv4*4;
int basicOffset = 0
+ batchIndex*uConstant.stride.x
+ z * uConstant.stride.y
+ pos.y * uConstant.stride.z
;
#ifdef MNN_OUTPUT_C4
OType color = OType(0);
if(zDiv4 == lastZ)
{
if(cIndex == 1)
{
color.r = in[basicOffset+0];
color.g = 0.0;
color.b = 0.0;
color.a = 0.0;
}
else if(cIndex == 2)
{
color.r = in[basicOffset+0];
color.g = in[basicOffset+1*uConstant.stride.y];
color.b = 0.0;
color.a = 0.0;
}
else
{
color.r = in[basicOffset+0];
color.g = in[basicOffset+1*uConstant.stride.y];
color.b = in[basicOffset+2*uConstant.stride.y];
color.a = 0.0;
}
}
else
{
color.r = in[basicOffset+0];
color.g = in[basicOffset+1*uConstant.stride.y];
color.b = in[basicOffset+2*uConstant.stride.y];
color.a = in[basicOffset+3*uConstant.stride.y];
}
out[0
+ pos.y
+ uConstant.size.x * uConstant.size.z*zDiv4
+ batchIndex*uConstant.size.z
] = color;
#else
IType color = in[0
+ pos.y
+ uConstant.size.x * uConstant.size.z*zDiv4
+ batchIndex*uConstant.size.z
];
if(zDiv4 == lastZ)
{
if(cIndex == 1)
{
out[basicOffset+0*uConstant.stride.y] = color.r;
}
else if(cIndex == 2)
{
out[basicOffset+0*uConstant.stride.y] = color.r;
out[basicOffset+1*uConstant.stride.y] = color.g;
}
else
{
out[basicOffset+0*uConstant.stride.y] = color.r;
out[basicOffset+1*uConstant.stride.y] = color.g;
out[basicOffset+2*uConstant.stride.y] = color.b;
}
}
else
{
out[basicOffset+0*uConstant.stride.y] = color.r;
out[basicOffset+1*uConstant.stride.y] = color.g;
out[basicOffset+2*uConstant.stride.y] = color.b;
out[basicOffset+3*uConstant.stride.y] = color.a;
}
#endif
}
}
)metal";
static const char* gCopy = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buffer(1)]], constant uint4& limit [[buffer(2)]], uint gid [[thread_position_in_grid]]) {
if (gid < limit.x) {
out[int(gid)] = (OType)in[int(gid)];
}
})metal";
void MetalBackend::onResizeBegin() {
// Abort last inference task if needed
flushEncoder();
_commandBuffer_net = nil;
_commandBuffer = nil;
wait();
mCurrentAllocator->reset();
}
ErrorCode MetalBackend::onResizeEnd() {
auto ctx = (__bridge MNNMetalContext *)context();
return mCurrentAllocator->compute();
}
static std::string _getType(const halide_type_t& type, MNN_DATA_FORMAT format, bool useFp16AsFp32) {
std::string res;
if (type.code == halide_type_float) {
if (useFp16AsFp32) {
res = "half";
} else {
res = "float";
}
} else {
switch (type.bytes()) {
case 1:
res = "char";
break;
case 2:
res = "short";
break;
case 4:
res = "int";
break;
default:
MNN_ASSERT(false);
break;
}
}
if (format == MNN_DATA_FORMAT_NC4HW4) {
return res + "4";
}
return res;
}
MetalBackend::CopyPipeline MetalBackend::_makeCopyInfo(const Tensor *src, const Tensor *dst, id<MTLBuffer> shape, int castType) const {
auto ctx = (__bridge MNNMetalContext *)context();
MetalBackend::CopyPipeline res;
auto sfmt = TensorUtils::getDescribe(src)->dimensionFormat;
auto dfmt = TensorUtils::getDescribe(dst)->dimensionFormat;
if (shape == nil) {
shape = getConstBuffer(8 * sizeof(int));
}
res.shape = shape;
if (sfmt == dfmt || src->dimensions() <= 1) {
auto srcType = _getType(src->getType(), MNN_DATA_FORMAT_NC4HW4, mUseFloatAsFp16 && castType != 1);
auto dstType = _getType(dst->getType(), MNN_DATA_FORMAT_NC4HW4, mUseFloatAsFp16 && castType != 2);
auto size = dst->elementSize();
size = UP_DIV(size, 4);
std::vector<std::string> keys = {
"copyC4",
srcType,
dstType
};
((uint32_t*)[shape contents])[0] = size;
id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
if (nil == pipeline) {
MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
[dic setValue:@(keys[1].c_str()) forKey:@"IType"];
[dic setValue:@(keys[2].c_str()) forKey:@"OType"];
option.preprocessorMacros = dic;
pipeline = makeComputePipelineWithSourceOption(gCopy, "main0", option);
mRuntime->insertPipeline(keys, pipeline);
}
res.groupSize = MTLSizeMake(UP_DIV(size, 256), 1, 1);
res.localSize = MTLSizeMake(256, 1, 1);
res.pipeline = pipeline;
return res;
}
auto srcType = _getType(src->getType(), sfmt, mUseFloatAsFp16 && castType != 1);
auto dstType = _getType(dst->getType(), dfmt, mUseFloatAsFp16 && castType != 2);
if (sfmt == MNN_DATA_FORMAT_NC4HW4 || dfmt == MNN_DATA_FORMAT_NC4HW4) {
auto normalTensor = dst;
if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
normalTensor = src;
}
// convert C4 / NCHW
std::vector<std::string> keys = {
"c4convert",
srcType,
dstType
};
if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
keys.emplace_back("outputc4");
}
id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
if (nil == pipeline) {
MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
[dic setValue:@(keys[1].c_str()) forKey:@"IType"];
[dic setValue:@(keys[2].c_str()) forKey:@"OType"];
if (dfmt == MNN_DATA_FORMAT_NC4HW4) {
[dic setValue:@"1" forKey:@"MNN_OUTPUT_C4"];
}
option.preprocessorMacros = dic;
pipeline = makeComputePipelineWithSourceOption(gNC4HW4Convert, "main0", option);
mRuntime->insertPipeline(keys, pipeline);
}
res.pipeline = pipeline;
auto size = getTensorShape(shape, normalTensor);
auto gl = [ctx computeBestGroupAndLocal:pipeline threads:size];
res.groupSize = gl.first;
res.localSize = gl.second;
return res;
}
// NCHW <-> NHWC
std::vector<std::string> keys = {
"transpose",
srcType,
dstType
};
id<MTLComputePipelineState> pipeline = mRuntime->findPipeline(keys);
if (nil == pipeline) {
MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
[dic setValue:@(keys[1].c_str()) forKey:@"IType"];
[dic setValue:@(keys[2].c_str()) forKey:@"OType"];
option.preprocessorMacros = dic;
pipeline = makeComputePipelineWithSourceOption(gTranspose, "main0", option);
mRuntime->insertPipeline(keys, pipeline);
}
res.pipeline = pipeline;
int n, c, plane;
_getNCPlane(dst, plane, c, n);
auto shapePtr = (uint32_t*)shape.contents;
shapePtr[0] = n;
shapePtr[3] = 1;
if (MNN_DATA_FORMAT_NHWC == dfmt) {
shapePtr[1] = plane;
shapePtr[2] = c;
} else {
shapePtr[1] = c;
shapePtr[2] = plane;
}
auto size = plane * n * c;
res.localSize = MTLSizeMake(256, 1, 1);
res.groupSize = MTLSizeMake(UP_DIV(size, 256), 1, 1);
return res;
}
static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::CopyPipeline& info, std::pair<id<MTLBuffer>, int> src, std::pair<id<MTLBuffer>, int> dst) {
[encoder setComputePipelineState:info.pipeline];
[encoder setBuffer:src.first offset:src.second atIndex:0];
[encoder setBuffer:dst.first offset:dst.second atIndex:1];
[encoder setBuffer:info.shape offset:0 atIndex:2];
[encoder dispatchThreadgroups:info.groupSize threadsPerThreadgroup:info.localSize];
}
void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape, int castType) const {
auto ctx = (__bridge MNNMetalContext *)context();
auto info = _makeCopyInfo(src, dst, shape, castType);
auto standalone = encoder == nil;
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
_execute(encoder, info, MetalBackend::getBuffer(src), MetalBackend::getBuffer(dst));
if (standalone) {
[encoder endEncoding];
MNN_PRINT_ENCODER(ctx, encoder);
}
}
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
flushEncoder();
auto ctx = (__bridge MNNMetalContext *)context();
commit_net();
_resetDynamicMemory();
onCopyBuffer(src, dst, nil, nil);
}
id<MTLComputeCommandEncoder> MetalBackend::encoder_for_net() const {
if (nil == mComputeEncoder) {
mComputeEncoder = encoder_net();//TO DO :: use which cmdBuffer
}
return mComputeEncoder;
}
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const {
MNN_ASSERT(src->buffer().dimensions == dst->buffer().dimensions);
if (!src->buffer().host && !dst->buffer().host) {
onCopyDeviceToDevice(src, dst, encoder, shape);
return;
}
auto sfmt = TensorUtils::getDescribe(src)->dimensionFormat;
auto dfmt = TensorUtils::getDescribe(dst)->dimensionFormat;
bool formatDiff = sfmt != dfmt && src->dimensions() > 1;
auto floats = src->getType().code == halide_type_float;
bool dataTypeDiff = floats && mUseFloatAsFp16;
bool needConvert = formatDiff || dataTypeDiff;
if (!src->buffer().host && dst->buffer().host) {
auto device = (id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)src->deviceId())->getBuffer();
auto devicePtr = (uint8_t*)device.contents + TensorUtils::getDescribe(src)->extra.offset;
if (needConvert) {
auto tDst = const_cast<Tensor*>(dst);
auto tmpBuffer = getHostBuffer(dst->usize());
auto info = _makeCopyInfo(src, dst, shape, 2);
auto standalone = encoder == nil;
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
_execute(encoder, info, MetalBackend::getBuffer(src), std::make_pair(tmpBuffer, 0));
if (standalone) {
[encoder endEncoding];
}
commit();
devicePtr = (uint8_t*)tmpBuffer.contents;
}
wait();
::memcpy(dst->host<void>(), devicePtr, dst->usize());
return;
}
if (src->buffer().host && !dst->buffer().host) {
// For command queue from user, need user to make sure last frame's gpu work is ready
bool needWait = !mRuntime->userSync();
if (needWait) {
wait();
}
auto srcSize = src->usize();
if (needConvert) {
auto tmpBuffer = getHostBuffer(srcSize);
::memcpy(tmpBuffer.contents, src->host<void>(), srcSize);
auto info = _makeCopyInfo(src, dst, shape, 1);
auto standalone = encoder == nil;
encoder = encoder ?: [getCommandBufferForBufferCopy() computeCommandEncoder];
_execute(encoder, info, std::make_pair(tmpBuffer, 0), MetalBackend::getBuffer(dst));
if (standalone) {
[encoder endEncoding];
}
commit();
} else {
auto device = (id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)dst->deviceId())->getBuffer();
auto devicePtr = (uint8_t*)device.contents + TensorUtils::getDescribe(dst)->extra.offset;
::memcpy(devicePtr, src->host<void>(), srcSize);
}
return;
}
MNN_ASSERT(false); // should not be handled here
}
int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
flushEncoder();
auto ctx = (__bridge MNNMetalContext *)context();
commit_net();
if (toCpu) {
wait();
}
return 0;
}
id<MTLCommandBuffer> MetalBackend::getCommandBufferForBufferCopy() const {
if (nil == _commandBuffer) {
_commandBuffer = [_commandQueue commandBuffer];
if (!mSupportDeferEncode) {
// In this case _commandBuffer should be the same as _commandBuffer_net
_commandBuffer_net = _commandBuffer;
}
}
return _commandBuffer;
}
id<MTLCommandBuffer> MetalBackend::getCommandBufferForNet() const {
if (nil == _commandBuffer_net) {
_commandBuffer_net = [_commandQueue commandBuffer];
if (!mSupportDeferEncode) {
// In this case _commandBuffer should be the same as _commandBuffer_net
_commandBuffer = _commandBuffer_net;
}
}
return _commandBuffer_net;
}
void MetalBackend::setTensor(const MNN::Tensor* tensor, id<MTLComputeCommandEncoder> encoder, int index) {
[encoder setBuffer:((MetalRuntimeAllocator::MetalBufferAlloc *)tensor->deviceId())->getBuffer() offset:TensorUtils::getDescribe(tensor)->extra.offset atIndex:index];
}
void MetalBackend::setMem(const MemChunk& chunk, id<MTLComputeCommandEncoder> encoder, int index) {
[encoder setBuffer:((MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first)->getBuffer() offset:chunk.second atIndex:index];
}
uint8_t* MetalBackend::getMemPtr(const MemChunk& chunk) {
return (uint8_t*)((MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first)->getBuffer().contents + chunk.second;
}
std::pair<id<MTLBuffer>, int> MetalBackend::getBuffer(const MNN::Tensor* tensor) {
return std::make_pair(((MetalRuntimeAllocator::MetalBufferAlloc *)tensor->deviceId())->getBuffer(), TensorUtils::getDescribe(tensor)->extra.offset);
}
void MetalBackend::commit() const {
if (nil != _commandBuffer && _commandBuffer.status < MTLCommandBufferStatusCommitted) {
[_commandBuffer commit];
_waiting = _commandBuffer;
_commandBuffer = nil;
if (!mSupportDeferEncode) {
// In this case _commandBuffer should be the same as _commandBuffer_net
_commandBuffer_net = nil;
}
}
}
void MetalBackend::commit_net() const {
if (nil != _commandBuffer_net && _commandBuffer_net.status < MTLCommandBufferStatusCommitted) {
[_commandBuffer_net commit];
_waiting = _commandBuffer_net;
_commandBuffer_net = nil;
if (!mSupportDeferEncode) {
// In this case _commandBuffer should be the same as _commandBuffer_net
_commandBuffer = nil;
}
}
}
void MetalBackend::wait() const {
if (nil != _waiting) {
auto buffer = _waiting;
if (buffer.status >= MTLCommandBufferStatusCompleted) {
_waiting = nil;
return;
}
#if MNN_METAL_BENCHMARK
NSTimeInterval begin = [NSDate timeIntervalSinceReferenceDate];
[buffer waitUntilCompleted];
NSTimeInterval end = [NSDate timeIntervalSinceReferenceDate];
if (@available(iOS 10.3, *)) {
printf("[METAL] commit costs: %.3fms\t(kernel: %.3fms, GPU: %.3fms)\n", (end - begin) * 1000.f,
(buffer.kernelEndTime - buffer.kernelStartTime) * 1000.f,
(buffer.GPUEndTime - buffer.GPUStartTime) * 1000.f);
} else {
printf("[METAL] commit costs: %.3fms\n", (end - begin) * 1000.f);
}
#else
[buffer waitUntilCompleted];
#endif
#if MNN_METAL_DEBUG
if (buffer.error) {
printf("[METAL] %s\n", buffer.error.localizedDescription.UTF8String);
}
#endif
}
_waiting = nil;
}
id<MTLComputePipelineState> MetalBackend::makeComputePipelineWithSourceOption(const char* csource, const char* cname, MTLCompileOptions *options) const{
auto ctx = (__bridge MNNMetalContext *)context();
auto source = [[NSString alloc] initWithUTF8String:csource];
auto name = [[NSString alloc] initWithUTF8String:cname];
auto pipeline = [ctx pipelineWithSourceOption:source name:name options:options];
if (nil == pipeline) {
mRuntime->pCurrentStatus = NOT_SUPPORT;
}
return pipeline;
}
void MetalRuntime::setCommandQueue(id<MTLCommandQueue> queue, bool userSync) {
mQueue = queue;
mUserSync = userSync;
}
id<MTLComputePipelineState> MetalRuntime::findPipeline(const std::vector<std::string>& keys) const {
auto iter = mCachePipeine.find(keys);
if (iter == mCachePipeine.end()) {
return nil;
}
return iter->second;
}
void MetalRuntime::insertPipeline(const std::vector<std::string>& keys, id<MTLComputePipelineState> pipeline) const {
if (nil != pipeline) {
mCachePipeine.insert(std::make_pair(keys, pipeline));
}
}
void MetalRuntime::setGpuMode(const int mode_num) {
int totalSet = 0;
bool isSet = (mode_num & MNN_GPU_MEMORY_BUFFER);
if(isSet) {
totalSet++;
}
isSet = (mode_num & MNN_GPU_MEMORY_IMAGE);
if(isSet) {
totalSet++;
}
if(totalSet > 0) {
MNN_PRINT("warning: set BUFFER and IMAGE mode is not useful for metal, it doesn't matter, cl_mode:%x!\n", mode_num);
}
totalSet = 0;
isSet = (mode_num & MNN_GPU_TUNING_NONE);
if(isSet) {
mTuneLevel = Never;
totalSet++;
}
isSet = (mode_num & MNN_GPU_TUNING_FAST);
if(isSet) {
mTuneLevel = Fast;
totalSet++;
}
isSet = (mode_num & MNN_GPU_TUNING_NORMAL);
if(isSet) {
mTuneLevel = Normal;
totalSet++;
}
isSet = (mode_num & MNN_GPU_TUNING_HEAVY);
if(isSet) {
mTuneLevel = Heavy;
totalSet++;
}
isSet = (mode_num & MNN_GPU_TUNING_WIDE);
if(isSet) {
mTuneLevel = Wide;
totalSet++;
}
if(totalSet != 1) {
MNN_PRINT("set multi tuning mode is not permitted, please check cl_mode:%x!\n", mode_num);
}
}
struct MetalContext {
std::mutex pLock;
MNNMetalContext* pContext;
id<MTLDevice> pDevice;
};
static MetalContext* gContext = nullptr;
MetalRuntime* MetalRuntime::create(const Backend::Info& info) {
std::unique_lock<std::mutex> _l(gContext->pLock);
MNNMetalSharedContext sharedContext;
sharedContext.device = nil;
sharedContext.queue = nil;
if (info.user != nullptr) {
if (info.user->sharedContext != nullptr) {
sharedContext.device = ((MNNMetalSharedContext*)info.user->sharedContext)->device;
sharedContext.queue = ((MNNMetalSharedContext*)info.user->sharedContext)->queue;
}
}
if (nil == sharedContext.device) {
sharedContext.device = MTLCreateSystemDefaultDevice();
}
if (nil == gContext->pContext || gContext->pDevice != sharedContext.device) {
gContext->pContext = [[MNNMetalContext alloc] init];
gContext->pDevice = sharedContext.device;
BOOL res = [gContext->pContext initWithSharedContext:&sharedContext dev:sharedContext.device];
if (!res) {
gContext->pContext = nil;
return nullptr;
}
}
auto mContext = (__bridge_retained void *)(gContext->pContext);
auto rt = new MetalRuntime(mContext);
rt->setGpuMode(info.gpuMode);
if (nil != sharedContext.queue) {
rt->setCommandQueue(sharedContext.queue, true);
}
bool supportDefer = info.numThread & MNN_GPU_RECORD_BATCH;
if ((!supportDefer) && nil == sharedContext.queue) {
id<MTLCommandQueue> queue = [sharedContext.device newCommandQueue];
rt->setCommandQueue(queue, false);
}
if (nullptr != info.user) {
rt->mDefaultConfig = *info.user;
}
return rt;
}
MetalRuntime::MetalRuntime(void* context) {
mContext = context;
auto ctx = (__bridge MNNMetalContext *)mContext;
std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
mSimdGroupReduce = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
mSimdGroupReduce |= [[ctx device] supportsFamily:MTLGPUFamilyMetal3];
mSimdGroupMatrix = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
mStatic.reset(new EagerBufferAllocator(allocator));
mDynamic.resize(METAL_SEPERATE_MAX_COUNT);
for (auto& buf : mDynamic) {
buf.root = allocator;
}
mTunedInfo = new TunedInfo;
}
MetalRuntime::~ MetalRuntime() {
if(mContext) {
CFRelease(mContext);
}
delete mTunedInfo;
}
bool MetalRuntime::setCache(std::pair<const void*, size_t> cache) {//Get Cache
auto buffer = cache.first;
auto size = cache.second;
if (nullptr == buffer) {
mCacheOutside = nullptr;
mCacheOutsideSize = 0;
mBuffer.clear();
return false;//actually get nothing
}
mCacheOutsideSize = size;
mCacheOutside = buffer;
auto cacheBuffer = GetCache(buffer);
flatbuffers::Verifier verify((const uint8_t*)cache.first, cache.second);
if (false == VerifyCacheBuffer(verify)) {
return false;
}
if (nullptr == cacheBuffer->tunings()) {
return false;
}
// Load Auto Tuning Info
if (nullptr != cacheBuffer->tunings()) {
auto tuningInfo = cacheBuffer->tunings();
for (int i=0; i<tuningInfo->size(); ++i) {
auto tun = tuningInfo->GetAs<Autotuning>(i);
if (nullptr == tun->threadSize() || nullptr == tun->groupSize() || nullptr == tun->key()) {
MNN_ERROR("Error tunning info\n");
continue;
}
std::vector<uint32_t> glo(tun->threadSize()->size());
for (int v=0; v<glo.size(); ++v) {
glo[v] = tun->threadSize()->data()[v];
}
std::vector<uint32_t> grop(tun->groupNum()->size());
for (int v=0; v<grop.size(); ++v) {
grop[v] = tun->groupNum()->data()[v];
}
std::vector<uint32_t> loc(tun->groupSize()->size());
for (int v=0; v<loc.size(); ++v) {
loc[v] = tun->groupSize()->data()[v];
}
uint32_t cost = tun->timeCost();
mTunedThreadGroup.insert(std::make_pair(std::make_pair(tun->key()->str(), glo), std::make_tuple(grop, loc, cost)));
mTunedThreadGroupVec[tun->key()->str()].emplace_back(std::make_pair(glo, std::make_tuple(grop, loc, cost)));
}
}
return true;
}
std::pair<const void*, size_t> MetalRuntime::makeCache(TunedInfo* info) {//make Cache
std::unique_ptr<CacheT> cache(new CacheT);
// Get All Autotuning cache
for (auto& iter : mTunedThreadGroup) {
std::unique_ptr<AutotuningT> tuning(new AutotuningT);
tuning->key = iter.first.first;
tuning->threadSize = iter.first.second;
tuning->groupNum = std::get<0>(iter.second);
tuning->groupSize = std::get<1>(iter.second);
tuning->timeCost = std::get<2>(iter.second);
cache->tunings.emplace_back(std::move(tuning));
}
cache->tuned = std::move(info->mInfos);
flatbuffers::FlatBufferBuilder builder;
auto lastOffset = Cache::Pack(builder, cache.get());
builder.Finish(lastOffset);
mBuffer.resize(builder.GetSize());
::memcpy(mBuffer.data(), builder.GetBufferPointer(), builder.GetSize());
return std::make_pair(mBuffer.data(), mBuffer.size());
}
float MetalRuntime::onGetMemoryInMB() {
auto staticMemoryInMB = mStatic->totalSize() / 1024.0f / 1024.0f;
float dynamicMemoryInMB = 0.0f;
for (auto& buf : mDynamic) {
dynamicMemoryInMB += buf.currentSize / 1024.0f / 1024.0f;
}
return staticMemoryInMB + dynamicMemoryInMB;
}
void MetalRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op) {
if (nullptr != op->name()) {
auto dstInfo = mTunedInfo;
std::unique_ptr<MetalCache::OpInfoT> opInfo(new MetalCache::OpInfoT);;
opInfo->type = op->type();
opInfo->name = op->name()->str();
opInfo->inputs.resize(inputs.size());
for (int v=0; v<opInfo->inputs.size(); ++v) {
opInfo->inputs[v].reset(new MetalCache::TensorInfoT);
opInfo->inputs[v]->shape.resize(inputs[v]->dimensions());
for (int u=0; u<opInfo->inputs[v]->shape.size(); ++u) {
opInfo->inputs[v]->shape[u] = inputs[v]->length(u);
}
}
opInfo->outputs.resize(outputs.size());
for (int v=0; v<opInfo->outputs.size(); ++v) {
opInfo->outputs[v].reset(new MetalCache::TensorInfoT);
opInfo->outputs[v]->shape.resize(outputs[v]->dimensions());
for (int u=0; u<opInfo->outputs[v]->shape.size(); ++u) {
opInfo->outputs[v]->shape[u] = outputs[v]->length(u);
}
}
dstInfo->mInfos.emplace_back(std::move(opInfo));
}
}
static bool _checkTensorInfo(const MetalCache::TensorInfoT* dst, const Tensor* src) {
if (dst->shape.size() != src->dimensions()) {
return false;
}
for (int j=0; j<dst->shape.size(); ++j) {
if (dst->shape[j] != src->length(j)) {
return false;
}
}
return true;
}
bool MetalRuntime::onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Runtime::OpInfo& dstInfo) const {
dstInfo.initCostLong = true;
if (nullptr == op->name()) {
dstInfo.initCostLong = false;
return true;
}
for(auto& info : mTunedInfo->mInfos) {
if (info->type != op->type()) {
continue;
}
if (info->name != op->name()->str()) {
continue;
}
if (info->inputs.size() != inputs.size() || info->outputs.size() != outputs.size()) {
continue;
}
bool match = true;
for (int i=0; i<inputs.size(); ++i) {
auto& dst = info->inputs[i];
auto src = inputs[i];
if (!_checkTensorInfo(dst.get(), src)) {
match = false;
break;
}
}
if (!match) {
continue;
}
for (int i=0; i<outputs.size(); ++i) {
auto& dst = info->outputs[i];
auto src = outputs[i];
if (!_checkTensorInfo(dst.get(), src)) {
match = false;
break;
}
}
if (match) {
// All Info is match
dstInfo.initCostLong = false;
break;
}
}
return true;
}
class MetalWrapAllocator : public BufferAllocator::Allocator {
private:
std::shared_ptr<BufferAllocator::Allocator> mOrigin;
id<MTLDevice> mDevice;
public:
MetalWrapAllocator(std::shared_ptr<BufferAllocator::Allocator> origin, id<MTLDevice> device) : mOrigin(origin), mDevice(device) {}
virtual ~ MetalWrapAllocator() {
// Do nothing
}
virtual MemChunk onAlloc(size_t size, size_t align) override {
auto mem = mOrigin->onAlloc(size, align);
MNN_ASSERT(mem.second == 0);
id<MTLBuffer> buffer = [mDevice newBufferWithBytesNoCopy:mem.first length:size options:MTLResourceStorageModeShared deallocator:nil];
auto wrap = new MetalRuntimeAllocator::MetalBufferAlloc(buffer);
return MemChunk((void *)wrap, 0);
}
virtual void onRelease(MemChunk chunk) override {
auto mem = (MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first;
mOrigin->onRelease(MemChunk(mem->getBuffer().contents));
delete mem;
}
};
Backend* MetalRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
if (hint().weightMemoryPath.size() > 0 && mStaticCache.get() == nullptr) {
auto ctx = (__bridge MNNMetalContext *)mContext;
auto mmap = BufferAllocator::Allocator::createMmap(hint().weightMemoryPath.c_str(), "", "metal.weight");
std::shared_ptr<BufferAllocator::Allocator> mmapMem(new MetalWrapAllocator(mmap, [ctx device]));
mStaticCache = mStatic;
mStatic.reset(new EagerBufferAllocator(mmapMem, 32, 1024 * 1024 * 1024));
}
BackendConfig::PrecisionMode precision = mDefaultConfig.precision;
BackendConfig::MemoryMode memory = mDefaultConfig.memory;
if (nullptr != config) {
precision = config->precision;
memory = config->memory;
}
bool useFp16AsFp32 = precision != BackendConfig::Precision_High;
return new MetalBackend(mStatic, this, useFp16AsFp32, memory);
}
void MetalRuntime::onGabageCollect(int level) {
mStatic->release(false);
if (level >= 100) {
for (auto& buf : mDynamic) {
buf.release();
}
}
}
std::pair<const void*, size_t> MetalRuntime::onGetCache() {//make Cache
return makeCache(mTunedInfo);
}
bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache
if (nullptr == buffer) {
return false;
}
auto cacheBuffer = MetalCache::GetCache(buffer);
flatbuffers::Verifier verify((const uint8_t*)buffer, size);
if (false == VerifyCacheBuffer(verify)) {
return false;
}
if(nullptr != cacheBuffer->tuned()) {
for (int i=0; i<cacheBuffer->tuned()->size(); ++i) {
auto srcInfo = cacheBuffer->tuned()->GetAs<MetalCache::OpInfo>(i);
std::unique_ptr<MetalCache::OpInfoT> dst(srcInfo->UnPack());
mTunedInfo->mInfos.emplace_back(std::move(dst));
}
}
return setCache(std::make_pair(buffer, size));
}
MemChunk MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
return MemChunk((void *)mMetalBufferAlloc, 0);
}
void MetalRuntimeAllocator::onRelease(MemChunk ptr) {
delete (MetalBufferAlloc *)ptr.first;
}
class MetalRuntimeCreator : public RuntimeCreator {
public:
MetalRuntimeCreator() {
// Do nothing
}
virtual ~ MetalRuntimeCreator() {
// Do nothing
}
virtual Runtime *onCreate(const Backend::Info &info) const {
auto rt = MetalRuntime::create(info);
return rt;
}
private:
id<MTLDevice> mDevice;
};
void registerMetalRuntimeCreator() {
// according to
// https://developer.apple.com/library/archive/documentation/DeviceInformation/Reference/iOSDeviceCompatibility/HardwareGPUInformation/HardwareGPUInformation.html
// not all device with iOS 8+ supports metal.
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
if (nil != device) {
gContext = new MetalContext;
gContext->pContext = nil;
gContext->pDevice = nil;
registerMetalOps();
#ifdef MNN_SUPPORT_RENDER
registerMetalRenderOps();
#endif
MNNInsertExtraRuntimeCreator(MNN_FORWARD_METAL, new MetalRuntimeCreator, false);
} else {
MNN_ERROR("Init Metal Error\n");
}
}
} // namespace MNN
#else
namespace MNN {
void registerMetalRuntimeCreator() {
}
};
int MNNMetalGetTensorContent(MNNMetalTensorContent* content, void* tensor) {
return -1;
}
#endif