in src/runtime/contrib/clml/clml_runtime.cc [762:969]
void BuildEngine() {
size_t nid;
// Create tensors for the operators which has distinct layout format
// other than CL_TENSOR_LAYOUT_OPTIMAL_QCOM.
for (nid = 0; nid < nodes_.size(); ++nid) {
const auto& node = nodes_[nid];
if ("nn.dense" == node.GetOpName()) CreateDenseLayerTensor(&layer_, node, nid);
if ("nn.batch_matmul" == node.GetOpName()) CreateBatchMatmulLayerTensor(&layer_, node, nid);
if ("nn.softmax" == node.GetOpName() || PatternMatch(node.GetOpName(), "nn.softmax"))
CreateSoftmaxLayerTensor(&layer_, node, nid);
}
for (nid = 0; nid < nodes_.size(); ++nid) {
const auto& node = nodes_[nid];
if (node.GetOpType() == "input") {
// Layers may request for different layout. Differ the input allocation.
} else if (node.GetOpType() == "kernel") {
auto op_name = node.GetOpName();
if (PatternMatch(op_name, "nn.conv2d") || PatternMatch(op_name, "nn.pad_conv2d"))
CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_CONVOLUTION_QCOM, nid);
else if (PatternMatch(op_name, "nn.depthwise_conv2d"))
CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_DEPTHWISE_QCOM, nid);
else if (PatternMatch(op_name, "nn.conv2d_transpose"))
CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_TRANSPOSE_QCOM, nid);
else if ("nn.relu6" == op_name || PatternMatch(op_name, "nn.relu6"))
CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU6);
else if (PatternMatch(op_name, "nn.relu"))
CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU);
else if (PatternMatch(op_name, "nn.batch_norm"))
CreateBatchNormLayer(&layer_, node, nid);
else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
"nn.l2_pool2d" == op_name || PatternMatch(op_name, "nn.max_pool2d") ||
PatternMatch(op_name, "nn.avg_pool2d"))
CreatePoolingLayer(&layer_, node, nid);
else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name ||
PatternMatch(op_name, "nn.global_avg_pool2d") ||
PatternMatch(op_name, "nn.global_max_pool2d"))
CreateGlobalPoolingLayer(&layer_, node, nid);
else if ("reshape" == op_name || PatternMatch(op_name, "reshape"))
CreateReshapeLayer(&layer_, node, nid);
else if ("concatenate" == op_name)
CreateConcatLayer(&layer_, node, nid);
else if ("nn.dense" == op_name)
CreateDenseLayer(&layer_, node, nid);
else if ("nn.softmax" == op_name || PatternMatch(op_name, "nn.softmax"))
CreateSoftMaxLayer(&layer_, node, nid);
else if ("nn.pad" == op_name)
CreatePadLayer(&layer_, node, nid);
else if ("nn.batch_flatten" == op_name)
CreateBatchFlattenLayer(&layer_, node, nid);
else if ("clip" == op_name)
CreateClipLayer(&layer_, node, nid);
else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name ||
"minimum" == op_name || "maximum" == op_name || "divide" == op_name ||
PatternMatch(op_name, "relax.add") || PatternMatch(op_name, "relax.subtract") ||
PatternMatch(op_name, "relax.multiply") ||
PatternMatch(op_name, "relax.minimum") || PatternMatch(op_name, "relax.maximum") ||
PatternMatch(op_name, "relax.divide"))
CreateBinaryLayer(&layer_, node, nid);
else if ("nn.depth_to_space" == op_name)
CreateDepthToSpaceLayer(&layer_, node, nid);
else if ("nn.upsampling" == op_name)
CreateResizeLayer(&layer_, node, nid);
else if ("nn.batch_matmul" == op_name)
CreateBatchMatmulLayer(&layer_, node, nid);
else
LOG(FATAL) << "Unsupported op: " << op_name;
this->layer_.layer_names.push_back(op_name);
// Keep map of function and Node to use in profiling
this->layer_.op_node_map.insert({this->layer_.function.back(), std::make_pair(nid, node)});
} else if (node.GetOpType() != "const") {
LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
}
}
for (size_t i = 0; i < outputs_.size(); ++i) {
nid = outputs_[i].id_;
DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
this->layer_.outputs.push_back(this->layer_.storage_map[nid].tensor_desc);
if (this->layer_.out_shapes.find(nid) != this->layer_.out_shapes.end()) {
// Handle customized shapes here
this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype, nullptr,
this->layer_.out_shapes[nid]));
} else {
this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype));
}
}
// Plan memory utilization
PlanMemory();
// ALlocate device memories and initialize the params if any
cl_int result = 0;
size_t alloc_on_chip = 0;
size_t alloc_ddr = 0;
size_t alloc_ddr_reuse = 0;
for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
auto tensor_desc = it->second.tensor_desc;
uint32_t mem_size = 0;
result = CL_OUT_OF_HOST_MEMORY;
CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, tensor_desc->tensor, &mem_size);
JSONGraphNode node = it->second.node;
void* node_data = nullptr;
size_t on_chip_mem_offset = -1;
if (layer_.on_chip_alloc_plan.find(it->first) != layer_.on_chip_alloc_plan.end()) {
LOG_MEM << "Found GMEM Alloc:" << it->first
<< " Size:" << layer_.on_chip_alloc_plan[it->first].first
<< " Offset:" << layer_.on_chip_alloc_plan[it->first].second;
on_chip_mem_offset = layer_.on_chip_alloc_plan[it->first].second;
alloc_on_chip += mem_size;
tensor_desc->memory = AllocateOnChipTensorMemory(mem_size, on_chip_mem_offset);
} else if (layer_.ddr_alloc_plan.find(it->first) != layer_.ddr_alloc_plan.end()) {
LOG_MEM << "DDR Alloc for nid:" << it->first << " Type:" << node.GetOpType();
tensor_desc->memory = layer_.ddr_alloc_plan[it->first];
alloc_ddr_reuse += mem_size;
//} else if ((node.GetOpType() == "input") || IsOutputTensor(it->first) || (node.GetOpType()
//== "const")) {
} else if (node.GetOpType() == "const") {
LOG_MEM << "DDR Alloc for Const/Input/Output";
tensor_desc->memory = AllocateDDRTensorMemory(mem_size);
alloc_ddr += mem_size;
} else {
LOG(FATAL) << "Mem allocation not found on DDR as well as On-Chip nid: " << it->first
<< " Type:" << node.GetOpType();
}
if (node.GetOpType() == "const") {
node_data = data_entry_[EntryID(it->first, 0)]->data;
if (node_data != nullptr) {
CopyDataToCLMLTensor(tensor_desc, node_data);
}
}
this->layer_.tensorMemDescs.push_back(*tensor_desc);
}
LOG_STATS << "Total On-Chip Allocation :" << alloc_on_chip;
LOG_STATS << "Total DDR Reuse Allocation:" << alloc_ddr_reuse;
LOG_STATS << "Total DDR fixed allocation:" << alloc_ddr;
size_t ddr_global_pool = 0;
size_t ddr_local_pool = 0;
auto cws = CLMLWorkspace::Global();
for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
ddr_global_pool += it->second.first;
}
LOG_STATS << "Total Global Pool:" << ddr_global_pool;
for (auto it = this->layer_.ddr_storage_ref_map.begin();
it != this->layer_.ddr_storage_ref_map.end(); it++) {
LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
ddr_local_pool += it->second.first;
}
LOG_STATS << "Total Local Pool:" << ddr_local_pool;
// Setup descriptor set
CLML_CALL(clCreateMLTensorMemoryDescriptorSetQCOM, &this->layer_.descriptorSet);
CLML_CALL(clUpdateMLTensorMemoryDescriptorSetQCOM, this->layer_.descriptorSet,
static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
this->layer_.tensorMemDescs.data());
if (cws->is_tuning_run) {
LOG_CLML << "CLML Tunning In Progress:";
// Let the command queue recreated in profiling mode.
cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true);
for (size_t i = 0; i < this->layer_.function.size(); ++i) {
LOG_CLML << "CLML Tunning:" << this->layer_.layer_names[i];
CLML_CALL(clTuneMLOpQCOM, CLML_QUEUE, this->layer_.function[i], this->layer_.descriptorSet,
this->layer_.tuning_cache, nullptr);
}
cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, false);
size_t cache_len_bytes = 0;
size_t len_ret = 0;
CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, 0, nullptr, &cache_len_bytes);
std::vector<unsigned char> saved_cache(cache_len_bytes, 0);
CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, saved_cache.size(),
saved_cache.data(), &len_ret);
std::string tune_str;
dmlc::MemoryStringStream mstrm(&tune_str);
dmlc::Stream* strm = &mstrm;
uint64_t header = kTVMCLMLTuningCacheMagic;
uint64_t reserved = 0x0;
strm->Write(header);
strm->Write(reserved);
strm->Write(clml_symbol);
strm->Write(saved_cache);
std::ofstream fs(cws->tuning_file, std::ios::app | std::ios::binary);
ICHECK(!fs.fail()) << "Cannot open " << cws->tuning_file;
fs.write(&tune_str[0], tune_str.length());
LOG_CLML << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
<< tune_str.length() << " with tuning blob len " << saved_cache.size();
}
if (cws->is_recordable_queue) {
for (size_t i = 0; i < this->layer_.function.size(); ++i) {
CLML_CALL(clEnqueueMLOpQCOM, this->layer_.recordable_queue, this->layer_.function[i],
this->layer_.descriptorSet, 0, nullptr, nullptr);
}
result = clEndRecordingQCOM(this->layer_.recording);
ICHECK(result == CL_SUCCESS) << "clEndRecordingQCOM:" << result;
}
}