void BuildEngine()

in src/runtime/contrib/clml/clml_runtime.cc [762:969]


  void BuildEngine() {
    size_t nid;
    // Create tensors for the operators which has distinct layout format
    // other than CL_TENSOR_LAYOUT_OPTIMAL_QCOM.
    for (nid = 0; nid < nodes_.size(); ++nid) {
      const auto& node = nodes_[nid];
      if ("nn.dense" == node.GetOpName()) CreateDenseLayerTensor(&layer_, node, nid);
      if ("nn.batch_matmul" == node.GetOpName()) CreateBatchMatmulLayerTensor(&layer_, node, nid);
      if ("nn.softmax" == node.GetOpName() || PatternMatch(node.GetOpName(), "nn.softmax"))
        CreateSoftmaxLayerTensor(&layer_, node, nid);
    }

    for (nid = 0; nid < nodes_.size(); ++nid) {
      const auto& node = nodes_[nid];
      if (node.GetOpType() == "input") {
        // Layers may request for different layout. Differ the input allocation.
      } else if (node.GetOpType() == "kernel") {
        auto op_name = node.GetOpName();
        if (PatternMatch(op_name, "nn.conv2d") || PatternMatch(op_name, "nn.pad_conv2d"))
          CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_CONVOLUTION_QCOM, nid);
        else if (PatternMatch(op_name, "nn.depthwise_conv2d"))
          CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_DEPTHWISE_QCOM, nid);
        else if (PatternMatch(op_name, "nn.conv2d_transpose"))
          CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_TRANSPOSE_QCOM, nid);
        else if ("nn.relu6" == op_name || PatternMatch(op_name, "nn.relu6"))
          CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU6);
        else if (PatternMatch(op_name, "nn.relu"))
          CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU);
        else if (PatternMatch(op_name, "nn.batch_norm"))
          CreateBatchNormLayer(&layer_, node, nid);
        else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
                 "nn.l2_pool2d" == op_name || PatternMatch(op_name, "nn.max_pool2d") ||
                 PatternMatch(op_name, "nn.avg_pool2d"))
          CreatePoolingLayer(&layer_, node, nid);
        else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name ||
                 PatternMatch(op_name, "nn.global_avg_pool2d") ||
                 PatternMatch(op_name, "nn.global_max_pool2d"))
          CreateGlobalPoolingLayer(&layer_, node, nid);
        else if ("reshape" == op_name || PatternMatch(op_name, "reshape"))
          CreateReshapeLayer(&layer_, node, nid);
        else if ("concatenate" == op_name)
          CreateConcatLayer(&layer_, node, nid);
        else if ("nn.dense" == op_name)
          CreateDenseLayer(&layer_, node, nid);
        else if ("nn.softmax" == op_name || PatternMatch(op_name, "nn.softmax"))
          CreateSoftMaxLayer(&layer_, node, nid);
        else if ("nn.pad" == op_name)
          CreatePadLayer(&layer_, node, nid);
        else if ("nn.batch_flatten" == op_name)
          CreateBatchFlattenLayer(&layer_, node, nid);
        else if ("clip" == op_name)
          CreateClipLayer(&layer_, node, nid);
        else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name ||
                 "minimum" == op_name || "maximum" == op_name || "divide" == op_name ||
                 PatternMatch(op_name, "relax.add") || PatternMatch(op_name, "relax.subtract") ||
                 PatternMatch(op_name, "relax.multiply") ||
                 PatternMatch(op_name, "relax.minimum") || PatternMatch(op_name, "relax.maximum") ||
                 PatternMatch(op_name, "relax.divide"))
          CreateBinaryLayer(&layer_, node, nid);
        else if ("nn.depth_to_space" == op_name)
          CreateDepthToSpaceLayer(&layer_, node, nid);
        else if ("nn.upsampling" == op_name)
          CreateResizeLayer(&layer_, node, nid);
        else if ("nn.batch_matmul" == op_name)
          CreateBatchMatmulLayer(&layer_, node, nid);
        else
          LOG(FATAL) << "Unsupported op: " << op_name;
        this->layer_.layer_names.push_back(op_name);
        // Keep map of function and Node to use in profiling
        this->layer_.op_node_map.insert({this->layer_.function.back(), std::make_pair(nid, node)});
      } else if (node.GetOpType() != "const") {
        LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
      }
    }

    for (size_t i = 0; i < outputs_.size(); ++i) {
      nid = outputs_[i].id_;
      DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
      cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
      this->layer_.outputs.push_back(this->layer_.storage_map[nid].tensor_desc);
      if (this->layer_.out_shapes.find(nid) != this->layer_.out_shapes.end()) {
        // Handle customized shapes here
        this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
            nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype, nullptr,
            this->layer_.out_shapes[nid]));
      } else {
        this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
            nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype));
      }
    }

    // Plan memory utilization
    PlanMemory();

    // ALlocate device memories and initialize the params if any
    cl_int result = 0;
    size_t alloc_on_chip = 0;
    size_t alloc_ddr = 0;
    size_t alloc_ddr_reuse = 0;
    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
      auto tensor_desc = it->second.tensor_desc;
      uint32_t mem_size = 0;
      result = CL_OUT_OF_HOST_MEMORY;
      CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, tensor_desc->tensor, &mem_size);

      JSONGraphNode node = it->second.node;
      void* node_data = nullptr;
      size_t on_chip_mem_offset = -1;
      if (layer_.on_chip_alloc_plan.find(it->first) != layer_.on_chip_alloc_plan.end()) {
        LOG_MEM << "Found GMEM Alloc:" << it->first
                << " Size:" << layer_.on_chip_alloc_plan[it->first].first
                << " Offset:" << layer_.on_chip_alloc_plan[it->first].second;
        on_chip_mem_offset = layer_.on_chip_alloc_plan[it->first].second;
        alloc_on_chip += mem_size;
        tensor_desc->memory = AllocateOnChipTensorMemory(mem_size, on_chip_mem_offset);
      } else if (layer_.ddr_alloc_plan.find(it->first) != layer_.ddr_alloc_plan.end()) {
        LOG_MEM << "DDR Alloc for nid:" << it->first << " Type:" << node.GetOpType();
        tensor_desc->memory = layer_.ddr_alloc_plan[it->first];
        alloc_ddr_reuse += mem_size;
        //} else if ((node.GetOpType() == "input") || IsOutputTensor(it->first) || (node.GetOpType()
        //== "const")) {
      } else if (node.GetOpType() == "const") {
        LOG_MEM << "DDR Alloc for Const/Input/Output";
        tensor_desc->memory = AllocateDDRTensorMemory(mem_size);
        alloc_ddr += mem_size;
      } else {
        LOG(FATAL) << "Mem allocation not found on DDR as well as On-Chip nid: " << it->first
                   << " Type:" << node.GetOpType();
      }

      if (node.GetOpType() == "const") {
        node_data = data_entry_[EntryID(it->first, 0)]->data;
        if (node_data != nullptr) {
          CopyDataToCLMLTensor(tensor_desc, node_data);
        }
      }
      this->layer_.tensorMemDescs.push_back(*tensor_desc);
    }
    LOG_STATS << "Total On-Chip Allocation  :" << alloc_on_chip;
    LOG_STATS << "Total DDR Reuse Allocation:" << alloc_ddr_reuse;
    LOG_STATS << "Total DDR fixed allocation:" << alloc_ddr;
    size_t ddr_global_pool = 0;
    size_t ddr_local_pool = 0;
    auto cws = CLMLWorkspace::Global();
    for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
      LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
      ddr_global_pool += it->second.first;
    }
    LOG_STATS << "Total Global Pool:" << ddr_global_pool;
    for (auto it = this->layer_.ddr_storage_ref_map.begin();
         it != this->layer_.ddr_storage_ref_map.end(); it++) {
      LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
      ddr_local_pool += it->second.first;
    }
    LOG_STATS << "Total Local Pool:" << ddr_local_pool;

    // Setup descriptor set
    CLML_CALL(clCreateMLTensorMemoryDescriptorSetQCOM, &this->layer_.descriptorSet);

    CLML_CALL(clUpdateMLTensorMemoryDescriptorSetQCOM, this->layer_.descriptorSet,
              static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
              this->layer_.tensorMemDescs.data());

    if (cws->is_tuning_run) {
      LOG_CLML << "CLML Tunning In Progress:";
      // Let the command queue recreated in profiling mode.
      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true);
      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
        LOG_CLML << "CLML Tunning:" << this->layer_.layer_names[i];
        CLML_CALL(clTuneMLOpQCOM, CLML_QUEUE, this->layer_.function[i], this->layer_.descriptorSet,
                  this->layer_.tuning_cache, nullptr);
      }
      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, false);

      size_t cache_len_bytes = 0;
      size_t len_ret = 0;
      CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, 0, nullptr, &cache_len_bytes);

      std::vector<unsigned char> saved_cache(cache_len_bytes, 0);
      CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, saved_cache.size(),
                saved_cache.data(), &len_ret);

      std::string tune_str;
      dmlc::MemoryStringStream mstrm(&tune_str);
      dmlc::Stream* strm = &mstrm;
      uint64_t header = kTVMCLMLTuningCacheMagic;
      uint64_t reserved = 0x0;
      strm->Write(header);
      strm->Write(reserved);
      strm->Write(clml_symbol);
      strm->Write(saved_cache);

      std::ofstream fs(cws->tuning_file, std::ios::app | std::ios::binary);
      ICHECK(!fs.fail()) << "Cannot open " << cws->tuning_file;
      fs.write(&tune_str[0], tune_str.length());
      LOG_CLML << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
               << tune_str.length() << " with tuning blob len " << saved_cache.size();
    }
    if (cws->is_recordable_queue) {
      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
        CLML_CALL(clEnqueueMLOpQCOM, this->layer_.recordable_queue, this->layer_.function[i],
                  this->layer_.descriptorSet, 0, nullptr, nullptr);
      }

      result = clEndRecordingQCOM(this->layer_.recording);
      ICHECK(result == CL_SUCCESS) << "clEndRecordingQCOM:" << result;
    }
  }