void CreateConvolution2DLayer()

in src/runtime/contrib/clml/clml_runtime.cc [981:1118]


  void CreateConvolution2DLayer(CachedLayer* layer, const JSONGraphNode& node,
                                cl_convolution_mode_qcom mode, size_t nid) {
    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
    std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
    std::vector<cl_uint> clml_padding = GetVectorValues(padding);

    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    if (!node.HasAttr("padding")) {
      clml_padding.resize(4);
      std::fill(clml_padding.begin(), clml_padding.end(), 0);
    }

    cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[0], clml_padding[1]};
    cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[2], clml_padding[3]};
    std::vector<cl_uint> v_strides = GetVectorValues(strides);
    std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
    cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0], v_strides[1]};
    cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_dilation[0], v_dilation[1]};

    cl_uint groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
    if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
      ICHECK(groups == 1) << "CLML convolution only supports group size of 1.";
    } else {
      groups = 1;  // Don't need to pass groups to depthwise
    }

    bool has_act = false;
    std::string activation_type;
    cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU;
    if (node.HasAttr("activation_type")) {
      activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
      ICHECK(activation_type == "relu" || activation_type == "relu6")
          << "Unknown activation type:" << activation_type;
      if (activation_type == "relu") {
        clml_act_type = CL_ACTIVATION_RELU;
      } else {
        clml_act_type = CL_ACTIVATION_RELU6;
      }
      has_act = true;
    }
    cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
                                              cl_arithmetic_mode};

    // Collect inputs and outputs, handling nn.conv2d.
    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
    size_t num_inputs = inputs.size();
    bool has_bias;
    bool has_bn;
    ICHECK(num_inputs >= 2 && num_inputs <= 7)
        << "Batchnorm fused convolution requires max 7 arguments";
    has_bias = (num_inputs == 3) || (num_inputs == 7);
    has_bn = (num_inputs == 6) || (num_inputs == 7);
    // Input
    auto input =
        MakeCLMLTensorFromJSONEntry(inputs[0].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    // Weight
    auto weight =
        MakeCLMLTensorFromJSONEntry(inputs[1].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    // Bias
    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
    if (has_bias) {
      bias =
          MakeCLMLTensorFromJSONEntry(inputs[2].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    } else {
      cl_ml_tensor_desc_qcom desc = {};
      desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
      CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
                                     &layer_.unusedTensor);
      ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
      bias->tensor = layer_.unusedTensor;
    }
    // Output
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    cl_ml_op_convolution_desc_qcom conv_desc{mode,
                                             groups,
                                             4,
                                             {clml_padding_b[0], clml_padding_b[1]},
                                             {clml_padding_a[0], clml_padding_a[1]},
                                             {clml_strides[0], clml_strides[1]},
                                             {clml_dilation[0], clml_dilation[1]},
                                             0,
                                             cl_arithmetic_mode};

    cl_ml_op_qcom op = nullptr;
    if (!has_bn) {
      if (!has_act) {
        CLML_CALL(clCreateMLOpConvolutionForwardQCOM, CLML_CTX, nullptr, &conv_desc, input->tensor,
                  weight->tensor, bias->tensor, output->tensor, &op, nullptr);
      } else {
        CLML_CALL(clCreateMLOpFusedConvolutionActivationForwardQCOM, CLML_CTX, nullptr, &conv_desc,
                  &act_desc, input->tensor, weight->tensor, bias->tensor, nullptr, output->tensor,
                  &op, layer_.tuning_cache);
      }
      layer->function.push_back(op);
    } else {
      int bn_index = has_bias ? 3 : 2;
      int axis = std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
      auto bn_dims = GetTensorDims(nodes_[inputs[bn_index].id_]);
      float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("batchnorm")[1]);

      std::vector<cl_ml_op_properties_qcom> opProperties;
      opProperties.push_back(CL_ML_BATCH_NORM_OP_EPSILON_QCOM);
      opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
      opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);
      std::vector<size_t> bn_shape = {1, 1, 1, 1};
      bn_shape[axis] = bn_dims.n;
      auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
      auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
      auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
      auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
      bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index].id_, bn_shape,
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
      bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1].id_, bn_shape,
                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
      bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2].id_, bn_shape,
                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
      bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3].id_, bn_shape,
                                           CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

      cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
      if (!has_act) {
        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormForwardQCOM, CLML_CTX, opProperties.data(),
                  &conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor, output->tensor,
                  bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
                  layer_.tuning_cache);
      } else {
        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM, CLML_CTX,
                  opProperties.data(), &conv_desc, &bn_desc, &act_desc, input->tensor,
                  weight->tensor, bias->tensor, output->tensor, nullptr, bn_mean->tensor,
                  bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op, layer_.tuning_cache);
      }
      layer->function.push_back(op);
    }
    return;
  }