in src/runtime/contrib/clml/clml_runtime.cc [981:1118]
void CreateConvolution2DLayer(CachedLayer* layer, const JSONGraphNode& node,
cl_convolution_mode_qcom mode, size_t nid) {
std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
std::vector<cl_uint> clml_padding = GetVectorValues(padding);
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
if (!node.HasAttr("padding")) {
clml_padding.resize(4);
std::fill(clml_padding.begin(), clml_padding.end(), 0);
}
cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[0], clml_padding[1]};
cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[2], clml_padding[3]};
std::vector<cl_uint> v_strides = GetVectorValues(strides);
std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0], v_strides[1]};
cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_dilation[0], v_dilation[1]};
cl_uint groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
ICHECK(groups == 1) << "CLML convolution only supports group size of 1.";
} else {
groups = 1; // Don't need to pass groups to depthwise
}
bool has_act = false;
std::string activation_type;
cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU;
if (node.HasAttr("activation_type")) {
activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
ICHECK(activation_type == "relu" || activation_type == "relu6")
<< "Unknown activation type:" << activation_type;
if (activation_type == "relu") {
clml_act_type = CL_ACTIVATION_RELU;
} else {
clml_act_type = CL_ACTIVATION_RELU6;
}
has_act = true;
}
cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
cl_arithmetic_mode};
// Collect inputs and outputs, handling nn.conv2d.
std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
size_t num_inputs = inputs.size();
bool has_bias;
bool has_bn;
ICHECK(num_inputs >= 2 && num_inputs <= 7)
<< "Batchnorm fused convolution requires max 7 arguments";
has_bias = (num_inputs == 3) || (num_inputs == 7);
has_bn = (num_inputs == 6) || (num_inputs == 7);
// Input
auto input =
MakeCLMLTensorFromJSONEntry(inputs[0].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
// Weight
auto weight =
MakeCLMLTensorFromJSONEntry(inputs[1].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
// Bias
auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
if (has_bias) {
bias =
MakeCLMLTensorFromJSONEntry(inputs[2].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
} else {
cl_ml_tensor_desc_qcom desc = {};
desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
&layer_.unusedTensor);
ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
bias->tensor = layer_.unusedTensor;
}
// Output
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_ml_op_convolution_desc_qcom conv_desc{mode,
groups,
4,
{clml_padding_b[0], clml_padding_b[1]},
{clml_padding_a[0], clml_padding_a[1]},
{clml_strides[0], clml_strides[1]},
{clml_dilation[0], clml_dilation[1]},
0,
cl_arithmetic_mode};
cl_ml_op_qcom op = nullptr;
if (!has_bn) {
if (!has_act) {
CLML_CALL(clCreateMLOpConvolutionForwardQCOM, CLML_CTX, nullptr, &conv_desc, input->tensor,
weight->tensor, bias->tensor, output->tensor, &op, nullptr);
} else {
CLML_CALL(clCreateMLOpFusedConvolutionActivationForwardQCOM, CLML_CTX, nullptr, &conv_desc,
&act_desc, input->tensor, weight->tensor, bias->tensor, nullptr, output->tensor,
&op, layer_.tuning_cache);
}
layer->function.push_back(op);
} else {
int bn_index = has_bias ? 3 : 2;
int axis = std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
auto bn_dims = GetTensorDims(nodes_[inputs[bn_index].id_]);
float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("batchnorm")[1]);
std::vector<cl_ml_op_properties_qcom> opProperties;
opProperties.push_back(CL_ML_BATCH_NORM_OP_EPSILON_QCOM);
opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);
std::vector<size_t> bn_shape = {1, 1, 1, 1};
bn_shape[axis] = bn_dims.n;
auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
if (!has_act) {
CLML_CALL(clCreateMLOpFusedConvolutionBatchNormForwardQCOM, CLML_CTX, opProperties.data(),
&conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor, output->tensor,
bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
layer_.tuning_cache);
} else {
CLML_CALL(clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM, CLML_CTX,
opProperties.data(), &conv_desc, &bn_desc, &act_desc, input->tensor,
weight->tensor, bias->tensor, output->tensor, nullptr, bn_mean->tensor,
bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op, layer_.tuning_cache);
}
layer->function.push_back(op);
}
return;
}