in tensorflow/lite/micro/kernels/arc_mli/conv.cc [357:550]
TfLiteStatus EvalMliQuantizedPerChannel(
TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
const OpData& data, const TfLiteEvalTensor* input,
const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
TfLiteEvalTensor* output) {
// Run Conv MLI kernel
// MLI optimized version only supports int8_t dataype and dilation factor of 1
if (data.is_mli_applicable) {
// Copy configuration data from external to local memory
mli_conv2d_cfg cfg_local = *data.cfg;
ops::micro::MliTensorAttachBuffer<int8_t>(input, &data.mli_in);
ops::micro::MliTensorAttachBuffer<int8_t>(filter, &data.mli_weights);
ops::micro::MliTensorAttachBuffer<int32_t>(bias, &data.mli_bias);
ops::micro::MliTensorAttachBuffer<int8_t>(output, &data.mli_out);
// for height slicing
const int height_dimension = 1;
int in_slice_height = 0;
int out_slice_height = 0;
const int kernel_height =
static_cast<int>(data.mli_weights.Shape()[KRNL_H_DIM_HWC]);
const int overlap = kernel_height - cfg_local.stride_height;
// for weight slicing (on output channels)
#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
// HWCN layout for weights, output channel dimension is the first dimension.
const int weight_out_ch_dimension = 3;
#else
// NHWC layout for weights, output channel dimension is the first dimension.
const int weight_out_ch_dimension = 0;
#endif
// bias has only 1 dimension
const int bias_out_ch_dimension = 0;
int slice_channels =
static_cast<int>(data.mli_weights.Shape()[weight_out_ch_dimension]);
// Batch-Height-Width-Channel layout means last dimension is output
// channels.
const int out_tensor_ch_dimension = 3;
// Tensors for data in fast (local) memory and config to copy data from
// external to local memory
mli_tensor weights_local = *data.mli_weights.MliTensor();
mli_tensor bias_local = *data.mli_bias.MliTensor();
mli_tensor in_local = *data.mli_in.MliTensor();
mli_tensor out_local = *data.mli_out.MliTensor();
ops::micro::MliTensorInterface weights_local_interface(&weights_local);
ops::micro::MliTensorInterface bias_local_interface(&bias_local);
ops::micro::MliTensorInterface in_local_interface(&in_local);
ops::micro::MliTensorInterface out_local_interface(&out_local);
mli_mov_cfg_t copy_config;
mli_mov_cfg_for_copy(©_config);
TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
context, &in_local_interface, &weights_local_interface,
&bias_local_interface, &out_local_interface));
TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
&in_local_interface, &out_local_interface, kernel_height,
cfg_local.stride_height, cfg_local.padding_top,
cfg_local.padding_bottom, &in_slice_height, &out_slice_height));
TF_LITE_ENSURE_STATUS(
ops::micro::arc_scratch_buffer_calc_slice_size_weights(
&weights_local_interface, &bias_local_interface,
weight_out_ch_dimension, &slice_channels));
/* is_local indicates that the tensor is already in local memory,
so in that case the original tensor can be used,
and there is no need to copy it to the local tensor*/
const bool in_is_local =
in_local_interface.Data<int8_t>() == data.mli_in.Data<int8_t>();
const bool out_is_local =
out_local_interface.Data<int8_t>() == data.mli_out.Data<int8_t>();
const bool b_is_local =
bias_local_interface.Data<int32_t>() == data.mli_bias.Data<int32_t>();
#ifndef MLI_2_0_KRNL_TEST
const bool w_is_local = weights_local_interface.Data<int8_t>() ==
data.mli_weights.Data<int8_t>();
#endif
#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
ops::micro::TensorSlicer w_slice(data.mli_weights.MliTensor(),
weight_out_ch_dimension, slice_channels, 0,
0, 0, true);
#else
ops::micro::TensorSlicer w_slice(data.mli_weights.MliTensor(),
weight_out_ch_dimension, slice_channels);
#endif
ops::micro::TensorSlicer b_slice(data.mli_bias.MliTensor(),
bias_out_ch_dimension, slice_channels);
ops::micro::TensorSlicer out_ch_slice(data.mli_out.MliTensor(),
out_tensor_ch_dimension,
slice_channels, 0, 0, 0, true);
#ifdef MLI_2_0_KRNL_TEST
mli_tensor* w_ptr = &weights_local;
#else
mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
#endif
mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
void* input_buffer_ptr = NULL;
uint32_t input_buffer_size = 0;
while (!w_slice.Done()) {
#ifndef MLI_2_0_KRNL_TEST
mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr);
#endif
mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr);
/* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
tensor. because the mli kernel will process one HWC tensor at a time, the
4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
on top of that there could be a need to also slice in the Height
dimension. for that the sliceHeight has been calculated. The tensor slicer
is configured that it will completely slice the nBatch dimension (0) and
slice the height dimension (1) in chunks of 'sliceHeight' */
ops::micro::TensorSlicer in_slice(
data.mli_in.MliTensor(), height_dimension, in_slice_height,
cfg_local.padding_top, cfg_local.padding_bottom, overlap);
/* output tensor is already sliced in the output channel dimension.
out_ch_slice.Sub() is the tensor for the amount of output channels of this
iteration of the weight slice loop. This tensor needs to be further
sliced over the batch and height dimension. */
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
out_slice_height);
/* setup the pointers to the local or remote tensor to make the code
* inside the loop easier. */
mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
#ifdef MLI_2_0_KRNL_TEST
/* Permute weights tensor to the HWCN layout */
// Checking conditions here to prevent usage non-contiguous buffer memory.
if (data.mli_out.Shape()[out_tensor_ch_dimension] !=
out_slice.Sub()->shape[FMAP_C_DIM_HWC] ||
data.mli_out.Shape()[height_dimension] !=
out_slice.Sub()->shape[FMAP_H_DIM_HWC]) {
TF_LITE_KERNEL_LOG(
context, "Slicing is not supported with real-time permutation.");
return kTfLiteError;
}
mli_permute_cfg permute_cfg = {{1, 2, 3, 0}};
ops::micro::permute_weights(data.mli_weights.MliTensor(), &permute_cfg,
w_ptr, &out_ptr->data);
#endif
while (!out_slice.Done()) {
if (!out_is_local) {
ops::micro::PrepareLocalTensor(out_slice.Sub(), &out_local);
ops::micro::PrepareLocalTensor(in_slice.Sub(), &in_local);
}
TF_LITE_ENSURE(context, !in_slice.Done());
cfg_local.padding_top = in_slice.GetPaddingPre();
cfg_local.padding_bottom = in_slice.GetPaddingPost();
// if same input copy as previous iteration, skip the copy of input
#ifdef MLI_2_0
if ((in_slice.Sub()->data.mem.pi8 != input_buffer_ptr) ||
(mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr);
input_buffer_ptr = in_slice.Sub()->data.mem.pi8;
input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
}
data.p_mli_krn_conv2d_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg_local,
out_ptr);
#else
if ((in_slice.Sub()->data != input_buffer_ptr) ||
(mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr);
input_buffer_ptr = in_slice.Sub()->data;
input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
}
data.p_mli_krn_conv2d_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg_local,
out_ptr);
#endif
mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub());
in_slice.Next();
out_slice.Next();
}
w_slice.Next();
b_slice.Next();
out_ch_slice.Next();
TF_LITE_ENSURE(context, in_slice.Done());
}
}
return kTfLiteOk;
}