TfLiteStatus EvalMliQuantizedPerChannel()

in tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc [393:593]


TfLiteStatus EvalMliQuantizedPerChannel(
    TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params,
    const OpData& data, const TfLiteEvalTensor* input,
    const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
    TfLiteEvalTensor* output) {
  // Run Depthwise Conv MLI kernel
  // MLI optimized version only supports int8_t dataype and dilation factor of 1
  if (data.is_mli_applicable) {
    // Copy configuration data from external to local memory
    mli_conv2d_cfg cfg_local = *data.cfg;

    ops::micro::MliTensorAttachBuffer<int8_t>(input, &data.mli_in);
    ops::micro::MliTensorAttachBuffer<int8_t>(filter, &data.mli_weights);
    ops::micro::MliTensorAttachBuffer<int32_t>(bias, &data.mli_bias);
    ops::micro::MliTensorAttachBuffer<int8_t>(output, &data.mli_out);

    // for height slicing
    const int height_dimension = 1;
    int in_slice_height = 0;
    int out_slice_height = 0;
    uint32_t* mli_weights_shape = data.mli_weights.Shape();
#ifdef MLI_2_0
    const int kernel_height =
        static_cast<int>(mli_weights_shape[KRNL_DW_H_DIM_HW1N]);
#else
    const int kernel_height =
        static_cast<int>(mli_weights_shape[KRNL_DW_H_DIM_HWC]);
#endif
    const int overlap = kernel_height - cfg_local.stride_height;

    // for weight slicing (on output channels)
    // HWCN layout for weights, output channel dimension is the first dimension.
    const int weight_out_ch_dimension = 3;
    // bias has only 1 dimension
    const int bias_out_ch_dimension = 0;
    // Batch-Height-Width-Channel layout means last dimension is output
    // channels.
    const int out_tensor_ch_dimension = 3;
    const int32_t in_channels = data.mli_in.Shape()[out_tensor_ch_dimension];
    const int32_t out_channels = data.mli_out.Shape()[out_tensor_ch_dimension];
    int slice_channels =
        static_cast<int>(mli_weights_shape[weight_out_ch_dimension]);

    // Tensors for data in fast (local) memory
    // and config to copy data from external to local memory
    mli_tensor weights_local = *data.mli_weights.MliTensor();
    mli_tensor bias_local = *data.mli_bias.MliTensor();
    mli_tensor in_local = *data.mli_in.MliTensor();
    mli_tensor out_local =
        *data.mli_out.MliTensor();  // this assumes that output shape
                                    // is already filled in the tensor struct.

    ops::micro::MliTensorInterface weights_local_interface(&weights_local);
    ops::micro::MliTensorInterface bias_local_interface(&bias_local);
    ops::micro::MliTensorInterface in_local_interface(&in_local);
    ops::micro::MliTensorInterface out_local_interface(&out_local);

    mli_mov_cfg_t copy_config;
    mli_mov_cfg_for_copy(&copy_config);

    TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
        context, &in_local_interface, &weights_local_interface,
        &bias_local_interface, &out_local_interface));

    /* is_local indicates that the tensor is already in local memory,
     so in that case the original tensor can be used,
     and there is no need to copy it to the local tensor*/
    const bool in_is_local =
        in_local_interface.Data<int8_t>() == data.mli_in.Data<int8_t>();
    const bool out_is_local =
        out_local_interface.Data<int8_t>() == data.mli_out.Data<int8_t>();
    const bool w_is_local = weights_local_interface.Data<int8_t>() ==
                            data.mli_weights.Data<int8_t>();
    const bool b_is_local =
        bias_local_interface.Data<int32_t>() == data.mli_bias.Data<int32_t>();

    TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
        &in_local_interface, &out_local_interface, kernel_height,
        cfg_local.stride_height, cfg_local.padding_top,
        cfg_local.padding_bottom, &in_slice_height, &out_slice_height));
    TF_LITE_ENSURE_STATUS(
        ops::micro::arc_scratch_buffer_calc_slice_size_weights(
            &weights_local_interface, &bias_local_interface,
            weight_out_ch_dimension, &slice_channels));

    /* if input channels is not equal to output channels, a channel multiplier
       is used. in this case the slice channels needs to be rounded down to a
       multiple of the input channels */
    if (in_channels != out_channels) {
      slice_channels = (slice_channels / in_channels) * in_channels;
    }

    ops::micro::TensorSlicer b_slice(data.mli_bias.MliTensor(),
                                     bias_out_ch_dimension, slice_channels);
    ops::micro::TensorSlicer w_slice(data.mli_weights.MliTensor(),
                                     weight_out_ch_dimension, slice_channels, 0,
                                     0, 0, true);
    ops::micro::TensorSlicer out_ch_slice(data.mli_out.MliTensor(),
                                          out_tensor_ch_dimension,
                                          slice_channels, 0, 0, 0, true);
    ops::micro::TensorSlicer in_ch_slice(data.mli_in.MliTensor(),
                                         out_tensor_ch_dimension,
                                         slice_channels, 0, 0, 0, true);

    mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
    mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;

    void* input_buffer_ptr = NULL;
    uint32_t input_buffer_size = 0;
    int padding_top = cfg_local.padding_top;
    int padding_bottom = cfg_local.padding_bottom;

    while (!w_slice.Done()) {
      mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
      mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);

      /* input tensor is already sliced in the  channel dimension.
      out_ch_slice.Sub() is the tensor for the amount of channels of this
      iteration of the weight slice loop. This tensor needs to be further
      sliced over the batch and height dimension. in_ch_slice.Sub() tensor
      contains batches of HWC tensors. so it is a 4 dimensional tensor. because
      the mli kernel will process one HWC tensor at a time, the 4 dimensional
      tensor needs to be sliced into nBatch 3 dimensional tensors. on top of
      that there could be a need to also slice in the Height dimension. for that
      the sliceHeight has been calculated. The tensor slicer is configured that
      it will completely slice the nBatch dimension (0) and slice the height
      dimension (1) in chunks of 'sliceHeight' */
      ops::micro::TensorSlicer in_slice(in_ch_slice.Sub(), height_dimension,
                                        in_slice_height, padding_top,
                                        padding_bottom, overlap);

      /* output tensor is already sliced in the output channel dimension.
      out_ch_slice.Sub() is the tensor for the amount of output channels of this
      iteration of the weight slice loop. This tensor needs to be further
      sliced over the batch and height dimension. */
      ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
                                         out_slice_height);

      /* setup the pointers to the local or remote tensor to make the code
       * inside the loop easier. */
      mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
      mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;

      while (!out_slice.Done()) {
        if (!out_is_local) {
          ops::micro::PrepareLocalTensor(out_slice.Sub(), &out_local);
          ops::micro::PrepareLocalTensor(in_slice.Sub(), &in_local);
        }
        TF_LITE_ENSURE(context, !in_slice.Done());
        cfg_local.padding_top = in_slice.GetPaddingPre();
        cfg_local.padding_bottom = in_slice.GetPaddingPost();

        // if same input copy as previous iteration, skip the copy of input
#ifdef MLI_2_0
        if ((in_slice.Sub()->data.mem.pi8 != input_buffer_ptr) ||
            (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
          input_buffer_ptr = in_slice.Sub()->data.mem.pi8;
          input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
        }

#ifdef MLI_2_0_KRNL_TEST
        // Checking conditions here to prevent usage non-contiguous buffer
        // memory.
        if (mli_weights_shape[weight_out_ch_dimension] !=
            w_slice.Sub()->shape[3]) {
          TF_LITE_KERNEL_LOG(
              context, "Slicing is not supported with real-time permutation.");
          return kTfLiteError;
        }
        uint8_t dim_order[] = {1, 2, 0, 3};
        ops::micro::change_shape(w_ptr, dim_order);
#endif

        data.p_mli_krn_depthwise_conv2d_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr,
                                                     &cfg_local, out_ptr);
#else
        if ((in_slice.Sub()->data != input_buffer_ptr) ||
            (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
          input_buffer_ptr = in_slice.Sub()->data;
          input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
        }
        data.p_mli_krn_depthwise_conv2d_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr,
                                                     &cfg_local, out_ptr);
#endif

        mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());

        in_slice.Next();
        out_slice.Next();
      }
      w_slice.Next();
      b_slice.Next();
      out_ch_slice.Next();
      in_ch_slice.Next();
      TF_LITE_ENSURE(context, in_slice.Done());
    }
  }
  return kTfLiteOk;
}