TfLiteStatus EvalMliQuantizedPerChannel()

in tensorflow/lite/micro/kernels/arc_mli/conv.cc [357:550]


TfLiteStatus EvalMliQuantizedPerChannel(
    TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
    const OpData& data, const TfLiteEvalTensor* input,
    const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
    TfLiteEvalTensor* output) {
  // Run Conv MLI kernel
  // MLI optimized version only supports int8_t dataype and dilation factor of 1
  if (data.is_mli_applicable) {
    // Copy configuration data from external to local memory
    mli_conv2d_cfg cfg_local = *data.cfg;

    ops::micro::MliTensorAttachBuffer<int8_t>(input, &data.mli_in);
    ops::micro::MliTensorAttachBuffer<int8_t>(filter, &data.mli_weights);
    ops::micro::MliTensorAttachBuffer<int32_t>(bias, &data.mli_bias);
    ops::micro::MliTensorAttachBuffer<int8_t>(output, &data.mli_out);

    // for height slicing
    const int height_dimension = 1;
    int in_slice_height = 0;
    int out_slice_height = 0;
    const int kernel_height =
        static_cast<int>(data.mli_weights.Shape()[KRNL_H_DIM_HWC]);
    const int overlap = kernel_height - cfg_local.stride_height;

// for weight slicing (on output channels)
#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
    // HWCN layout for weights, output channel dimension is the first dimension.
    const int weight_out_ch_dimension = 3;
#else
    // NHWC layout for weights, output channel dimension is the first dimension.
    const int weight_out_ch_dimension = 0;
#endif
    // bias has only 1 dimension
    const int bias_out_ch_dimension = 0;
    int slice_channels =
        static_cast<int>(data.mli_weights.Shape()[weight_out_ch_dimension]);
    // Batch-Height-Width-Channel layout means last dimension is output
    // channels.
    const int out_tensor_ch_dimension = 3;

    // Tensors for data in fast (local) memory and config to copy data from
    // external to local memory
    mli_tensor weights_local = *data.mli_weights.MliTensor();
    mli_tensor bias_local = *data.mli_bias.MliTensor();
    mli_tensor in_local = *data.mli_in.MliTensor();
    mli_tensor out_local = *data.mli_out.MliTensor();

    ops::micro::MliTensorInterface weights_local_interface(&weights_local);
    ops::micro::MliTensorInterface bias_local_interface(&bias_local);
    ops::micro::MliTensorInterface in_local_interface(&in_local);
    ops::micro::MliTensorInterface out_local_interface(&out_local);

    mli_mov_cfg_t copy_config;
    mli_mov_cfg_for_copy(&copy_config);

    TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
        context, &in_local_interface, &weights_local_interface,
        &bias_local_interface, &out_local_interface));
    TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
        &in_local_interface, &out_local_interface, kernel_height,
        cfg_local.stride_height, cfg_local.padding_top,
        cfg_local.padding_bottom, &in_slice_height, &out_slice_height));
    TF_LITE_ENSURE_STATUS(
        ops::micro::arc_scratch_buffer_calc_slice_size_weights(
            &weights_local_interface, &bias_local_interface,
            weight_out_ch_dimension, &slice_channels));

    /* is_local indicates that the tensor is already in local memory,
       so in that case the original tensor can be used,
       and there is no need to copy it to the local tensor*/
    const bool in_is_local =
        in_local_interface.Data<int8_t>() == data.mli_in.Data<int8_t>();
    const bool out_is_local =
        out_local_interface.Data<int8_t>() == data.mli_out.Data<int8_t>();
    const bool b_is_local =
        bias_local_interface.Data<int32_t>() == data.mli_bias.Data<int32_t>();
#ifndef MLI_2_0_KRNL_TEST
    const bool w_is_local = weights_local_interface.Data<int8_t>() ==
                            data.mli_weights.Data<int8_t>();
#endif

#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
    ops::micro::TensorSlicer w_slice(data.mli_weights.MliTensor(),
                                     weight_out_ch_dimension, slice_channels, 0,
                                     0, 0, true);
#else
    ops::micro::TensorSlicer w_slice(data.mli_weights.MliTensor(),
                                     weight_out_ch_dimension, slice_channels);
#endif
    ops::micro::TensorSlicer b_slice(data.mli_bias.MliTensor(),
                                     bias_out_ch_dimension, slice_channels);
    ops::micro::TensorSlicer out_ch_slice(data.mli_out.MliTensor(),
                                          out_tensor_ch_dimension,
                                          slice_channels, 0, 0, 0, true);

#ifdef MLI_2_0_KRNL_TEST
    mli_tensor* w_ptr = &weights_local;
#else
    mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
#endif
    mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;

    void* input_buffer_ptr = NULL;
    uint32_t input_buffer_size = 0;

    while (!w_slice.Done()) {
#ifndef MLI_2_0_KRNL_TEST
      mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
#endif
      mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);

      /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
      tensor. because the mli kernel will process one HWC tensor at a time, the
      4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
      on top of that there could be a need to also slice in the Height
      dimension. for that the sliceHeight has been calculated. The tensor slicer
      is configured that it will completely slice the nBatch dimension (0) and
      slice the height dimension (1) in chunks of 'sliceHeight' */
      ops::micro::TensorSlicer in_slice(
          data.mli_in.MliTensor(), height_dimension, in_slice_height,
          cfg_local.padding_top, cfg_local.padding_bottom, overlap);

      /* output tensor is already sliced in the output channel dimension.
      out_ch_slice.Sub() is the tensor for the amount of output channels of this
      iteration of the weight slice loop. This tensor needs to be further
      sliced over the batch and height dimension. */
      ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
                                         out_slice_height);

      /* setup the pointers to the local or remote tensor to make the code
       * inside the loop easier. */
      mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
      mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;

#ifdef MLI_2_0_KRNL_TEST
      /* Permute weights tensor to the HWCN layout */
      // Checking conditions here to prevent usage non-contiguous buffer memory.
      if (data.mli_out.Shape()[out_tensor_ch_dimension] !=
              out_slice.Sub()->shape[FMAP_C_DIM_HWC] ||
          data.mli_out.Shape()[height_dimension] !=
              out_slice.Sub()->shape[FMAP_H_DIM_HWC]) {
        TF_LITE_KERNEL_LOG(
            context, "Slicing is not supported with real-time permutation.");
        return kTfLiteError;
      }
      mli_permute_cfg permute_cfg = {{1, 2, 3, 0}};
      ops::micro::permute_weights(data.mli_weights.MliTensor(), &permute_cfg,
                                  w_ptr, &out_ptr->data);
#endif

      while (!out_slice.Done()) {
        if (!out_is_local) {
          ops::micro::PrepareLocalTensor(out_slice.Sub(), &out_local);
          ops::micro::PrepareLocalTensor(in_slice.Sub(), &in_local);
        }

        TF_LITE_ENSURE(context, !in_slice.Done());
        cfg_local.padding_top = in_slice.GetPaddingPre();
        cfg_local.padding_bottom = in_slice.GetPaddingPost();

        // if same input copy as previous iteration, skip the copy of input
#ifdef MLI_2_0
        if ((in_slice.Sub()->data.mem.pi8 != input_buffer_ptr) ||
            (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
          input_buffer_ptr = in_slice.Sub()->data.mem.pi8;
          input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
        }

        data.p_mli_krn_conv2d_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg_local,
                                           out_ptr);
#else
        if ((in_slice.Sub()->data != input_buffer_ptr) ||
            (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
          input_buffer_ptr = in_slice.Sub()->data;
          input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
        }
        data.p_mli_krn_conv2d_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg_local,
                                           out_ptr);
#endif
        mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());

        in_slice.Next();
        out_slice.Next();
      }
      w_slice.Next();
      b_slice.Next();
      out_ch_slice.Next();
      TF_LITE_ENSURE(context, in_slice.Done());
    }
  }
  return kTfLiteOk;
}