TfLiteStatus EvalMliQuantizedInt8()

in tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc [209:372]


TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
                                  const TfLiteFullyConnectedParams* params,
                                  const OpData& data,
                                  const TfLiteEvalTensor* input,
                                  const TfLiteEvalTensor* filter,
                                  const TfLiteEvalTensor* bias,
                                  TfLiteEvalTensor* output) {
  ops::micro::MliTensorAttachBuffer<int8_t>(input, &data.mli_in);
  ops::micro::MliTensorAttachBuffer<int8_t>(filter, &data.mli_weights);
  ops::micro::MliTensorAttachBuffer<int32_t>(bias, &data.mli_bias);
  ops::micro::MliTensorAttachBuffer<int8_t>(output, &data.mli_out);

  // Tensors for data in fast (local) memory and config to copy data from
  // external to local memory
  mli_tensor weights_local = *data.mli_weights.MliTensor();
  mli_tensor bias_local = *data.mli_bias.MliTensor();
  mli_tensor in_local = *data.mli_in.MliTensor();
  mli_tensor out_local = *data.mli_out.MliTensor();

  ops::micro::MliTensorInterface weights_local_interface(&weights_local);
  ops::micro::MliTensorInterface bias_local_interface(&bias_local);
  ops::micro::MliTensorInterface in_local_interface(&in_local);
  ops::micro::MliTensorInterface out_local_interface(&out_local);

  mli_mov_cfg_t copy_config;
  mli_mov_cfg_for_copy(&copy_config);
#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
  const int weight_out_dimension = 1;
#else
  const int weight_out_dimension = 0;
#endif
  // bias has only 1 dimension
  const int bias_out_ch_dimension = 0;
  const int out_tensor_dimension = 1;
  const int input_size_dimension = 1;
  int slice_size = data.mli_weights.Shape()[weight_out_dimension];

  /* allocate the local buffers, and compute the slice size */
  TF_LITE_ENSURE_STATUS(
      ops::micro::get_arc_scratch_buffer_for_fully_connect_tensors(
          context, &in_local_interface, &weights_local_interface,
          &bias_local_interface, &out_local_interface));
  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
      &weights_local_interface, &bias_local_interface, weight_out_dimension,
      &slice_size));

  int max_out_slice_size = *out_local_interface.DataCapacity() /
                           mli_hlp_tensor_element_size(&out_local);

  if (slice_size > max_out_slice_size) slice_size = max_out_slice_size;

  /* is_local indicates that the tensor is already in local memory,
     so in that case the original tensor can be used,
     and there is no need to copy it to the local tensor*/
  const bool in_is_local =
      in_local_interface.Data<int8_t>() == data.mli_in.Data<int8_t>();
  const bool out_is_local =
      out_local_interface.Data<int8_t>() == data.mli_out.Data<int8_t>();
  const bool b_is_local =
      bias_local_interface.Data<int32_t>() == data.mli_bias.Data<int32_t>();
#ifndef MLI_2_0_KRNL_TEST
  const bool w_is_local =
      weights_local_interface.Data<int8_t>() == data.mli_weights.Data<int8_t>();
#endif

#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
  ops::micro::TensorSlicer w_slice(data.mli_weights.MliTensor(),
                                   weight_out_dimension, slice_size, 0, 0, 0,
                                   true);
#else
  ops::micro::TensorSlicer w_slice(data.mli_weights.MliTensor(),
                                   weight_out_dimension, slice_size);
#endif
  ops::micro::TensorSlicer b_slice(data.mli_bias.MliTensor(),
                                   bias_out_ch_dimension, slice_size);
  ops::micro::TensorSlicer out_ch_slice(data.mli_out.MliTensor(),
                                        out_tensor_dimension, slice_size, 0, 0,
                                        0, true);

#ifdef MLI_2_0_KRNL_TEST
  mli_tensor* w_ptr = &weights_local;
#else
  mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
#endif
  mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;

  void* input_buffer_ptr = NULL;

  while (!w_slice.Done()) {
#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
    w_ptr->el_params.sa.scale.mem.pi16 = NULL;
    b_ptr->el_params.sa.scale.mem.pi16 = NULL;
#endif

#ifndef MLI_2_0_KRNL_TEST
    mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
#endif
    mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);

    // Slice the input over the batches (one at a time with the size of a
    // complete input)
    ops::micro::TensorSlicer in_slice(
        data.mli_in.MliTensor(), input_size_dimension,
        data.mli_in.Shape()[input_size_dimension]);

    /* output tensor is already sliced in the output size dimension.
    out_ch_slice.Sub() is the tensor for the amount of output size of this
    iteration of the weight slice loop. This tensor needs to be further
    sliced over the batch */
    ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
                                       slice_size);

    /* setup the pointers to the local or remote tensor to make the code
     * inside the loop easier. */
    mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
    mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;

#ifdef MLI_2_0_KRNL_TEST
    /* Permute weights tensor to the HWCN layout */
    // Assertion here to prevent usage non-contiguous buffer memory.
    if (data.mli_out.Shape()[out_tensor_dimension] !=
        out_slice.Sub()->shape[0]) {
      TF_LITE_KERNEL_LOG(
          context, "Slicing is not supported with real-time permutation.");
      return kTfLiteError;
    }
    mli_permute_cfg permute_cfg = {{1, 0, 2, 3}};
    ops::micro::permute_weights(data.mli_weights.MliTensor(), &permute_cfg,
                                w_ptr, &out_ptr->data);
#endif

    while (!out_slice.Done()) {
      if (!out_is_local) {
        ops::micro::PrepareLocalTensor(out_slice.Sub(), &out_local);
        ops::micro::PrepareLocalTensor(in_slice.Sub(), &in_local);
      }
      // if same input copy as previous iteration, skip the copy of input
#ifdef MLI_2_0
      if (in_slice.Sub()->data.mem.pi8 != input_buffer_ptr) {
        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
        input_buffer_ptr = in_slice.Sub()->data.mem.pi8;
      }
      mli_fully_connected_cfg cfg;
      cfg.relu.type = MLI_RELU_NONE;
      mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
#else
      if (in_slice.Sub()->data != input_buffer_ptr) {
        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
        input_buffer_ptr = in_slice.Sub()->data;
      }
      mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr);
#endif

      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());

      in_slice.Next();
      out_slice.Next();
    }
    w_slice.Next();
    b_slice.Next();
    out_ch_slice.Next();
  }
  return kTfLiteOk;
}