in src/tensorflow/lite/micro/kernels/svdf_common.cpp [52:192]
void EvalIntegerSvdfReference(TfLiteContext* context, TfLiteNode* node,
const TfLiteEvalTensor* input_tensor,
const TfLiteEvalTensor* weights_feature_tensor,
const TfLiteEvalTensor* weights_time_tensor,
const TfLiteEvalTensor* bias_tensor,
const TfLiteSVDFParams* params,
TfLiteEvalTensor* activation_state_tensor,
TfLiteEvalTensor* output_tensor,
const OpDataSvdf& data) {
const int n_rank = params->rank;
const int n_batch = input_tensor->dims->data[0];
const int n_input = input_tensor->dims->data[1];
const int n_filter = weights_feature_tensor->dims->data[0];
const int n_unit = n_filter / n_rank;
const int n_memory = weights_time_tensor->dims->data[1];
TFLITE_DCHECK(context != nullptr);
TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
int32_t* scratch_tensor = static_cast<int32_t*>(
context->GetScratchBuffer(context, data.scratch_tensor_index));
int32_t* scratch_output_tensor = static_cast<int32_t*>(
context->GetScratchBuffer(context, data.scratch_output_tensor_index));
// Shift states.
T* const state_ptr = tflite::micro::GetTensorData<T>(activation_state_tensor);
// Left shift the activation_state.
{
T* new_state_start = state_ptr;
const T* old_state_start = state_ptr + 1;
const T* old_state_end = state_ptr + n_batch * n_filter * n_memory;
while (old_state_start != old_state_end) {
*new_state_start++ = *old_state_start++;
}
}
// Note: no need to clear the latest activation, matmul is not accumulative.
// Feature matmul.
{
T* state = tflite::micro::GetTensorData<T>(activation_state_tensor);
const int8_t* input = tflite::micro::GetTensorData<int8_t>(input_tensor);
const int8_t* weight_feature =
tflite::micro::GetTensorData<int8_t>(weights_feature_tensor);
const int32_t output_max = std::numeric_limits<T>::max();
const int32_t output_min = std::numeric_limits<T>::min();
T* result_in_batch = state + (n_memory - 1);
for (int b = 0; b < n_batch; b++) {
const int8_t* matrix_ptr = weight_feature;
for (int r = 0; r < n_filter; r++) {
int32_t dot_prod = 0;
const int8_t* vector_in_batch = input + b * n_input;
for (int c = 0; c < n_input; c++) {
dot_prod +=
*matrix_ptr++ * (*vector_in_batch++ - data.input_zero_point);
}
dot_prod = MultiplyByQuantizedMultiplier(
dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
dot_prod = std::min(std::max(output_min, dot_prod), output_max);
// The int16 version of the op assumes a zero_point of 0. This
// code accounts for the potentially non-zero zero_point for the int8
// version of the op.
*result_in_batch = data.activation_state_zero_point + dot_prod;
result_in_batch += n_memory;
}
}
}
// Time.
{
for (int b = 0; b < n_batch; ++b) {
int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
// Perform batched vector dot product:
const T* vector1_ptr =
tflite::micro::GetTensorData<T>(weights_time_tensor);
const T* vector2_ptr =
tflite::micro::GetTensorData<T>(activation_state_tensor) +
b * n_memory * n_filter;
for (int i = 0; i < n_filter; i++) {
*scratch_ptr_batch = 0;
for (int j = 0; j < n_memory; j++) {
*scratch_ptr_batch +=
*vector1_ptr++ *
(*vector2_ptr++ - data.activation_state_zero_point);
}
scratch_ptr_batch++;
}
}
}
// Reduce, add bias, rescale, activation.
{
// Add bias.
if (bias_tensor) {
// Vector batch assign:
const int32_t* bias_data =
tflite::micro::GetTensorData<int32_t>(bias_tensor);
for (int i = 0; i < n_batch; ++i) {
int32_t* output_ptr = scratch_output_tensor + i * n_unit;
const int32_t* bias_ptr = bias_data;
for (int j = 0; j < n_unit; ++j) {
*output_ptr++ = *bias_ptr++;
}
}
} else {
int32_t* output_ptr = scratch_output_tensor;
for (int i = 0; i < n_batch * n_unit; ++i) {
*output_ptr++ = 0;
}
}
// Reduce.
for (int b = 0; b < n_batch; ++b) {
int32_t* output_temp_ptr = scratch_output_tensor + b * n_unit;
int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
// Reduction sum vector
for (int i = 0; i < n_unit; ++i) {
for (int j = 0; j < n_rank; ++j) {
output_temp_ptr[i] += *scratch_ptr_batch++;
}
}
}
// Rescale.
const int32_t output_max = std::numeric_limits<int8_t>::max();
const int32_t output_min = std::numeric_limits<int8_t>::min();
for (int i = 0; i < n_batch * n_unit; ++i) {
int32_t x1 = scratch_output_tensor[i];
int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
data.effective_scale_2_b);
int32_t x3 = x2 + data.output_zero_point;
int32_t x4 = std::min(std::max(output_min, x3), output_max);
tflite::micro::GetTensorData<int8_t>(output_tensor)[i] =
static_cast<int8_t>(x4);
}
}
}