in src/tensorflow/lite/kernels/internal/reference/fully_connected.h [165:316]
inline void ShuffledFullyConnected(
const FullyConnectedParams& params, const RuntimeShape& input_shape,
const uint8_t* input_data, const RuntimeShape& weights_shape,
const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
const int32_t* bias_data, const RuntimeShape& output_shape,
int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
// TODO(b/62193649): This really should be:
// const int batches = ArraySize(output_dims, 1);
// but the current --variable_batch hack consists in overwriting the 3rd
// dimension with the runtime batch size, as we don't keep track for each
// array of which dimension is the batch dimension in it.
const int output_dim_count = output_shape.DimensionsCount();
const int weights_dim_count = weights_shape.DimensionsCount();
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
output_shape, output_dim_count - 1);
const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
TFLITE_DCHECK((accum_depth % 16) == 0);
TFLITE_DCHECK((output_depth % 4) == 0);
// Shuffling and xoring of input activations into the workspace buffer
uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
if (batches == 1) {
for (int i = 0; i < accum_depth; i++) {
shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
}
} else if (batches == 4) {
for (int c = 0; c < accum_depth; c += 16) {
for (int b = 0; b < 4; b++) {
const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
for (int j = 0; j < 16; j++) {
uint8_t src_val = *src_data_ptr++;
// Flip the sign bit, so that the kernel will only need to
// reinterpret these uint8_t values as int8_t, getting for free the
// subtraction of the zero_point value 128.
uint8_t dst_val = src_val ^ 0x80;
*shuffled_input_workspace_ptr++ = dst_val;
}
}
}
} else {
TFLITE_DCHECK(false);
return;
}
// Actual computation
if (batches == 1) {
int16_t* output_ptr = output_data;
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
// so that just reinterpreting them as int8_t values is equivalent to
// subtracting 128 from them, thus implementing for free the subtraction of
// the zero_point value 128.
const int8_t* shuffled_weights_ptr =
reinterpret_cast<const int8_t*>(shuffled_weights_data);
// Likewise, we preshuffled and pre-xored the input data above.
const int8_t* shuffled_input_data =
reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
for (int c = 0; c < output_depth; c += 4) {
// Internal accumulation.
// Initialize accumulator with the bias-value.
int32_t accum[4] = {0};
// Accumulation loop.
for (int d = 0; d < accum_depth; d += 16) {
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 16; j++) {
int8_t input_val = shuffled_input_data[d + j];
int8_t weights_val = *shuffled_weights_ptr++;
accum[i] += weights_val * input_val;
}
}
}
for (int i = 0; i < 4; i++) {
// Add bias value
int32_t acc = accum[i] + bias_data[c + i];
// Down-scale the final int32_t accumulator to the scale used by our
// (16-bit, typically 3 integer bits) fixed-point format. The quantized
// multiplier and shift here have been pre-computed offline
// (e.g. by toco).
acc =
MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
// Saturate, cast to int16_t, and store to output array.
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_ptr[c + i] = acc;
}
}
} else if (batches == 4) {
int16_t* output_ptr = output_data;
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
// so that just reinterpreting them as int8_t values is equivalent to
// subtracting 128 from them, thus implementing for free the subtraction of
// the zero_point value 128.
const int8_t* shuffled_weights_ptr =
reinterpret_cast<const int8_t*>(shuffled_weights_data);
// Likewise, we preshuffled and pre-xored the input data above.
const int8_t* shuffled_input_data =
reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
for (int c = 0; c < output_depth; c += 4) {
const int8_t* shuffled_input_ptr = shuffled_input_data;
// Accumulation loop.
// Internal accumulation.
// Initialize accumulator with the bias-value.
int32_t accum[4][4];
for (int i = 0; i < 4; i++) {
for (int b = 0; b < 4; b++) {
accum[i][b] = 0;
}
}
for (int d = 0; d < accum_depth; d += 16) {
for (int i = 0; i < 4; i++) {
for (int b = 0; b < 4; b++) {
for (int j = 0; j < 16; j++) {
int8_t input_val = shuffled_input_ptr[16 * b + j];
int8_t weights_val = shuffled_weights_ptr[16 * i + j];
accum[i][b] += weights_val * input_val;
}
}
}
shuffled_input_ptr += 64;
shuffled_weights_ptr += 64;
}
for (int i = 0; i < 4; i++) {
for (int b = 0; b < 4; b++) {
// Add bias value
int32_t acc = accum[i][b] + bias_data[c + i];
// Down-scale the final int32_t accumulator to the scale used by our
// (16-bit, typically 3 integer bits) fixed-point format. The
// quantized multiplier and shift here have been pre-computed offline
// (e.g. by toco).
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
output_shift);
// Saturate, cast to int16_t, and store to output array.
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_ptr[b * output_depth + c + i] = acc;
}
}
}
} else {
TFLITE_DCHECK(false);
return;
}
}