in gemmology.h [1232:1274]
void Engine<Arch>::PrepareB(const float *input, int8_t *output_shadow,
float quant_mult, size_t rows, size_t cols) {
using batch8 = xsimd::batch<int8_t, Arch>;
xsimd::batch<float, Arch> q(quant_mult);
/* Currently all multipliers have a stride of 8 columns.*/
const size_t kColStride = 8;
auto *output = reinterpret_cast<batch8 *>(output_shadow);
for (size_t c = 0; c < cols; c += kColStride) {
for (size_t r = 0; r < rows; r += sizeof(*output), output += 8) {
output[0] =
QuantizeTile8::ForReshape(q, input + cols * (r + 0) + c, cols);
output[1] =
QuantizeTile8::ForReshape(q, input + cols * (r + 1) + c, cols);
output[2] =
QuantizeTile8::ForReshape(q, input + cols * (r + 4) + c, cols);
output[3] =
QuantizeTile8::ForReshape(q, input + cols * (r + 5) + c, cols);
output[4] =
QuantizeTile8::ForReshape(q, input + cols * (r + 8) + c, cols);
output[5] =
QuantizeTile8::ForReshape(q, input + cols * (r + 9) + c, cols);
output[6] =
QuantizeTile8::ForReshape(q, input + cols * (r + 12) + c, cols);
output[7] =
QuantizeTile8::ForReshape(q, input + cols * (r + 13) + c, cols);
std::tie(output[0], output[1]) =
interleave(xsimd::bitwise_cast<int8_t>(output[0]),
xsimd::bitwise_cast<int8_t>(output[1]));
std::tie(output[2], output[3]) =
interleave(xsimd::bitwise_cast<int8_t>(output[2]),
xsimd::bitwise_cast<int8_t>(output[3]));
std::tie(output[4], output[5]) =
interleave(xsimd::bitwise_cast<int8_t>(output[4]),
xsimd::bitwise_cast<int8_t>(output[5]));
std::tie(output[6], output[7]) =
interleave(xsimd::bitwise_cast<int8_t>(output[6]),
xsimd::bitwise_cast<int8_t>(output[7]));
Transpose16InLane(output[0], output[1], output[2], output[3], output[4],
output[5], output[6], output[7]);
}
}
}