in gemmology.h [1177:1198]
void Engine<Arch>::PrepareBTransposed(const float *input, int8_t *output,
float quant_mult, size_t cols,
size_t rows) {
using batch8 = xsimd::batch<int8_t, Arch>;
const size_t RegisterElemsInt = batch8::size;
const size_t kColStride = 8;
xsimd::batch<float, Arch> q(quant_mult);
auto *output_it = reinterpret_cast<batch8 *>(output);
size_t r = 0;
size_t c = 0;
while (r < rows) {
for (size_t ri = 0; ri < 8; ++ri)
*output_it++ = QuantizeTile8::ConsecutiveWithWrapping(
q, input + (r + ri) * cols + c, cols - c, cols, 8);
c += RegisterElemsInt;
while (c >= cols) {
r += kColStride;
c -= cols;
}
}
}