in gemmology.h [1201:1214]
void Engine<Arch>::PrepareBQuantizedTransposed(const int8_t *input,
int8_t *output, size_t cols,
size_t rows) {
using batch8 = xsimd::batch<int8_t, Arch>;
const size_t RegisterElems = batch8::size;
const size_t kColStride = 8;
auto *output_it = reinterpret_cast<batch8 *>(output);
for (size_t r = 0; r < rows; r += kColStride)
for (size_t c = 0; c < cols; c += RegisterElems)
for (size_t ri = 0; ri < 8; ++ri)
*output_it++ =
*reinterpret_cast<const batch8 *>(input + (r + ri) * cols + c);
}