in gemmology.h [815:840]
if constexpr (batchf32::size == 16) {
const batch8 neg127(-127);
// In reverse order: grabbing the first 32-bit values from each 128-bit
// register, then the second 32-bit values, etc. Grab 4 registers at a
// time in 32-bit format.
batch32 g0 =
QuantizerGrabHalves(input + 0 * cols, input + 2 * cols, quant_mult);
batch32 g1 =
QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult);
batch32 g2 =
QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult);
batch32 g3 =
QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult);
// Pack 32-bit to 16-bit.
batch16 packed0 = deinterleave(g0, g1);
batch16 packed1 = deinterleave(g2, g3);
// Pack 16-bit to 8-bit.
batch8 packed = deinterleave(packed0, packed1);
// Ban -128.
packed = xsimd::max(packed, neg127);
return xsimd::bitwise_cast<int8_t>(
xsimd::swizzle(xsimd::bitwise_cast<int32_t>(packed),
xsimd::make_batch_constant<uint32_t, Arch, Tiler<Arch>>()));
} else if constexpr (batchf32::size == 8)