in gemmology.h [1129:1163]
void Engine<Arch>::Quantize(const float *const input, int8_t *const output,
float quant_mult, size_t size) {
using batch8 = xsimd::batch<int8_t, Arch>;
const std::size_t kBatch = batch8::size;
const std::size_t fast_end = size & ~(kBatch - 1);
xsimd::batch<float, Arch> q(quant_mult);
for (std::size_t i = 0; i < fast_end; i += kBatch) {
auto tile = QuantizeTile8::Consecutive(q, input + i);
tile.store_aligned(output + i);
}
std::size_t overhang = size & (kBatch - 1);
if (!overhang)
return;
/* Each does size(xsimd::batch<int8_t, Arch>) / 32 == kBatch / 4 floats at a
* time. If we're allowed to read one of them, then we can read the whole
* register.
*/
const float *inputs[4];
std::size_t i;
for (i = 0; i < (overhang + (kBatch / 4) - 1) / (kBatch / 4); ++i) {
inputs[i] = &input[fast_end + i * (kBatch / 4)];
}
/* These will be clipped off. */
for (; i < 4; ++i) {
inputs[i] = &input[fast_end];
}
auto result =
QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]);
alignas(Arch::alignment()) int8_t buffer[kBatch];
result.store_aligned(buffer);
std::memcpy(output + (size & ~(kBatch - 1)), buffer, overhang);
}