void Engine::Quantize()

in gemmology.h [1129:1163]


void Engine<Arch>::Quantize(const float *const input, int8_t *const output,
                            float quant_mult, size_t size) {
  using batch8 = xsimd::batch<int8_t, Arch>;

  const std::size_t kBatch = batch8::size;
  const std::size_t fast_end = size & ~(kBatch - 1);

  xsimd::batch<float, Arch> q(quant_mult);
  for (std::size_t i = 0; i < fast_end; i += kBatch) {
    auto tile = QuantizeTile8::Consecutive(q, input + i);
    tile.store_aligned(output + i);
  }

  std::size_t overhang = size & (kBatch - 1);
  if (!overhang)
    return;
  /* Each does size(xsimd::batch<int8_t, Arch>) / 32 == kBatch / 4 floats at a
   * time. If we're allowed to read one of them, then we can read the whole
   * register.
   */
  const float *inputs[4];
  std::size_t i;
  for (i = 0; i < (overhang + (kBatch / 4) - 1) / (kBatch / 4); ++i) {
    inputs[i] = &input[fast_end + i * (kBatch / 4)];
  }
  /* These will be clipped off. */
  for (; i < 4; ++i) {
    inputs[i] = &input[fast_end];
  }
  auto result =
      QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]);
  alignas(Arch::alignment()) int8_t buffer[kBatch];
  result.store_aligned(buffer);
  std::memcpy(output + (size & ~(kBatch - 1)), buffer, overhang);
}