void Engine::PrepareB()

in gemmology.h [1232:1274]


void Engine<Arch>::PrepareB(const float *input, int8_t *output_shadow,
                            float quant_mult, size_t rows, size_t cols) {
  using batch8 = xsimd::batch<int8_t, Arch>;

  xsimd::batch<float, Arch> q(quant_mult);
  /* Currently all multipliers have a stride of 8 columns.*/
  const size_t kColStride = 8;
  auto *output = reinterpret_cast<batch8 *>(output_shadow);
  for (size_t c = 0; c < cols; c += kColStride) {
    for (size_t r = 0; r < rows; r += sizeof(*output), output += 8) {
      output[0] =
          QuantizeTile8::ForReshape(q, input + cols * (r + 0) + c, cols);
      output[1] =
          QuantizeTile8::ForReshape(q, input + cols * (r + 1) + c, cols);
      output[2] =
          QuantizeTile8::ForReshape(q, input + cols * (r + 4) + c, cols);
      output[3] =
          QuantizeTile8::ForReshape(q, input + cols * (r + 5) + c, cols);
      output[4] =
          QuantizeTile8::ForReshape(q, input + cols * (r + 8) + c, cols);
      output[5] =
          QuantizeTile8::ForReshape(q, input + cols * (r + 9) + c, cols);
      output[6] =
          QuantizeTile8::ForReshape(q, input + cols * (r + 12) + c, cols);
      output[7] =
          QuantizeTile8::ForReshape(q, input + cols * (r + 13) + c, cols);
      std::tie(output[0], output[1]) =
          interleave(xsimd::bitwise_cast<int8_t>(output[0]),
                     xsimd::bitwise_cast<int8_t>(output[1]));
      std::tie(output[2], output[3]) =
          interleave(xsimd::bitwise_cast<int8_t>(output[2]),
                     xsimd::bitwise_cast<int8_t>(output[3]));
      std::tie(output[4], output[5]) =
          interleave(xsimd::bitwise_cast<int8_t>(output[4]),
                     xsimd::bitwise_cast<int8_t>(output[5]));
      std::tie(output[6], output[7]) =
          interleave(xsimd::bitwise_cast<int8_t>(output[6]),
                     xsimd::bitwise_cast<int8_t>(output[7]));
      Transpose16InLane(output[0], output[1], output[2], output[3], output[4],
                        output[5], output[6], output[7]);
    }
  }
}