void Engine::Shift::PrepareBias()

in gemmology.h [1357:1401]


void Engine<Arch>::Shift::PrepareBias(const int8_t *B, size_t width,
                                      size_t B_cols, Callback C) {
  using batch8 = xsimd::batch<int8_t, Arch>;
  const size_t simd_width = width / batch8::size;
  xsimd::batch<uint8_t, Arch> a(1);
  for (size_t j = 0; j < B_cols; j += 8) {
    /*Process one row of A at a time.  Doesn't seem to be faster to do multiple
     * rows of A at once.*/
    const int8_t *B_j = B + j * width;

    /* Rather than initializing as zeros and adding, just initialize the
     * first.*/
    /* These will be packed 16-bit integers containing sums for each column of
     * B multiplied by the row of A.*/
    /* Upcast to 32-bit and horizontally add. Seems a bit faster if this is
     * declared here.*/
    auto isum0 = maddw(a, batch8::load_aligned(&B_j[0 * batch8::size]));
    auto isum1 = maddw(a, batch8::load_aligned(&B_j[1 * batch8::size]));
    auto isum2 = maddw(a, batch8::load_aligned(&B_j[2 * batch8::size]));
    auto isum3 = maddw(a, batch8::load_aligned(&B_j[3 * batch8::size]));
    auto isum4 = maddw(a, batch8::load_aligned(&B_j[4 * batch8::size]));
    auto isum5 = maddw(a, batch8::load_aligned(&B_j[5 * batch8::size]));
    auto isum6 = maddw(a, batch8::load_aligned(&B_j[6 * batch8::size]));
    auto isum7 = maddw(a, batch8::load_aligned(&B_j[7 * batch8::size]));

    B_j += 8 * batch8::size;

    for (size_t k = 1; k < simd_width; ++k, B_j += 8 * batch8::size) {
      isum0 = maddw(a, batch8::load_aligned(&B_j[0 * batch8::size]), isum0);
      isum1 = maddw(a, batch8::load_aligned(&B_j[1 * batch8::size]), isum1);
      isum2 = maddw(a, batch8::load_aligned(&B_j[2 * batch8::size]), isum2);
      isum3 = maddw(a, batch8::load_aligned(&B_j[3 * batch8::size]), isum3);
      isum4 = maddw(a, batch8::load_aligned(&B_j[4 * batch8::size]), isum4);
      isum5 = maddw(a, batch8::load_aligned(&B_j[5 * batch8::size]), isum5);
      isum6 = maddw(a, batch8::load_aligned(&B_j[6 * batch8::size]), isum6);
      isum7 = maddw(a, batch8::load_aligned(&B_j[7 * batch8::size]), isum7);
    }

    auto pack0123 = Pack0123(isum0, isum1, isum2, isum3);
    auto pack4567 = Pack0123(isum4, isum5, isum6, isum7);

    auto total = PermuteSummer(pack0123, pack4567);
    C(total, 0, j, B_cols);
  }
}