in gemmology.h [1357:1401]
void Engine<Arch>::Shift::PrepareBias(const int8_t *B, size_t width,
size_t B_cols, Callback C) {
using batch8 = xsimd::batch<int8_t, Arch>;
const size_t simd_width = width / batch8::size;
xsimd::batch<uint8_t, Arch> a(1);
for (size_t j = 0; j < B_cols; j += 8) {
/*Process one row of A at a time. Doesn't seem to be faster to do multiple
* rows of A at once.*/
const int8_t *B_j = B + j * width;
/* Rather than initializing as zeros and adding, just initialize the
* first.*/
/* These will be packed 16-bit integers containing sums for each column of
* B multiplied by the row of A.*/
/* Upcast to 32-bit and horizontally add. Seems a bit faster if this is
* declared here.*/
auto isum0 = maddw(a, batch8::load_aligned(&B_j[0 * batch8::size]));
auto isum1 = maddw(a, batch8::load_aligned(&B_j[1 * batch8::size]));
auto isum2 = maddw(a, batch8::load_aligned(&B_j[2 * batch8::size]));
auto isum3 = maddw(a, batch8::load_aligned(&B_j[3 * batch8::size]));
auto isum4 = maddw(a, batch8::load_aligned(&B_j[4 * batch8::size]));
auto isum5 = maddw(a, batch8::load_aligned(&B_j[5 * batch8::size]));
auto isum6 = maddw(a, batch8::load_aligned(&B_j[6 * batch8::size]));
auto isum7 = maddw(a, batch8::load_aligned(&B_j[7 * batch8::size]));
B_j += 8 * batch8::size;
for (size_t k = 1; k < simd_width; ++k, B_j += 8 * batch8::size) {
isum0 = maddw(a, batch8::load_aligned(&B_j[0 * batch8::size]), isum0);
isum1 = maddw(a, batch8::load_aligned(&B_j[1 * batch8::size]), isum1);
isum2 = maddw(a, batch8::load_aligned(&B_j[2 * batch8::size]), isum2);
isum3 = maddw(a, batch8::load_aligned(&B_j[3 * batch8::size]), isum3);
isum4 = maddw(a, batch8::load_aligned(&B_j[4 * batch8::size]), isum4);
isum5 = maddw(a, batch8::load_aligned(&B_j[5 * batch8::size]), isum5);
isum6 = maddw(a, batch8::load_aligned(&B_j[6 * batch8::size]), isum6);
isum7 = maddw(a, batch8::load_aligned(&B_j[7 * batch8::size]), isum7);
}
auto pack0123 = Pack0123(isum0, isum1, isum2, isum3);
auto pack4567 = Pack0123(isum4, isum5, isum6, isum7);
auto total = PermuteSummer(pack0123, pack4567);
C(total, 0, j, B_cols);
}
}