in gemmology.h [690:706]
inline xsimd::batch<int32_t, Arch> Pack0123(xsimd::batch<int32_t, Arch> sum0,
xsimd::batch<int32_t, Arch> sum1,
xsimd::batch<int32_t, Arch> sum2,
xsimd::batch<int32_t, Arch> sum3,
xsimd::kernel::requires_arch<xsimd::generic>) {
std::tie(sum0, sum1) = interleave(sum0, sum1, Arch{});
auto pack01 = sum0 + sum1;
std::tie(sum2, sum3) = interleave(sum2, sum3, Arch{});
auto pack23 = sum2 + sum3;
auto packed = interleave(xsimd::bitwise_cast<int64_t>(pack01),
xsimd::bitwise_cast<int64_t>(pack23),
Arch{});
return xsimd::bitwise_cast<int32_t>(std::get<0>(packed)) +
xsimd::bitwise_cast<int32_t>(std::get<1>(packed));
}