in gemmology.h [922:1004]
inline void Transpose16InLane(
xsimd::batch<int8_t, Arch> &r0, xsimd::batch<int8_t, Arch> &r1,
xsimd::batch<int8_t, Arch> &r2, xsimd::batch<int8_t, Arch> &r3,
xsimd::batch<int8_t, Arch> &r4, xsimd::batch<int8_t, Arch> &r5,
xsimd::batch<int8_t, Arch> &r6, xsimd::batch<int8_t, Arch> &r7) {
/* r0: columns 0 1 2 3 4 5 6 7 from row 0
r1: columns 0 1 2 3 4 5 6 7 from row 1*/
auto r0_16 = xsimd::bitwise_cast<int16_t>(r0);
auto r1_16 = xsimd::bitwise_cast<int16_t>(r1);
auto r2_16 = xsimd::bitwise_cast<int16_t>(r2);
auto r3_16 = xsimd::bitwise_cast<int16_t>(r3);
auto r4_16 = xsimd::bitwise_cast<int16_t>(r4);
auto r5_16 = xsimd::bitwise_cast<int16_t>(r5);
auto r6_16 = xsimd::bitwise_cast<int16_t>(r6);
auto r7_16 = xsimd::bitwise_cast<int16_t>(r7);
std::tie(r0_16, r1_16) = interleave(r0_16, r1_16);
std::tie(r2_16, r3_16) = interleave(r2_16, r3_16);
std::tie(r4_16, r5_16) = interleave(r4_16, r5_16);
std::tie(r6_16, r7_16) = interleave(r6_16, r7_16);
/* r0: columns 0 0 1 1 2 2 3 3 from rows 0 and 1
r1: columns 4 4 5 5 6 6 7 7 from rows 0 and 1
r2: columns 0 0 1 1 2 2 3 3 from rows 2 and 3
r3: columns 4 4 5 5 6 6 7 7 from rows 2 and 3
r4: columns 0 0 1 1 2 2 3 3 from rows 4 and 5
r5: columns 4 4 5 5 6 6 7 7 from rows 4 and 5
r6: columns 0 0 1 1 2 2 3 3 from rows 6 and 7
r7: columns 4 4 5 5 6 6 7 7 from rows 6 and 7*/
auto r0_32 = xsimd::bitwise_cast<int32_t>(r0_16);
auto r2_32 = xsimd::bitwise_cast<int32_t>(r2_16);
auto r1_32 = xsimd::bitwise_cast<int32_t>(r1_16);
auto r3_32 = xsimd::bitwise_cast<int32_t>(r3_16);
auto r4_32 = xsimd::bitwise_cast<int32_t>(r4_16);
auto r6_32 = xsimd::bitwise_cast<int32_t>(r6_16);
auto r5_32 = xsimd::bitwise_cast<int32_t>(r5_16);
auto r7_32 = xsimd::bitwise_cast<int32_t>(r7_16);
std::tie(r0_32, r2_32) = interleave(r0_32, r2_32);
std::tie(r1_32, r3_32) = interleave(r1_32, r3_32);
std::tie(r4_32, r6_32) = interleave(r4_32, r6_32);
std::tie(r5_32, r7_32) = interleave(r5_32, r7_32);
/* r0: columns 0 0 0 0 1 1 1 1 from rows 0, 1, 2, and 3
r1: columns 4 4 4 4 5 5 5 5 from rows 0, 1, 2, and 3
r2: columns 2 2 2 2 3 3 3 3 from rows 0, 1, 2, and 3
r3: columns 6 6 6 6 7 7 7 7 from rows 0, 1, 2, and 3
r4: columns 0 0 0 0 1 1 1 1 from rows 4, 5, 6, and 7
r5: columns 4 4 4 4 5 5 5 5 from rows 4, 5, 6, and 7
r6: columns 2 2 2 2 3 3 3 3 from rows 4, 5, 6, and 7
r7: columns 6 6 6 6 7 7 7 7 from rows 4, 5, 6, and 7*/
auto r0_64 = xsimd::bitwise_cast<int64_t>(r0_32);
auto r2_64 = xsimd::bitwise_cast<int64_t>(r2_32);
auto r1_64 = xsimd::bitwise_cast<int64_t>(r1_32);
auto r3_64 = xsimd::bitwise_cast<int64_t>(r3_32);
auto r4_64 = xsimd::bitwise_cast<int64_t>(r4_32);
auto r6_64 = xsimd::bitwise_cast<int64_t>(r6_32);
auto r5_64 = xsimd::bitwise_cast<int64_t>(r5_32);
auto r7_64 = xsimd::bitwise_cast<int64_t>(r7_32);
std::tie(r0_64, r4_64) = interleave(r0_64, r4_64);
std::tie(r1_64, r5_64) = interleave(r1_64, r5_64);
std::tie(r2_64, r6_64) = interleave(r2_64, r6_64);
std::tie(r3_64, r7_64) = interleave(r3_64, r7_64);
r0 = xsimd::bitwise_cast<int8_t>(r0_64);
r1 = xsimd::bitwise_cast<int8_t>(r1_64);
r2 = xsimd::bitwise_cast<int8_t>(r2_64);
r3 = xsimd::bitwise_cast<int8_t>(r3_64);
r4 = xsimd::bitwise_cast<int8_t>(r4_64);
r5 = xsimd::bitwise_cast<int8_t>(r5_64);
r6 = xsimd::bitwise_cast<int8_t>(r6_64);
r7 = xsimd::bitwise_cast<int8_t>(r7_64);
/* r0: columns 0 0 0 0 0 0 0 0 from rows 0 through 7
r1: columns 4 4 4 4 4 4 4 4 from rows 0 through 7
r2: columns 2 2 2 2 2 2 2 2 from rows 0 through 7
r3: columns 6 6 6 6 6 6 6 6 from rows 0 through 7
r4: columns 1 1 1 1 1 1 1 1 from rows 0 through 7
r5: columns 5 5 5 5 5 5 5 5 from rows 0 through 7*/
/* Empirically gcc is able to remove these movs and just rename the outputs of
* Interleave64. */
std::swap(r1, r4);
std::swap(r3, r6);
}