inline void Transpose16InLane()

in gemmology.h [922:1004]


inline void Transpose16InLane(
    xsimd::batch<int8_t, Arch> &r0, xsimd::batch<int8_t, Arch> &r1,
    xsimd::batch<int8_t, Arch> &r2, xsimd::batch<int8_t, Arch> &r3,
    xsimd::batch<int8_t, Arch> &r4, xsimd::batch<int8_t, Arch> &r5,
    xsimd::batch<int8_t, Arch> &r6, xsimd::batch<int8_t, Arch> &r7) {
  /* r0: columns 0 1 2 3 4 5 6 7 from row 0
     r1: columns 0 1 2 3 4 5 6 7 from row 1*/
  auto r0_16 = xsimd::bitwise_cast<int16_t>(r0);
  auto r1_16 = xsimd::bitwise_cast<int16_t>(r1);
  auto r2_16 = xsimd::bitwise_cast<int16_t>(r2);
  auto r3_16 = xsimd::bitwise_cast<int16_t>(r3);
  auto r4_16 = xsimd::bitwise_cast<int16_t>(r4);
  auto r5_16 = xsimd::bitwise_cast<int16_t>(r5);
  auto r6_16 = xsimd::bitwise_cast<int16_t>(r6);
  auto r7_16 = xsimd::bitwise_cast<int16_t>(r7);

  std::tie(r0_16, r1_16) = interleave(r0_16, r1_16);
  std::tie(r2_16, r3_16) = interleave(r2_16, r3_16);
  std::tie(r4_16, r5_16) = interleave(r4_16, r5_16);
  std::tie(r6_16, r7_16) = interleave(r6_16, r7_16);
  /* r0: columns 0 0 1 1 2 2 3 3 from rows 0 and 1
     r1: columns 4 4 5 5 6 6 7 7 from rows 0 and 1
     r2: columns 0 0 1 1 2 2 3 3 from rows 2 and 3
     r3: columns 4 4 5 5 6 6 7 7 from rows 2 and 3
     r4: columns 0 0 1 1 2 2 3 3 from rows 4 and 5
     r5: columns 4 4 5 5 6 6 7 7 from rows 4 and 5
     r6: columns 0 0 1 1 2 2 3 3 from rows 6 and 7
     r7: columns 4 4 5 5 6 6 7 7 from rows 6 and 7*/
  auto r0_32 = xsimd::bitwise_cast<int32_t>(r0_16);
  auto r2_32 = xsimd::bitwise_cast<int32_t>(r2_16);
  auto r1_32 = xsimd::bitwise_cast<int32_t>(r1_16);
  auto r3_32 = xsimd::bitwise_cast<int32_t>(r3_16);
  auto r4_32 = xsimd::bitwise_cast<int32_t>(r4_16);
  auto r6_32 = xsimd::bitwise_cast<int32_t>(r6_16);
  auto r5_32 = xsimd::bitwise_cast<int32_t>(r5_16);
  auto r7_32 = xsimd::bitwise_cast<int32_t>(r7_16);

  std::tie(r0_32, r2_32) = interleave(r0_32, r2_32);
  std::tie(r1_32, r3_32) = interleave(r1_32, r3_32);
  std::tie(r4_32, r6_32) = interleave(r4_32, r6_32);
  std::tie(r5_32, r7_32) = interleave(r5_32, r7_32);
  /* r0: columns 0 0 0 0 1 1 1 1 from rows 0, 1, 2, and 3
     r1: columns 4 4 4 4 5 5 5 5 from rows 0, 1, 2, and 3
     r2: columns 2 2 2 2 3 3 3 3 from rows 0, 1, 2, and 3
     r3: columns 6 6 6 6 7 7 7 7 from rows 0, 1, 2, and 3
     r4: columns 0 0 0 0 1 1 1 1 from rows 4, 5, 6, and 7
     r5: columns 4 4 4 4 5 5 5 5 from rows 4, 5, 6, and 7
     r6: columns 2 2 2 2 3 3 3 3 from rows 4, 5, 6, and 7
     r7: columns 6 6 6 6 7 7 7 7 from rows 4, 5, 6, and 7*/

  auto r0_64 = xsimd::bitwise_cast<int64_t>(r0_32);
  auto r2_64 = xsimd::bitwise_cast<int64_t>(r2_32);
  auto r1_64 = xsimd::bitwise_cast<int64_t>(r1_32);
  auto r3_64 = xsimd::bitwise_cast<int64_t>(r3_32);
  auto r4_64 = xsimd::bitwise_cast<int64_t>(r4_32);
  auto r6_64 = xsimd::bitwise_cast<int64_t>(r6_32);
  auto r5_64 = xsimd::bitwise_cast<int64_t>(r5_32);
  auto r7_64 = xsimd::bitwise_cast<int64_t>(r7_32);

  std::tie(r0_64, r4_64) = interleave(r0_64, r4_64);
  std::tie(r1_64, r5_64) = interleave(r1_64, r5_64);
  std::tie(r2_64, r6_64) = interleave(r2_64, r6_64);
  std::tie(r3_64, r7_64) = interleave(r3_64, r7_64);

  r0 = xsimd::bitwise_cast<int8_t>(r0_64);
  r1 = xsimd::bitwise_cast<int8_t>(r1_64);
  r2 = xsimd::bitwise_cast<int8_t>(r2_64);
  r3 = xsimd::bitwise_cast<int8_t>(r3_64);
  r4 = xsimd::bitwise_cast<int8_t>(r4_64);
  r5 = xsimd::bitwise_cast<int8_t>(r5_64);
  r6 = xsimd::bitwise_cast<int8_t>(r6_64);
  r7 = xsimd::bitwise_cast<int8_t>(r7_64);
  /* r0: columns 0 0 0 0 0 0 0 0 from rows 0 through 7
     r1: columns 4 4 4 4 4 4 4 4 from rows 0 through 7
     r2: columns 2 2 2 2 2 2 2 2 from rows 0 through 7
     r3: columns 6 6 6 6 6 6 6 6 from rows 0 through 7
     r4: columns 1 1 1 1 1 1 1 1 from rows 0 through 7
     r5: columns 5 5 5 5 5 5 5 5 from rows 0 through 7*/
  /* Empirically gcc is able to remove these movs and just rename the outputs of
   * Interleave64. */
  std::swap(r1, r4);
  std::swap(r3, r6);
}