in common/checksum/sse2neon.h [7435:7514]
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
{
uint8x16_t _a, _b;
switch (imm & 0x4) {
case 0:
// do nothing
_a = vreinterpretq_u8_m128i(a);
break;
case 4:
_a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
vreinterpretq_u32_m128i(a), 1));
break;
default:
#if defined(__GNUC__) || defined(__clang__)
__builtin_unreachable();
#elif defined(_MSC_VER)
__assume(0);
#endif
break;
}
switch (imm & 0x3) {
case 0:
_b = vreinterpretq_u8_u32(
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
break;
case 1:
_b = vreinterpretq_u8_u32(
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
break;
case 2:
_b = vreinterpretq_u8_u32(
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
break;
case 3:
_b = vreinterpretq_u8_u32(
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
break;
default:
#if defined(__GNUC__) || defined(__clang__)
__builtin_unreachable();
#elif defined(_MSC_VER)
__assume(0);
#endif
break;
}
int16x8_t c04, c15, c26, c37;
uint8x8_t low_b = vget_low_u8(_b);
c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
#if defined(__aarch64__) || defined(_M_ARM64)
// |0|4|2|6|
c04 = vpaddq_s16(c04, c26);
// |1|5|3|7|
c15 = vpaddq_s16(c15, c37);
int32x4_t trn1_c =
vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
int32x4_t trn2_c =
vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
vreinterpretq_s16_s32(trn2_c)));
#else
int16x4_t c01, c23, c45, c67;
c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
return vreinterpretq_m128i_s16(
vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
#endif
}