in common/checksum/sse2neon.h [8885:8951]
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
{
#if defined(__aarch64__)
static const uint8_t inv_shift_rows[] = {
0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
};
static const uint8_t ror32by8[] = {
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
};
uint8x16_t v;
uint8x16_t w = vreinterpretq_u8_m128i(a);
// inverse shift rows
w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
// inverse sub bytes
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
// inverse mix columns
// multiplying 'v' by 4 in GF(2^8)
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
v ^= w;
v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
0x1b); // multiplying 'v' by 2 in GF(2^8)
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
// add round key
return vreinterpretq_m128i_u8(w) ^ RoundKey;
#else /* ARMv7-A NEON implementation */
/* FIXME: optimized for NEON */
uint8_t i, e, f, g, h, v[4][4];
uint8_t *_a = (uint8_t *) &a;
for (i = 0; i < 16; ++i) {
v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
}
// inverse mix columns
for (i = 0; i < 4; ++i) {
e = v[i][0];
f = v[i][1];
g = v[i][2];
h = v[i][3];
v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
}
return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
#endif
}