in common/checksum/sse2neon.h [8788:8880]
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
{
#if defined(__aarch64__) || defined(_M_ARM64)
static const uint8_t shift_rows[] = {
0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
};
static const uint8_t ror32by8[] = {
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
};
uint8x16_t v;
uint8x16_t w = vreinterpretq_u8_m128i(a);
/* shift rows */
w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
/* sub bytes */
// Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
// look up each of the table. After each lookup, we load the next table
// which locates at the next 64-bytes. In the meantime, the index in the
// table would be smaller than it was, so the index parameters of
// `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
// 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
/* mix columns */
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
/* add round key */
return vreinterpretq_m128i_u8(w) ^ RoundKey;
#else /* ARMv7-A implementation for a table-based AES */
#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
(((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
((uint32_t) (b1) << 8) | (uint32_t) (b0))
// multiplying 'x' by 2 in GF(2^8)
#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
// multiplying 'x' by 3 in GF(2^8)
#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
#define SSE2NEON_AES_U0(p) \
SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
#define SSE2NEON_AES_U1(p) \
SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
#define SSE2NEON_AES_U2(p) \
SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
#define SSE2NEON_AES_U3(p) \
SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
// this generates a table containing every possible permutation of
// shift_rows() and sub_bytes() with mix_columns().
static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
};
#undef SSE2NEON_AES_B2W
#undef SSE2NEON_AES_F2
#undef SSE2NEON_AES_F3
#undef SSE2NEON_AES_U0
#undef SSE2NEON_AES_U1
#undef SSE2NEON_AES_U2
#undef SSE2NEON_AES_U3
uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]
uint32_t x1 =
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]
uint32_t x2 =
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]
uint32_t x3 =
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]
// finish the modulo addition step in mix_columns()
__m128i out = _mm_set_epi32(
(aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
(aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
(aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
(aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
return _mm_xor_si128(out, RoundKey);
#endif
}