FORCE_INLINE __m128i _mm_aesenc_si128()

in common/checksum/sse2neon.h [8788:8880]


FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
{
#if defined(__aarch64__) || defined(_M_ARM64)
    static const uint8_t shift_rows[] = {
        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
    };
    static const uint8_t ror32by8[] = {
        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
    };

    uint8x16_t v;
    uint8x16_t w = vreinterpretq_u8_m128i(a);

    /* shift rows */
    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));

    /* sub bytes */
    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
    // look up each of the table. After each lookup, we load the next table
    // which locates at the next 64-bytes. In the meantime, the index in the
    // table would be smaller than it was, so the index parameters of
    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);

    /* mix columns */
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));

    /* add round key */
    return vreinterpretq_m128i_u8(w) ^ RoundKey;

#else /* ARMv7-A implementation for a table-based AES */
#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
// multiplying 'x' by 2 in GF(2^8)
#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
// multiplying 'x' by 3 in GF(2^8)
#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
#define SSE2NEON_AES_U0(p) \
    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
#define SSE2NEON_AES_U1(p) \
    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
#define SSE2NEON_AES_U2(p) \
    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
#define SSE2NEON_AES_U3(p) \
    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))

    // this generates a table containing every possible permutation of
    // shift_rows() and sub_bytes() with mix_columns().
    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
    };
#undef SSE2NEON_AES_B2W
#undef SSE2NEON_AES_F2
#undef SSE2NEON_AES_F3
#undef SSE2NEON_AES_U0
#undef SSE2NEON_AES_U1
#undef SSE2NEON_AES_U2
#undef SSE2NEON_AES_U3

    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
    uint32_t x1 =
        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
    uint32_t x2 =
        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
    uint32_t x3 =
        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]

    // finish the modulo addition step in mix_columns()
    __m128i out = _mm_set_epi32(
        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));

    return _mm_xor_si128(out, RoundKey);
#endif
}