in ring/crypto/poly1305/poly1305_vec.c [247:435]
static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
size_t bytes) {
const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
poly1305_power *p;
xmmi H0, H1, H2, H3, H4;
xmmi T0, T1, T2, T3, T4, T5, T6;
xmmi M0, M1, M2, M3, M4;
xmmi C1, C2;
H0 = st->H[0];
H1 = st->H[1];
H2 = st->H[2];
H3 = st->H[3];
H4 = st->H[4];
while (bytes >= 64) {
// H *= [r^4,r^4]
p = &st->P[0];
T0 = _mm_mul_epu32(H0, p->R20.v);
T1 = _mm_mul_epu32(H0, p->R21.v);
T2 = _mm_mul_epu32(H0, p->R22.v);
T3 = _mm_mul_epu32(H0, p->R23.v);
T4 = _mm_mul_epu32(H0, p->R24.v);
T5 = _mm_mul_epu32(H1, p->S24.v);
T6 = _mm_mul_epu32(H1, p->R20.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H2, p->S23.v);
T6 = _mm_mul_epu32(H2, p->S24.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H3, p->S22.v);
T6 = _mm_mul_epu32(H3, p->S23.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H4, p->S21.v);
T6 = _mm_mul_epu32(H4, p->S22.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H1, p->R21.v);
T6 = _mm_mul_epu32(H1, p->R22.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H2, p->R20.v);
T6 = _mm_mul_epu32(H2, p->R21.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H3, p->S24.v);
T6 = _mm_mul_epu32(H3, p->R20.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H4, p->S23.v);
T6 = _mm_mul_epu32(H4, p->S24.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H1, p->R23.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(H2, p->R22.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(H3, p->R21.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(H4, p->R20.v);
T4 = _mm_add_epi64(T4, T5);
// H += [Mx,My]*[r^2,r^2]
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
_mm_loadl_epi64((const xmmi *)(m + 16)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
_mm_loadl_epi64((const xmmi *)(m + 24)));
M0 = _mm_and_si128(MMASK, T5);
M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
M2 = _mm_and_si128(MMASK, T5);
M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
p = &st->P[1];
T5 = _mm_mul_epu32(M0, p->R20.v);
T6 = _mm_mul_epu32(M0, p->R21.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(M1, p->S24.v);
T6 = _mm_mul_epu32(M1, p->R20.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(M2, p->S23.v);
T6 = _mm_mul_epu32(M2, p->S24.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(M3, p->S22.v);
T6 = _mm_mul_epu32(M3, p->S23.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(M4, p->S21.v);
T6 = _mm_mul_epu32(M4, p->S22.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(M0, p->R22.v);
T6 = _mm_mul_epu32(M0, p->R23.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(M1, p->R21.v);
T6 = _mm_mul_epu32(M1, p->R22.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(M2, p->R20.v);
T6 = _mm_mul_epu32(M2, p->R21.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(M3, p->S24.v);
T6 = _mm_mul_epu32(M3, p->R20.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(M4, p->S23.v);
T6 = _mm_mul_epu32(M4, p->S24.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(M0, p->R24.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(M1, p->R23.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(M2, p->R22.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(M3, p->R21.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(M4, p->R20.v);
T4 = _mm_add_epi64(T4, T5);
// H += [Mx,My]
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
_mm_loadl_epi64((const xmmi *)(m + 48)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
_mm_loadl_epi64((const xmmi *)(m + 56)));
M0 = _mm_and_si128(MMASK, T5);
M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
M2 = _mm_and_si128(MMASK, T5);
M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
T0 = _mm_add_epi64(T0, M0);
T1 = _mm_add_epi64(T1, M1);
T2 = _mm_add_epi64(T2, M2);
T3 = _mm_add_epi64(T3, M3);
T4 = _mm_add_epi64(T4, M4);
// reduce
C1 = _mm_srli_epi64(T0, 26);
C2 = _mm_srli_epi64(T3, 26);
T0 = _mm_and_si128(T0, MMASK);
T3 = _mm_and_si128(T3, MMASK);
T1 = _mm_add_epi64(T1, C1);
T4 = _mm_add_epi64(T4, C2);
C1 = _mm_srli_epi64(T1, 26);
C2 = _mm_srli_epi64(T4, 26);
T1 = _mm_and_si128(T1, MMASK);
T4 = _mm_and_si128(T4, MMASK);
T2 = _mm_add_epi64(T2, C1);
T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
C1 = _mm_srli_epi64(T2, 26);
C2 = _mm_srli_epi64(T0, 26);
T2 = _mm_and_si128(T2, MMASK);
T0 = _mm_and_si128(T0, MMASK);
T3 = _mm_add_epi64(T3, C1);
T1 = _mm_add_epi64(T1, C2);
C1 = _mm_srli_epi64(T3, 26);
T3 = _mm_and_si128(T3, MMASK);
T4 = _mm_add_epi64(T4, C1);
// H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
H0 = T0;
H1 = T1;
H2 = T2;
H3 = T3;
H4 = T4;
m += 64;
bytes -= 64;
}
st->H[0] = H0;
st->H[1] = H1;
st->H[2] = H2;
st->H[3] = H3;
st->H[4] = H4;
}