in ring/crypto/poly1305/poly1305_vec.c [437:678]
static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
size_t bytes) {
const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
poly1305_power *p;
xmmi H0, H1, H2, H3, H4;
xmmi M0, M1, M2, M3, M4;
xmmi T0, T1, T2, T3, T4, T5, T6;
xmmi C1, C2;
uint64_t r0, r1, r2;
uint64_t t0, t1, t2, t3, t4;
uint64_t c;
size_t consumed = 0;
H0 = st->H[0];
H1 = st->H[1];
H2 = st->H[2];
H3 = st->H[3];
H4 = st->H[4];
// p = [r^2,r^2]
p = &st->P[1];
if (bytes >= 32) {
// H *= [r^2,r^2]
T0 = _mm_mul_epu32(H0, p->R20.v);
T1 = _mm_mul_epu32(H0, p->R21.v);
T2 = _mm_mul_epu32(H0, p->R22.v);
T3 = _mm_mul_epu32(H0, p->R23.v);
T4 = _mm_mul_epu32(H0, p->R24.v);
T5 = _mm_mul_epu32(H1, p->S24.v);
T6 = _mm_mul_epu32(H1, p->R20.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H2, p->S23.v);
T6 = _mm_mul_epu32(H2, p->S24.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H3, p->S22.v);
T6 = _mm_mul_epu32(H3, p->S23.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H4, p->S21.v);
T6 = _mm_mul_epu32(H4, p->S22.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H1, p->R21.v);
T6 = _mm_mul_epu32(H1, p->R22.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H2, p->R20.v);
T6 = _mm_mul_epu32(H2, p->R21.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H3, p->S24.v);
T6 = _mm_mul_epu32(H3, p->R20.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H4, p->S23.v);
T6 = _mm_mul_epu32(H4, p->S24.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H1, p->R23.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(H2, p->R22.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(H3, p->R21.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(H4, p->R20.v);
T4 = _mm_add_epi64(T4, T5);
// H += [Mx,My]
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
_mm_loadl_epi64((const xmmi *)(m + 16)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
_mm_loadl_epi64((const xmmi *)(m + 24)));
M0 = _mm_and_si128(MMASK, T5);
M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
M2 = _mm_and_si128(MMASK, T5);
M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
T0 = _mm_add_epi64(T0, M0);
T1 = _mm_add_epi64(T1, M1);
T2 = _mm_add_epi64(T2, M2);
T3 = _mm_add_epi64(T3, M3);
T4 = _mm_add_epi64(T4, M4);
// reduce
C1 = _mm_srli_epi64(T0, 26);
C2 = _mm_srli_epi64(T3, 26);
T0 = _mm_and_si128(T0, MMASK);
T3 = _mm_and_si128(T3, MMASK);
T1 = _mm_add_epi64(T1, C1);
T4 = _mm_add_epi64(T4, C2);
C1 = _mm_srli_epi64(T1, 26);
C2 = _mm_srli_epi64(T4, 26);
T1 = _mm_and_si128(T1, MMASK);
T4 = _mm_and_si128(T4, MMASK);
T2 = _mm_add_epi64(T2, C1);
T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
C1 = _mm_srli_epi64(T2, 26);
C2 = _mm_srli_epi64(T0, 26);
T2 = _mm_and_si128(T2, MMASK);
T0 = _mm_and_si128(T0, MMASK);
T3 = _mm_add_epi64(T3, C1);
T1 = _mm_add_epi64(T1, C2);
C1 = _mm_srli_epi64(T3, 26);
T3 = _mm_and_si128(T3, MMASK);
T4 = _mm_add_epi64(T4, C1);
// H = (H*[r^2,r^2] + [Mx,My])
H0 = T0;
H1 = T1;
H2 = T2;
H3 = T3;
H4 = T4;
consumed = 32;
}
// finalize, H *= [r^2,r]
r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
p->R24.d[2] = (uint32_t)((r2 >> 16));
p->S21.d[2] = p->R21.d[2] * 5;
p->S22.d[2] = p->R22.d[2] * 5;
p->S23.d[2] = p->R23.d[2] * 5;
p->S24.d[2] = p->R24.d[2] * 5;
// H *= [r^2,r]
T0 = _mm_mul_epu32(H0, p->R20.v);
T1 = _mm_mul_epu32(H0, p->R21.v);
T2 = _mm_mul_epu32(H0, p->R22.v);
T3 = _mm_mul_epu32(H0, p->R23.v);
T4 = _mm_mul_epu32(H0, p->R24.v);
T5 = _mm_mul_epu32(H1, p->S24.v);
T6 = _mm_mul_epu32(H1, p->R20.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H2, p->S23.v);
T6 = _mm_mul_epu32(H2, p->S24.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H3, p->S22.v);
T6 = _mm_mul_epu32(H3, p->S23.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H4, p->S21.v);
T6 = _mm_mul_epu32(H4, p->S22.v);
T0 = _mm_add_epi64(T0, T5);
T1 = _mm_add_epi64(T1, T6);
T5 = _mm_mul_epu32(H1, p->R21.v);
T6 = _mm_mul_epu32(H1, p->R22.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H2, p->R20.v);
T6 = _mm_mul_epu32(H2, p->R21.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H3, p->S24.v);
T6 = _mm_mul_epu32(H3, p->R20.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H4, p->S23.v);
T6 = _mm_mul_epu32(H4, p->S24.v);
T2 = _mm_add_epi64(T2, T5);
T3 = _mm_add_epi64(T3, T6);
T5 = _mm_mul_epu32(H1, p->R23.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(H2, p->R22.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(H3, p->R21.v);
T4 = _mm_add_epi64(T4, T5);
T5 = _mm_mul_epu32(H4, p->R20.v);
T4 = _mm_add_epi64(T4, T5);
C1 = _mm_srli_epi64(T0, 26);
C2 = _mm_srli_epi64(T3, 26);
T0 = _mm_and_si128(T0, MMASK);
T3 = _mm_and_si128(T3, MMASK);
T1 = _mm_add_epi64(T1, C1);
T4 = _mm_add_epi64(T4, C2);
C1 = _mm_srli_epi64(T1, 26);
C2 = _mm_srli_epi64(T4, 26);
T1 = _mm_and_si128(T1, MMASK);
T4 = _mm_and_si128(T4, MMASK);
T2 = _mm_add_epi64(T2, C1);
T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
C1 = _mm_srli_epi64(T2, 26);
C2 = _mm_srli_epi64(T0, 26);
T2 = _mm_and_si128(T2, MMASK);
T0 = _mm_and_si128(T0, MMASK);
T3 = _mm_add_epi64(T3, C1);
T1 = _mm_add_epi64(T1, C2);
C1 = _mm_srli_epi64(T3, 26);
T3 = _mm_and_si128(T3, MMASK);
T4 = _mm_add_epi64(T4, C1);
// H = H[0]+H[1]
H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
t0 = _mm_cvtsi128_si32(H0);
c = (t0 >> 26);
t0 &= 0x3ffffff;
t1 = _mm_cvtsi128_si32(H1) + c;
c = (t1 >> 26);
t1 &= 0x3ffffff;
t2 = _mm_cvtsi128_si32(H2) + c;
c = (t2 >> 26);
t2 &= 0x3ffffff;
t3 = _mm_cvtsi128_si32(H3) + c;
c = (t3 >> 26);
t3 &= 0x3ffffff;
t4 = _mm_cvtsi128_si32(H4) + c;
c = (t4 >> 26);
t4 &= 0x3ffffff;
t0 = t0 + (c * 5);
c = (t0 >> 26);
t0 &= 0x3ffffff;
t1 = t1 + c;
st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
return consumed;
}