static void poly1305_blocks()

in ring/crypto/poly1305/poly1305_vec.c [247:435]


static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
                            size_t bytes) {
  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);

  poly1305_power *p;
  xmmi H0, H1, H2, H3, H4;
  xmmi T0, T1, T2, T3, T4, T5, T6;
  xmmi M0, M1, M2, M3, M4;
  xmmi C1, C2;

  H0 = st->H[0];
  H1 = st->H[1];
  H2 = st->H[2];
  H3 = st->H[3];
  H4 = st->H[4];

  while (bytes >= 64) {
    // H *= [r^4,r^4]
    p = &st->P[0];
    T0 = _mm_mul_epu32(H0, p->R20.v);
    T1 = _mm_mul_epu32(H0, p->R21.v);
    T2 = _mm_mul_epu32(H0, p->R22.v);
    T3 = _mm_mul_epu32(H0, p->R23.v);
    T4 = _mm_mul_epu32(H0, p->R24.v);
    T5 = _mm_mul_epu32(H1, p->S24.v);
    T6 = _mm_mul_epu32(H1, p->R20.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(H2, p->S23.v);
    T6 = _mm_mul_epu32(H2, p->S24.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(H3, p->S22.v);
    T6 = _mm_mul_epu32(H3, p->S23.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(H4, p->S21.v);
    T6 = _mm_mul_epu32(H4, p->S22.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(H1, p->R21.v);
    T6 = _mm_mul_epu32(H1, p->R22.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(H2, p->R20.v);
    T6 = _mm_mul_epu32(H2, p->R21.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(H3, p->S24.v);
    T6 = _mm_mul_epu32(H3, p->R20.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(H4, p->S23.v);
    T6 = _mm_mul_epu32(H4, p->S24.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(H1, p->R23.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(H2, p->R22.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(H3, p->R21.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(H4, p->R20.v);
    T4 = _mm_add_epi64(T4, T5);

    // H += [Mx,My]*[r^2,r^2]
    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
                            _mm_loadl_epi64((const xmmi *)(m + 16)));
    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
                            _mm_loadl_epi64((const xmmi *)(m + 24)));
    M0 = _mm_and_si128(MMASK, T5);
    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
    M2 = _mm_and_si128(MMASK, T5);
    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);

    p = &st->P[1];
    T5 = _mm_mul_epu32(M0, p->R20.v);
    T6 = _mm_mul_epu32(M0, p->R21.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(M1, p->S24.v);
    T6 = _mm_mul_epu32(M1, p->R20.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(M2, p->S23.v);
    T6 = _mm_mul_epu32(M2, p->S24.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(M3, p->S22.v);
    T6 = _mm_mul_epu32(M3, p->S23.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(M4, p->S21.v);
    T6 = _mm_mul_epu32(M4, p->S22.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(M0, p->R22.v);
    T6 = _mm_mul_epu32(M0, p->R23.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(M1, p->R21.v);
    T6 = _mm_mul_epu32(M1, p->R22.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(M2, p->R20.v);
    T6 = _mm_mul_epu32(M2, p->R21.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(M3, p->S24.v);
    T6 = _mm_mul_epu32(M3, p->R20.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(M4, p->S23.v);
    T6 = _mm_mul_epu32(M4, p->S24.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(M0, p->R24.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(M1, p->R23.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(M2, p->R22.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(M3, p->R21.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(M4, p->R20.v);
    T4 = _mm_add_epi64(T4, T5);

    // H += [Mx,My]
    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
                            _mm_loadl_epi64((const xmmi *)(m + 48)));
    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
                            _mm_loadl_epi64((const xmmi *)(m + 56)));
    M0 = _mm_and_si128(MMASK, T5);
    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
    M2 = _mm_and_si128(MMASK, T5);
    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);

    T0 = _mm_add_epi64(T0, M0);
    T1 = _mm_add_epi64(T1, M1);
    T2 = _mm_add_epi64(T2, M2);
    T3 = _mm_add_epi64(T3, M3);
    T4 = _mm_add_epi64(T4, M4);

    // reduce
    C1 = _mm_srli_epi64(T0, 26);
    C2 = _mm_srli_epi64(T3, 26);
    T0 = _mm_and_si128(T0, MMASK);
    T3 = _mm_and_si128(T3, MMASK);
    T1 = _mm_add_epi64(T1, C1);
    T4 = _mm_add_epi64(T4, C2);
    C1 = _mm_srli_epi64(T1, 26);
    C2 = _mm_srli_epi64(T4, 26);
    T1 = _mm_and_si128(T1, MMASK);
    T4 = _mm_and_si128(T4, MMASK);
    T2 = _mm_add_epi64(T2, C1);
    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
    C1 = _mm_srli_epi64(T2, 26);
    C2 = _mm_srli_epi64(T0, 26);
    T2 = _mm_and_si128(T2, MMASK);
    T0 = _mm_and_si128(T0, MMASK);
    T3 = _mm_add_epi64(T3, C1);
    T1 = _mm_add_epi64(T1, C2);
    C1 = _mm_srli_epi64(T3, 26);
    T3 = _mm_and_si128(T3, MMASK);
    T4 = _mm_add_epi64(T4, C1);

    // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
    H0 = T0;
    H1 = T1;
    H2 = T2;
    H3 = T3;
    H4 = T4;

    m += 64;
    bytes -= 64;
  }

  st->H[0] = H0;
  st->H[1] = H1;
  st->H[2] = H2;
  st->H[3] = H3;
  st->H[4] = H4;
}