static size_t poly1305_combine()

in ring/crypto/poly1305/poly1305_vec.c [437:678]


static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
                               size_t bytes) {
  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);

  poly1305_power *p;
  xmmi H0, H1, H2, H3, H4;
  xmmi M0, M1, M2, M3, M4;
  xmmi T0, T1, T2, T3, T4, T5, T6;
  xmmi C1, C2;

  uint64_t r0, r1, r2;
  uint64_t t0, t1, t2, t3, t4;
  uint64_t c;
  size_t consumed = 0;

  H0 = st->H[0];
  H1 = st->H[1];
  H2 = st->H[2];
  H3 = st->H[3];
  H4 = st->H[4];

  // p = [r^2,r^2]
  p = &st->P[1];

  if (bytes >= 32) {
    // H *= [r^2,r^2]
    T0 = _mm_mul_epu32(H0, p->R20.v);
    T1 = _mm_mul_epu32(H0, p->R21.v);
    T2 = _mm_mul_epu32(H0, p->R22.v);
    T3 = _mm_mul_epu32(H0, p->R23.v);
    T4 = _mm_mul_epu32(H0, p->R24.v);
    T5 = _mm_mul_epu32(H1, p->S24.v);
    T6 = _mm_mul_epu32(H1, p->R20.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(H2, p->S23.v);
    T6 = _mm_mul_epu32(H2, p->S24.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(H3, p->S22.v);
    T6 = _mm_mul_epu32(H3, p->S23.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(H4, p->S21.v);
    T6 = _mm_mul_epu32(H4, p->S22.v);
    T0 = _mm_add_epi64(T0, T5);
    T1 = _mm_add_epi64(T1, T6);
    T5 = _mm_mul_epu32(H1, p->R21.v);
    T6 = _mm_mul_epu32(H1, p->R22.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(H2, p->R20.v);
    T6 = _mm_mul_epu32(H2, p->R21.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(H3, p->S24.v);
    T6 = _mm_mul_epu32(H3, p->R20.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(H4, p->S23.v);
    T6 = _mm_mul_epu32(H4, p->S24.v);
    T2 = _mm_add_epi64(T2, T5);
    T3 = _mm_add_epi64(T3, T6);
    T5 = _mm_mul_epu32(H1, p->R23.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(H2, p->R22.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(H3, p->R21.v);
    T4 = _mm_add_epi64(T4, T5);
    T5 = _mm_mul_epu32(H4, p->R20.v);
    T4 = _mm_add_epi64(T4, T5);

    // H += [Mx,My]
    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
                            _mm_loadl_epi64((const xmmi *)(m + 16)));
    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
                            _mm_loadl_epi64((const xmmi *)(m + 24)));
    M0 = _mm_and_si128(MMASK, T5);
    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
    M2 = _mm_and_si128(MMASK, T5);
    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);

    T0 = _mm_add_epi64(T0, M0);
    T1 = _mm_add_epi64(T1, M1);
    T2 = _mm_add_epi64(T2, M2);
    T3 = _mm_add_epi64(T3, M3);
    T4 = _mm_add_epi64(T4, M4);

    // reduce
    C1 = _mm_srli_epi64(T0, 26);
    C2 = _mm_srli_epi64(T3, 26);
    T0 = _mm_and_si128(T0, MMASK);
    T3 = _mm_and_si128(T3, MMASK);
    T1 = _mm_add_epi64(T1, C1);
    T4 = _mm_add_epi64(T4, C2);
    C1 = _mm_srli_epi64(T1, 26);
    C2 = _mm_srli_epi64(T4, 26);
    T1 = _mm_and_si128(T1, MMASK);
    T4 = _mm_and_si128(T4, MMASK);
    T2 = _mm_add_epi64(T2, C1);
    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
    C1 = _mm_srli_epi64(T2, 26);
    C2 = _mm_srli_epi64(T0, 26);
    T2 = _mm_and_si128(T2, MMASK);
    T0 = _mm_and_si128(T0, MMASK);
    T3 = _mm_add_epi64(T3, C1);
    T1 = _mm_add_epi64(T1, C2);
    C1 = _mm_srli_epi64(T3, 26);
    T3 = _mm_and_si128(T3, MMASK);
    T4 = _mm_add_epi64(T4, C1);

    // H = (H*[r^2,r^2] + [Mx,My])
    H0 = T0;
    H1 = T1;
    H2 = T2;
    H3 = T3;
    H4 = T4;

    consumed = 32;
  }

  // finalize, H *= [r^2,r]
  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];

  p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
  p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
  p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
  p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
  p->R24.d[2] = (uint32_t)((r2 >> 16));
  p->S21.d[2] = p->R21.d[2] * 5;
  p->S22.d[2] = p->R22.d[2] * 5;
  p->S23.d[2] = p->R23.d[2] * 5;
  p->S24.d[2] = p->R24.d[2] * 5;

  // H *= [r^2,r]
  T0 = _mm_mul_epu32(H0, p->R20.v);
  T1 = _mm_mul_epu32(H0, p->R21.v);
  T2 = _mm_mul_epu32(H0, p->R22.v);
  T3 = _mm_mul_epu32(H0, p->R23.v);
  T4 = _mm_mul_epu32(H0, p->R24.v);
  T5 = _mm_mul_epu32(H1, p->S24.v);
  T6 = _mm_mul_epu32(H1, p->R20.v);
  T0 = _mm_add_epi64(T0, T5);
  T1 = _mm_add_epi64(T1, T6);
  T5 = _mm_mul_epu32(H2, p->S23.v);
  T6 = _mm_mul_epu32(H2, p->S24.v);
  T0 = _mm_add_epi64(T0, T5);
  T1 = _mm_add_epi64(T1, T6);
  T5 = _mm_mul_epu32(H3, p->S22.v);
  T6 = _mm_mul_epu32(H3, p->S23.v);
  T0 = _mm_add_epi64(T0, T5);
  T1 = _mm_add_epi64(T1, T6);
  T5 = _mm_mul_epu32(H4, p->S21.v);
  T6 = _mm_mul_epu32(H4, p->S22.v);
  T0 = _mm_add_epi64(T0, T5);
  T1 = _mm_add_epi64(T1, T6);
  T5 = _mm_mul_epu32(H1, p->R21.v);
  T6 = _mm_mul_epu32(H1, p->R22.v);
  T2 = _mm_add_epi64(T2, T5);
  T3 = _mm_add_epi64(T3, T6);
  T5 = _mm_mul_epu32(H2, p->R20.v);
  T6 = _mm_mul_epu32(H2, p->R21.v);
  T2 = _mm_add_epi64(T2, T5);
  T3 = _mm_add_epi64(T3, T6);
  T5 = _mm_mul_epu32(H3, p->S24.v);
  T6 = _mm_mul_epu32(H3, p->R20.v);
  T2 = _mm_add_epi64(T2, T5);
  T3 = _mm_add_epi64(T3, T6);
  T5 = _mm_mul_epu32(H4, p->S23.v);
  T6 = _mm_mul_epu32(H4, p->S24.v);
  T2 = _mm_add_epi64(T2, T5);
  T3 = _mm_add_epi64(T3, T6);
  T5 = _mm_mul_epu32(H1, p->R23.v);
  T4 = _mm_add_epi64(T4, T5);
  T5 = _mm_mul_epu32(H2, p->R22.v);
  T4 = _mm_add_epi64(T4, T5);
  T5 = _mm_mul_epu32(H3, p->R21.v);
  T4 = _mm_add_epi64(T4, T5);
  T5 = _mm_mul_epu32(H4, p->R20.v);
  T4 = _mm_add_epi64(T4, T5);

  C1 = _mm_srli_epi64(T0, 26);
  C2 = _mm_srli_epi64(T3, 26);
  T0 = _mm_and_si128(T0, MMASK);
  T3 = _mm_and_si128(T3, MMASK);
  T1 = _mm_add_epi64(T1, C1);
  T4 = _mm_add_epi64(T4, C2);
  C1 = _mm_srli_epi64(T1, 26);
  C2 = _mm_srli_epi64(T4, 26);
  T1 = _mm_and_si128(T1, MMASK);
  T4 = _mm_and_si128(T4, MMASK);
  T2 = _mm_add_epi64(T2, C1);
  T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  C1 = _mm_srli_epi64(T2, 26);
  C2 = _mm_srli_epi64(T0, 26);
  T2 = _mm_and_si128(T2, MMASK);
  T0 = _mm_and_si128(T0, MMASK);
  T3 = _mm_add_epi64(T3, C1);
  T1 = _mm_add_epi64(T1, C2);
  C1 = _mm_srli_epi64(T3, 26);
  T3 = _mm_and_si128(T3, MMASK);
  T4 = _mm_add_epi64(T4, C1);

  // H = H[0]+H[1]
  H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
  H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
  H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
  H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
  H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));

  t0 = _mm_cvtsi128_si32(H0);
  c = (t0 >> 26);
  t0 &= 0x3ffffff;
  t1 = _mm_cvtsi128_si32(H1) + c;
  c = (t1 >> 26);
  t1 &= 0x3ffffff;
  t2 = _mm_cvtsi128_si32(H2) + c;
  c = (t2 >> 26);
  t2 &= 0x3ffffff;
  t3 = _mm_cvtsi128_si32(H3) + c;
  c = (t3 >> 26);
  t3 &= 0x3ffffff;
  t4 = _mm_cvtsi128_si32(H4) + c;
  c = (t4 >> 26);
  t4 &= 0x3ffffff;
  t0 = t0 + (c * 5);
  c = (t0 >> 26);
  t0 &= 0x3ffffff;
  t1 = t1 + c;

  st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
  st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
  st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);

  return consumed;
}