void gf2x_mul_base_pclmul()

in src/gf2x/gf2x_mul_base_pclmul.c [91:131]


void gf2x_mul_base_pclmul(OUT uint64_t *c,
                          IN const uint64_t *a,
                          IN const uint64_t *b)
{
  __m128i va[4], vb[4];
  __m128i aa[2], bb[2];
  __m128i lo[4], hi[4], mi[4], m[2];

  for(size_t i = 0; i < 4; i++) {
    va[i] = LOAD128(&a[QWORDS_IN_XMM * i]);
    vb[i] = LOAD128(&b[QWORDS_IN_XMM * i]);
  }

  // Multiply the low and the high halves of a and b
  // lo <-- a_lo * b_lo
  // hi <-- a_hi * b_hi
  gf2x_mul4_int(lo, va[0], va[1], vb[0], vb[1]);
  gf2x_mul4_int(hi, va[2], va[3], vb[2], vb[3]);

  // Compute the middle multiplication
  // aa <-- a_lo + a_hi
  // bb <-- b_lo + b_hi
  // mi <-- aa * bb
  aa[0] = va[0] ^ va[2];
  aa[1] = va[1] ^ va[3];
  bb[0] = vb[0] ^ vb[2];
  bb[1] = vb[1] ^ vb[3];
  gf2x_mul4_int(mi, aa[0], aa[1], bb[0], bb[1]);

  m[0] = lo[2] ^ hi[0];
  m[1] = lo[3] ^ hi[1];

  STORE128(&c[0 * QWORDS_IN_XMM], lo[0]);
  STORE128(&c[1 * QWORDS_IN_XMM], lo[1]);
  STORE128(&c[2 * QWORDS_IN_XMM], mi[0] ^ lo[0] ^ m[0]);
  STORE128(&c[3 * QWORDS_IN_XMM], mi[1] ^ lo[1] ^ m[1]);
  STORE128(&c[4 * QWORDS_IN_XMM], mi[2] ^ hi[2] ^ m[0]);
  STORE128(&c[5 * QWORDS_IN_XMM], mi[3] ^ hi[3] ^ m[1]);
  STORE128(&c[6 * QWORDS_IN_XMM], hi[2]);
  STORE128(&c[7 * QWORDS_IN_XMM], hi[3]);
}