in src/gf2x/gf2x_mul_base_pclmul.c [91:131]
void gf2x_mul_base_pclmul(OUT uint64_t *c,
IN const uint64_t *a,
IN const uint64_t *b)
{
__m128i va[4], vb[4];
__m128i aa[2], bb[2];
__m128i lo[4], hi[4], mi[4], m[2];
for(size_t i = 0; i < 4; i++) {
va[i] = LOAD128(&a[QWORDS_IN_XMM * i]);
vb[i] = LOAD128(&b[QWORDS_IN_XMM * i]);
}
// Multiply the low and the high halves of a and b
// lo <-- a_lo * b_lo
// hi <-- a_hi * b_hi
gf2x_mul4_int(lo, va[0], va[1], vb[0], vb[1]);
gf2x_mul4_int(hi, va[2], va[3], vb[2], vb[3]);
// Compute the middle multiplication
// aa <-- a_lo + a_hi
// bb <-- b_lo + b_hi
// mi <-- aa * bb
aa[0] = va[0] ^ va[2];
aa[1] = va[1] ^ va[3];
bb[0] = vb[0] ^ vb[2];
bb[1] = vb[1] ^ vb[3];
gf2x_mul4_int(mi, aa[0], aa[1], bb[0], bb[1]);
m[0] = lo[2] ^ hi[0];
m[1] = lo[3] ^ hi[1];
STORE128(&c[0 * QWORDS_IN_XMM], lo[0]);
STORE128(&c[1 * QWORDS_IN_XMM], lo[1]);
STORE128(&c[2 * QWORDS_IN_XMM], mi[0] ^ lo[0] ^ m[0]);
STORE128(&c[3 * QWORDS_IN_XMM], mi[1] ^ lo[1] ^ m[1]);
STORE128(&c[4 * QWORDS_IN_XMM], mi[2] ^ hi[2] ^ m[0]);
STORE128(&c[5 * QWORDS_IN_XMM], mi[3] ^ hi[3] ^ m[1]);
STORE128(&c[6 * QWORDS_IN_XMM], hi[2]);
STORE128(&c[7 * QWORDS_IN_XMM], hi[3]);
}