in src/gf2x/gf2x_mul_base_pclmul.c [21:87]
_INLINE_ void gf2x_mul4_int(OUT __m128i c[4],
IN const __m128i a_lo,
IN const __m128i a_hi,
IN const __m128i b_lo,
IN const __m128i b_hi)
{
// a_lo = [a1 | a0]; a_hi = [a3 | a2];
// b_lo = [b1 | b0]; b_hi = [b3 | b2];
// 4x4 Karatsuba requires three 2x2 multiplications:
// (1) a_lo * b_lo
// (2) a_hi * b_hi
// (3) aa * bb = (a_lo + a_hi) * (b_lo + b_hi)
// Each of the three 2x2 multiplications requires three 1x1 multiplications:
// (1) is computed by a0*b0, a1*b1, (a0+a1)*(b0+b1)
// (2) is computed by a2*b2, a3*b3, (a2+a3)*(b2+b3)
// (3) is computed by aa0*bb0, aa1*bb1, (aa0+aa1)*(bb0+bb1)
// All the required additions are performed in the end.
__m128i aa, bb;
__m128i xx, yy, uu, vv, m;
__m128i lo[2], hi[2], mi[2];
__m128i t[9];
aa = a_lo ^ a_hi;
bb = b_lo ^ b_hi;
// xx <-- [(a2+a3) | (a0+a1)]
// yy <-- [(b2+b3) | (b0+b1)]
xx = UNPACKLO(a_lo, a_hi);
yy = UNPACKLO(b_lo, b_hi);
xx = xx ^ UNPACKHI(a_lo, a_hi);
yy = yy ^ UNPACKHI(b_lo, b_hi);
// uu <-- [ 0 | (aa0+aa1)]
// vv <-- [ 0 | (bb0+bb1)]
uu = aa ^ BSRLI(aa, 8);
vv = bb ^ BSRLI(bb, 8);
// 9 multiplications
t[0] = CLMUL(a_lo, b_lo, 0x00);
t[1] = CLMUL(a_lo, b_lo, 0x11);
t[2] = CLMUL(a_hi, b_hi, 0x00);
t[3] = CLMUL(a_hi, b_hi, 0x11);
t[4] = CLMUL(xx, yy, 0x00);
t[5] = CLMUL(xx, yy, 0x11);
t[6] = CLMUL(aa, bb, 0x00);
t[7] = CLMUL(aa, bb, 0x11);
t[8] = CLMUL(uu, vv, 0x00);
t[4] ^= (t[0] ^ t[1]);
t[5] ^= (t[2] ^ t[3]);
t[8] ^= (t[6] ^ t[7]);
lo[0] = t[0] ^ BSLLI(t[4], 8);
lo[1] = t[1] ^ BSRLI(t[4], 8);
hi[0] = t[2] ^ BSLLI(t[5], 8);
hi[1] = t[3] ^ BSRLI(t[5], 8);
mi[0] = t[6] ^ BSLLI(t[8], 8);
mi[1] = t[7] ^ BSRLI(t[8], 8);
m = lo[1] ^ hi[0];
c[0] = lo[0];
c[1] = lo[0] ^ mi[0] ^ m;
c[2] = hi[1] ^ mi[1] ^ m;
c[3] = hi[1];
}