_INLINE_ void gf2x_mul4_int()

in src/gf2x/gf2x_mul_base_pclmul.c [21:87]


_INLINE_ void gf2x_mul4_int(OUT __m128i      c[4],
                            IN const __m128i a_lo,
                            IN const __m128i a_hi,
                            IN const __m128i b_lo,
                            IN const __m128i b_hi)
{
  // a_lo = [a1 | a0]; a_hi = [a3 | a2];
  // b_lo = [b1 | b0]; b_hi = [b3 | b2];
  // 4x4 Karatsuba requires three 2x2 multiplications:
  //   (1) a_lo * b_lo
  //   (2) a_hi * b_hi
  //   (3) aa * bb = (a_lo + a_hi) * (b_lo + b_hi)
  // Each of the three 2x2 multiplications requires three 1x1 multiplications:
  //   (1) is computed by a0*b0, a1*b1, (a0+a1)*(b0+b1)
  //   (2) is computed by a2*b2, a3*b3, (a2+a3)*(b2+b3)
  //   (3) is computed by aa0*bb0, aa1*bb1, (aa0+aa1)*(bb0+bb1)
  // All the required additions are performed in the end.

  __m128i aa, bb;
  __m128i xx, yy, uu, vv, m;
  __m128i lo[2], hi[2], mi[2];
  __m128i t[9];

  aa = a_lo ^ a_hi;
  bb = b_lo ^ b_hi;

  // xx <-- [(a2+a3) | (a0+a1)]
  // yy <-- [(b2+b3) | (b0+b1)]
  xx = UNPACKLO(a_lo, a_hi);
  yy = UNPACKLO(b_lo, b_hi);
  xx = xx ^ UNPACKHI(a_lo, a_hi);
  yy = yy ^ UNPACKHI(b_lo, b_hi);

  // uu <-- [ 0 | (aa0+aa1)]
  // vv <-- [ 0 | (bb0+bb1)]
  uu = aa ^ BSRLI(aa, 8);
  vv = bb ^ BSRLI(bb, 8);

  // 9 multiplications
  t[0] = CLMUL(a_lo, b_lo, 0x00);
  t[1] = CLMUL(a_lo, b_lo, 0x11);
  t[2] = CLMUL(a_hi, b_hi, 0x00);
  t[3] = CLMUL(a_hi, b_hi, 0x11);
  t[4] = CLMUL(xx, yy, 0x00);
  t[5] = CLMUL(xx, yy, 0x11);
  t[6] = CLMUL(aa, bb, 0x00);
  t[7] = CLMUL(aa, bb, 0x11);
  t[8] = CLMUL(uu, vv, 0x00);

  t[4] ^= (t[0] ^ t[1]);
  t[5] ^= (t[2] ^ t[3]);
  t[8] ^= (t[6] ^ t[7]);

  lo[0] = t[0] ^ BSLLI(t[4], 8);
  lo[1] = t[1] ^ BSRLI(t[4], 8);
  hi[0] = t[2] ^ BSLLI(t[5], 8);
  hi[1] = t[3] ^ BSRLI(t[5], 8);
  mi[0] = t[6] ^ BSLLI(t[8], 8);
  mi[1] = t[7] ^ BSRLI(t[8], 8);

  m = lo[1] ^ hi[0];

  c[0] = lo[0];
  c[1] = lo[0] ^ mi[0] ^ m;
  c[2] = hi[1] ^ mi[1] ^ m;
  c[3] = hi[1];
}