_INLINE_ void gf2x_mul8_512_int()

in src/gf2x/gf2x_mul_base_vpclmul.c [31:83]


_INLINE_ void gf2x_mul8_512_int(OUT __m512i *zh,
                                OUT __m512i *    zl,
                                IN const __m512i a,
                                IN const __m512i b)
{
  const __m512i mask0   = SET_I64(13, 12, 5, 4, 9, 8, 1, 0);
  const __m512i mask1   = SET_I64(15, 14, 7, 6, 11, 10, 3, 2);
  const __m512i mask2   = SET_I64(3, 2, 1, 0, 7, 6, 5, 4);
  const __m512i mask3   = SET_I64(11, 10, 9, 8, 3, 2, 1, 0);
  const __m512i mask4   = SET_I64(15, 14, 13, 12, 7, 6, 5, 4);
  const __m512i mask_s1 = SET_I64(7, 6, 5, 4, 1, 0, 3, 2);
  const __m512i mask_s2 = SET_I64(3, 2, 7, 6, 5, 4, 1, 0);

  __m512i xl, xh, xabl, xabh, xab, xab1, xab2;
  __m512i yl, yh, yabl, yabh, yab;
  __m512i t[4];

  // Calculate:
  // AX1^AX3|| AX2^AX3 || AX0^AX2 || AX0^AX1
  // BX1^BX3|| BX2^BX3 || BX0^BX2 || BX0^BX1
  // Where (AX1^AX3 || AX0^AX2) stands for (AX1 || AX0)^(AX3 || AX2) = AY0^AY1
  t[0] = PERMXVAR_I64(mask_s1, a) ^ PERMXVAR_I64(mask_s2, a);
  t[1] = PERMXVAR_I64(mask_s1, b) ^ PERMXVAR_I64(mask_s2, b);

  // Calculate:
  // Don't care || AX1^AX3^AX0^AX2
  // Don't care || BX1^BX3^BX0^BX2
  t[2] = t[0] ^ VALIGN(t[0], t[0], 4);
  t[3] = t[1] ^ VALIGN(t[1], t[1], 4);

  mul2_512(&xh, &xl, a, b);
  mul2_512(&xabh, &xabl, t[0], t[1]);
  mul2_512(&yabh, &yabl, t[2], t[3]);

  xab  = xl ^ xh ^ PERMX2VAR_I64(xabl, mask0, xabh);
  yl   = PERMX2VAR_I64(xl, mask3, xh);
  yh   = PERMX2VAR_I64(xl, mask4, xh);
  xab1 = VALIGN(xab, xab, 6);
  xab2 = VALIGN(xab, xab, 2);
  yl   = MXOR_I64(yl, 0x3c, yl, xab1);
  yh   = MXOR_I64(yh, 0x3c, yh, xab2);

  __m512i oxh = PERMX2VAR_I64(xabl, mask1, xabh);
  __m512i oxl = VALIGN(oxh, oxh, 4);
  yab         = oxl ^ oxh ^ PERMX2VAR_I64(yabl, mask0, yabh);
  yab         = MXOR_I64(oxh, 0x3c, oxh, VALIGN(yab, yab, 2));
  yab ^= yl ^ yh;

  // Z0 (yl) + Z1 (yab) + Z2 (yh)
  yab = PERMXVAR_I64(mask2, yab);
  *zl = MXOR_I64(yl, 0xf0, yl, yab);
  *zh = MXOR_I64(yh, 0x0f, yh, yab);
}