in src/gf2x/gf2x_mul_base_vpclmul.c [31:83]
_INLINE_ void gf2x_mul8_512_int(OUT __m512i *zh,
OUT __m512i * zl,
IN const __m512i a,
IN const __m512i b)
{
const __m512i mask0 = SET_I64(13, 12, 5, 4, 9, 8, 1, 0);
const __m512i mask1 = SET_I64(15, 14, 7, 6, 11, 10, 3, 2);
const __m512i mask2 = SET_I64(3, 2, 1, 0, 7, 6, 5, 4);
const __m512i mask3 = SET_I64(11, 10, 9, 8, 3, 2, 1, 0);
const __m512i mask4 = SET_I64(15, 14, 13, 12, 7, 6, 5, 4);
const __m512i mask_s1 = SET_I64(7, 6, 5, 4, 1, 0, 3, 2);
const __m512i mask_s2 = SET_I64(3, 2, 7, 6, 5, 4, 1, 0);
__m512i xl, xh, xabl, xabh, xab, xab1, xab2;
__m512i yl, yh, yabl, yabh, yab;
__m512i t[4];
// Calculate:
// AX1^AX3|| AX2^AX3 || AX0^AX2 || AX0^AX1
// BX1^BX3|| BX2^BX3 || BX0^BX2 || BX0^BX1
// Where (AX1^AX3 || AX0^AX2) stands for (AX1 || AX0)^(AX3 || AX2) = AY0^AY1
t[0] = PERMXVAR_I64(mask_s1, a) ^ PERMXVAR_I64(mask_s2, a);
t[1] = PERMXVAR_I64(mask_s1, b) ^ PERMXVAR_I64(mask_s2, b);
// Calculate:
// Don't care || AX1^AX3^AX0^AX2
// Don't care || BX1^BX3^BX0^BX2
t[2] = t[0] ^ VALIGN(t[0], t[0], 4);
t[3] = t[1] ^ VALIGN(t[1], t[1], 4);
mul2_512(&xh, &xl, a, b);
mul2_512(&xabh, &xabl, t[0], t[1]);
mul2_512(&yabh, &yabl, t[2], t[3]);
xab = xl ^ xh ^ PERMX2VAR_I64(xabl, mask0, xabh);
yl = PERMX2VAR_I64(xl, mask3, xh);
yh = PERMX2VAR_I64(xl, mask4, xh);
xab1 = VALIGN(xab, xab, 6);
xab2 = VALIGN(xab, xab, 2);
yl = MXOR_I64(yl, 0x3c, yl, xab1);
yh = MXOR_I64(yh, 0x3c, yh, xab2);
__m512i oxh = PERMX2VAR_I64(xabl, mask1, xabh);
__m512i oxl = VALIGN(oxh, oxh, 4);
yab = oxl ^ oxh ^ PERMX2VAR_I64(yabl, mask0, yabh);
yab = MXOR_I64(oxh, 0x3c, oxh, VALIGN(yab, yab, 2));
yab ^= yl ^ yh;
// Z0 (yl) + Z1 (yab) + Z2 (yh)
yab = PERMXVAR_I64(mask2, yab);
*zl = MXOR_I64(yl, 0xf0, yl, yab);
*zh = MXOR_I64(yh, 0x0f, yh, yab);
}