in src/backend/vector/ifma/field.rs [474:612]
fn mul(self, rhs: &'b F51x4Reduced) -> F51x4Unreduced {
unsafe {
// Inputs
let x = &self.0;
let y = &rhs.0;
// Accumulators for terms with coeff 1
let mut z0_1 = u64x4::splat(0);
let mut z1_1 = u64x4::splat(0);
let mut z2_1 = u64x4::splat(0);
let mut z3_1 = u64x4::splat(0);
let mut z4_1 = u64x4::splat(0);
let mut z5_1 = u64x4::splat(0);
let mut z6_1 = u64x4::splat(0);
let mut z7_1 = u64x4::splat(0);
let mut z8_1 = u64x4::splat(0);
// Accumulators for terms with coeff 2
let mut z0_2 = u64x4::splat(0);
let mut z1_2 = u64x4::splat(0);
let mut z2_2 = u64x4::splat(0);
let mut z3_2 = u64x4::splat(0);
let mut z4_2 = u64x4::splat(0);
let mut z5_2 = u64x4::splat(0);
let mut z6_2 = u64x4::splat(0);
let mut z7_2 = u64x4::splat(0);
let mut z8_2 = u64x4::splat(0);
let mut z9_2 = u64x4::splat(0);
// LLVM doesn't seem to do much work reordering IFMA
// instructions, so try to organize them into "waves" of 8
// independent operations (4c latency, 0.5 c throughput
// means 8 in flight)
// Wave 0
z4_1 = madd52lo(z4_1, x[2], y[2]);
z5_2 = madd52hi(z5_2, x[2], y[2]);
z5_1 = madd52lo(z5_1, x[4], y[1]);
z6_2 = madd52hi(z6_2, x[4], y[1]);
z6_1 = madd52lo(z6_1, x[4], y[2]);
z7_2 = madd52hi(z7_2, x[4], y[2]);
z7_1 = madd52lo(z7_1, x[4], y[3]);
z8_2 = madd52hi(z8_2, x[4], y[3]);
// Wave 1
z4_1 = madd52lo(z4_1, x[3], y[1]);
z5_2 = madd52hi(z5_2, x[3], y[1]);
z5_1 = madd52lo(z5_1, x[3], y[2]);
z6_2 = madd52hi(z6_2, x[3], y[2]);
z6_1 = madd52lo(z6_1, x[3], y[3]);
z7_2 = madd52hi(z7_2, x[3], y[3]);
z7_1 = madd52lo(z7_1, x[3], y[4]);
z8_2 = madd52hi(z8_2, x[3], y[4]);
// Wave 2
z8_1 = madd52lo(z8_1, x[4], y[4]);
z9_2 = madd52hi(z9_2, x[4], y[4]);
z4_1 = madd52lo(z4_1, x[4], y[0]);
z5_2 = madd52hi(z5_2, x[4], y[0]);
z5_1 = madd52lo(z5_1, x[2], y[3]);
z6_2 = madd52hi(z6_2, x[2], y[3]);
z6_1 = madd52lo(z6_1, x[2], y[4]);
z7_2 = madd52hi(z7_2, x[2], y[4]);
let z8 = z8_1 + z8_2 + z8_2;
let z9 = z9_2 + z9_2;
// Wave 3
z3_1 = madd52lo(z3_1, x[3], y[0]);
z4_2 = madd52hi(z4_2, x[3], y[0]);
z4_1 = madd52lo(z4_1, x[1], y[3]);
z5_2 = madd52hi(z5_2, x[1], y[3]);
z5_1 = madd52lo(z5_1, x[1], y[4]);
z6_2 = madd52hi(z6_2, x[1], y[4]);
z2_1 = madd52lo(z2_1, x[2], y[0]);
z3_2 = madd52hi(z3_2, x[2], y[0]);
let z6 = z6_1 + z6_2 + z6_2;
let z7 = z7_1 + z7_2 + z7_2;
// Wave 4
z3_1 = madd52lo(z3_1, x[2], y[1]);
z4_2 = madd52hi(z4_2, x[2], y[1]);
z4_1 = madd52lo(z4_1, x[0], y[4]);
z5_2 = madd52hi(z5_2, x[0], y[4]);
z1_1 = madd52lo(z1_1, x[1], y[0]);
z2_2 = madd52hi(z2_2, x[1], y[0]);
z2_1 = madd52lo(z2_1, x[1], y[1]);
z3_2 = madd52hi(z3_2, x[1], y[1]);
let z5 = z5_1 + z5_2 + z5_2;
// Wave 5
z3_1 = madd52lo(z3_1, x[1], y[2]);
z4_2 = madd52hi(z4_2, x[1], y[2]);
z0_1 = madd52lo(z0_1, x[0], y[0]);
z1_2 = madd52hi(z1_2, x[0], y[0]);
z1_1 = madd52lo(z1_1, x[0], y[1]);
z2_1 = madd52lo(z2_1, x[0], y[2]);
z2_2 = madd52hi(z2_2, x[0], y[1]);
z3_2 = madd52hi(z3_2, x[0], y[2]);
let mut t0 = u64x4::splat(0);
let mut t1 = u64x4::splat(0);
let r19 = u64x4::splat(19);
// Wave 6
t0 = madd52hi(t0, r19, z9);
t1 = madd52lo(t1, r19, z9 >> 52);
z3_1 = madd52lo(z3_1, x[0], y[3]);
z4_2 = madd52hi(z4_2, x[0], y[3]);
z1_2 = madd52lo(z1_2, r19, z5 >> 52);
z2_2 = madd52lo(z2_2, r19, z6 >> 52);
z3_2 = madd52lo(z3_2, r19, z7 >> 52);
z0_1 = madd52lo(z0_1, r19, z5);
// Wave 7
z4_1 = madd52lo(z4_1, r19, z9);
z1_1 = madd52lo(z1_1, r19, z6);
z0_2 = madd52lo(z0_2, r19, t0 + t1);
z4_2 = madd52hi(z4_2, r19, z8);
z2_1 = madd52lo(z2_1, r19, z7);
z1_2 = madd52hi(z1_2, r19, z5);
z2_2 = madd52hi(z2_2, r19, z6);
z3_2 = madd52hi(z3_2, r19, z7);
// Wave 8
z3_1 = madd52lo(z3_1, r19, z8);
z4_2 = madd52lo(z4_2, r19, z8 >> 52);
F51x4Unreduced([
z0_1 + z0_2 + z0_2,
z1_1 + z1_2 + z1_2,
z2_1 + z2_2 + z2_2,
z3_1 + z3_2 + z3_2,
z4_1 + z4_2 + z4_2,
])
}
}