fn mul()

in src/backend/vector/ifma/field.rs [474:612]


    fn mul(self, rhs: &'b F51x4Reduced) -> F51x4Unreduced {
        unsafe {
            // Inputs
            let x = &self.0;
            let y = &rhs.0;

            // Accumulators for terms with coeff 1
            let mut z0_1 = u64x4::splat(0);
            let mut z1_1 = u64x4::splat(0);
            let mut z2_1 = u64x4::splat(0);
            let mut z3_1 = u64x4::splat(0);
            let mut z4_1 = u64x4::splat(0);
            let mut z5_1 = u64x4::splat(0);
            let mut z6_1 = u64x4::splat(0);
            let mut z7_1 = u64x4::splat(0);
            let mut z8_1 = u64x4::splat(0);

            // Accumulators for terms with coeff 2
            let mut z0_2 = u64x4::splat(0);
            let mut z1_2 = u64x4::splat(0);
            let mut z2_2 = u64x4::splat(0);
            let mut z3_2 = u64x4::splat(0);
            let mut z4_2 = u64x4::splat(0);
            let mut z5_2 = u64x4::splat(0);
            let mut z6_2 = u64x4::splat(0);
            let mut z7_2 = u64x4::splat(0);
            let mut z8_2 = u64x4::splat(0);
            let mut z9_2 = u64x4::splat(0);

            // LLVM doesn't seem to do much work reordering IFMA
            // instructions, so try to organize them into "waves" of 8
            // independent operations (4c latency, 0.5 c throughput
            // means 8 in flight)

            // Wave 0
            z4_1 = madd52lo(z4_1, x[2], y[2]);
            z5_2 = madd52hi(z5_2, x[2], y[2]);
            z5_1 = madd52lo(z5_1, x[4], y[1]);
            z6_2 = madd52hi(z6_2, x[4], y[1]);
            z6_1 = madd52lo(z6_1, x[4], y[2]);
            z7_2 = madd52hi(z7_2, x[4], y[2]);
            z7_1 = madd52lo(z7_1, x[4], y[3]);
            z8_2 = madd52hi(z8_2, x[4], y[3]);

            // Wave 1
            z4_1 = madd52lo(z4_1, x[3], y[1]);
            z5_2 = madd52hi(z5_2, x[3], y[1]);
            z5_1 = madd52lo(z5_1, x[3], y[2]);
            z6_2 = madd52hi(z6_2, x[3], y[2]);
            z6_1 = madd52lo(z6_1, x[3], y[3]);
            z7_2 = madd52hi(z7_2, x[3], y[3]);
            z7_1 = madd52lo(z7_1, x[3], y[4]);
            z8_2 = madd52hi(z8_2, x[3], y[4]);

            // Wave 2
            z8_1 = madd52lo(z8_1, x[4], y[4]);
            z9_2 = madd52hi(z9_2, x[4], y[4]);
            z4_1 = madd52lo(z4_1, x[4], y[0]);
            z5_2 = madd52hi(z5_2, x[4], y[0]);
            z5_1 = madd52lo(z5_1, x[2], y[3]);
            z6_2 = madd52hi(z6_2, x[2], y[3]);
            z6_1 = madd52lo(z6_1, x[2], y[4]);
            z7_2 = madd52hi(z7_2, x[2], y[4]);

            let z8 = z8_1 + z8_2 + z8_2;
            let z9 = z9_2 + z9_2;

            // Wave 3
            z3_1 = madd52lo(z3_1, x[3], y[0]);
            z4_2 = madd52hi(z4_2, x[3], y[0]);
            z4_1 = madd52lo(z4_1, x[1], y[3]);
            z5_2 = madd52hi(z5_2, x[1], y[3]);
            z5_1 = madd52lo(z5_1, x[1], y[4]);
            z6_2 = madd52hi(z6_2, x[1], y[4]);
            z2_1 = madd52lo(z2_1, x[2], y[0]);
            z3_2 = madd52hi(z3_2, x[2], y[0]);

            let z6 = z6_1 + z6_2 + z6_2;
            let z7 = z7_1 + z7_2 + z7_2;

            // Wave 4
            z3_1 = madd52lo(z3_1, x[2], y[1]);
            z4_2 = madd52hi(z4_2, x[2], y[1]);
            z4_1 = madd52lo(z4_1, x[0], y[4]);
            z5_2 = madd52hi(z5_2, x[0], y[4]);
            z1_1 = madd52lo(z1_1, x[1], y[0]);
            z2_2 = madd52hi(z2_2, x[1], y[0]);
            z2_1 = madd52lo(z2_1, x[1], y[1]);
            z3_2 = madd52hi(z3_2, x[1], y[1]);

            let z5 = z5_1 + z5_2 + z5_2;

            // Wave 5
            z3_1 = madd52lo(z3_1, x[1], y[2]);
            z4_2 = madd52hi(z4_2, x[1], y[2]);
            z0_1 = madd52lo(z0_1, x[0], y[0]);
            z1_2 = madd52hi(z1_2, x[0], y[0]);
            z1_1 = madd52lo(z1_1, x[0], y[1]);
            z2_1 = madd52lo(z2_1, x[0], y[2]);
            z2_2 = madd52hi(z2_2, x[0], y[1]);
            z3_2 = madd52hi(z3_2, x[0], y[2]);

            let mut t0 = u64x4::splat(0);
            let mut t1 = u64x4::splat(0);
            let r19 = u64x4::splat(19);

            // Wave 6
            t0 = madd52hi(t0, r19, z9);
            t1 = madd52lo(t1, r19, z9 >> 52);
            z3_1 = madd52lo(z3_1, x[0], y[3]);
            z4_2 = madd52hi(z4_2, x[0], y[3]);
            z1_2 = madd52lo(z1_2, r19, z5 >> 52);
            z2_2 = madd52lo(z2_2, r19, z6 >> 52);
            z3_2 = madd52lo(z3_2, r19, z7 >> 52);
            z0_1 = madd52lo(z0_1, r19, z5);

            // Wave 7
            z4_1 = madd52lo(z4_1, r19, z9);
            z1_1 = madd52lo(z1_1, r19, z6);
            z0_2 = madd52lo(z0_2, r19, t0 + t1);
            z4_2 = madd52hi(z4_2, r19, z8);
            z2_1 = madd52lo(z2_1, r19, z7);
            z1_2 = madd52hi(z1_2, r19, z5);
            z2_2 = madd52hi(z2_2, r19, z6);
            z3_2 = madd52hi(z3_2, r19, z7);

            // Wave 8
            z3_1 = madd52lo(z3_1, r19, z8);
            z4_2 = madd52lo(z4_2, r19, z8 >> 52);

            F51x4Unreduced([
                z0_1 + z0_2 + z0_2,
                z1_1 + z1_2 + z1_2,
                z2_1 + z2_2 + z2_2,
                z3_1 + z3_2 + z3_2,
                z4_1 + z4_2 + z4_2,
            ])
        }
    }