fn reduce64()

in src/backend/vector/avx2/field.rs [521:588]
39 lines of code
2 McCabe index (conditional complexity)

    fn reduce64(mut z: [u64x4; 10]) -> FieldElement2625x4 {
        // These aren't const because splat isn't a const fn
        let LOW_25_BITS: u64x4 = u64x4::splat((1 << 25) - 1);
        let LOW_26_BITS: u64x4 = u64x4::splat((1 << 26) - 1);

        // Carry the value from limb i = 0..8 to limb i+1
        let carry = |z: &mut [u64x4; 10], i: usize| {
            debug_assert!(i < 9);
            if i % 2 == 0 {
                // Even limbs have 26 bits
                z[i + 1] = z[i + 1] + (z[i] >> 26);
                z[i] = z[i] & LOW_26_BITS;
            } else {
                // Odd limbs have 25 bits
                z[i + 1] = z[i + 1] + (z[i] >> 25);
                z[i] = z[i] & LOW_25_BITS;
            }
        };

        // Perform two halves of the carry chain in parallel.
        carry(&mut z, 0); carry(&mut z, 4);
        carry(&mut z, 1); carry(&mut z, 5);
        carry(&mut z, 2); carry(&mut z, 6);
        carry(&mut z, 3); carry(&mut z, 7);
        // Since z[3] < 2^64, c < 2^(64-25) = 2^39,
        // so    z[4] < 2^26 + 2^39 < 2^39.0002
        carry(&mut z, 4); carry(&mut z, 8);
        // Now z[4] < 2^26
        // and z[5] < 2^25 + 2^13.0002 < 2^25.0004 (good enough)

        // Last carry has a multiplication by 19.  In the serial case we
        // do a 64-bit multiplication by 19, but here we want to do a
        // 32-bit multiplication.  However, if we only know z[9] < 2^64,
        // the carry is bounded as c < 2^(64-25) = 2^39, which is too
        // big.  To ensure c < 2^32, we would need z[9] < 2^57.
        // Instead, we split the carry in two, with c = c_0 + c_1*2^26.

        let c = z[9] >> 25;
        z[9] = z[9] & LOW_25_BITS;
        let mut c0: u64x4 = c & LOW_26_BITS; // c0 < 2^26;
        let mut c1: u64x4 = c >> 26;         // c1 < 2^(39-26) = 2^13;

        unsafe {
            use core::arch::x86_64::_mm256_mul_epu32;
            let x19 = u64x4::splat(19);
            c0 = _mm256_mul_epu32(c0.into_bits(), x19.into_bits()).into_bits(); // c0 < 2^30.25
            c1 = _mm256_mul_epu32(c1.into_bits(), x19.into_bits()).into_bits(); // c1 < 2^17.25
        }

        z[0] = z[0] + c0; // z0 < 2^26 + 2^30.25 < 2^30.33
        z[1] = z[1] + c1; // z1 < 2^25 + 2^17.25 < 2^25.0067
        carry(&mut z, 0); // z0 < 2^26, z1 < 2^25.0067 + 2^4.33 = 2^25.007

        // The output coefficients are bounded with
        //
        // b = 0.007  for z[1]
        // b = 0.0004 for z[5]
        // b = 0      for other z[i].
        //
        // So the packed result is bounded with b = 0.007.
        FieldElement2625x4([
            repack_pair(z[0].into_bits(), z[1].into_bits()),
            repack_pair(z[2].into_bits(), z[3].into_bits()),
            repack_pair(z[4].into_bits(), z[5].into_bits()),
            repack_pair(z[6].into_bits(), z[7].into_bits()),
            repack_pair(z[8].into_bits(), z[9].into_bits()),
        ])
    }