in src/backend/serial/u32/field.rs [523:562]
fn square_inner(&self) -> [u64; 10] {
// Optimized version of multiplication for the case of squaring.
// Pre- and post- conditions identical to multiplication function.
let x = &self.0;
let x0_2 = 2 * x[0];
let x1_2 = 2 * x[1];
let x2_2 = 2 * x[2];
let x3_2 = 2 * x[3];
let x4_2 = 2 * x[4];
let x5_2 = 2 * x[5];
let x6_2 = 2 * x[6];
let x7_2 = 2 * x[7];
let x5_19 = 19 * x[5];
let x6_19 = 19 * x[6];
let x7_19 = 19 * x[7];
let x8_19 = 19 * x[8];
let x9_19 = 19 * x[9];
/// Helper function to multiply two 32-bit integers with 64 bits
/// of output.
#[inline(always)]
fn m(x: u32, y: u32) -> u64 { (x as u64) * (y as u64) }
// This block is rearranged so that instead of doing a 32-bit multiplication by 38, we do a
// 64-bit multiplication by 2 on the results. This is because lg(38) is too big: we would
// have less than 1 bit of headroom left, which is too little.
let mut z = [0u64;10];
z[0] = m(x[0],x[0]) + m(x2_2,x8_19) + m(x4_2,x6_19) + (m(x1_2,x9_19) + m(x3_2,x7_19) + m(x[5],x5_19))*2;
z[1] = m(x0_2,x[1]) + m(x3_2,x8_19) + m(x5_2,x6_19) + (m(x[2],x9_19) + m(x[4],x7_19))*2;
z[2] = m(x0_2,x[2]) + m(x1_2,x[1]) + m(x4_2,x8_19) + m(x[6],x6_19) + (m(x3_2,x9_19) + m(x5_2,x7_19))*2;
z[3] = m(x0_2,x[3]) + m(x1_2,x[2]) + m(x5_2,x8_19) + (m(x[4],x9_19) + m(x[6],x7_19))*2;
z[4] = m(x0_2,x[4]) + m(x1_2,x3_2) + m(x[2],x[2]) + m(x6_2,x8_19) + (m(x5_2,x9_19) + m(x[7],x7_19))*2;
z[5] = m(x0_2,x[5]) + m(x1_2,x[4]) + m(x2_2,x[3]) + m(x7_2,x8_19) + m(x[6],x9_19)*2;
z[6] = m(x0_2,x[6]) + m(x1_2,x5_2) + m(x2_2,x[4]) + m(x3_2,x[3]) + m(x[8],x8_19) + m(x7_2,x9_19)*2;
z[7] = m(x0_2,x[7]) + m(x1_2,x[6]) + m(x2_2,x[5]) + m(x3_2,x[4]) + m(x[8],x9_19)*2;
z[8] = m(x0_2,x[8]) + m(x1_2,x7_2) + m(x2_2,x[6]) + m(x3_2,x5_2) + m(x[4],x[4]) + m(x[9],x9_19)*2;
z[9] = m(x0_2,x[9]) + m(x1_2,x[8]) + m(x2_2,x[7]) + m(x3_2,x[6]) + m(x4_2,x[5]) ;
z
}