void rdc_mont()

in src/P503/AMD64/fp_x64.c [382:569]


void rdc_mont(digit_t* ma, digit_t* mc)
{ // Montgomery reduction exploiting special form of the prime.
  // mc = ma*R^-1 mod p503x2, where R = 2^512.
  // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1].
  // ma is assumed to be in Montgomery representation.
        
#if (OS_TARGET == OS_WIN)
    unsigned int carry;
    digit_t t = 0;
    uint128_t uv = {0};
    
    mc[0] = ma[0];
    mc[1] = ma[1];
    mc[2] = ma[2];
    MUL128(mc[0], ((digit_t*)p503p1)[3], uv);
    ADDC(0, uv[0], ma[3], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    mc[3] = uv[0];
    uv[0] = uv[1];
    uv[1] = 0;

    MULADD128(mc[0], ((digit_t*)p503p1)[4], uv, carry, uv);
    MULADD128(mc[1], ((digit_t*)p503p1)[3], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[4], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[4] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[0], ((digit_t*)p503p1)[5], uv, carry, uv);
    t += carry;
    MULADD128(mc[1], ((digit_t*)p503p1)[4], uv, carry, uv);
    t += carry;
    MULADD128(mc[2], ((digit_t*)p503p1)[3], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[5], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[5] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[0], ((digit_t*)p503p1)[6], uv, carry, uv);
    t += carry;
    MULADD128(mc[1], ((digit_t*)p503p1)[5], uv, carry, uv);
    t += carry;
    MULADD128(mc[2], ((digit_t*)p503p1)[4], uv, carry, uv);
    t += carry;
    MULADD128(mc[3], ((digit_t*)p503p1)[3], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[6], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[6] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[0], ((digit_t*)p503p1)[7], uv, carry, uv);
    t += carry;
    MULADD128(mc[1], ((digit_t*)p503p1)[6], uv, carry, uv);
    t += carry;
    MULADD128(mc[2], ((digit_t*)p503p1)[5], uv, carry, uv);
    t += carry;
    MULADD128(mc[3], ((digit_t*)p503p1)[4], uv, carry, uv);
    t += carry;
    MULADD128(mc[4], ((digit_t*)p503p1)[3], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[7], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[7] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[1], ((digit_t*)p503p1)[7], uv, carry, uv);
    t += carry;
    MULADD128(mc[2], ((digit_t*)p503p1)[6], uv, carry, uv);
    t += carry;
    MULADD128(mc[3], ((digit_t*)p503p1)[5], uv, carry, uv);
    t += carry;
    MULADD128(mc[4], ((digit_t*)p503p1)[4], uv, carry, uv);
    t += carry;
    MULADD128(mc[5], ((digit_t*)p503p1)[3], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[8], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[0] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[2], ((digit_t*)p503p1)[7], uv, carry, uv);
    t += carry;
    MULADD128(mc[3], ((digit_t*)p503p1)[6], uv, carry, uv);
    t += carry;
    MULADD128(mc[4], ((digit_t*)p503p1)[5], uv, carry, uv);
    t += carry;
    MULADD128(mc[5], ((digit_t*)p503p1)[4], uv, carry, uv);
    t += carry;
    MULADD128(mc[6], ((digit_t*)p503p1)[3], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[9], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[1] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[3], ((digit_t*)p503p1)[7], uv, carry, uv);
    t += carry;
    MULADD128(mc[4], ((digit_t*)p503p1)[6], uv, carry, uv);
    t += carry;
    MULADD128(mc[5], ((digit_t*)p503p1)[5], uv, carry, uv);
    t += carry;
    MULADD128(mc[6], ((digit_t*)p503p1)[4], uv, carry, uv);
    t += carry;
    MULADD128(mc[7], ((digit_t*)p503p1)[3], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[10], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[2] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[4], ((digit_t*)p503p1)[7], uv, carry, uv);
    t += carry;
    MULADD128(mc[5], ((digit_t*)p503p1)[6], uv, carry, uv);
    t += carry;
    MULADD128(mc[6], ((digit_t*)p503p1)[5], uv, carry, uv);
    t += carry;
    MULADD128(mc[7], ((digit_t*)p503p1)[4], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[11], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[3] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[5], ((digit_t*)p503p1)[7], uv, carry, uv);
    t += carry;
    MULADD128(mc[6], ((digit_t*)p503p1)[6], uv, carry, uv);
    t += carry;
    MULADD128(mc[7], ((digit_t*)p503p1)[5], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[12], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[4] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[6], ((digit_t*)p503p1)[7], uv, carry, uv);
    t += carry;
    MULADD128(mc[7], ((digit_t*)p503p1)[6], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[13], carry, uv[0]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    t += carry; 
    mc[5] = uv[0];
    uv[0] = uv[1];
    uv[1] = t;
    t = 0;

    MULADD128(mc[7], ((digit_t*)p503p1)[7], uv, carry, uv);
    t += carry;
    ADDC(0, uv[0], ma[14], carry, mc[6]); 
    ADDC(carry, uv[1], 0, carry, uv[1]); 
    ADDC(0, uv[1], ma[15], carry, mc[7]); 
    
#elif (OS_TARGET == OS_NIX)                 
    
    rdc503_asm(ma, mc);    

#endif
}