in src/P610/AMD64/fp_x64.c [474:726]
void rdc_mont(digit_t* ma, digit_t* mc)
{ // Montgomery reduction exploiting special form of the prime.
// mc = ma*R^-1 mod p610x2, where R = 2^640.
// If ma < 2^640*p610, the output mc is in the range [0, 2*p610-1].
// ma is assumed to be in Montgomery representation.
#if (OS_TARGET == OS_WIN)
unsigned int carry;
digit_t t = 0;
uint128_t uv = {0};
mc[0] = ma[0];
mc[1] = ma[1];
mc[2] = ma[2];
mc[3] = ma[3];
MUL128(mc[0], ((digit_t*)p610p1)[4], uv);
ADDC(0, uv[0], ma[4], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
mc[4] = uv[0];
uv[0] = uv[1];
uv[1] = 0;
MULADD128(mc[0], ((digit_t*)p610p1)[5], uv, carry, uv);
MULADD128(mc[1], ((digit_t*)p610p1)[4], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[5], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[5] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t*)p610p1)[5], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p610p1)[4], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[6], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[6] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p610p1)[5], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p610p1)[4], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[7], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[7] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p610p1)[5], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p610p1)[4], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[8], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[8] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p610p1)[5], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p610p1)[4], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[9], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[9] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[1], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p610p1)[5], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p610p1)[4], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[10], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[0] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[2], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p610p1)[5], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p610p1)[4], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[11], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[1] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[3], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p610p1)[5], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p610p1)[4], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[12], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[2] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[4], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p610p1)[5], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p610p1)[4], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[13], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[3] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[5], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p610p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[14], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[4] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[6], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p610p1)[6], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[15], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[5] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[7], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p610p1)[7], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[16], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[6] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[8], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p610p1)[8], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[17], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[7] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[9], ((digit_t*)p610p1)[9], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[18], carry, mc[8]);
ADDC(carry, uv[1], 0, carry, uv[1]);
ADDC(0, uv[1], ma[19], carry, mc[9]);
#elif (OS_TARGET == OS_NIX)
rdc610_asm(ma, mc);
#endif
}