in src/P751/AMD64/fp_x64.c [582:907]
void rdc_mont(digit_t* ma, digit_t* mc)
{ // Montgomery reduction exploiting special form of the prime.
// mc = ma*R^-1 mod p751x2, where R = 2^768.
// If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1].
// ma is assumed to be in Montgomery representation.
#if (OS_TARGET == OS_WIN)
unsigned int carry;
digit_t t = 0;
uint128_t uv = {0};
mc[0] = ma[0];
mc[1] = ma[1];
mc[2] = ma[2];
mc[3] = ma[3];
mc[4] = ma[4];
MUL128(mc[0], ((digit_t*)p751p1)[5], uv);
ADDC(0, uv[0], ma[5], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
mc[5] = uv[0];
uv[0] = uv[1];
uv[1] = 0;
MULADD128(mc[0], ((digit_t*)p751p1)[6], uv, carry, uv);
MULADD128(mc[1], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[6], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[6] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[7], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[7] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[8], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[8] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[9], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[9] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[10], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[10] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[11], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[11] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[1], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[12], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[0] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[2], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[13], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[1] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[3], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[14], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[2] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[4], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[15], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[3] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[5], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t*)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[16], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[4] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[6], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t*)p751p1)[6], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[17], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[5] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[7], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t*)p751p1)[7], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[18], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[6] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[8], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t*)p751p1)[8], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[19], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[7] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[9], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t*)p751p1)[9], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[20], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[8] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[10], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t*)p751p1)[10], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[21], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[9] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[11], ((digit_t*)p751p1)[11], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[22], carry, mc[10]);
ADDC(carry, uv[1], 0, carry, uv[1]);
ADDC(0, uv[1], ma[23], carry, mc[11]);
#elif (OS_TARGET == OS_NIX)
rdc751_asm(ma, mc);
#endif
}