in core/riot/reference/RiotEcc.c [392:785]
static void big_mpyP (bigval_t *tgt, bigval_t const *a, bigval_t const *b, modulus_val_t modselect)
{
int64_t w[2 * BIGLEN];
int64_t s_accum; // signed
int i, minj, maxj, a_words, b_words, cum_carry;
#ifdef SMALL_CODE
int j;
#else
uint32_t const *ap;
uint32_t const *bp;
#endif
#ifdef ARM7_ASM
uint32_t tmpr0, tmpr1, sum0, sum1;
#else
uint64_t u_accum;
#endif
#ifdef ECDSA
#define MODSELECT modselect
#else
#define MODSELECT MOD_MODULUS
#endif
a_words = BIGLEN;
while (a_words > 0 && a->data[a_words - 1] == 0) {
--a_words;
}
//
// i is target index. The j (in comments only) indexes
// through the multiplier.
//
#ifdef ARM7_ASM
sum0 = 0;
sum1 = 0;
cum_carry = 0;
#else
u_accum = 0;
cum_carry = 0;
#endif
#ifndef SPECIAL_SQUARE
#define NO_SPECIAL_SQUARE 1
#else
#define NO_SPECIAL_SQUARE 0
#endif
if (NO_SPECIAL_SQUARE || (a != b)) {
// normal multiply
// compute length of b
b_words = BIGLEN;
while (b_words > 0 && b->data[b_words - 1] == 0) {
--b_words;
}
// iterate over words of output
for (i = 0; i < a_words + b_words - 1; ++i) {
//
// Run j over all possible values such that
// 0 <= j < b_words && 0 <= i-j < a_words.
// Hence
// j >= 0 and j > i - a_words and
// j < b_words and j <= i
//
// (j exists only in the mind of the reader.)
//
maxj = MIN (b_words - 1, i);
minj = MAX (0, i - a_words + 1);
// ACCUM accumulates into <cum_carry, u_accum>.
#ifdef SMALL_CODE
for (j = minj; j <= maxj; ++j) {
ACCUM (a->data + i - j, b->data + j);
}
#else // SMALL_CODE not defined
//
// The inner loop (over j, running from minj to maxj) is
// unrolled. Sequentially increasing case values in the code
// are intended to coax the compiler into emitting a jump
// table. Here j runs from maxj to minj, but addition is
// commutative, so it doesn't matter.
//
ap = &a->data[i - minj];
bp = &b->data[minj];
// the order is opposite the loop, but addition is commutative
switch (8 - (maxj - minj)) {
case 0:
ACCUM (ap - 8, bp + 8); // j = 8
/* fall through */ /* no break */
case 1:
ACCUM (ap - 7, bp + 7);
/* fall through */ /* no break */
case 2:
ACCUM (ap - 6, bp + 6);
/* fall through */ /* no break */
case 3:
ACCUM (ap - 5, bp + 5);
/* fall through */ /* no break */
case 4:
ACCUM (ap - 4, bp + 4);
/* fall through */ /* no break */
case 5:
ACCUM (ap - 3, bp + 3);
/* fall through */ /* no break */
case 6:
ACCUM (ap - 2, bp + 2);
/* fall through */ /* no break */
case 7:
ACCUM (ap - 1, bp + 1);
/* fall through */ /* no break */
case 8:
ACCUM (ap - 0, bp + 0); // j = 0
/* fall through */ /* no break */
}
#endif // SMALL_CODE not defined
// The total value is
// w + u_accum << (32 *i) + cum_carry << (32 * i + 64).
// The steps from here to the end of the i-loop (not counting
// squaring branch) and the increment of i by the loop
// maintain the invariant that the value is constant.
// (Assume w had been initialized to zero, even though we
// really didn't.)
#ifdef ARM7_ASM
w[i] = sum0;
sum0 = sum1;
sum1 = cum_carry;
cum_carry = 0;
#else
w[i] = u_accum & 0xffffffffULL;
u_accum = (u_accum >> 32) + ((uint64_t) cum_carry << 32);
cum_carry = 0;
#endif
}
}
else {
// squaring
#ifdef SPECIAL_SQUARE
// a[i] * a[j] + a[j] * a[i] == 2 * (a[i] * a[j]), so
// we can cut the number of multiplies nearly in half.
for (i = 0; i < 2 * a_words - 1; ++i) {
// Run j over all possible values such that
// 0 <= j < a_words && 0 <= i-j < a_words && j < i-j
// Hence
// j >= 0 and j > i - a_words and
// j < a_words and 2*j < i
//
maxj = MIN (a_words - 1, i);
// Only go half way. Must use (i-1)>> 1, not (i-1)/ 2
maxj = MIN (maxj, (i - 1) >> 1);
minj = MAX (0, i - a_words + 1);
#ifdef SMALL_CODE
for (j = minj; j <= maxj; ++j) {
ACCUMDBL (a->data + i - j, a->data + j);
}
// j live
if ((i & 1) == 0) {
ACCUM (a->data + j, a->data + j);
}
#else // SMALL_CODE not defined
ap = &a->data[i - minj];
bp = &a->data[minj];
switch (8 - (maxj - minj)) {
case 0:
ACCUMDBL (ap - 8, bp + 8); // j = 8
/* fall through */ /* no break */
case 1:
ACCUMDBL (ap - 7, bp + 7);
/* fall through */ /* no break */
case 2:
ACCUMDBL (ap - 6, bp + 6);
/* fall through */ /* no break */
case 3:
ACCUMDBL (ap - 5, bp + 5);
/* fall through */ /* no break */
case 4:
ACCUMDBL (ap - 4, bp + 4);
/* fall through */ /* no break */
case 5:
ACCUMDBL (ap - 3, bp + 3);
/* fall through */ /* no break */
case 6:
ACCUMDBL (ap - 2, bp + 2);
/* fall through */ /* no break */
case 7:
ACCUMDBL (ap - 1, bp + 1);
/* fall through */ /* no break */
case 8:
ACCUMDBL (ap - 0, bp + 0); // j = 0
/* fall through */ /* no break */
}
// Even numbered columns (zero based) have a middle element.
if ((i & 1) == 0) {
ACCUM (a->data + maxj + 1, a->data + maxj + 1);
}
#endif // SMALL_CODE not defined
// The total value is
// w + u_accum << (32 *i) + cum_carry << (32 * i + 64).
// The steps from here to the end of i-loop and
// the increment of i by the loop maintain the invariant
// that the total value is unchanged.
// (Assume w had been initialized to zero, even though we
// really didn't.)
#ifdef ARM7_ASM
w[i] = sum0;
sum0 = sum1;
sum1 = cum_carry;
cum_carry = 0;
#else // ARM7_ASM not defined
w[i] = u_accum & 0xffffffffULL;
u_accum = (u_accum >> 32) + ((uint64_t) cum_carry << 32);
cum_carry = 0;
#endif // ARM7_ASM not defined
}
#endif // SPECIAL_SQUARE
} // false branch of NO_SPECIAL_SQUARE || (a != b)
// The total value as indicated above is maintained invariant
// down to the approximate reduction code below.
// propagate any residual to next to end of array
for (; i < 2 * BIGLEN - 1; ++i) {
#ifdef ARM7_ASM
w[i] = sum0;
sum0 = sum1;
sum1 = 0;
#else
w[i] = u_accum & 0xffffffffULL;
u_accum >>= 32;
#endif
}
// i is still live
// from here on, think of w as containing signed values
// Last value of the array, still using i. We store the entire 64
// bits. There are two reasons for this. The pedantic one is that
// this clearly maintains our invariant that the value has not
// changed. The other one is that this makes w[BIGNUM-1] negative
// if the result was negative, and reduction depends on this.
#ifdef ARM7_ASM
w[i] = ((uint64_t) sum1 << 32) | sum0;
// sum1 = sum0 = 0; maintain invariant
#else
w[i] = u_accum;
// u_accum = 0; maintain invariant
#endif
//
// Apply correction if a or b are negative. It would be nice to
// put this inside the i-loop to reduce memory bandwidth. Later...
//
// signvedval(a) = unsignedval(a) - 2^(32*BIGLEN)*isneg(a).
//
// so signval(a) * signedval(b) = unsignedval(a) * unsignedval[b] -
// isneg(a) * unsignedval(b) * 2^(32*BIGLEN) -
// isneg(b) * unsingedval(a) * 2^ (32*BIGLEN) +
// isneg(a) * isneg(b) * 2 ^(2 * 32 * BIGLEN)
//
// If one arg is zero and the other is negative, obviously no
// correction is needed, but we do not make a special case, since
// the "correction" only adds in zero.
if (big_is_negative (a)) {
for (i = 0; i < BIGLEN; ++i) {
w[i + BIGLEN] -= b->data[i];
}
}
if (big_is_negative (b)) {
for (i = 0; i < BIGLEN; ++i) {
w[i + BIGLEN] -= a->data[i];
}
if (big_is_negative (a)) {
// both negative
w[2 * BIGLEN - 1] += 1ULL << 32;
}
}
//
// The code from here to the end of the function maintains w mod
// modulusP constant, even though it changes the value of w.
//
// reduce (approximate)
if (MODSELECT == MOD_MODULUS) {
for (i = 2 * BIGLEN - 1; i >= MSW; --i) {
int64_t v;
v = w[i];
if (v != 0) {
w[i] = 0;
w[i - 1] += v;
w[i - 2] -= v;
w[i - 5] -= v;
w[i - 8] += v;
}
}
}
else {
// modulo order. Not performance critical
#if ECDSA_SIGN || ECDSA_VERIFY
int64_t carry;
// convert to 32 bit values, except for most signifiant word
carry = 0;
for (i = 0; i < 2 * BIGLEN - 1; ++i) {
w[i] += carry;
carry = w[i] >> 32;
w[i] -= carry << 32;
}
// i is live
w[i] += carry;
// each iteration knocks off word i
for (i = 2 * BIGLEN - 1; i >= MSW; --i) { // most to least significant
int64_t v;
int64_t tmp;
int64_t tmp2;
int j;
int k;
for (k = 0; w[i] != 0 && k < 3; ++k) {
v = w[i];
carry = 0;
for (j = i - MSW; j < 2 * BIGLEN; ++j) {
if (j <= i) {
tmp2 = -(v * orderDBL.data[j - i + MSW]);
tmp = w[j] + tmp2 + carry;
}
else {
tmp = w[j] + carry;
}
if (j < 2 * BIGLEN - 1) {
carry = tmp >> 32;
tmp -= carry << 32;
}
else {
carry = 0;
}
w[j] = tmp;
}
}
}
#endif // ECDSA_SIGN || ECDSA_VERIFY
}
// propagate carries and copy out to tgt in 32 bit chunks.
s_accum = 0;
for (i = 0; i < BIGLEN; ++i) {
s_accum += w[i];
tgt->data[i] = (uint32_t) s_accum;
s_accum >>= 32; // signed, so sign bit propagates
}
// final approximate reduction
if (MODSELECT == MOD_MODULUS) {
big_approx_reduceP (tgt, tgt);
}
else {
#ifdef ECDSA
if (tgt->data[MSW]) {
// Keep it simple! At one time all this was done in place,
// and was totally non-obvious.
bigval_t tmp;
// The most significant word is signed, even though the
// whole array has declared uint32_t.
big_1wd_mpy (&tmp, &orderP, (int32_t) tgt->data[MSW]);
big_sub (tgt, tgt, &tmp);
}
#endif // ECDSA
}
}