static void big_mpyP()

in core/riot/reference/RiotEcc.c [392:785]


static void big_mpyP (bigval_t *tgt, bigval_t const *a, bigval_t const *b, modulus_val_t modselect)
{
	int64_t w[2 * BIGLEN];
	int64_t s_accum;	// signed
	int i, minj, maxj, a_words, b_words, cum_carry;

#ifdef SMALL_CODE
	int j;
#else
	uint32_t const *ap;
	uint32_t const *bp;
#endif

#ifdef ARM7_ASM
	uint32_t tmpr0, tmpr1, sum0, sum1;
#else
	uint64_t u_accum;
#endif

#ifdef ECDSA
#define MODSELECT modselect
#else
#define MODSELECT MOD_MODULUS
#endif

	a_words = BIGLEN;
	while (a_words > 0 && a->data[a_words - 1] == 0) {
		--a_words;
	}
	//
	// i is target index.  The j (in comments only) indexes
	// through the multiplier.
	//
#ifdef ARM7_ASM
	sum0 = 0;
	sum1 = 0;
	cum_carry = 0;
#else
	u_accum = 0;
	cum_carry = 0;
#endif

#ifndef SPECIAL_SQUARE
#define NO_SPECIAL_SQUARE 1
#else
#define NO_SPECIAL_SQUARE 0
#endif

	if (NO_SPECIAL_SQUARE || (a != b)) {
		// normal multiply

		// compute length of b
		b_words = BIGLEN;
		while (b_words > 0 && b->data[b_words - 1] == 0) {
			--b_words;
		}
		// iterate over words of output
		for (i = 0; i < a_words + b_words - 1; ++i) {
			//
			// Run j over all possible values such that
			// 0 <= j < b_words && 0 <= i-j < a_words.
			// Hence
			// j >= 0 and j > i - a_words and
			// j < b_words and j <= i
			//
			// (j exists only in the mind of the reader.)
			//
			maxj = MIN (b_words - 1, i);
			minj = MAX (0, i - a_words + 1);

			// ACCUM accumulates into <cum_carry, u_accum>.
#ifdef SMALL_CODE
			for (j = minj; j <= maxj; ++j) {
				ACCUM (a->data + i - j, b->data + j);
			}
#else	// SMALL_CODE not defined
			//
			// The inner loop (over j, running from minj to maxj) is
			// unrolled.  Sequentially increasing case values in the code
			// are intended to coax the compiler into emitting a jump
			// table. Here j runs from maxj to minj, but addition is
			// commutative, so it doesn't matter.
			//
			ap = &a->data[i - minj];
			bp = &b->data[minj];

			// the order is opposite the loop, but addition is commutative
			switch (8 - (maxj - minj)) {
				case 0:
					ACCUM (ap - 8, bp + 8);	// j = 8
				/* fall through */ /* no break */

				case 1:
					ACCUM (ap - 7, bp + 7);
				/* fall through */ /* no break */

				case 2:
					ACCUM (ap - 6, bp + 6);
				/* fall through */ /* no break */

				case 3:
					ACCUM (ap - 5, bp + 5);
				/* fall through */ /* no break */

				case 4:
					ACCUM (ap - 4, bp + 4);
				/* fall through */ /* no break */

				case 5:
					ACCUM (ap - 3, bp + 3);
				/* fall through */ /* no break */

				case 6:
					ACCUM (ap - 2, bp + 2);
				/* fall through */ /* no break */

				case 7:
					ACCUM (ap - 1, bp + 1);
				/* fall through */ /* no break */

				case 8:
					ACCUM (ap - 0, bp + 0);	// j = 0
					/* fall through */ /* no break */
			}
#endif	// SMALL_CODE not defined

			// The total value is
			// w + u_accum << (32 *i) + cum_carry << (32 * i + 64).
			// The steps from here to the end of the i-loop (not counting
			// squaring branch) and the increment of i by the loop
			// maintain the invariant that the value is constant.
			// (Assume w had been initialized to zero, even though we
			// really didn't.)

#ifdef ARM7_ASM
			w[i] = sum0;
			sum0 = sum1;
			sum1 = cum_carry;
			cum_carry = 0;
#else
			w[i] = u_accum & 0xffffffffULL;
			u_accum = (u_accum >> 32) + ((uint64_t) cum_carry << 32);
			cum_carry = 0;
#endif
		}
	}
	else {
		// squaring

#ifdef SPECIAL_SQUARE
		// a[i] * a[j] + a[j] * a[i] == 2 * (a[i] * a[j]), so
		// we can cut the number of multiplies nearly in half.
		for (i = 0; i < 2 * a_words - 1; ++i) {
			// Run j over all possible values such that
			// 0 <= j < a_words && 0 <= i-j < a_words && j < i-j
			// Hence
			// j >= 0 and j > i - a_words and
			// j < a_words and 2*j < i
			//
			maxj = MIN (a_words - 1, i);
			// Only go half way.  Must use (i-1)>> 1, not (i-1)/ 2
			maxj = MIN (maxj, (i - 1) >> 1);
			minj = MAX (0, i - a_words + 1);
#ifdef SMALL_CODE
			for (j = minj; j <= maxj; ++j) {
				ACCUMDBL (a->data + i - j, a->data + j);
			}
			// j live
			if ((i & 1) == 0) {
				ACCUM (a->data + j, a->data + j);
			}
#else	// SMALL_CODE not defined
			ap = &a->data[i - minj];
			bp = &a->data[minj];

			switch (8 - (maxj - minj)) {
				case 0:
					ACCUMDBL (ap - 8, bp + 8);	// j = 8
				/* fall through */ /* no break */

				case 1:
					ACCUMDBL (ap - 7, bp + 7);
				/* fall through */ /* no break */

				case 2:
					ACCUMDBL (ap - 6, bp + 6);
				/* fall through */ /* no break */

				case 3:
					ACCUMDBL (ap - 5, bp + 5);
				/* fall through */ /* no break */

				case 4:
					ACCUMDBL (ap - 4, bp + 4);
				/* fall through */ /* no break */

				case 5:
					ACCUMDBL (ap - 3, bp + 3);
				/* fall through */ /* no break */

				case 6:
					ACCUMDBL (ap - 2, bp + 2);
				/* fall through */ /* no break */

				case 7:
					ACCUMDBL (ap - 1, bp + 1);
				/* fall through */ /* no break */

				case 8:
					ACCUMDBL (ap - 0, bp + 0);	// j = 0
					/* fall through */ /* no break */
			}

			// Even numbered columns (zero based) have a middle element.
			if ((i & 1) == 0) {
				ACCUM (a->data + maxj + 1, a->data + maxj + 1);
			}
#endif	// SMALL_CODE not defined

			// The total value is
			// w + u_accum << (32 *i) + cum_carry << (32 * i + 64).
			// The steps from here to the end of i-loop and
			// the increment of i by the loop maintain the invariant
			// that the total value is unchanged.
			// (Assume w had been initialized to zero, even though we
			//  really didn't.)
#ifdef ARM7_ASM
			w[i] = sum0;
			sum0 = sum1;
			sum1 = cum_carry;
			cum_carry = 0;
#else	// ARM7_ASM not defined
			w[i] = u_accum & 0xffffffffULL;
			u_accum = (u_accum >> 32) + ((uint64_t) cum_carry << 32);
			cum_carry = 0;
#endif	// ARM7_ASM not defined
		}
#endif	// SPECIAL_SQUARE
	}	// false branch of NO_SPECIAL_SQUARE || (a != b)

	// The total value as indicated above is maintained invariant
	// down to the approximate reduction code below.

	// propagate any residual to next to end of array
	for (; i < 2 * BIGLEN - 1; ++i) {
#ifdef ARM7_ASM
		w[i] = sum0;
		sum0 = sum1;
		sum1 = 0;
#else
		w[i] = u_accum & 0xffffffffULL;
		u_accum >>= 32;
#endif
	}
	// i is still live
	// from here on, think of w as containing signed values

	// Last value of the array, still using i.  We store the entire 64
	// bits.  There are two reasons for this.  The pedantic one is that
	// this clearly maintains our invariant that the value has not
	// changed.  The other one is that this makes w[BIGNUM-1] negative
	// if the result was negative, and reduction depends on this.

#ifdef ARM7_ASM
	w[i] = ((uint64_t) sum1 << 32) | sum0;
	// sum1 = sum0 = 0;  maintain invariant
#else
	w[i] = u_accum;
	// u_accum = 0; maintain invariant
#endif
	//
	// Apply correction if a or b are negative.  It would be nice to
	// put this inside the i-loop to reduce memory bandwidth.  Later...
	//
	// signvedval(a) = unsignedval(a) - 2^(32*BIGLEN)*isneg(a).
	//
	// so signval(a) * signedval(b) = unsignedval(a) * unsignedval[b] -
	//   isneg(a) * unsignedval(b) * 2^(32*BIGLEN) -
	//   isneg(b) * unsingedval(a) * 2^ (32*BIGLEN) +
	//   isneg(a) * isneg(b) * 2 ^(2 * 32 * BIGLEN)
	//
	// If one arg is zero and the other is negative, obviously no
	// correction is needed, but we do not make a special case, since
	// the "correction" only adds in zero.

	if (big_is_negative (a)) {
		for (i = 0; i < BIGLEN; ++i) {
			w[i + BIGLEN] -= b->data[i];
		}
	}
	if (big_is_negative (b)) {
		for (i = 0; i < BIGLEN; ++i) {
			w[i + BIGLEN] -= a->data[i];
		}
		if (big_is_negative (a)) {
			// both negative
			w[2 * BIGLEN - 1] += 1ULL << 32;
		}
	}
	//
	// The code from here to the end of the function maintains w mod
	// modulusP constant, even though it changes the value of w.
	//

	// reduce (approximate)
	if (MODSELECT == MOD_MODULUS) {
		for (i = 2 * BIGLEN - 1; i >= MSW; --i) {
			int64_t v;

			v = w[i];
			if (v != 0) {
				w[i] = 0;
				w[i - 1] += v;
				w[i - 2] -= v;
				w[i - 5] -= v;
				w[i - 8] += v;
			}
		}
	}
	else {
		// modulo order.  Not performance critical
#if ECDSA_SIGN || ECDSA_VERIFY

		int64_t carry;

		// convert to 32 bit values, except for most signifiant word
		carry = 0;
		for (i = 0; i < 2 * BIGLEN - 1; ++i) {
			w[i] += carry;
			carry = w[i] >> 32;
			w[i] -= carry << 32;
		}
		// i is live
		w[i] += carry;

		// each iteration knocks off word i
		for (i = 2 * BIGLEN - 1; i >= MSW; --i) {	// most to least significant
			int64_t v;
			int64_t tmp;
			int64_t tmp2;
			int j;
			int k;

			for (k = 0; w[i] != 0 && k < 3; ++k) {
				v = w[i];
				carry = 0;
				for (j = i - MSW; j < 2 * BIGLEN; ++j) {
					if (j <= i) {
						tmp2 = -(v * orderDBL.data[j - i + MSW]);
						tmp = w[j] + tmp2 + carry;
					}
					else {
						tmp = w[j] + carry;
					}
					if (j < 2 * BIGLEN - 1) {
						carry = tmp >> 32;
						tmp -= carry << 32;
					}
					else {
						carry = 0;
					}
					w[j] = tmp;
				}
			}
		}
#endif	//  ECDSA_SIGN || ECDSA_VERIFY
	}
	// propagate carries and copy out to tgt in 32 bit chunks.
	s_accum = 0;
	for (i = 0; i < BIGLEN; ++i) {
		s_accum += w[i];
		tgt->data[i] = (uint32_t) s_accum;
		s_accum >>= 32;	// signed, so sign bit propagates
	}
	// final approximate reduction

	if (MODSELECT == MOD_MODULUS) {
		big_approx_reduceP (tgt, tgt);
	}
	else {
#ifdef ECDSA
		if (tgt->data[MSW]) {
			// Keep it simple! At one time all this was done in place,
			// and was totally non-obvious.
			bigval_t tmp;

			// The most significant word is signed, even though the
			// whole array has declared uint32_t.
			big_1wd_mpy (&tmp, &orderP, (int32_t) tgt->data[MSW]);
			big_sub (tgt, tgt, &tmp);
		}
#endif	// ECDSA
	}
}