crypto/fipsmodule/ec/ec_nistp.c (395 lines of code) (raw):
// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC
// In this file we will implement elliptic curve point operations for
// NIST curves P-256, P-384, and P-521. The idea is to implement the operations
// in a generic way such that the code can be reused instead of having
// a separate implementation for each of the curves. We implement:
// 1. point addition,
// 2. point doubling,
// 3. scalar multiplication of a base point,
// 4. scalar multiplication of an arbitrary point,
// 5. scalar multiplication of a base and an arbitrary point.
//
// Matrix of what has been done so far:
//
// | op | P-521 | P-384 | P-256 |
// |----------------------------|
// | 1. | x | x | x* |
// | 2. | x | x | x* |
// | 3. | x | x | x* |
// | 4. | x | x | x* |
// | 5. | x | x | x* |
// * For P-256, only the Fiat-crypto implementation in p256.c is replaced.
#include "ec_nistp.h"
// Some of the functions below need temporary field element variables.
// To avoid dynamic allocation we define nistp_felem type to have the maximum
// size possible (which is currently P-521 curve). The values are hard-coded
// for the moment, this will be fixed when we migrate the whole P-521
// implementation to ec_nistp.c.
#if defined(EC_NISTP_USE_64BIT_LIMB)
#define FELEM_MAX_NUM_OF_LIMBS (9)
#else
#define FELEM_MAX_NUM_OF_LIMBS (19)
#endif
typedef ec_nistp_felem_limb ec_nistp_felem[FELEM_MAX_NUM_OF_LIMBS];
// Conditional copy in constant-time (out = t == 0 ? z : nz).
static void cmovznz(ec_nistp_felem_limb *out,
size_t num_limbs,
ec_nistp_felem_limb t,
const ec_nistp_felem_limb *z,
const ec_nistp_felem_limb *nz) {
ec_nistp_felem_limb mask = constant_time_is_zero_w(t);
for (size_t i = 0; i < num_limbs; i++) {
out[i] = constant_time_select_w(mask, z[i], nz[i]);
}
}
// Group operations
// ----------------
//
// Building on top of the field operations we have the operations on the
// elliptic curve group itself. Points on the curve are represented in Jacobian
// coordinates.
//
// ec_nistp_point_double calculates 2*(x_in, y_in, z_in)
//
// The method is based on:
// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
// for which there is a Coq transcription and correctness proof:
// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
//
// However, we slighty changed the computation for efficiency (see the full
// explanation within the function body), which makes the Coq proof above
// not applicable to our implementation.
// TODO(awslc): Write a Coq correctness proof for our version of the algorithm.
//
// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed;
// while x_out == y_in is not (maybe this works, but it's not tested).
void ec_nistp_point_double(const ec_nistp_meth *ctx,
ec_nistp_felem_limb *x_out,
ec_nistp_felem_limb *y_out,
ec_nistp_felem_limb *z_out,
const ec_nistp_felem_limb *x_in,
const ec_nistp_felem_limb *y_in,
const ec_nistp_felem_limb *z_in) {
ec_nistp_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
// delta = z^2
ctx->felem_sqr(delta, z_in);
// gamma = y^2
ctx->felem_sqr(gamma, y_in);
// beta = x*gamma
ctx->felem_mul(beta, x_in, gamma);
// alpha = 3*(x-delta)*(x+delta)
ctx->felem_sub(ftmp, x_in, delta);
ctx->felem_add(ftmp2, x_in, delta);
ctx->felem_add(tmptmp, ftmp2, ftmp2);
ctx->felem_add(ftmp2, ftmp2, tmptmp);
ctx->felem_mul(alpha, ftmp, ftmp2);
// x' = alpha^2 - 8*beta
ctx->felem_sqr(x_out, alpha);
ctx->felem_add(fourbeta, beta, beta);
ctx->felem_add(fourbeta, fourbeta, fourbeta);
ctx->felem_add(tmptmp, fourbeta, fourbeta);
ctx->felem_sub(x_out, x_out, tmptmp);
// z' = (y + z)^2 - gamma - delta
// The following calculation differs from the Coq proof cited above.
// The proof is for:
// add(delta, gamma, delta);
// add(ftmp, y_in, z_in);
// square(z_out, ftmp);
// sub(z_out, z_out, delta);
// Our operations sequence is a bit more efficient because it saves us
// a certain number of conditional moves.
ctx->felem_add(ftmp, y_in, z_in);
ctx->felem_sqr(z_out, ftmp);
ctx->felem_sub(z_out, z_out, gamma);
ctx->felem_sub(z_out, z_out, delta);
// y' = alpha*(4*beta - x') - 8*gamma^2
ctx->felem_sub(y_out, fourbeta, x_out);
ctx->felem_add(gamma, gamma, gamma);
ctx->felem_sqr(gamma, gamma);
ctx->felem_mul(y_out, alpha, y_out);
ctx->felem_add(gamma, gamma, gamma);
ctx->felem_sub(y_out, y_out, gamma);
}
// ec_nistp_point_add calculates (x1, y1, z1) + (x2, y2, z2)
//
// The method is taken from:
// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#addition-add-2007-bl
// adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
//
// Coq transcription and correctness proof:
// <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L467>
// <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L544>
//
// This function includes a branch for checking whether the two input points
// are equal, (while not equal to the point at infinity). This case should
// never happen during single point multiplication, so there is no timing leak
// for ECDH and ECDSA.
void ec_nistp_point_add(const ec_nistp_meth *ctx,
ec_nistp_felem_limb *x3,
ec_nistp_felem_limb *y3,
ec_nistp_felem_limb *z3,
const ec_nistp_felem_limb *x1,
const ec_nistp_felem_limb *y1,
const ec_nistp_felem_limb *z1,
const int mixed,
const ec_nistp_felem_limb *x2,
const ec_nistp_felem_limb *y2,
const ec_nistp_felem_limb *z2) {
ec_nistp_felem x_out, y_out, z_out;
ec_nistp_felem_limb z1nz = ctx->felem_nz(z1);
ec_nistp_felem_limb z2nz = ctx->felem_nz(z2);
// z1z1 = z1**2
ec_nistp_felem z1z1;
ctx->felem_sqr(z1z1, z1);
ec_nistp_felem u1, s1, two_z1z2;
if (!mixed) {
// z2z2 = z2**2
ec_nistp_felem z2z2;
ctx->felem_sqr(z2z2, z2);
// u1 = x1*z2z2
ctx->felem_mul(u1, x1, z2z2);
// two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2
ctx->felem_add(two_z1z2, z1, z2);
ctx->felem_sqr(two_z1z2, two_z1z2);
ctx->felem_sub(two_z1z2, two_z1z2, z1z1);
ctx->felem_sub(two_z1z2, two_z1z2, z2z2);
// s1 = y1 * z2**3
ctx->felem_mul(s1, z2, z2z2);
ctx->felem_mul(s1, s1, y1);
} else {
// We'll assume z2 = 1 (special case z2 = 0 is handled later).
// u1 = x1*z2z2
OPENSSL_memcpy(u1, x1, ctx->felem_num_limbs * sizeof(ec_nistp_felem_limb));
// two_z1z2 = 2z1z2
ctx->felem_add(two_z1z2, z1, z1);
// s1 = y1 * z2**3
OPENSSL_memcpy(s1, y1, ctx->felem_num_limbs * sizeof(ec_nistp_felem_limb));
}
// u2 = x2*z1z1
ec_nistp_felem u2;
ctx->felem_mul(u2, x2, z1z1);
// h = u2 - u1
ec_nistp_felem h;
ctx->felem_sub(h, u2, u1);
ec_nistp_felem_limb xneq = ctx->felem_nz(h);
// z_out = two_z1z2 * h
ctx->felem_mul(z_out, h, two_z1z2);
// z1z1z1 = z1 * z1z1
ec_nistp_felem z1z1z1;
ctx->felem_mul(z1z1z1, z1, z1z1);
// s2 = y2 * z1**3
ec_nistp_felem s2;
ctx->felem_mul(s2, y2, z1z1z1);
// r = (s2 - s1)*2
ec_nistp_felem r;
ctx->felem_sub(r, s2, s1);
ctx->felem_add(r, r, r);
ec_nistp_felem_limb yneq = ctx->felem_nz(r);
// This case will never occur in the constant-time |ec_GFp_mont_mul|.
ec_nistp_felem_limb is_nontrivial_double =
constant_time_is_zero_w(xneq | yneq) &
~constant_time_is_zero_w(z1nz) &
~constant_time_is_zero_w(z2nz);
if (constant_time_declassify_w(is_nontrivial_double)) {
ec_nistp_point_double(ctx, x3, y3, z3, x1, y1, z1);
return;
}
// I = (2h)**2
ec_nistp_felem i;
ctx->felem_add(i, h, h);
ctx->felem_sqr(i, i);
// J = h * I
ec_nistp_felem j;
ctx->felem_mul(j, h, i);
// V = U1 * I
ec_nistp_felem v;
ctx->felem_mul(v, u1, i);
// x_out = r**2 - J - 2V
ctx->felem_sqr(x_out, r);
ctx->felem_sub(x_out, x_out, j);
ctx->felem_sub(x_out, x_out, v);
ctx->felem_sub(x_out, x_out, v);
// y_out = r(V-x_out) - 2 * s1 * J
ctx->felem_sub(y_out, v, x_out);
ctx->felem_mul(y_out, y_out, r);
ec_nistp_felem s1j;
ctx->felem_mul(s1j, s1, j);
ctx->felem_sub(y_out, y_out, s1j);
ctx->felem_sub(y_out, y_out, s1j);
cmovznz(x_out, ctx->felem_num_limbs, z1nz, x2, x_out);
cmovznz(y_out, ctx->felem_num_limbs, z1nz, y2, y_out);
cmovznz(z_out, ctx->felem_num_limbs, z1nz, z2, z_out);
cmovznz(x3, ctx->felem_num_limbs, z2nz, x1, x_out);
cmovznz(y3, ctx->felem_num_limbs, z2nz, y1, y_out);
cmovznz(z3, ctx->felem_num_limbs, z2nz, z1, z_out);
}
// Returns i-th bit of the scalar (zero or one).
// The caller is responsible for making sure i is within bounds of the scalar.
static int16_t get_bit(const EC_SCALAR *in, size_t i) {
// |in->words| is an array of BN_ULONGs which can be either 8 or 4 bytes long.
#if defined(OPENSSL_64_BIT)
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) == 8, bn_ulong_not_eight_bytes);
return (in->words[i >> 6] >> (i & 63)) & 1;
#else
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) == 4, bn_ulong_not_four_bytes);
return (in->words[i >> 5] >> (i & 31)) & 1;
#endif
}
#define DIV_AND_CEIL(a, b) ((a + b - 1) / b)
// Compute "regular" wNAF representation of a scalar, see
// Joye, Tunstall, "Exponent Recoding and Regular Exponentiation Algorithms",
// AfricaCrypt 2009, Alg 6.
// It forces an odd scalar and outputs digits in
// {\pm 1, \pm 3, \pm 5, \pm 7, \pm 9, ...}
// i.e. signed odd digits with _no zeroes_ -- that makes it "regular".
static void scalar_rwnaf(int16_t *out, size_t window_size,
const EC_SCALAR *scalar, size_t scalar_bit_size) {
assert(window_size < 14);
// The assert above ensures this works correctly.
const int16_t window_mask = (1 << (window_size + 1)) - 1;
int16_t window = (int16_t)(scalar->words[0] & (BN_ULONG)window_mask);
window |= 1;
const size_t num_windows = DIV_AND_CEIL(scalar_bit_size, window_size);
for (size_t i = 0; i < num_windows - 1; i++) {
int16_t d = (window & window_mask) - (int16_t)(1 << window_size);
out[i] = d;
window = (window - d) >> window_size;
for (size_t j = 1; j <= window_size; j++) {
size_t idx = (i + 1) * window_size + j;
if (idx < scalar_bit_size) {
window |= get_bit(scalar, idx) << j;
}
}
}
out[num_windows - 1] = window;
}
// The window size for scalar multiplication is hard coded for now.
#define SCALAR_MUL_WINDOW_SIZE (5)
#define SCALAR_MUL_TABLE_NUM_POINTS (1 << (SCALAR_MUL_WINDOW_SIZE - 1))
// To avoid dynamic allocation and freeing of memory in functions below
// we define maximum values of certain variables.
//
// The maximum number of limbs the table in |ec_nistp_scalar_mul| can have.
// Each point in the table has 3 coordinates that are field elements,
// and each field element has a defined maximum number of limbs.
#define SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS \
(SCALAR_MUL_TABLE_NUM_POINTS * 3 * FELEM_MAX_NUM_OF_LIMBS)
// The maximum number of bits for a scalar.
#define SCALAR_MUL_MAX_SCALAR_BITS (521)
// Maximum number of windows (digits) for a scalar encoding which is
// determined by the maximum scalar bit size -- 521 bits in our case.
#define SCALAR_MUL_MAX_NUM_WINDOWS \
DIV_AND_CEIL(SCALAR_MUL_MAX_SCALAR_BITS, SCALAR_MUL_WINDOW_SIZE)
// Generate table of multiples of the input point P = (x_in, y_in, z_in):
// table <-- [2i + 1]P for i in [0, SCALAR_MUL_TABLE_NUM_POINTS - 1].
static void generate_table(const ec_nistp_meth *ctx,
ec_nistp_felem_limb *table,
const ec_nistp_felem_limb *x_in,
const ec_nistp_felem_limb *y_in,
const ec_nistp_felem_limb *z_in)
{
const size_t felem_num_limbs = ctx->felem_num_limbs;
const size_t felem_num_bytes = felem_num_limbs * sizeof(ec_nistp_felem_limb);
// Helper variables to access individual coordinates of a point.
const size_t x_idx = 0;
const size_t y_idx = felem_num_limbs;
const size_t z_idx = felem_num_limbs * 2;
// table[0] <-- P.
OPENSSL_memcpy(&table[x_idx], x_in, felem_num_bytes);
OPENSSL_memcpy(&table[y_idx], y_in, felem_num_bytes);
OPENSSL_memcpy(&table[z_idx], z_in, felem_num_bytes);
// Compute 2P.
ec_nistp_felem x_in_dbl, y_in_dbl, z_in_dbl;
ctx->point_dbl(x_in_dbl, y_in_dbl, z_in_dbl,
&table[x_idx], &table[y_idx], &table[z_idx]);
// Compute the rest of the table.
for (size_t i = 1; i < SCALAR_MUL_TABLE_NUM_POINTS; i++) {
// Just getting pointers to i-th and (i-1)-th point in the table.
ec_nistp_felem_limb *point_i = &table[i * 3 * felem_num_limbs];
ec_nistp_felem_limb *point_im1 = &table[(i - 1) * 3 * felem_num_limbs];
// table[i] <-- table[i - 1] + 2P
ctx->point_add(&point_i[x_idx], &point_i[y_idx], &point_i[z_idx],
&point_im1[x_idx], &point_im1[y_idx], &point_im1[z_idx],
0, x_in_dbl, y_in_dbl, z_in_dbl);
}
}
// Writes to out the idx-th point from table in constant-time.
static inline void select_point_from_table(const ec_nistp_meth *ctx,
ec_nistp_felem_limb *out,
const ec_nistp_felem_limb *table,
const size_t idx,
const size_t projective) {
// if projective != 0 then a point is (x, y, z), otherwise (x, y).
size_t point_num_coord = 2 + (projective != 0 ? 1 : 0);
size_t point_num_limbs = ctx->felem_num_limbs * point_num_coord;
// The ifdef branching below is temporary. Using only constant_..._table_8
// would be best for simplicity, but unfortunatelly, on x86 systems it is
// significantly slower than constant_..._table_w.
#if defined(EC_NISTP_USE_64BIT_LIMB) && defined(OPENSSL_64_BIT)
constant_time_select_entry_from_table_w(out, (crypto_word_t*) table, idx,
SCALAR_MUL_TABLE_NUM_POINTS, point_num_limbs);
#else
size_t entry_size = point_num_limbs * sizeof(ec_nistp_felem_limb);
constant_time_select_entry_from_table_8((uint8_t*)out, (uint8_t*)table,
idx, SCALAR_MUL_TABLE_NUM_POINTS, entry_size);
#endif
}
// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
// The product is computed with the use of a small table generated on-the-fly
// and the scalar recoded in the regular-wNAF representation.
//
// The precomputed (on-the-fly) table |table| holds odd multiples of P:
// [2i + 1]P for i in [0, SCALAR_MUL_TABLE_NUM_POINTS - 1].
// Computing the negation of a point P = (x, y, z) is relatively easy:
// -P = (x, -y, z),
// so we may assume that for each point we have its negative as well.
//
// The scalar is recoded (regular-wNAF encoding) into signed digits as explained
// in |scalar_rwnaf| function. Namely, for a window size |w| we have:
// scalar' = s_0 + s_1*2^w + s_2*2^(2*w) + ... + s_{m-1}*2^((m-1)*w),
// where digits s_i are in [\pm 1, \pm 3, ..., \pm (2^w-1)] and
// m = ceil(scalar_bit_size / w). Note that for an odd scalar we have that
// scalar = scalar', while in the case of an even scalar we have that
// scalar = scalar' - 1.
//
// The required product, [scalar]P, is computed by the following algorithm.
// 1. Initialize the accumulator with the point from |table|
// corresponding to the most significant digit s_{m-1} of the scalar.
// 2. For digits s_i starting from s_{m-2} down to s_0:
// 3. Double the accumulator w times. (note that doubling a point [a]P
// w times results in [2^w*a]P).
// 4. Read from |table| the point corresponding to abs(s_i),
// negate it if s_i is negative, and add it to the accumulator.
// 5. Subtract P from the result if the scalar is even.
//
// Note: this function is constant-time.
void ec_nistp_scalar_mul(const ec_nistp_meth *ctx,
ec_nistp_felem_limb *x_out,
ec_nistp_felem_limb *y_out,
ec_nistp_felem_limb *z_out,
const ec_nistp_felem_limb *x_in,
const ec_nistp_felem_limb *y_in,
const ec_nistp_felem_limb *z_in,
const EC_SCALAR *scalar) {
// Make sure that the max table size is large enough.
assert(SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS >=
SCALAR_MUL_TABLE_NUM_POINTS * ctx->felem_num_limbs * 3);
// Table of multiples of P = (x_in, y_in, z_in).
ec_nistp_felem_limb table[SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS];
generate_table(ctx, table, x_in, y_in, z_in);
// Regular-wNAF encoding of the scalar.
int16_t rwnaf[SCALAR_MUL_MAX_NUM_WINDOWS];
scalar_rwnaf(rwnaf, SCALAR_MUL_WINDOW_SIZE, scalar, ctx->felem_num_bits);
// We need two point accumulators, so we define them of maximum size
// to avoid allocation, and just take pointers to individual coordinates.
// (This cruft will dissapear when we refactor point_add/dbl to work with
// whole points instead of individual coordinates).
ec_nistp_felem_limb res[3 * FELEM_MAX_NUM_OF_LIMBS];
ec_nistp_felem_limb tmp[3 * FELEM_MAX_NUM_OF_LIMBS];
ec_nistp_felem_limb *x_res = &res[0];
ec_nistp_felem_limb *y_res = &res[ctx->felem_num_limbs];
ec_nistp_felem_limb *z_res = &res[ctx->felem_num_limbs * 2];
ec_nistp_felem_limb *x_tmp = &tmp[0];
ec_nistp_felem_limb *y_tmp = &tmp[ctx->felem_num_limbs];
ec_nistp_felem_limb *z_tmp = &tmp[ctx->felem_num_limbs * 2];
// The actual number of windows (digits) of the scalar (denoted by m in the
// description above the function).
const size_t num_windows = DIV_AND_CEIL(ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);
// Step 1. Initialize the accmulator (res) with the input point multiplied by
// the most significant digit of the scalar s_{m-1} (note that this digit
// can't be negative).
int16_t idx = rwnaf[num_windows - 1];
idx >>= 1;
select_point_from_table(ctx, res, table, idx, 1);
// Step 2. Process the remaining digits of the scalar (s_{m-2} to s_0).
for (int i = num_windows - 2; i >= 0; i--) {
// Step 3. Double the accumulator w times.
for (size_t j = 0; j < SCALAR_MUL_WINDOW_SIZE; j++) {
ctx->point_dbl(x_res, y_res, z_res, x_res, y_res, z_res);
}
// Step 4a. Compute abs(s_i).
int16_t d = rwnaf[i];
int16_t is_neg = (d >> 15) & 1; // is_neg = (d < 0) ? 1 : 0
d = (d ^ -is_neg) + is_neg; // d = abs(d)
// Step 4b. Select from table the point corresponding to abs(s_i).
idx = d >> 1;
select_point_from_table(ctx, tmp, table, idx, 1);
// Step 4c. Negate the point if s_i < 0.
ec_nistp_felem ftmp;
ctx->felem_neg(ftmp, y_tmp);
cmovznz(y_tmp, ctx->felem_num_limbs, is_neg, y_tmp, ftmp);
// Step 4d. Add the point to the accumulator.
ctx->point_add(x_res, y_res, z_res, x_res, y_res, z_res, 0, x_tmp, y_tmp, z_tmp);
}
// Step 5a. Negate the input point P (we negate it in-place since we already
// have it stored as the first entry in the table).
ec_nistp_felem_limb *x_mp = &table[0];
ec_nistp_felem_limb *y_mp = &table[ctx->felem_num_limbs];
ec_nistp_felem_limb *z_mp = &table[ctx->felem_num_limbs * 2];
ctx->felem_neg(y_mp, y_mp);
// Step 5b. Subtract P from the accumulator.
ctx->point_add(x_tmp, y_tmp, z_tmp, x_res, y_res, z_res, 0, x_mp, y_mp, z_mp);
// Step 5c. Select |res| or |res - P| based on parity of the scalar.
ec_nistp_felem_limb t = scalar->words[0] & 1;
cmovznz(x_out, ctx->felem_num_limbs, t, x_tmp, x_res);
cmovznz(y_out, ctx->felem_num_limbs, t, y_tmp, y_res);
cmovznz(z_out, ctx->felem_num_limbs, t, z_tmp, z_res);
}
// Multiplication of the base point G of the curve with the given scalar.
// The product is computed with the Comb method using a precomputed table
// and the regular-wNAF scalar encoding.
//
// While the algorithm is generic and works for different curves, window sizes,
// and scalar sizes, for clarity, we describe it by using the example of P-521.
//
// The precomputed table has 27 sub-tables each holding 16 points:
//
// 0 : [1]G, [3]G, ..., [31]G
// 1 : [1*2^20]G, [3*2^20]G, ..., [31*2^20]G
// ...
// i : [1*2^20i]G, [3*2^20i]G, ..., [31*2^20i]G
// ...
// 26 : [2^520]G, [3*2^520]G, ..., [31*2^520]G
// Computing the negation of a point P = (x, y) is relatively easy:
// -P = (x, -y).
// So we may assume that for each sub-table we have 32 points instead of 16:
// [\pm 1*2^20i]G, [\pm 3*2^20i]G, ..., [\pm 31*2^20i]G.
//
// The 521-bit |scalar| is recoded (regular-wNAF encoding) into 105 signed
// digits, each of length 5 bits, as explained in the
// |p521_felem_mul_scalar_rwnaf| function. Namely,
// scalar' = s_0 + s_1*2^5 + s_2*2^10 + ... + s_104*2^520,
// where digits s_i are in [\pm 1, \pm 3, ..., \pm 31]. Note that for an odd
// scalar we have that scalar = scalar', while in the case of an even
// scalar we have that scalar = scalar' - 1.
//
// To compute the required product, [scalar]G, we may do the following.
// Group the recoded digits of the scalar in 4 groups:
// | corresponding multiples in
// digits | the recoded representation
// -------------------------------------------------------------------------
// (0): {s_0, s_4, s_8, ..., s_100, s_104} | { 2^0, 2^20, ..., 2^500, 2^520}
// (1): {s_1, s_5, s_9, ..., s_101} | { 2^5, 2^25, ..., 2^505}
// (2): {s_2, s_6, s_10, ..., s_102} | {2^10, 2^30, ..., 2^510}
// (3): {s_3, s_7, s_11, ..., s_103} | {2^15, 2^35, ..., 2^515}
// corresponding sub-table lookup | { T0, T1, ..., T25, T26}
//
// The group (0) digits correspond precisely to the multiples of G that are
// held in the 27 precomputed sub-tables, so we may simply read the appropriate
// points from the sub-tables and sum them all up (negating if needed, i.e., if
// a digit s_i is negative, we read the point corresponding to the abs(s_i) and
// negate it before adding it to the sum).
// The remaining three groups (1), (2), and (3), correspond to the multiples
// of G from the sub-tables multiplied additionally by 2^5, 2^10, and 2^15,
// respectively. Therefore, for these groups we may read the appropriate points
// from the table, double them 5, 10, or 15 times, respectively, and add them
// to the final result.
//
// To minimize the number of required doubling operations we process the digits
// of the scalar from left to right. In other words, the algorithm is:
// 1. For group (i) in this order (3, 2, 1, 0):
// 2. Double the accumulator 5 times except in the first iteration.
// 3. Read the points corresponding to the group (i) digits from the tables
// and add them to an accumulator.
// 4. If the scalar is even subtract G from the accumulator.
//
// Note: this function is designed to be constant-time.
void ec_nistp_scalar_mul_base(const ec_nistp_meth *ctx,
ec_nistp_felem_limb *x_out,
ec_nistp_felem_limb *y_out,
ec_nistp_felem_limb *z_out,
const EC_SCALAR *scalar) {
// Regular-wNAF encoding of the scalar.
int16_t rwnaf[SCALAR_MUL_MAX_NUM_WINDOWS];
scalar_rwnaf(rwnaf, SCALAR_MUL_WINDOW_SIZE, scalar, ctx->felem_num_bits);
size_t num_windows = DIV_AND_CEIL(ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);
// We need two point accumulators, so we define them of maximum size
// to avoid allocation, and just take pointers to individual coordinates.
// (This cruft will disapear when we refactor point_add/dbl to work with
// whole points instead of individual coordinates).
ec_nistp_felem_limb res[3 * FELEM_MAX_NUM_OF_LIMBS] = {0};
ec_nistp_felem_limb tmp[3 * FELEM_MAX_NUM_OF_LIMBS] = {0};
ec_nistp_felem_limb *x_res = &res[0];
ec_nistp_felem_limb *y_res = &res[ctx->felem_num_limbs];
ec_nistp_felem_limb *z_res = &res[ctx->felem_num_limbs * 2];
ec_nistp_felem_limb *x_tmp = &tmp[0];
ec_nistp_felem_limb *y_tmp = &tmp[ctx->felem_num_limbs];
ec_nistp_felem_limb *z_tmp = &tmp[ctx->felem_num_limbs * 2];
// Process the 4 groups of digits starting from group (3) down to group (0).
for (int i = 3; i >= 0; i--) {
// Double |res| 5 times in each iteration, except in the first one.
for (size_t j = 0; i != 3 && j < SCALAR_MUL_WINDOW_SIZE; j++) {
ctx->point_dbl(x_res, y_res, z_res, x_res, y_res, z_res);
}
// Process the digits in the current group from the most to the least
// significant one.
size_t start_idx = ((num_windows - i - 1) / 4) * 4 + i;
for (int j = start_idx; j >= 0; j -= 4) {
// For each digit |d| in the current group read the corresponding point
// from the table and add it to |res|. If |d| is negative, negate
// the point before adding it to |res|.
int16_t d = rwnaf[j];
int16_t is_neg = (d >> 15) & 1; // is_neg = (d < 0) ? 1 : 0
d = (d ^ -is_neg) + is_neg; // d = abs(d)
int16_t idx = d >> 1;
// Select the point to add, in constant time.
size_t point_num_limbs = 2 * ctx->felem_num_limbs; // Affine points.
size_t subtable_num_limbs = SCALAR_MUL_TABLE_NUM_POINTS * point_num_limbs;
size_t table_idx = (j / 4) * subtable_num_limbs;
const ec_nistp_felem_limb *table = &ctx->scalar_mul_base_table[table_idx];
select_point_from_table(ctx, tmp, table, idx, 0);
// Negate y coordinate of the point tmp = (x, y); ftmp = -y.
ec_nistp_felem ftmp;
ctx->felem_neg(ftmp, y_tmp);
cmovznz(y_tmp, ctx->felem_num_limbs, is_neg, y_tmp, ftmp);
// Add the point to the accumulator |res|.
ctx->point_add(x_res, y_res, z_res, x_res, y_res, z_res, 1,
x_tmp, y_tmp, ctx->felem_one);
}
}
// Conditionally subtract G if the scalar is even, in constant-time.
const ec_nistp_felem_limb *x_mp = &ctx->scalar_mul_base_table[0];
const ec_nistp_felem_limb *y_mp = &ctx->scalar_mul_base_table[ctx->felem_num_limbs];
ec_nistp_felem ftmp;
ctx->felem_neg(ftmp, y_mp);
// Subtract P from the accumulator.
ctx->point_add(x_tmp, y_tmp, z_tmp, x_res, y_res, z_res, 1, x_mp, ftmp, ctx->felem_one);
// Select |res| or |res - P| based on parity of the scalar.
ec_nistp_felem_limb t = scalar->words[0] & 1;
cmovznz(x_out, ctx->felem_num_limbs, t, x_tmp, x_res);
cmovznz(y_out, ctx->felem_num_limbs, t, y_tmp, y_res);
cmovznz(z_out, ctx->felem_num_limbs, t, z_tmp, z_res);
}
// Computes [g_scalar]G + [p_scalar]P, where G is the base point of the curve
// curve, and P is the given point (x_p, y_p, z_p).
//
// Both scalar products are computed by the same "textbook" wNAF method,
// with w = 5 for g_scalar and w = 5 for p_scalar.
// For the base point G product we use the first sub-table of the precomputed
// table, while for P we generate the table on-the-fly. The tables hold the
// first 16 odd multiples of G or P:
// g_pre_comp = {[1]G, [3]G, ..., [31]G},
// p_pre_comp = {[1]P, [3]P, ..., [31]P}.
// Computing the negation of a point P = (x, y) is relatively easy:
// -P = (x, -y).
// So we may assume that we also have the negatives of the points in the tables.
//
// The scalars are recoded by the textbook wNAF method to digits, where a digit
// is either a zero or an odd integer in [-31, 31]. The method guarantees that
// each non-zero digit is followed by at least four zeroes.
//
// The result [g_scalar]G + [p_scalar]P is computed by the following algorithm:
// 1. Initialize the accumulator with the point-at-infinity.
// 2. For i starting from 521 down to 0:
// 3. Double the accumulator (doubling can be skipped while the
// accumulator is equal to the point-at-infinity).
// 4. Read from |p_pre_comp| the point corresponding to the i-th digit of
// p_scalar, negate it if the digit is negative, and add it to the
// accumulator.
// 5. Read from |g_pre_comp| the point corresponding to the i-th digit of
// g_scalar, negate it if the digit is negative, and add it to the
// accumulator.
// Note: this function is NOT constant-time.
void ec_nistp_scalar_mul_public(const ec_nistp_meth *ctx,
ec_nistp_felem_limb *x_out,
ec_nistp_felem_limb *y_out,
ec_nistp_felem_limb *z_out,
const EC_SCALAR *g_scalar,
const ec_nistp_felem_limb *x_p,
const ec_nistp_felem_limb *y_p,
const ec_nistp_felem_limb *z_p,
const EC_SCALAR *p_scalar) {
const size_t felem_num_bytes = ctx->felem_num_limbs * sizeof(ec_nistp_felem_limb);
// Table of multiples of P.
ec_nistp_felem_limb p_table[SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS];
generate_table(ctx, p_table, x_p, y_p, z_p);
const size_t p_point_num_limbs = 3 * ctx->felem_num_limbs; // Projective.
// Table of multiples of G.
const ec_nistp_felem_limb *g_table = ctx->scalar_mul_base_table;
const size_t g_point_num_limbs = 2 * ctx->felem_num_limbs; // Affine.
// Recode the scalars.
int8_t p_wnaf[SCALAR_MUL_MAX_SCALAR_BITS + 1] = {0};
int8_t g_wnaf[SCALAR_MUL_MAX_SCALAR_BITS + 1] = {0};
ec_compute_wNAF(p_wnaf, p_scalar, ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);
ec_compute_wNAF(g_wnaf, g_scalar, ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);
// In the beginning res is set to point-at-infinity, so we set the flag.
int16_t res_is_inf = 1;
int16_t d, is_neg, idx;
ec_nistp_felem ftmp;
for (int i = ctx->felem_num_bits; i >= 0; i--) {
// If |res| is point-at-infinity there is no point in doubling so skip it.
if (!res_is_inf) {
ctx->point_dbl(x_out, y_out, z_out, x_out, y_out, z_out);
}
// Process the p_scalar digit.
d = p_wnaf[i];
if (d != 0) {
is_neg = d < 0 ? 1 : 0;
idx = (is_neg) ? (-d - 1) >> 1 : (d - 1) >> 1;
if (res_is_inf) {
// If |res| is point-at-infinity there is no need to add the new point,
// we can simply copy it.
const size_t table_idx = idx * p_point_num_limbs;
OPENSSL_memcpy(x_out, &p_table[table_idx], felem_num_bytes);
OPENSSL_memcpy(y_out, &p_table[table_idx + ctx->felem_num_limbs], felem_num_bytes);
OPENSSL_memcpy(z_out, &p_table[table_idx + ctx->felem_num_limbs * 2], felem_num_bytes);
res_is_inf = 0;
} else {
// Otherwise, add to the accumulator either the point at position idx
// in the table or its negation.
const ec_nistp_felem_limb *y_tmp;
y_tmp = &p_table[idx * p_point_num_limbs + ctx->felem_num_limbs];
if (is_neg) {
ctx->felem_neg(ftmp, y_tmp);
y_tmp = ftmp;
}
ctx->point_add(x_out, y_out, z_out, x_out, y_out, z_out, 0,
&p_table[idx * p_point_num_limbs],
y_tmp,
&p_table[idx * p_point_num_limbs + ctx->felem_num_limbs * 2]);
}
}
/* // Process the g_scalar digit. */
d = g_wnaf[i];
if (d != 0) {
is_neg = d < 0 ? 1 : 0;
idx = (is_neg) ? (-d - 1) >> 1 : (d - 1) >> 1;
if (res_is_inf) {
// If |res| is point-at-infinity there is no need to add the new point,
// we can simply copy it.
const size_t table_idx = idx * g_point_num_limbs;
OPENSSL_memcpy(x_out, &g_table[table_idx], felem_num_bytes);
OPENSSL_memcpy(y_out, &g_table[table_idx + ctx->felem_num_limbs], felem_num_bytes);
OPENSSL_memcpy(z_out, ctx->felem_one, felem_num_bytes);
res_is_inf = 0;
} else {
// Otherwise, add to the accumulator either the point at position idx
// in the table or its negation.
const ec_nistp_felem_limb *y_tmp ;
y_tmp = &g_table[idx * g_point_num_limbs + ctx->felem_num_limbs];
if (is_neg) {
ctx->felem_neg(ftmp, &g_table[idx * g_point_num_limbs + ctx->felem_num_limbs]);
y_tmp = ftmp;
}
ctx->point_add(x_out, y_out, z_out, x_out, y_out, z_out, 1,
&g_table[idx * g_point_num_limbs], y_tmp, ctx->felem_one);
}
}
}
}
void ec_nistp_point_to_coordinates(ec_nistp_felem_limb *x_out,
ec_nistp_felem_limb *y_out,
ec_nistp_felem_limb *z_out,
const ec_nistp_felem_limb *xyz_in,
size_t num_limbs_per_coord) {
size_t num_bytes_per_coord = num_limbs_per_coord * sizeof(ec_nistp_felem_limb);
OPENSSL_memcpy(x_out, xyz_in, num_bytes_per_coord);
OPENSSL_memcpy(y_out, &xyz_in[num_limbs_per_coord], num_bytes_per_coord);
OPENSSL_memcpy(z_out, &xyz_in[num_limbs_per_coord * 2], num_bytes_per_coord);
}
void ec_nistp_coordinates_to_point(ec_nistp_felem_limb *xyz_out,
const ec_nistp_felem_limb *x_in,
const ec_nistp_felem_limb *y_in,
const ec_nistp_felem_limb *z_in,
size_t num_limbs_per_coord) {
size_t num_bytes_per_coord = num_limbs_per_coord * sizeof(ec_nistp_felem_limb);
OPENSSL_memcpy(xyz_out, x_in, num_bytes_per_coord);
OPENSSL_memcpy(&xyz_out[num_limbs_per_coord], y_in, num_bytes_per_coord);
OPENSSL_memcpy(&xyz_out[num_limbs_per_coord * 2], z_in, num_bytes_per_coord);
}