crypto/fipsmodule/ec/p384.c (401 lines of code) (raw):
/*
------------------------------------------------------------------------------------
Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0 OR ISC
------------------------------------------------------------------------------------
*/
#include <openssl/bn.h>
#include <openssl/ec.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include "../bn/internal.h"
#include "../cpucap/internal.h"
#include "../delocate.h"
#include "internal.h"
#include "ec_nistp.h"
#if !defined(OPENSSL_SMALL)
#if defined(EC_NISTP_USE_S2N_BIGNUM)
# include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
#else
# if defined(EC_NISTP_USE_64BIT_LIMB)
# include "../../../third_party/fiat/p384_64.h"
# else
# include "../../../third_party/fiat/p384_32.h"
# endif
#endif
#if defined(EC_NISTP_USE_64BIT_LIMB)
#define P384_NLIMBS (6)
typedef uint64_t p384_limb_t;
typedef uint64_t p384_felem[P384_NLIMBS];
static const p384_felem p384_felem_one = {
0xffffffff00000001, 0xffffffff, 0x1, 0x0, 0x0, 0x0};
#else // 64BIT; else 32BIT
#define P384_NLIMBS (12)
typedef uint32_t p384_limb_t;
typedef uint32_t p384_felem[P384_NLIMBS];
static const p384_felem p384_felem_one = {
0x1, 0xffffffff, 0xffffffff, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
#endif // 64BIT
#if defined(EC_NISTP_USE_S2N_BIGNUM)
#define p384_felem_add(out, in0, in1) bignum_add_p384(out, in0, in1)
#define p384_felem_sub(out, in0, in1) bignum_sub_p384(out, in0, in1)
#define p384_felem_opp(out, in0) bignum_neg_p384(out, in0)
#define p384_felem_to_bytes(out, in0) bignum_tolebytes_6(out, in0)
#define p384_felem_from_bytes(out, in0) bignum_fromlebytes_6(out, in0)
#define p384_felem_to_mont(out, in0) bignum_tomont_p384_selector(out, in0)
#define p384_felem_from_mont(out, in0) bignum_deamont_p384_selector(out, in0)
#define p384_felem_mul(out, in0, in1) bignum_montmul_p384_selector(out, in0, in1)
#define p384_felem_sqr(out, in0) bignum_montsqr_p384_selector(out, in0)
static p384_limb_t p384_felem_nz(const p384_limb_t in1[P384_NLIMBS]) {
return bignum_nonzero_6(in1);
}
#else // EC_NISTP_USE_S2N_BIGNUM
// Fiat-crypto implementation of field arithmetic
#define p384_felem_add(out, in0, in1) fiat_p384_add(out, in0, in1)
#define p384_felem_sub(out, in0, in1) fiat_p384_sub(out, in0, in1)
#define p384_felem_opp(out, in0) fiat_p384_opp(out, in0)
#define p384_felem_mul(out, in0, in1) fiat_p384_mul(out, in0, in1)
#define p384_felem_sqr(out, in0) fiat_p384_square(out, in0)
#define p384_felem_to_mont(out, in0) fiat_p384_to_montgomery(out, in0)
#define p384_felem_from_mont(out, in0) fiat_p384_from_montgomery(out, in0)
#define p384_felem_to_bytes(out, in0) fiat_p384_to_bytes(out, in0)
#define p384_felem_from_bytes(out, in0) fiat_p384_from_bytes(out, in0)
static p384_limb_t p384_felem_nz(const p384_limb_t in1[P384_NLIMBS]) {
p384_limb_t ret;
fiat_p384_nonzero(&ret, in1);
return ret;
}
#endif // EC_NISTP_USE_S2N_BIGNUM
// The wrapper functions are needed for FIPS static build.
// Otherwise, initializing ec_nistp_meth with pointers to s2n-bignum
// functions directly generates :got: references that are also thought
// to be local_target by the delocator.
static inline void p384_felem_add_wrapper(ec_nistp_felem_limb *c,
const ec_nistp_felem_limb *a,
const ec_nistp_felem_limb *b) {
p384_felem_add(c, a, b);
}
static inline void p384_felem_sub_wrapper(ec_nistp_felem_limb *c,
const ec_nistp_felem_limb *a,
const ec_nistp_felem_limb *b) {
p384_felem_sub(c, a, b);
}
static inline void p384_felem_neg_wrapper(ec_nistp_felem_limb *c,
const ec_nistp_felem_limb *a) {
p384_felem_opp(c, a);
}
static void p384_from_generic(p384_felem out, const EC_FELEM *in) {
#ifdef OPENSSL_BIG_ENDIAN
uint8_t tmp[P384_EC_FELEM_BYTES];
bn_words_to_little_endian(tmp, P384_EC_FELEM_BYTES, in->words, P384_EC_FELEM_WORDS);
p384_felem_from_bytes(out, tmp);
#else
p384_felem_from_bytes(out, (const uint8_t *)in->words);
#endif
}
static void p384_to_generic(EC_FELEM *out, const p384_felem in) {
// This works because 384 is a multiple of 64, so there are no excess bytes to
// zero when rounding up to |BN_ULONG|s.
OPENSSL_STATIC_ASSERT(
384 / 8 == sizeof(BN_ULONG) * ((384 + BN_BITS2 - 1) / BN_BITS2),
p384_felem_to_bytes_leaves_bytes_uninitialized);
#ifdef OPENSSL_BIG_ENDIAN
uint8_t tmp[P384_EC_FELEM_BYTES];
p384_felem_to_bytes(tmp, in);
bn_little_endian_to_words(out->words, P384_EC_FELEM_WORDS, tmp, P384_EC_FELEM_BYTES);
#else
p384_felem_to_bytes((uint8_t *)out->words, in);
#endif
}
static void p384_from_scalar(p384_felem out, const EC_SCALAR *in) {
#ifdef OPENSSL_BIG_ENDIAN
uint8_t tmp[P384_EC_FELEM_BYTES];
bn_words_to_little_endian(tmp, P384_EC_FELEM_BYTES, in->words, P384_EC_FELEM_WORDS);
p384_felem_from_bytes(out, tmp);
#else
p384_felem_from_bytes(out, (const uint8_t *)in->words);
#endif
}
// p384_inv_square calculates |out| = |in|^{-2}
//
// Based on Fermat's Little Theorem:
// a^p = a (mod p)
// a^{p-1} = 1 (mod p)
// a^{p-3} = a^{-2} (mod p)
// p = 2^384 - 2^128 - 2^96 + 2^32 - 1
// Hexadecimal representation of p − 3:
// p-3 = ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe
// ffffffff 00000000 00000000 fffffffc
static void p384_inv_square(p384_felem out,
const p384_felem in) {
#if defined(EC_NISTP_USE_S2N_BIGNUM)
ec_nistp_felem_limb in_sqr[P384_NLIMBS];
p384_felem_sqr(in_sqr, in);
bignum_montinv_p384(out, in_sqr);
#else
// This implements the addition chain described in
// https://briansmith.org/ecc-inversion-addition-chains-01#p384_field_inversion
// The side comments show the value of the exponent:
// squaring the element => doubling the exponent
// multiplying by an element => adding to the exponent the power of that element
p384_felem x2, x3, x6, x12, x15, x30, x60, x120;
p384_felem_sqr(x2, in); // 2^2 - 2^1
p384_felem_mul(x2, x2, in); // 2^2 - 2^0
p384_felem_sqr(x3, x2); // 2^3 - 2^1
p384_felem_mul(x3, x3, in); // 2^3 - 2^0
p384_felem_sqr(x6, x3);
for (int i = 1; i < 3; i++) {
p384_felem_sqr(x6, x6);
} // 2^6 - 2^3
p384_felem_mul(x6, x6, x3); // 2^6 - 2^0
p384_felem_sqr(x12, x6);
for (int i = 1; i < 6; i++) {
p384_felem_sqr(x12, x12);
} // 2^12 - 2^6
p384_felem_mul(x12, x12, x6); // 2^12 - 2^0
p384_felem_sqr(x15, x12);
for (int i = 1; i < 3; i++) {
p384_felem_sqr(x15, x15);
} // 2^15 - 2^3
p384_felem_mul(x15, x15, x3); // 2^15 - 2^0
p384_felem_sqr(x30, x15);
for (int i = 1; i < 15; i++) {
p384_felem_sqr(x30, x30);
} // 2^30 - 2^15
p384_felem_mul(x30, x30, x15); // 2^30 - 2^0
p384_felem_sqr(x60, x30);
for (int i = 1; i < 30; i++) {
p384_felem_sqr(x60, x60);
} // 2^60 - 2^30
p384_felem_mul(x60, x60, x30); // 2^60 - 2^0
p384_felem_sqr(x120, x60);
for (int i = 1; i < 60; i++) {
p384_felem_sqr(x120, x120);
} // 2^120 - 2^60
p384_felem_mul(x120, x120, x60); // 2^120 - 2^0
p384_felem ret;
p384_felem_sqr(ret, x120);
for (int i = 1; i < 120; i++) {
p384_felem_sqr(ret, ret);
} // 2^240 - 2^120
p384_felem_mul(ret, ret, x120); // 2^240 - 2^0
for (int i = 0; i < 15; i++) {
p384_felem_sqr(ret, ret);
} // 2^255 - 2^15
p384_felem_mul(ret, ret, x15); // 2^255 - 2^0
// Why (1 + 30) in the loop?
// This is as expressed in:
// https://briansmith.org/ecc-inversion-addition-chains-01#p384_field_inversion
// My guess is to say that we're going to shift 31 bits, but this time we
// won't add x31 to make all the new bits 1s, as was done in previous steps,
// but we're going to add x30 so there will be 255 1s, then a 0, then 30 1s
// to form this pattern:
// ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe ffffffff
// (the last 2 1s are appended in the following step).
for (int i = 0; i < (1 + 30); i++) {
p384_felem_sqr(ret, ret);
} // 2^286 - 2^31
p384_felem_mul(ret, ret, x30); // 2^286 - 2^30 - 2^0
p384_felem_sqr(ret, ret);
p384_felem_sqr(ret, ret); // 2^288 - 2^32 - 2^2
p384_felem_mul(ret, ret, x2); // 2^288 - 2^32 - 2^0
// Why not 94 instead of (64 + 30) in the loop?
// Similarly to the comment above, there is a shift of 94 bits
// but what will be added is x30, which will cause 64 of those bits
// to be 64 0s and 30 1s to complete the pattern above with:
// 00000000 00000000 fffffffc
// (the last 2 0s are appended by the last 2 shifts).
for (int i = 0; i < (64 + 30); i++) {
p384_felem_sqr(ret, ret);
} // 2^382 - 2^126 - 2^94
p384_felem_mul(ret, ret, x30); // 2^382 - 2^126 - 2^94 + 2^30 - 2^0
p384_felem_sqr(ret, ret);
p384_felem_sqr(out, ret); // 2^384 - 2^128 - 2^96 + 2^32 - 2^2 = p - 3
#endif
}
static void p384_point_double(p384_felem x_out,
p384_felem y_out,
p384_felem z_out,
const p384_felem x_in,
const p384_felem y_in,
const p384_felem z_in) {
#if defined(EC_NISTP_USE_S2N_BIGNUM)
ec_nistp_felem_limb in[P384_NLIMBS * 3];
ec_nistp_felem_limb out[P384_NLIMBS * 3];
ec_nistp_coordinates_to_point(in, x_in, y_in, z_in, P384_NLIMBS);
p384_montjdouble_selector(out, in);
ec_nistp_point_to_coordinates(x_out, y_out, z_out, out, P384_NLIMBS);
#else
ec_nistp_point_double(p384_methods(), x_out, y_out, z_out, x_in, y_in, z_in);
#endif
}
// p384_point_add calculates (x1, y1, z1) + (x2, y2, z2)
//
// The method is taken from:
// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#addition-add-2007-bl
// adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
//
// Coq transcription and correctness proof:
// <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L467>
// <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L544>
static void p384_point_add(p384_felem x3, p384_felem y3, p384_felem z3,
const p384_felem x1,
const p384_felem y1,
const p384_felem z1,
const int mixed,
const p384_felem x2,
const p384_felem y2,
const p384_felem z2) {
ec_nistp_point_add(p384_methods(), x3, y3, z3, x1, y1, z1, mixed, x2, y2, z2);
}
#include "p384_table.h"
#if defined(EC_NISTP_USE_S2N_BIGNUM)
DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
out->felem_num_limbs = P384_NLIMBS;
out->felem_num_bits = 384;
out->felem_add = p384_felem_add_wrapper;
out->felem_sub = p384_felem_sub_wrapper;
out->felem_mul = bignum_montmul_p384_selector;
out->felem_sqr = bignum_montsqr_p384_selector;
out->felem_neg = p384_felem_neg_wrapper;
out->felem_nz = p384_felem_nz;
out->felem_one = p384_felem_one;
out->point_dbl = p384_point_double;
out->point_add = p384_point_add;
out->scalar_mul_base_table = (const ec_nistp_felem_limb*) p384_g_pre_comp;
}
#else
DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
out->felem_num_limbs = P384_NLIMBS;
out->felem_num_bits = 384;
out->felem_add = fiat_p384_add;
out->felem_sub = fiat_p384_sub;
out->felem_mul = fiat_p384_mul;
out->felem_sqr = fiat_p384_square;
out->felem_neg = fiat_p384_opp;
out->felem_nz = p384_felem_nz;
out->felem_one = p384_felem_one;
out->point_dbl = p384_point_double;
out->point_add = p384_point_add;
out->scalar_mul_base_table = (const ec_nistp_felem_limb*) p384_g_pre_comp;
}
#endif
// OPENSSL EC_METHOD FUNCTIONS
// Takes the Jacobian coordinates (X, Y, Z) of a point and returns:
// (X', Y') = (X/Z^2, Y/Z^3).
static int ec_GFp_nistp384_point_get_affine_coordinates(
const EC_GROUP *group, const EC_JACOBIAN *point,
EC_FELEM *x_out, EC_FELEM *y_out) {
if (constant_time_declassify_w(ec_GFp_simple_is_at_infinity(group, point))) {
OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
return 0;
}
p384_felem z1, z2;
p384_from_generic(z1, &point->Z);
p384_inv_square(z2, z1);
if (x_out != NULL) {
p384_felem x;
p384_from_generic(x, &point->X);
p384_felem_mul(x, x, z2);
p384_to_generic(x_out, x);
}
if (y_out != NULL) {
p384_felem y;
p384_from_generic(y, &point->Y);
p384_felem_sqr(z2, z2); // z^-4
p384_felem_mul(y, y, z1); // y * z
p384_felem_mul(y, y, z2); // y * z^-3
p384_to_generic(y_out, y);
}
return 1;
}
static void ec_GFp_nistp384_add(const EC_GROUP *group, EC_JACOBIAN *r,
const EC_JACOBIAN *a, const EC_JACOBIAN *b) {
p384_felem x1, y1, z1, x2, y2, z2;
p384_from_generic(x1, &a->X);
p384_from_generic(y1, &a->Y);
p384_from_generic(z1, &a->Z);
p384_from_generic(x2, &b->X);
p384_from_generic(y2, &b->Y);
p384_from_generic(z2, &b->Z);
p384_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2, z2);
p384_to_generic(&r->X, x1);
p384_to_generic(&r->Y, y1);
p384_to_generic(&r->Z, z1);
}
static void ec_GFp_nistp384_dbl(const EC_GROUP *group, EC_JACOBIAN *r,
const EC_JACOBIAN *a) {
p384_felem x, y, z;
p384_from_generic(x, &a->X);
p384_from_generic(y, &a->Y);
p384_from_generic(z, &a->Z);
p384_point_double(x, y, z, x, y, z);
p384_to_generic(&r->X, x);
p384_to_generic(&r->Y, y);
p384_to_generic(&r->Z, z);
}
// The calls to from/to_generic are needed for the case
// when BORINGSSL_HAS_UINT128 is undefined, i.e. p384_32.h fiat code is used;
// while OPENSSL_64_BIT is defined, i.e. BN_ULONG is uint64_t
static void ec_GFp_nistp384_mont_felem_to_bytes(
const EC_GROUP *group, uint8_t *out, size_t *out_len, const EC_FELEM *in) {
size_t len = BN_num_bytes(&group->field.N);
EC_FELEM felem_tmp;
p384_felem tmp;
p384_from_generic(tmp, in);
p384_felem_from_mont(tmp, tmp);
p384_to_generic(&felem_tmp, tmp);
bn_words_to_big_endian(out, len, felem_tmp.words, group->order.N.width);
*out_len = len;
}
static int ec_GFp_nistp384_mont_felem_from_bytes(
const EC_GROUP *group, EC_FELEM *out, const uint8_t *in, size_t len) {
EC_FELEM felem_tmp;
p384_felem tmp;
// This function calls bn_cmp_words_consttime
if (!ec_GFp_simple_felem_from_bytes(group, &felem_tmp, in, len)) {
return 0;
}
p384_from_generic(tmp, &felem_tmp);
p384_felem_to_mont(tmp, tmp);
p384_to_generic(out, tmp);
return 1;
}
static int ec_GFp_nistp384_cmp_x_coordinate(const EC_GROUP *group,
const EC_JACOBIAN *p,
const EC_SCALAR *r) {
if (ec_GFp_simple_is_at_infinity(group, p)) {
return 0;
}
// We wish to compare X/Z^2 with r. This is equivalent to comparing X with
// r*Z^2. Note that X and Z are represented in Montgomery form, while r is
// not.
p384_felem Z2_mont;
p384_from_generic(Z2_mont, &p->Z);
p384_felem_mul(Z2_mont, Z2_mont, Z2_mont);
p384_felem r_Z2;
p384_from_scalar(r_Z2, r); // r < order < p, so this is valid.
p384_felem_mul(r_Z2, r_Z2, Z2_mont);
p384_felem X;
p384_from_generic(X, &p->X);
p384_felem_from_mont(X, X);
if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
return 1;
}
// During signing the x coefficient is reduced modulo the group order.
// Therefore there is a small possibility, less than 2^189/2^384 = 1/2^195,
// that group_order < p.x < p.
// In that case, we need not only to compare against |r| but also to
// compare against r+group_order.
assert(group->field.N.width == group->order.N.width);
EC_FELEM tmp;
BN_ULONG carry =
bn_add_words(tmp.words, r->words, group->order.N.d, group->field.N.width);
if (carry == 0 &&
bn_less_than_words(tmp.words, group->field.N.d, group->field.N.width)) {
p384_from_generic(r_Z2, &tmp);
p384_felem_mul(r_Z2, r_Z2, Z2_mont);
if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
return 1;
}
}
return 0;
}
// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
const EC_JACOBIAN *p,
const EC_SCALAR *scalar) {
p384_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}};
p384_from_generic(tmp[0], &p->X);
p384_from_generic(tmp[1], &p->Y);
p384_from_generic(tmp[2], &p->Z);
#if defined(EC_NISTP_USE_S2N_BIGNUM)
p384_montjscalarmul_selector((uint64_t*)res, scalar->words, (uint64_t*)tmp);
#else
ec_nistp_scalar_mul(p384_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
#endif
p384_to_generic(&r->X, res[0]);
p384_to_generic(&r->Y, res[1]);
p384_to_generic(&r->Z, res[2]);
}
// Multiplication of the base point G of P-384 curve with the given scalar.
static void ec_GFp_nistp384_point_mul_base(const EC_GROUP *group,
EC_JACOBIAN *r,
const EC_SCALAR *scalar) {
p384_felem res[3] = {{0}, {0}, {0}};
ec_nistp_scalar_mul_base(p384_methods(), res[0], res[1], res[2], scalar);
p384_to_generic(&r->X, res[0]);
p384_to_generic(&r->Y, res[1]);
p384_to_generic(&r->Z, res[2]);
}
// Computes [g_scalar]G + [p_scalar]P, where G is the base point of the P-384
// curve, and P is the given point |p|.
// Note: this function is NOT constant-time.
static void ec_GFp_nistp384_point_mul_public(const EC_GROUP *group,
EC_JACOBIAN *r,
const EC_SCALAR *g_scalar,
const EC_JACOBIAN *p,
const EC_SCALAR *p_scalar) {
p384_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}};
p384_from_generic(tmp[0], &p->X);
p384_from_generic(tmp[1], &p->Y);
p384_from_generic(tmp[2], &p->Z);
ec_nistp_scalar_mul_public(p384_methods(), res[0], res[1], res[2], g_scalar, tmp[0], tmp[1], tmp[2], p_scalar);
p384_to_generic(&r->X, res[0]);
p384_to_generic(&r->Y, res[1]);
p384_to_generic(&r->Z, res[2]);
}
DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp384_method) {
out->point_get_affine_coordinates =
ec_GFp_nistp384_point_get_affine_coordinates;
out->jacobian_to_affine_batch =
ec_GFp_mont_jacobian_to_affine_batch; // needed for TrustToken tests
out->add = ec_GFp_nistp384_add;
out->dbl = ec_GFp_nistp384_dbl;
out->mul = ec_GFp_nistp384_point_mul;
out->mul_base = ec_GFp_nistp384_point_mul_base;
out->mul_public = ec_GFp_nistp384_point_mul_public;
out->mul_batch = ec_GFp_mont_mul_batch; // needed for TrustToken tests
out->mul_public_batch = ec_GFp_mont_mul_public_batch;
out->init_precomp = ec_GFp_mont_init_precomp; // needed for TrustToken tests
out->mul_precomp = ec_GFp_mont_mul_precomp; // needed for TrustToken tests
out->felem_mul = ec_GFp_mont_felem_mul;
out->felem_sqr = ec_GFp_mont_felem_sqr;
out->felem_to_bytes = ec_GFp_nistp384_mont_felem_to_bytes;
out->felem_from_bytes = ec_GFp_nistp384_mont_felem_from_bytes;
out->felem_reduce = ec_GFp_mont_felem_reduce; // needed for ECTest.HashToCurve
out->felem_exp = ec_GFp_mont_felem_exp; // needed for ECTest.HashToCurve
out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery;
out->scalar_to_montgomery_inv_vartime =
ec_simple_scalar_to_montgomery_inv_vartime;
out->cmp_x_coordinate = ec_GFp_nistp384_cmp_x_coordinate;
}
// ----------------------------------------------------------------------------
// Analysis of the doubling case occurrence in the Joye-Tunstall recoding:
// p384_felem_mul_scalar_rwnaf()
// ----------------------------------------------------------------------------
//
// The JT scalar recoding is Algorithm 6: (Odd) Signed-Digit Recoding Algorithm in
// Joye, Tunstall, "Exponent Recoding and Regular Exponentiation Algorithms",
// AfricaCrypt 2009, available from
// https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.477.1245&rep=rep1&type=pdf
//
// We write the algorithm using variables similar to those used in the code and
// in the proof detailed in util.c (t_i in the algorithm below is d in
// p384_felem_mul_scalar_rwnaf()):
//
// Input: k: odd scalar, where k = (b_{l-1}, ..., b_1, b_0) in binary form,
// w: window width
// Output: k = (t_{h-1}, ..., t_1, t_0)
// with t_i \in {\pm 1, \pm 3, ..., \pm (2^{w} - 1)},
// h = ceil(t/w), i.e. t_i are positive and negative odd digits
// which absolute value is less than (2^{w} - 1).
// i := 0
// j := 0
// while (k > 2^w):
// window := (b_{j+w}, ..., b_j) # (w+1)-bit window in k where
// # the least significant bit is b_j
// t_i := window - 2^w
// k := k - t_i
// k := k / 2^w # k >> w
// i += 1
// j += w
// t_{h-1} := k
//
// Note that if b_{j+w} = 0, t_i will be negative;
// otherwise, if b_{j+w} = 1, t_i will be positive.
//
// The algorithm recodes the least (w+1) bits into a (odd) digit in the range
// [-(2^{w}-1), (2^{w}-1)] by subtracting 2^w from that digit and adding it back
// to the remaining bits of the scalar. This ensures that, after the w-bit right
// shift, the next least significant bit is 1, i.e. next digit is odd.
//
// In the following we will show that the non-trivial doubling case in
// single-point left-to-right windowed (or m-ary, m = 2^w) scalar multiplication
// may occur if and only if the (2^w)th bit of the group order is 1. This only
// holds if the scalar is fully reduced and the group order is a prime that is
// much larger than 2^{w+1}.
//
// PROOF:
//
// Let n be the group order. Let l be the number of bits needed to represent n.
// Assume there exists some 0 <= k < n such that signed w-bit windowed
// multiplication hits the doubling case.
//
// Windowed multiplication consists of iterating over the digits t_i defined
// above by the algorithm from most to least significant. At iteration i
// (for i = ..., 3w, 2w, w, 0, starting from the most significant window), we:
//
// 1. Double the accumulator A, w times. Let A_i be the value of A at this
// point, and it corresponds to a value [a_i]P.
//
// 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P, [t_i]P
// From the algorithm steps we can see that the current digit
// t_i = (b_{w+i} ... b_i) - 2^w, b_i = 1 => -2^w < t_i < 2^w, t_i: odd
// which can also be written using C notation as
// t_i = [(k >> i) & ((1<<(w+1)) - 1)] - (1 << w) -- (1)
//
// and the accumulator value
// a_i = b_{l-1} ... b_{i+w+1} 1
// when written as C notation
// a_i = (k >> (i+w+1)) << (w+1)) + (1 << w) -- (2)
//
// Similarly to the recoding in util.c, a_i is bounded by
// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up
// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for
// all groups. That would imply the subsequent a_i is zero, which means all
// terms thus far were zero.)
//
// Let j be the index such that A_j = T_j ≠ ∞. We have a_j = t_j (mod n). We now
// determine the value of a_j - t_j, which must be divisible by n.
// Our bounds on a_j and t_j imply a_j - t_j is 0 or n.
// If it is 0, a_j = t_j. However, 2^w divides a_j and -2^w < t_i < 2^w, so this
// can only happen if a_j = t_j = 0, which is a trivial doubling.
// Therefore, a_j - t_j = n.
//
// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then,
//
// n = a_j - t_j = (k >> (j+w+1)) << (w+1)) + (1 << w) - t_j
// = k/2^j + 2^w - t_j
// < n/2^w + 2^w + 2^w-1
//
// n is much larger than 2^{w+1}, so this is impossible. Thus, j = 0: only the
// final addition may hit the doubling case.
//
// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L
// such that k_H is the contribution from b_(l-1) .. b_{w+1},
// k_M is the contribution from b_w,
// and k_L is the contribution from b_(w-1) ... b_0.
// That is:
//
// - 2^{w+1} divides k_H
// - k_M is 0 or 2^w
// - 0 <= k_L < 2^w
//
// Divide n into n_H + n_M + n_L similarly.
// From (1) and (2), we have
//
// t_0 = (k_M + k_L) - 2^w
// a_0 = k_H + 2^w
//
// We try to find t_0 and a_0 such that
//
// n = a_0 - t_0
// n_H + n_M + n_L = k_H + 2^w - (k_M + k_L - 2^w)
// = k_H + 2^{w+1} - (k_M + k_L)
//
// We know that k_H <= n_H.
//
// If k_H < n_H, then k_H <= n_H - 2^{w+1} (Note that 2^{w+1} divides both k_H
// and n_H). Then we would have
//
// n_H + n_M + n_L <= n_H - 2^{w+1} + 2^{w+1} - (k_M + k_L)
// n_M + n_L <= - (k_M + k_L)
//
// Contradiction. Thus,
//
// k_H = n_H -- (3)
// => n_M + n_L = 2^{w+1} - (k_M + k_L) -- (4)
//
// We also have n > k; hence,
// n_M + n_L > k_M + k_L -- (5)
//
// For (3), (4) and (5) to hold,
// n_M = 2^w, k_M = 0.
//
// Otherwise, if n_M = 0 and k_M = 0
// n_L = 2^{w+1} - k_L
// n_L >= 2^{w+1} - (2^w - 1)
// n_L >= 2^w + 1
// Contradiction since n_L < 2^w.
//
// And if n_M = 0 and k_M = 2^w, (5) would not hold.
//
// Since n_M = 2^w, n_L >= 1, k_L >= 1, from (4) we have
// k_M + k_L = 2^{w+1} - (n_M + n_L)
// <= 2^{w+1} - 2^w - 1
// <= 2^{w} - 1
// => k_M = 0
//
// Putting this together, from the group order of the curve, n, we can construct
// the scalar, k, that would incur a doubling in the last iteration as:
//
// if n_M = 2^w,
// k_H = n_H and
// k_M + k_L = 2^{w+1} - (n_M + n_L)
//
// COMMON CURVES:
//
// The group orders for common curves end in the following bit patterns:
//
// P-521: ...00001001; w = 4, 5, 6, 7 are okay
// P-384: ...01110011; w = 2, 3, 7 are okay
// P-256: ...01010001; w = 2, 3, 5, 7 are okay
//
//
// CAN DOUBLING OCCUR IN RIGHT-TO-LEFT ALGORITHMS OR COMB ALGORITHMS?
//
// This question was answered empirically for P-384 group order n, w = 5,
// by asking:
// Is there a value d_76, such that
// - d_{76} * 2^{380} - a_{76} = n and
// - d_{76} * 2^{380} + a_{76} = k < n ?
//
// Setting
// d_76 = 0xf
// a_76 = (d_76 << 380) - n =
// -0xfffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973
// k = a_76 + (d_76 << 380) =
// 0xe00000000000000000000000000000000000000000000000389cb27e0bc8d220a7e5f24db74f58851313e695333ad68d
//
// n-k =
// 0x1fffffffffffffffffffffffffffffffffffffffffffffff8ec69b03e86e5bbeb0341b6491614ef5d9d832d5998a52e6
// => 0 < k < n
//
// -(1<<380)-a_76 = -0x389cb27e0bc8d220a7e5f24db74f58851313e695333ad68d
// => -2^380 < a_76 < 2^380
//
// This shows that such a k value exists.
//
// This resulted in modifying the comb algorithm used in
// ec_GFp_nistp384_point_mul_base() to proceed in a left-to-right fashion in
// order to add the least significant digit in the last iteration.
//
// We can probably construct values of k that would incur doubling for whenever
// any of the higher digits, t_{j-1}, (down to the middle digit, roughly) is
// added last. This is because the upper half of the group order of P-384 is all
// 1s, therefore we can find a value k < n, having a 0 at the (j*w)th bit which
// would become 1 in the recoding of t_j (being the least significant bit in
// t_j) and making t_{j-1} a negative digit. Hence, the difference between the
// accumulator value containing all digits and t_{j-1} * 2^{(j-1)*w} can be n.
//
// This was tested as follows in Python for j = 75, i.e. the second last digit:
// let that digit's value be the smallest possible value for w = 5, i.e. -31
// d_75 = -31
// # assuming the accumulator contains the most significant digit d_76
// a = n + (d_75 << 375); hex(a)
// '0xf07fffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973L'
// hex(n)
// '0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973L'
// k = a + (d_75 << 375); hex(k)
// '0xe0ffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973L'
//
// # Checks
// hex(n-k)
// '0x1f0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000L'
// hex(n - (a - (d_75 << 375)))
// '0x0L'
// => n > k
// and a value k was found such that when adding d_75 last, the difference
// between the accumulator a and (d_75 << 375) is n
//
//
// ----------------------------------------------------------------------------
// Python code showing the doubling case occurrence in the Joye-Tunstall
// recoding:
// ----------------------------------------------------------------------------
//
// from array import *
//
// # P-384 group order
// n = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFC7634D81F4372DDF581A0DB248B0A77AECEC196ACCC52973
//
// # k value that causes a doubling case in left-to-right reconstruction
// k = 0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc5294d
// # k value that causes a doubling case in right-to-left reconstruction
// k_r2l = 0xe00000000000000000000000000000000000000000000000389cb27e0bc8d220a7e5f24db74f58851313e695333ad68d
//
//
// def recode(k, w):
// rec = array('i', [])
// while k > (2 ** w):
// window = k & ((2 ** (w + 1)) - 1)
// d = window - (2 ** w)
// k = k - d
// k = (k >> w)
// rec.append(d)
// rec.append(k)
// return rec
//
// # Rebuild k from the recoded scalar proceeding from left to right
// def rebuild_l2r(rec, w):
// l = rec.buffer_info()[1] # length of the recoded scalar array
// # initialise accumulator
// a = rec[l-1]
// # for i from l-2 downto 0
// for i in range(l-2,-1,-1):
// a = (a << w)
// if (a - rec[i]) == n:
// print("L2R Doubling case: ")
// print(" i =", i, " digit =", hex(rec[i]))
// print(" a =", hex(a))
// a += rec[i]
// return a
//
// # Rebuild k from the recoded scalar proceeding from right to left
// def rebuild_r2l(rec, w):
// l = rec.buffer_info()[1] # length of the recoded scalar array
// # initialise accumulator
// a = rec[0]
// # for i from 1 to l-1
// for i in range(1,l):
// shifted_d = rec[i] << (w*i)
// if (shifted_d - a) == n:
// print("R2L Doubling case: ")
// print(" i =", i, " digit =", hex(rec[i]))
// print(" a =", hex(a))
// a += shifted_d
// return a
//
// def test_recode():
// w = 5
//
// # Left-to-right recoding of k which causes a doubling case
// assert k < n
// print("k = ", hex(k))
// # recode k
// rec_k = recode(k,w)
// # print(rec_k)
// print("Digits of the recoded scalar:")
// for a in rec_k:
// print(hex(a), end=', ')
// print()
// # rebuild k
// out_k = rebuild_l2r(rec_k, w)
// if out_k != k:
// print("ERROR: rebuilt value is different from recoded value")
// print()
//
// # Right-to-left recoding of k_r2l which causes a doubling case
// assert k_r2l < n
// print("k = ", hex(k_r2l))
// # recode k_r2l
// rec_k_r2l = recode(k_r2l,w)
// # print(rec_k_r2l)
// print("Digits of the recoded scalar:")
// for a in rec_k_r2l:
// print(hex(a), end=', ')
// print()
// # rebuild k_r2l
// out_k_r2l = rebuild_r2l(rec_k_r2l, w)
// if out_k_r2l != k_r2l:
// print("ERROR: rebuilt R2L value is different from recoded value")
// print()
//
// test_recode()
// '''
// Output:
// -------
// k = 0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc5294d
// Digits of the recoded scalar:
// -0x13, -0x15, -0x15, -0x15, -0x13, 0x7, 0xb, 0xd, -0x7, 0x1, 0x1b, -0x7, 0xf, 0x1d, -0x3, -0xb, 0x11, -0x1b, -0xd, 0x5, -0x5, -0x19, 0x9, -0x1d, -0x7, 0x1b, 0x17, -0x5, 0x13, -0x5, -0xf, 0x1f, -0x1f, 0xd, -0xd, -0x19, 0x17, 0x3, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0xf,
// L2R Doubling case:
// i = 0 digit = -0x13
// a = 0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52960
//
// k = 0xe00000000000000000000000000000000000000000000000389cb27e0bc8d220a7e5f24db74f58851313e695333ad68d
// Digits of the recoded scalar:
// -0x13, 0x15, 0x15, 0x15, 0x13, -0x7, -0xb, -0xd, 0x7, -0x1, -0x1b, 0x7, -0xf, -0x1d, 0x3, 0xb, -0x11, 0x1b, 0xd, -0x5, 0x5, 0x19, -0x9, 0x1d, 0x7, -0x1b, -0x17, 0x5, -0x13, 0x5, 0xf, -0x1f, 0x1f, -0xd, 0xd, 0x19, -0x17, -0x3, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, 0xf,
// R2L Doubling case:
// i = 76 digit = 0xf
// a = -0xfffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973
// '''
//
#endif // !defined(OPENSSL_SMALL)