crypto/fipsmodule/ml_kem/mlkem/poly.c (292 lines of code) (raw):
/*
* Copyright (c) 2024-2025 The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0
*/
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include <stdint.h>
#include <string.h>
#include "cbmc.h"
#include "debug.h"
#include "poly.h"
#include "sampling.h"
#include "symmetric.h"
#include "verify.h"
#if !defined(MLK_USE_NATIVE_POLY_TOMONT) || \
!defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
!defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_fqmul
*
* Description: Montgomery multiplication modulo MLKEM_Q
*
* Arguments: - int16_t a: first factor
* Can be any int16_t.
* - int16_t b: second factor.
* Must be signed canonical (abs value <(MLKEM_Q+1)/2)
*
* Returns 16-bit integer congruent to a*b*R^{-1} mod MLKEM_Q, and
* smaller than MLKEM_Q in absolute value.
*
**************************************************/
/* Reference: `fqmul()` in reference implementation. */
static MLK_INLINE int16_t mlk_fqmul(int16_t a, int16_t b)
__contract__(
requires(b > -MLKEM_Q_HALF && b < MLKEM_Q_HALF)
ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
)
{
int16_t res;
mlk_assert_abs_bound(&b, 1, MLKEM_Q_HALF);
res = mlk_montgomery_reduce((int32_t)a * (int32_t)b);
/* Bounds:
* |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
* <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
* <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
* < MLKEM_Q
*/
mlk_assert_abs_bound(&res, 1, MLKEM_Q);
return res;
}
#endif /* !MLK_USE_NATIVE_POLY_TOMONT || !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE \
|| !MLK_USE_NATIVE_NTT || !MLK_USE_NATIVE_INTT */
#if !defined(MLK_USE_NATIVE_POLY_REDUCE) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_barrett_reduce
*
* Description: Barrett reduction; given a 16-bit integer a, computes
* centered representative congruent to a mod q in
* {-(q-1)/2,...,(q-1)/2}
*
* Arguments: - int16_t a: input integer to be reduced
*
* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
*
**************************************************/
/* Reference: `barrett_reduce()` in reference implementation. */
static MLK_INLINE int16_t mlk_barrett_reduce(int16_t a)
__contract__(
ensures(return_value > -MLKEM_Q_HALF && return_value < MLKEM_Q_HALF)
)
{
/* Barrett reduction approximates
* ```
* round(a/MLKEM_Q)
* = round(a*(2^N/MLKEM_Q))/2^N)
* ~= round(a*round(2^N/MLKEM_Q)/2^N)
* ```
* Here, we pick N=26.
*/
const int32_t magic = 20159; /* check-magic: 20159 == round(2^26 / MLKEM_Q) */
/*
* PORTABILITY: Right-shift on a signed integer is
* implementation-defined for negative left argument.
* Here, we assume it's sign-preserving "arithmetic" shift right.
* See (C99 6.5.7 (5))
*/
const int32_t t = (magic * a + (1 << 25)) >> 26;
/*
* t is in -10 .. +10, so we need 32-bit math to
* evaluate t * MLKEM_Q and the subsequent subtraction
*/
int16_t res = (int16_t)(a - t * MLKEM_Q);
mlk_assert_abs_bound(&res, 1, MLKEM_Q_HALF);
return res;
}
#endif /* !MLK_USE_NATIVE_POLY_REDUCE || !MLK_USE_NATIVE_INTT */
#if !defined(MLK_USE_NATIVE_POLY_TOMONT)
/* Reference: `poly_tomont()` in reference implementation. */
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
{
unsigned i;
const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
{
r->coeffs[i] = mlk_fqmul(r->coeffs[i], f);
}
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
#else /* !MLK_USE_NATIVE_POLY_TOMONT */
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
{
mlk_poly_tomont_native(r->coeffs);
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
#if !defined(MLK_USE_NATIVE_POLY_REDUCE)
/************************************************************
* Name: mlk_scalar_signed_to_unsigned_q
*
* Description: Constant-time conversion of signed representatives
* modulo MLKEM_Q within range (-(MLKEM_Q-1) .. (MLKEM_Q-1))
* into unsigned representatives within range (0..(MLKEM_Q-1)).
*
* Arguments: c: signed coefficient to be converted
*
************************************************************/
/* Reference: Not present in reference implementation.
* - Used here to implement different semantics of `poly_reduce()`;
* see below. In the reference implementation, this logic is
* part of all compression functions (see `compress.c`). */
static MLK_INLINE uint16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
__contract__(
requires(c > -MLKEM_Q && c < MLKEM_Q)
ensures(return_value >= 0 && return_value < MLKEM_Q)
ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
{
mlk_assert_abs_bound(&c, 1, MLKEM_Q);
/* Add Q if c is negative, but in constant time */
c = mlk_ct_sel_int16(c + MLKEM_Q, c, mlk_ct_cmask_neg_i16(c));
/* and therefore cast to uint16_t is safe. */
mlk_assert_bound(&c, 1, 0, MLKEM_Q);
return (uint16_t)c;
}
/* Reference: `poly_reduce()` in reference implementation
* - We use _unsigned_ canonical outputs, while the reference
* implementation uses _signed_ canonical outputs.
* Accordingly, we need a conditional addition of MLKEM_Q
* here to go from signed to unsigned representatives.
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
{
unsigned i;
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
{
/* Barrett reduction, giving signed canonical representative */
int16_t t = mlk_barrett_reduce(r->coeffs[i]);
/* Conditional addition to get unsigned canonical representative */
r->coeffs[i] = mlk_scalar_signed_to_unsigned_q(t);
}
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
#else /* !MLK_USE_NATIVE_POLY_REDUCE */
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
{
mlk_poly_reduce_native(r->coeffs);
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
/* Reference: `poly_add()` in the reference implementation.
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
{
unsigned i;
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
{
r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
}
}
/* Reference: `poly_sub()` in the reference implementation.
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
{
unsigned i;
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
{
r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
}
}
/* Include zeta table unless NTT, invNTT and mulcache computation
* have been replaced by native implementations. */
#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
!defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
#include "zetas.inc"
#endif
#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
/* Reference: Does not exist in the reference implementation.
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This is an idea
* originally taken from https://ia.cr/2021/986
* and used at the C level here. */
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
{
unsigned i;
for (i = 0; i < MLKEM_N / 4; i++)
__loop__(
invariant(i <= MLKEM_N / 4)
invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
{
x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
x->coeffs[2 * i + 1] = mlk_fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
}
/*
* This bound is true for the C implementation, but not needed
* in the higher level bounds reasoning. It is thus omitted
* them from the spec to not unnecessarily constrain native
* implementations, but checked here nonetheless.
*/
mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
}
#else /* !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
{
mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
/* Omitting bounds assertion since native implementations may
* decide not to use a mulcache. Note that the C backend implementation
* of poly_basemul_montgomery_cached() does still include the check. */
}
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
#if !defined(MLK_USE_NATIVE_NTT)
/*
* Computes a block CT butterflies with a fixed twiddle factor,
* using Montgomery multiplication.
* Parameters:
* - r: Pointer to base of polynomial (_not_ the base of butterfly block)
* - root: Twiddle factor to use for the butterfly. This must be in
* Montgomery form and signed canonical.
* - start: Offset to the beginning of the butterfly block
* - len: Index difference between coefficients subject to a butterfly
* - bound: Ghost variable describing coefficient bound: Prior to `start`,
* coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
* they must be bound by `bound`.
* When this function returns, output coefficients in the index range
* [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
* Example:
* - start=8, len=4
* This would compute the following four butterflies
* 8 -- 12
* 9 -- 13
* 10 -- 14
* 11 -- 15
* - start=4, len=2
* This would compute the following two butterflies
* 4 -- 6
* 5 -- 7
*/
/* Reference: Embedded in `ntt()` in the reference implementation. */
static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
unsigned start, unsigned len, int bound)
__contract__(
requires(start < MLKEM_N)
requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
requires(-MLKEM_Q_HALF < zeta && zeta < MLKEM_Q_HALF)
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
requires(array_abs_bound(r, start, MLKEM_N, bound))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
{
/* `bound` is a ghost variable only needed in the CBMC specification */
unsigned j;
((void)bound);
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
/*
* Coefficients are updated in strided pairs, so the bounds for the
* intermediate states alternate twice between the old and new bound
*/
invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q))
invariant(array_abs_bound(r, j, start + len, bound))
invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q))
invariant(array_abs_bound(r, j + len, MLKEM_N, bound)))
{
int16_t t;
t = mlk_fqmul(r[j + len], zeta);
r[j + len] = r[j] - t;
r[j] = r[j] + t;
}
}
/*
* Compute one layer of forward NTT
* Parameters:
* - r: Pointer to base of polynomial
* - layer: Variable indicating which layer is being applied.
*/
/* Reference: Embedded in `ntt()` in the reference implementation. */
static void mlk_ntt_layer(int16_t r[MLKEM_N], unsigned layer)
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(1 <= layer && layer <= 7)
requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
{
unsigned start, k, len;
/* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
k = 1u << (layer - 1);
len = MLKEM_N >> layer;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(start < MLKEM_N + 2 * len)
invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
{
int16_t zeta = zetas[k++];
mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
}
}
/*
* Compute full forward NTT
* NOTE: This particular implementation satisfies a much tighter
* bound on the output coefficients (5*q) than the contractual one (8*q),
* but this is not needed in the calling code. Should we change the
* base multiplication strategy to require smaller NTT output bounds,
* the proof may need strengthening.
*/
/* Reference: `ntt()` in the reference implementation.
* - Iterate over `layer` instead of `len` in the outer loop
* to simplify computation of zeta index. */
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *p)
{
unsigned layer;
int16_t *r;
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
r = p->coeffs;
for (layer = 1; layer <= 7; layer++)
__loop__(
invariant(1 <= layer && layer <= 8)
invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
{
mlk_ntt_layer(r, layer);
}
/* Check the stronger bound */
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
}
#else /* !MLK_USE_NATIVE_NTT */
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *p)
{
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
mlk_ntt_native(p->coeffs);
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
}
#endif /* MLK_USE_NATIVE_NTT */
#if !defined(MLK_USE_NATIVE_INTT)
/* Compute one layer of inverse NTT */
/* Reference: Embedded into `invntt()` in the reference implementation */
static void mlk_invntt_layer(int16_t *r, unsigned layer)
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(1 <= layer && layer <= 7)
requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
unsigned start, k, len;
len = (MLKEM_N >> layer);
k = (1u << layer) - 1;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
invariant(start <= MLKEM_N && k <= 127)
/* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
{
unsigned j;
int16_t zeta = zetas[k--];
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
invariant(start <= MLKEM_N && k <= 127)
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
int16_t t = r[j];
r[j] = mlk_barrett_reduce(t + r[j + len]);
r[j + len] = r[j + len] - t;
r[j + len] = mlk_fqmul(r[j + len], zeta);
}
}
}
/* Reference: `invntt()` in the reference implementation
* - We normalize at the beginning of the inverse NTT,
* while the reference implementation normalizes at
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *p)
{
/*
* Scale input polynomial to account for Montgomery factor
* and NTT twist. This also brings coefficients down to
* absolute value < MLKEM_Q.
*/
unsigned j, layer;
const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
int16_t *r = p->coeffs;
for (j = 0; j < MLKEM_N; j++)
__loop__(
invariant(j <= MLKEM_N)
invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
{
r[j] = mlk_fqmul(r[j], f);
}
/* Run the invNTT layers */
for (layer = 7; layer > 0; layer--)
__loop__(
invariant(0 <= layer && layer < 8)
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
mlk_invntt_layer(r, layer);
}
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
}
#else /* !MLK_USE_NATIVE_INTT */
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *p)
{
mlk_intt_native(p->coeffs);
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
}
#endif /* MLK_USE_NATIVE_INTT */
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(mlk_poly)
#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */