crypto/fipsmodule/ml_kem/mlkem/poly_k.c (279 lines of code) (raw):

/* * Copyright (c) 2024-2025 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ #include <stdint.h> #include <string.h> #include "compress.h" #include "debug.h" #include "poly_k.h" #include "sampling.h" #include "symmetric.h" /* Level namespacing * This is to facilitate building multiple instances * of mlkem-native (e.g. with varying security levels) * within a single compilation unit. */ #define mlk_poly_cbd_eta1 MLK_ADD_LEVEL(mlk_poly_cbd_eta1) #define mlk_poly_cbd_eta2 MLK_ADD_LEVEL(mlk_poly_cbd_eta2) /* End of level namespacing */ /* Reference: `polyvec_compress()` in the reference implementation * - In contrast to the reference implementation, we assume * unsigned canonical coefficients here. * The reference implementation works with coefficients * in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */ MLK_INTERNAL_API void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], const mlk_polyvec a) { unsigned i; mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); for (i = 0; i < MLKEM_K; i++) { mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a[i]); } } /* Reference: `polyvec_decompress()` in the reference implementation. */ MLK_INTERNAL_API void mlk_polyvec_decompress_du(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) { unsigned i; for (i = 0; i < MLKEM_K; i++) { mlk_poly_decompress_du(&r[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); } mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); } /* Reference: `polyvec_tobytes()` in the reference implementation. * - In contrast to the reference implementation, we assume * unsigned canonical coefficients here. * The reference implementation works with coefficients * in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */ MLK_INTERNAL_API void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a) { unsigned i; mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); for (i = 0; i < MLKEM_K; i++) { mlk_poly_tobytes(r + i * MLKEM_POLYBYTES, &a[i]); } } /* Reference: `polyvec_frombytes()` in the reference implementation. */ MLK_INTERNAL_API void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES]) { unsigned i; for (i = 0; i < MLKEM_K; i++) { mlk_poly_frombytes(&r[i], a + i * MLKEM_POLYBYTES); } mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT); } /* Reference: `polyvec_ntt()` in the reference implementation. */ MLK_INTERNAL_API void mlk_polyvec_ntt(mlk_polyvec r) { unsigned i; for (i = 0; i < MLKEM_K; i++) { mlk_poly_ntt(&r[i]); } mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_NTT_BOUND); } /* Reference: `polyvec_invntt_tomont()` in the reference implementation. * - We normalize at the beginning of the inverse NTT, * while the reference implementation normalizes at * the end. This allows us to drop a call to `poly_reduce()` * from the base multiplication. */ MLK_INTERNAL_API void mlk_polyvec_invntt_tomont(mlk_polyvec r) { unsigned i; for (i = 0; i < MLKEM_K; i++) { mlk_poly_invntt_tomont(&r[i]); } mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND); } #if !defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) /* Reference: `polyvec_basemul_acc_montgomery()` in the * reference implementation. * - We use a multiplication cache ('mulcache') here * which is not present in the reference implementation. * This is an idea originally taken from https://ia.cr/2021/986 * and used at the C level here. * - We compute the coefficients of the scalar product in 32-bit * coefficients and perform only a single modular reduction * at the end. The reference implementation uses 2 * MLKEM_K * more modular reductions since it reduces after every modular * multiplication. */ MLK_INTERNAL_API void mlk_polyvec_basemul_acc_montgomery_cached( mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b, const mlk_polyvec_mulcache b_cache) { unsigned i; mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT); for (i = 0; i < MLKEM_N / 2; i++) __loop__(invariant(i <= MLKEM_N / 2)) { unsigned k; int32_t t[2] = {0}; for (k = 0; k < MLKEM_K; k++) __loop__( invariant(k <= MLKEM_K && t[0] <= (int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768 && t[0] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) && t[1] <= ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) && t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768))) { t[0] += (int32_t)a[k].coeffs[2 * i + 1] * b_cache[k].coeffs[i]; t[0] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i]; t[1] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i + 1]; t[1] += (int32_t)a[k].coeffs[2 * i + 1] * b[k].coeffs[2 * i]; } r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]); r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]); } } #else /* !MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ MLK_INTERNAL_API void mlk_polyvec_basemul_acc_montgomery_cached( mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b, const mlk_polyvec_mulcache b_cache) { mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT); /* Omitting bounds assertion for cache since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ #if MLKEM_K == 2 mlk_polyvec_basemul_acc_montgomery_cached_k2_native( r->coeffs, (const int16_t *)a, (const int16_t *)b, (const int16_t *)b_cache); #elif MLKEM_K == 3 mlk_polyvec_basemul_acc_montgomery_cached_k3_native( r->coeffs, (const int16_t *)a, (const int16_t *)b, (const int16_t *)b_cache); #elif MLKEM_K == 4 mlk_polyvec_basemul_acc_montgomery_cached_k4_native( r->coeffs, (const int16_t *)a, (const int16_t *)b, (const int16_t *)b_cache); #endif } #endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ /* Reference: Does not exist in the reference implementation. * - The reference implementation does not use a * multiplication cache ('mulcache'). This is an idea * originally taken from https://ia.cr/2021/986 * and used at the C level here. */ MLK_INTERNAL_API void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a) { unsigned i; for (i = 0; i < MLKEM_K; i++) { mlk_poly_mulcache_compute(&x[i], &a[i]); } } /* Reference: `polyvec_reduce()` in the reference implementation. * - We use _unsigned_ canonical outputs, while the reference * implementation uses _signed_ canonical outputs. * Accordingly, we need a conditional addition of MLKEM_Q * here to go from signed to unsigned representatives. * This conditional addition is then dropped from all * polynomial compression functions instead (see `compress.c`). */ MLK_INTERNAL_API void mlk_polyvec_reduce(mlk_polyvec r) { unsigned i; for (i = 0; i < MLKEM_K; i++) { mlk_poly_reduce(&r[i]); } mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); } /* Reference: `polyvec_add()` in the reference implementation. * - We use destructive version (output=first input) to avoid * reasoning about aliasing in the CBMC specification */ MLK_INTERNAL_API void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b) { unsigned i; for (i = 0; i < MLKEM_K; i++) { mlk_poly_add(&r[i], &b[i]); } } /* Reference: `polyvec_tomont()` in the reference implementation. */ MLK_INTERNAL_API void mlk_polyvec_tomont(mlk_polyvec r) { unsigned i; for (i = 0; i < MLKEM_K; i++) { mlk_poly_tomont(&r[i]); } mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); } /************************************************* * Name: mlk_poly_cbd_eta1 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter MLKEM_ETA1. * * Arguments: - mlk_poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array * * Specification: Implements [FIPS 203, Algorithm 8, SamplePolyCBD_eta1], where * eta1 is specified per level in [FIPS 203, Table 2] * and represented as MLKEM_ETA1 here. * **************************************************/ /* Reference: `poly_cbd_eta1` in the reference implementation. */ static MLK_INLINE void mlk_poly_cbd_eta1( mlk_poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) __contract__( requires(memory_no_alias(r, sizeof(mlk_poly))) requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) assigns(memory_slice(r, sizeof(mlk_poly))) ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) ) { #if MLKEM_ETA1 == 2 mlk_poly_cbd2(r, buf); #elif MLKEM_ETA1 == 3 mlk_poly_cbd3(r, buf); #else #error "Invalid value of MLKEM_ETA1" #endif } /* Reference: Does not exist in the reference implementation. * - This implements a x4-batched version of `poly_getnoise_eta1()` * from the reference implementation, to leverage * batched Keccak-f1600.*/ MLK_INTERNAL_API void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2, mlk_poly *r3, const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) { MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)]; MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)]; memcpy(extkey[0], seed, MLKEM_SYMBYTES); memcpy(extkey[1], seed, MLKEM_SYMBYTES); memcpy(extkey[2], seed, MLKEM_SYMBYTES); memcpy(extkey[3], seed, MLKEM_SYMBYTES); extkey[0][MLKEM_SYMBYTES] = nonce0; extkey[1][MLKEM_SYMBYTES] = nonce1; extkey[2][MLKEM_SYMBYTES] = nonce2; extkey[3][MLKEM_SYMBYTES] = nonce3; mlk_prf_eta1_x4(buf, extkey); mlk_poly_cbd_eta1(r0, buf[0]); mlk_poly_cbd_eta1(r1, buf[1]); mlk_poly_cbd_eta1(r2, buf[2]); mlk_poly_cbd_eta1(r3, buf[3]); mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); /* Specification: Partially implements * [FIPS 203, Section 3.3, Destruction of intermediate values] */ mlk_zeroize(buf, sizeof(buf)); mlk_zeroize(extkey, sizeof(extkey)); } #if MLKEM_K == 2 || MLKEM_K == 4 /************************************************* * Name: mlk_poly_cbd_eta2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter MLKEM_ETA2. * * Arguments: - mlk_poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array * * Specification: Implements [FIPS 203, Algorithm 8, SamplePolyCBD_eta2], where * eta2 is specified per level in [FIPS 203, Table 2] * and represented as MLKEM_ETA2 here. * **************************************************/ /* Reference: `poly_cbd_eta2` in the reference implementation. */ static MLK_INLINE void mlk_poly_cbd_eta2( mlk_poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) __contract__( requires(memory_no_alias(r, sizeof(mlk_poly))) requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) assigns(memory_slice(r, sizeof(mlk_poly))) ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) { #if MLKEM_ETA2 == 2 mlk_poly_cbd2(r, buf); #else #error "Invalid value of MLKEM_ETA2" #endif } /* Reference: `poly_getnoise_eta1()` in the reference implementation. * - We include buffer zeroization. */ MLK_INTERNAL_API void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce) { MLK_ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; MLK_ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; memcpy(extkey, seed, MLKEM_SYMBYTES); extkey[MLKEM_SYMBYTES] = nonce; mlk_prf_eta2(buf, extkey); mlk_poly_cbd_eta2(r, buf); mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); /* Specification: Partially implements * [FIPS 203, Section 3.3, Destruction of intermediate values] */ mlk_zeroize(buf, sizeof(buf)); mlk_zeroize(extkey, sizeof(extkey)); } #endif /* MLKEM_K == 2 || MLKEM_K == 4 */ #if MLKEM_K == 2 /* Reference: Does not exist in the reference implementation. * - This implements a x4-batched version of `poly_getnoise_eta1()` * and `poly_getnoise_eta1()` from the reference implementation, * leveraging batched Keccak-f1600. * - If a x4-batched Keccak-f1600 is available, we squeeze * more random data than needed for the eta2 calls, to be * be able to use a x4-batched Keccak-f1600. */ MLK_INTERNAL_API void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2, mlk_poly *r3, const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) { #if MLKEM_ETA2 >= MLKEM_ETA1 #error mlk_poly_getnoise_eta1122_4x assumes MLKEM_ETA1 > MLKEM_ETA2 #endif MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)]; MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)]; memcpy(extkey[0], seed, MLKEM_SYMBYTES); memcpy(extkey[1], seed, MLKEM_SYMBYTES); memcpy(extkey[2], seed, MLKEM_SYMBYTES); memcpy(extkey[3], seed, MLKEM_SYMBYTES); extkey[0][MLKEM_SYMBYTES] = nonce0; extkey[1][MLKEM_SYMBYTES] = nonce1; extkey[2][MLKEM_SYMBYTES] = nonce2; extkey[3][MLKEM_SYMBYTES] = nonce3; /* On systems with fast batched Keccak, we use 4-fold batched PRF, * even though that means generating more random data in buf[2] and buf[3] * than necessary. */ #if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) mlk_prf_eta1_x4(buf, extkey); #else mlk_prf_eta1(buf[0], extkey[0]); mlk_prf_eta1(buf[1], extkey[1]); mlk_prf_eta2(buf[2], extkey[2]); mlk_prf_eta2(buf[3], extkey[3]); #endif /* FIPS202_X4_DEFAULT_IMPLEMENTATION */ mlk_poly_cbd_eta1(r0, buf[0]); mlk_poly_cbd_eta1(r1, buf[1]); mlk_poly_cbd_eta2(r2, buf[2]); mlk_poly_cbd_eta2(r3, buf[3]); mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); /* Specification: Partially implements * [FIPS 203, Section 3.3, Destruction of intermediate values] */ mlk_zeroize(buf, sizeof(buf)); mlk_zeroize(extkey, sizeof(extkey)); } #endif /* MLKEM_K == 2 */ /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ #undef mlk_poly_cbd_eta1 #undef mlk_poly_cbd_eta2