inner.h
1 #ifndef FALCON_INNER_H__
2 #define FALCON_INNER_H__
3
4 /*
5 * Internal functions for Falcon. This is not the API intended to be
6 * used by applications; instead, this internal API provides all the
7 * primitives on which wrappers build to provide external APIs.
8 *
9 * ==========================(LICENSE BEGIN)============================
10 *
11 * Copyright (c) 2017-2019 Falcon Project
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining
14 * a copy of this software and associated documentation files (the
15 * "Software"), to deal in the Software without restriction, including
16 * without limitation the rights to use, copy, modify, merge, publish,
17 * distribute, sublicense, and/or sell copies of the Software, and to
18 * permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be
22 * included in all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 *
32 * ===========================(LICENSE END)=============================
33 *
34 * @author Thomas Pornin <thomas.pornin@nccgroup.com>
35 */
36
37 /*
38 * IMPORTANT API RULES
39 * -------------------
40 *
41 * This API has some non-trivial usage rules:
42 *
43 *
44 * - All public functions (i.e. the non-static ones) must be referenced
45 * with the Zf() macro (e.g. Zf(verify_raw) for the verify_raw()
46 * function). That macro adds a prefix to the name, which is
47 * configurable with the FALCON_PREFIX macro. This allows compiling
48 * the code into a specific "namespace" and potentially including
49 * several versions of this code into a single application (e.g. to
50 * have an AVX2 and a non-AVX2 variants and select the one to use at
51 * runtime based on availability of AVX2 opcodes).
52 *
53 * - Functions that need temporary buffers expects them as a final
54 * tmp[] array of type uint8_t*, with a size which is documented for
55 * each function. However, most have some alignment requirements,
56 * because they will use the array to store 16-bit, 32-bit or 64-bit
57 * values (e.g. uint64_t or double). The caller must ensure proper
58 * alignment. What happens on unaligned access depends on the
59 * underlying architecture, ranging from a slight time penalty
60 * to immediate termination of the process.
61 *
62 * - Some functions rely on specific rounding rules and precision for
63 * floating-point numbers. On some systems (in particular 32-bit x86
64 * with the 387 FPU), this requires setting an hardware control
65 * word. The caller MUST use set_fpu_cw() to ensure proper precision:
66 *
67 * oldcw = set_fpu_cw(2);
68 * Zf(sign_dyn)(...);
69 * set_fpu_cw(oldcw);
70 *
71 * On systems where the native floating-point precision is already
72 * proper, or integer-based emulation is used, the set_fpu_cw()
73 * function does nothing, so it can be called systematically.
74 */
75
76 // yyyPQCLEAN+0 yyyNIST+0 yyySUPERCOP+0
77 #include "config.h"
78 // yyyPQCLEAN- yyyNIST- yyySUPERCOP-
79 // yyySUPERCOP+1
80 // yyyCONF*
81 // yyySUPERCOP-
82
83 #include <stdint.h>
84 #include <stdlib.h>
85 #include <string.h>
86
87 #if defined FALCON_AVX2 && FALCON_AVX2 // yyyAVX2+1
88 /*
89 * This implementation uses AVX2 and optionally FMA intrinsics.
90 */
91 #include <immintrin.h>
92 #ifndef FALCON_LE
93 #define FALCON_LE 1
94 #endif
95 #ifndef FALCON_UNALIGNED
96 #define FALCON_UNALIGNED 1
97 #endif
98 #if defined __GNUC__
99 #if defined FALCON_FMA && FALCON_FMA
100 #define TARGET_AVX2 __attribute__((target("avx2,fma")))
101 #else
102 #define TARGET_AVX2 __attribute__((target("avx2")))
103 #endif
104 #elif defined _MSC_VER && _MSC_VER
105 #pragma warning( disable : 4752 )
106 #endif
107 #if defined FALCON_FMA && FALCON_FMA
108 #define FMADD(a, b, c) _mm256_fmadd_pd(a, b, c)
109 #define FMSUB(a, b, c) _mm256_fmsub_pd(a, b, c)
110 #else
111 #define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c)
112 #define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c)
113 #endif
114 #endif // yyyAVX2-
115
116 // yyyNIST+0 yyyPQCLEAN+0
117 /*
118 * On MSVC, disable warning about applying unary minus on an unsigned
119 * type: this is perfectly defined standard behaviour and we do it
120 * quite often.
121 */
122 #if defined _MSC_VER && _MSC_VER
123 #pragma warning( disable : 4146 )
124 #endif
125
126 // yyySUPERCOP+0
127 /*
128 * Enable ARM assembly on any ARMv7m platform (if it was not done before).
129 */
130 #ifndef FALCON_ASM_CORTEXM4
131 #if (defined __ARM_ARCH_7EM__ && __ARM_ARCH_7EM__) \
132 && (defined __ARM_FEATURE_DSP && __ARM_FEATURE_DSP)
133 #define FALCON_ASM_CORTEXM4 1
134 #else
135 #define FALCON_ASM_CORTEXM4 0
136 #endif
137 #endif
138 // yyySUPERCOP-
139
140 #if defined __i386__ || defined _M_IX86 \
141 || defined __x86_64__ || defined _M_X64 || \
142 (defined _ARCH_PWR8 && \
143 (defined __LITTLE_ENDIAN || defined __LITTLE_ENDIAN__))
144
145 #ifndef FALCON_LE
146 #define FALCON_LE 1
147 #endif
148 #ifndef FALCON_UNALIGNED
149 #define FALCON_UNALIGNED 1
150 #endif
151
152 #elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4
153
154 #ifndef FALCON_LE
155 #define FALCON_LE 1
156 #endif
157 #ifndef FALCON_UNALIGNED
158 #define FALCON_UNALIGNED 0
159 #endif
160
161 #elif (defined __LITTLE_ENDIAN__ && __LITTLE_ENDIAN__) \
162 || (defined __BYTE_ORDER__ && defined __ORDER_LITTLE_ENDIAN__ \
163 && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
164
165 #ifndef FALCON_LE
166 #define FALCON_LE 1
167 #endif
168 #ifndef FALCON_UNALIGNED
169 #define FALCON_UNALIGNED 0
170 #endif
171
172 #else
173
174 #ifndef FALCON_LE
175 #define FALCON_LE 0
176 #endif
177 #ifndef FALCON_UNALIGNED
178 #define FALCON_UNALIGNED 0
179 #endif
180
181 #endif
182
183 /*
184 * We ensure that both FALCON_FPEMU and FALCON_FPNATIVE are defined,
185 * with compatible values (exactly one of them must be non-zero).
186 * If none is defined, then default FP implementation is 'native'
187 * except on ARM Cortex M4.
188 */
189 #if !defined FALCON_FPEMU && !defined FALCON_FPNATIVE
190
191 #if (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
192 || (!defined __ARM_FP && defined __ARM_VFPV2__)
193 #define FALCON_FPEMU 0
194 #define FALCON_FPNATIVE 1
195 #elif defined FALCON_ASM_CORTEXM4 && FALCON_ASM_CORTEXM4
196 #define FALCON_FPEMU 1
197 #define FALCON_FPNATIVE 0
198 #else
199 #define FALCON_FPEMU 0
200 #define FALCON_FPNATIVE 1
201 #endif
202
203 #elif defined FALCON_FPEMU && !defined FALCON_FPNATIVE
204
205 #if FALCON_FPEMU
206 #define FALCON_FPNATIVE 0
207 #else
208 #define FALCON_FPNATIVE 1
209 #endif
210
211 #elif defined FALCON_FPNATIVE && !defined FALCON_FPEMU
212
213 #if FALCON_FPNATIVE
214 #define FALCON_FPEMU 0
215 #else
216 #define FALCON_FPEMU 1
217 #endif
218
219 #endif
220
221 #if (FALCON_FPEMU && FALCON_FPNATIVE) || (!FALCON_FPEMU && !FALCON_FPNATIVE)
222 #error Exactly one of FALCON_FPEMU and FALCON_FPNATIVE must be selected
223 #endif
224
225 // yyySUPERCOP+0
226 /*
227 * For seed generation from the operating system:
228 * - On Linux and glibc-2.25+, FreeBSD 12+ and OpenBSD, use getentropy().
229 * - On Unix-like systems, use /dev/urandom (including as a fallback
230 * for failed getentropy() calls).
231 * - On Windows, use CryptGenRandom().
232 */
233
234 #ifndef FALCON_RAND_GETENTROPY
235 #if (defined __linux__ && defined __GLIBC__ \
236 && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25))) \
237 || (defined __FreeBSD__ && __FreeBSD__ >= 12) \
238 || defined __OpenBSD__
239 #define FALCON_RAND_GETENTROPY 1
240 #else
241 #define FALCON_RAND_GETENTROPY 0
242 #endif
243 #endif
244
245 #ifndef FALCON_RAND_URANDOM
246 #if defined _AIX \
247 || defined __ANDROID__ \
248 || defined __FreeBSD__ \
249 || defined __NetBSD__ \
250 || defined __OpenBSD__ \
251 || defined __DragonFly__ \
252 || defined __linux__ \
253 || (defined __sun && (defined __SVR4 || defined __svr4__)) \
254 || (defined __APPLE__ && defined __MACH__)
255 #define FALCON_RAND_URANDOM 1
256 #else
257 #define FALCON_RAND_URANDOM 0
258 #endif
259 #endif
260
261 #ifndef FALCON_RAND_WIN32
262 #if defined _WIN32 || defined _WIN64
263 #define FALCON_RAND_WIN32 1
264 #else
265 #define FALCON_RAND_WIN32 0
266 #endif
267 #endif
268 // yyySUPERCOP-
269
270 /*
271 * For still undefined compile-time macros, define them to 0 to avoid
272 * warnings with -Wundef.
273 */
274 #ifndef FALCON_AVX2
275 #define FALCON_AVX2 0
276 #endif
277 #ifndef FALCON_FMA
278 #define FALCON_FMA 0
279 #endif
280 #ifndef FALCON_KG_CHACHA20
281 #define FALCON_KG_CHACHA20 0
282 #endif
283 // yyyNIST- yyyPQCLEAN-
284
285 // yyyPQCLEAN+0 yyySUPERCOP+0
286 /*
287 * "Naming" macro used to apply a consistent prefix over all global
288 * symbols.
289 */
290 #ifndef FALCON_PREFIX
291 #define FALCON_PREFIX falcon_inner
292 #endif
293 #define Zf(name) Zf_(FALCON_PREFIX, name)
294 #define Zf_(prefix, name) Zf__(prefix, name)
295 #define Zf__(prefix, name) prefix ## _ ## name
296 // yyyPQCLEAN- yyySUPERCOP-
297
298 // yyyAVX2+1
299 /*
300 * We use the TARGET_AVX2 macro to tag some functions which, in some
301 * configurations, may use AVX2 and FMA intrinsics; this depends on
302 * the compiler. In all other cases, we just define it to emptiness
303 * (i.e. it will have no effect).
304 */
305 #ifndef TARGET_AVX2
306 #define TARGET_AVX2
307 #endif
308 // yyyAVX2-
309
310 /*
311 * Some computations with floating-point elements, in particular
312 * rounding to the nearest integer, rely on operations using _exactly_
313 * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
314 * x86, the 387 FPU may be used (depending on the target OS) and, in
315 * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
316 * total type length); to prevent miscomputations, we define an explicit
317 * function that modifies the precision in the FPU control word.
318 *
319 * set_fpu_cw() sets the precision to the provided value, and returns
320 * the previously set precision; callers are supposed to restore the
321 * previous precision on exit. The correct (52-bit) precision is
322 * configured with the value "2". On unsupported compilers, or on
323 * targets other than 32-bit x86, or when the native 'double' type is
324 * not used, the set_fpu_cw() function does nothing at all.
325 */
326 #if FALCON_FPNATIVE // yyyFPNATIVE+1
327 #if defined __GNUC__ && defined __i386__
328 static inline unsigned
329 set_fpu_cw(unsigned x)
330 {
331 unsigned short t;
332 unsigned old;
333
334 __asm__ __volatile__ ("fstcw %0" : "=m" (t) : : );
335 old = (t & 0x0300u) >> 8;
336 t = (unsigned short)((t & ~0x0300u) | (x << 8));
337 __asm__ __volatile__ ("fldcw %0" : : "m" (t) : );
338 return old;
339 }
340 #elif defined _M_IX86
341 static inline unsigned
342 set_fpu_cw(unsigned x)
343 {
344 unsigned short t;
345 unsigned old;
346
347 __asm { fstcw t }
348 old = (t & 0x0300u) >> 8;
349 t = (unsigned short)((t & ~0x0300u) | (x << 8));
350 __asm { fldcw t }
351 return old;
352 }
353 #else
354 static inline unsigned
355 set_fpu_cw(unsigned x)
356 {
357 return x;
358 }
359 #endif
360 #else // yyyFPNATIVE+0
361 static inline unsigned
362 set_fpu_cw(unsigned x)
363 {
364 return x;
365 }
366 #endif // yyyFPNATIVE-
367
368 #if FALCON_FPNATIVE && !FALCON_AVX2 // yyyFPNATIVE+1 yyyAVX2+0
369 /*
370 * If using the native 'double' type but not AVX2 code, on an x86
371 * machine with SSE2 activated for maths, then we will use the
372 * SSE2 intrinsics.
373 */
374 #if defined __GNUC__ && defined __SSE2_MATH__
375 #include <immintrin.h>
376 #endif
377 #endif // yyyFPNATIVE- yyyAVX2-
378
379 #if FALCON_FPNATIVE // yyyFPNATIVE+1
380 /*
381 * For optimal reproducibility of values, we need to disable contraction
382 * of floating-point expressions; otherwise, on some architectures (e.g.
383 * PowerPC), the compiler may generate fused-multiply-add opcodes that
384 * may round differently than two successive separate opcodes. C99 defines
385 * a standard pragma for that, but GCC-6.2.2 appears to ignore it,
386 * hence the GCC-specific pragma (that Clang does not support).
387 */
388 #if defined __clang__
389 #pragma STDC FP_CONTRACT OFF
390 #elif defined __GNUC__
391 #pragma GCC optimize ("fp-contract=off")
392 #endif
393 #endif // yyyFPNATIVE-
394
395 // yyyPQCLEAN+0
396 /*
397 * MSVC 2015 does not know the C99 keyword 'restrict'.
398 */
399 #if defined _MSC_VER && _MSC_VER
400 #ifndef restrict
401 #define restrict __restrict
402 #endif
403 #endif
404 // yyyPQCLEAN-
405
406 /* ==================================================================== */
407 /*
408 * SHAKE256 implementation (shake.c).
409 *
410 * API is defined to be easily replaced with the fips202.h API defined
411 * as part of PQClean.
412 */
413
414 // yyyPQCLEAN+0
415 typedef struct {
416 union {
417 uint64_t A[25];
418 uint8_t dbuf[200];
419 } st;
420 uint64_t dptr;
421 } inner_shake256_context;
422
423 #define inner_shake256_init Zf(i_shake256_init)
424 #define inner_shake256_inject Zf(i_shake256_inject)
425 #define inner_shake256_flip Zf(i_shake256_flip)
426 #define inner_shake256_extract Zf(i_shake256_extract)
427
428 void Zf(i_shake256_init)(
429 inner_shake256_context *sc);
430 void Zf(i_shake256_inject)(
431 inner_shake256_context *sc, const uint8_t *in, size_t len);
432 void Zf(i_shake256_flip)(
433 inner_shake256_context *sc);
434 void Zf(i_shake256_extract)(
435 inner_shake256_context *sc, uint8_t *out, size_t len);
436
437 /*
438 // yyyPQCLEAN+1
439
440 #include "fips202.h"
441
442 #define inner_shake256_context shake256incctx
443 #define inner_shake256_init(sc) shake256_inc_init(sc)
444 #define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
445 #define inner_shake256_flip(sc) shake256_inc_finalize(sc)
446 #define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
447
448 // yyyPQCLEAN+0
449 */
450 // yyyPQCLEAN-
451
452 /* ==================================================================== */
453 /*
454 * Encoding/decoding functions (codec.c).
455 *
456 * Encoding functions take as parameters an output buffer (out) with
457 * a given maximum length (max_out_len); returned value is the actual
458 * number of bytes which have been written. If the output buffer is
459 * not large enough, then 0 is returned (some bytes may have been
460 * written to the buffer). If 'out' is NULL, then 'max_out_len' is
461 * ignored; instead, the function computes and returns the actual
462 * required output length (in bytes).
463 *
464 * Decoding functions take as parameters an input buffer (in) with
465 * its maximum length (max_in_len); returned value is the actual number
466 * of bytes that have been read from the buffer. If the provided length
467 * is too short, then 0 is returned.
468 *
469 * Values to encode or decode are vectors of integers, with N = 2^logn
470 * elements.
471 *
472 * Three encoding formats are defined:
473 *
474 * - modq: sequence of values modulo 12289, each encoded over exactly
475 * 14 bits. The encoder and decoder verify that integers are within
476 * the valid range (0..12288). Values are arrays of uint16.
477 *
478 * - trim: sequence of signed integers, a specified number of bits
479 * each. The number of bits is provided as parameter and includes
480 * the sign bit. Each integer x must be such that |x| < 2^(bits-1)
481 * (which means that the -2^(bits-1) value is forbidden); encode and
482 * decode functions check that property. Values are arrays of
483 * int16_t or int8_t, corresponding to names 'trim_i16' and
484 * 'trim_i8', respectively.
485 *
486 * - comp: variable-length encoding for signed integers; each integer
487 * uses a minimum of 9 bits, possibly more. This is normally used
488 * only for signatures.
489 *
490 */
491
492 size_t Zf(modq_encode)(void *out, size_t max_out_len,
493 const uint16_t *x, unsigned logn);
494 size_t Zf(trim_i16_encode)(void *out, size_t max_out_len,
495 const int16_t *x, unsigned logn, unsigned bits);
496 size_t Zf(trim_i8_encode)(void *out, size_t max_out_len,
497 const int8_t *x, unsigned logn, unsigned bits);
498 size_t Zf(comp_encode)(void *out, size_t max_out_len,
499 const int16_t *x, unsigned logn);
500
501 size_t Zf(modq_decode)(uint16_t *x, unsigned logn,
502 const void *in, size_t max_in_len);
503 size_t Zf(trim_i16_decode)(int16_t *x, unsigned logn, unsigned bits,
504 const void *in, size_t max_in_len);
505 size_t Zf(trim_i8_decode)(int8_t *x, unsigned logn, unsigned bits,
506 const void *in, size_t max_in_len);
507 size_t Zf(comp_decode)(int16_t *x, unsigned logn,
508 const void *in, size_t max_in_len);
509
510 /*
511 * Number of bits for key elements, indexed by logn (1 to 10). This
512 * is at most 8 bits for all degrees, but some degrees may have shorter
513 * elements.
514 */
515 extern const uint8_t Zf(max_fg_bits)[];
516 extern const uint8_t Zf(max_FG_bits)[];
517
518 /*
519 * Maximum size, in bits, of elements in a signature, indexed by logn
520 * (1 to 10). The size includes the sign bit.
521 */
522 extern const uint8_t Zf(max_sig_bits)[];
523
524 /* ==================================================================== */
525 /*
526 * Support functions used for both signature generation and signature
527 * verification (common.c).
528 */
529
530 /*
531 * From a SHAKE256 context (must be already flipped), produce a new
532 * point. This is the non-constant-time version, which may leak enough
533 * information to serve as a stop condition on a brute force attack on
534 * the hashed message (provided that the nonce value is known).
535 */
536 void Zf(hash_to_point_vartime)(inner_shake256_context *sc,
537 uint16_t *x, unsigned logn);
538
539 /*
540 * From a SHAKE256 context (must be already flipped), produce a new
541 * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
542 * This function is constant-time but is typically more expensive than
543 * Zf(hash_to_point_vartime)().
544 *
545 * tmp[] must have 16-bit alignment.
546 */
547 void Zf(hash_to_point_ct)(inner_shake256_context *sc,
548 uint16_t *x, unsigned logn, uint8_t *tmp);
549
550 /*
551 * Tell whether a given vector (2N coordinates, in two halves) is
552 * acceptable as a signature. This compares the appropriate norm of the
553 * vector with the acceptance bound. Returned value is 1 on success
554 * (vector is short enough to be acceptable), 0 otherwise.
555 */
556 int Zf(is_short)(const int16_t *s1, const int16_t *s2, unsigned logn);
557
558 /*
559 * Tell whether a given vector (2N coordinates, in two halves) is
560 * acceptable as a signature. Instead of the first half s1, this
561 * function receives the "saturated squared norm" of s1, i.e. the
562 * sum of the squares of the coordinates of s1 (saturated at 2^32-1
563 * if the sum exceeds 2^31-1).
564 *
565 * Returned value is 1 on success (vector is short enough to be
566 * acceptable), 0 otherwise.
567 */
568 int Zf(is_short_half)(uint32_t sqn, const int16_t *s2, unsigned logn);
569
570 /* ==================================================================== */
571 /*
572 * Signature verification functions (vrfy.c).
573 */
574
575 /*
576 * Convert a public key to NTT + Montgomery format. Conversion is done
577 * in place.
578 */
579 void Zf(to_ntt_monty)(uint16_t *h, unsigned logn);
580
581 /*
582 * Internal signature verification code:
583 * c0[] contains the hashed nonce+message
584 * s2[] is the decoded signature
585 * h[] contains the public key, in NTT + Montgomery format
586 * logn is the degree log
587 * tmp[] temporary, must have at least 2*2^logn bytes
588 * Returned value is 1 on success, 0 on error.
589 *
590 * tmp[] must have 16-bit alignment.
591 */
592 int Zf(verify_raw)(const uint16_t *c0, const int16_t *s2,
593 const uint16_t *h, unsigned logn, uint8_t *tmp);
594
595 /*
596 * Compute the public key h[], given the private key elements f[] and
597 * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
598 * modulus. This function returns 1 on success, 0 on error (an error is
599 * reported if f is not invertible mod phi mod q).
600 *
601 * The tmp[] array must have room for at least 2*2^logn elements.
602 * tmp[] must have 16-bit alignment.
603 */
604 int Zf(compute_public)(uint16_t *h,
605 const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
606
607 /*
608 * Recompute the fourth private key element. Private key consists in
609 * four polynomials with small coefficients f, g, F and G, which are
610 * such that fG - gF = q mod phi; furthermore, f is invertible modulo
611 * phi and modulo q. This function recomputes G from f, g and F.
612 *
613 * The tmp[] array must have room for at least 4*2^logn bytes.
614 *
615 * Returned value is 1 in success, 0 on error (f not invertible).
616 * tmp[] must have 16-bit alignment.
617 */
618 int Zf(complete_private)(int8_t *G,
619 const int8_t *f, const int8_t *g, const int8_t *F,
620 unsigned logn, uint8_t *tmp);
621
622 /*
623 * Test whether a given polynomial is invertible modulo phi and q.
624 * Polynomial coefficients are small integers.
625 *
626 * tmp[] must have 16-bit alignment.
627 */
628 int Zf(is_invertible)(
629 const int16_t *s2, unsigned logn, uint8_t *tmp);
630
631 /*
632 * Count the number of elements of value zero in the NTT representation
633 * of the given polynomial: this is the number of primitive 2n-th roots
634 * of unity (modulo q = 12289) that are roots of the provided polynomial
635 * (taken modulo q).
636 *
637 * tmp[] must have 16-bit alignment.
638 */
639 int Zf(count_nttzero)(const int16_t *sig, unsigned logn, uint8_t *tmp);
640
641 /*
642 * Internal signature verification with public key recovery:
643 * h[] receives the public key (NOT in NTT/Montgomery format)
644 * c0[] contains the hashed nonce+message
645 * s1[] is the first signature half
646 * s2[] is the second signature half
647 * logn is the degree log
648 * tmp[] temporary, must have at least 2*2^logn bytes
649 * Returned value is 1 on success, 0 on error. Success is returned if
650 * the signature is a short enough vector; in that case, the public
651 * key has been written to h[]. However, the caller must still
652 * verify that h[] is the correct value (e.g. with regards to a known
653 * hash of the public key).
654 *
655 * h[] may not overlap with any of the other arrays.
656 *
657 * tmp[] must have 16-bit alignment.
658 */
659 int Zf(verify_recover)(uint16_t *h,
660 const uint16_t *c0, const int16_t *s1, const int16_t *s2,
661 unsigned logn, uint8_t *tmp);
662
663 /* ==================================================================== */
664 /*
665 * Implementation of floating-point real numbers (fpr.h, fpr.c).
666 */
667
668 /*
669 * Real numbers are implemented by an extra header file, included below.
670 * This is meant to support pluggable implementations. The default
671 * implementation relies on the C type 'double'.
672 *
673 * The included file must define the following types, functions and
674 * constants:
675 *
676 * fpr
677 * type for a real number
678 *
679 * fpr fpr_of(int64_t i)
680 * cast an integer into a real number; source must be in the
681 * -(2^63-1)..+(2^63-1) range
682 *
683 * fpr fpr_scaled(int64_t i, int sc)
684 * compute i*2^sc as a real number; source 'i' must be in the
685 * -(2^63-1)..+(2^63-1) range
686 *
687 * fpr fpr_ldexp(fpr x, int e)
688 * compute x*2^e
689 *
690 * int64_t fpr_rint(fpr x)
691 * round x to the nearest integer; x must be in the -(2^63-1)
692 * to +(2^63-1) range
693 *
694 * int64_t fpr_trunc(fpr x)
695 * round to an integer; this rounds towards zero; value must
696 * be in the -(2^63-1) to +(2^63-1) range
697 *
698 * fpr fpr_add(fpr x, fpr y)
699 * compute x + y
700 *
701 * fpr fpr_sub(fpr x, fpr y)
702 * compute x - y
703 *
704 * fpr fpr_neg(fpr x)
705 * compute -x
706 *
707 * fpr fpr_half(fpr x)
708 * compute x/2
709 *
710 * fpr fpr_double(fpr x)
711 * compute x*2
712 *
713 * fpr fpr_mul(fpr x, fpr y)
714 * compute x * y
715 *
716 * fpr fpr_sqr(fpr x)
717 * compute x * x
718 *
719 * fpr fpr_inv(fpr x)
720 * compute 1/x
721 *
722 * fpr fpr_div(fpr x, fpr y)
723 * compute x/y
724 *
725 * fpr fpr_sqrt(fpr x)
726 * compute the square root of x
727 *
728 * int fpr_lt(fpr x, fpr y)
729 * return 1 if x < y, 0 otherwise
730 *
731 * uint64_t fpr_expm_p63(fpr x)
732 * return exp(x), assuming that 0 <= x < log(2). Returned value
733 * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
734 * rounded to the nearest integer). Computation should have a
735 * precision of at least 45 bits.
736 *
737 * const fpr fpr_gm_tab[]
738 * array of constants for FFT / iFFT
739 *
740 * const fpr fpr_p2_tab[]
741 * precomputed powers of 2 (by index, 0 to 10)
742 *
743 * Constants of type 'fpr':
744 *
745 * fpr fpr_q 12289
746 * fpr fpr_inverse_of_q 1/12289
747 * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2))
748 * fpr fpr_inv_sigma[] 1/sigma (indexed by logn, 1 to 10)
749 * fpr fpr_sigma_min[] 1/sigma_min (indexed by logn, 1 to 10)
750 * fpr fpr_log2 log(2)
751 * fpr fpr_inv_log2 1/log(2)
752 * fpr fpr_bnorm_max 16822.4121
753 * fpr fpr_zero 0
754 * fpr fpr_one 1
755 * fpr fpr_two 2
756 * fpr fpr_onehalf 0.5
757 * fpr fpr_ptwo31 2^31
758 * fpr fpr_ptwo31m1 2^31-1
759 * fpr fpr_mtwo31m1 -(2^31-1)
760 * fpr fpr_ptwo63m1 2^63-1
761 * fpr fpr_mtwo63m1 -(2^63-1)
762 * fpr fpr_ptwo63 2^63
763 */
764 #include "fpr.h"
765
766 /* ==================================================================== */
767 /*
768 * RNG (rng.c).
769 *
770 * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
771 * context (flipped) and is used for bulk pseudorandom generation.
772 * A system-dependent seed generator is also provided.
773 */
774
775 /*
776 * Obtain a random seed from the system RNG.
777 *
778 * Returned value is 1 on success, 0 on error.
779 */
780 int Zf(get_seed)(void *seed, size_t seed_len);
781
782 /*
783 * Structure for a PRNG. This includes a large buffer so that values
784 * get generated in advance. The 'state' is used to keep the current
785 * PRNG algorithm state (contents depend on the selected algorithm).
786 *
787 * The unions with 'dummy_u64' are there to ensure proper alignment for
788 * 64-bit direct access.
789 */
790 typedef struct {
791 union {
792 uint8_t d[512]; /* MUST be 512, exactly */
793 uint64_t dummy_u64;
794 } buf;
795 size_t ptr;
796 union {
797 uint8_t d[256];
798 uint64_t dummy_u64;
799 } state;
800 int type;
801 } prng;
802
803 /*
804 * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
805 * context (in "flipped" state) to obtain its initial state.
806 */
807 void Zf(prng_init)(prng *p, inner_shake256_context *src);
808
809 /*
810 * Refill the PRNG buffer. This is normally invoked automatically, and
811 * is declared here only so that prng_get_u64() may be inlined.
812 */
813 void Zf(prng_refill)(prng *p);
814
815 /*
816 * Get some bytes from a PRNG.
817 */
818 void Zf(prng_get_bytes)(prng *p, void *dst, size_t len);
819
820 /*
821 * Get a 64-bit random value from a PRNG.
822 */
823 static inline uint64_t
824 prng_get_u64(prng *p)
825 {
826 size_t u;
827
828 /*
829 * If there are less than 9 bytes in the buffer, we refill it.
830 * This means that we may drop the last few bytes, but this allows
831 * for faster extraction code. Also, it means that we never leave
832 * an empty buffer.
833 */
834 u = p->ptr;
835 if (u >= (sizeof p->buf.d) - 9) {
836 Zf(prng_refill)(p);
837 u = 0;
838 }
839 p->ptr = u + 8;
840
841 /*
842 * On systems that use little-endian encoding and allow
843 * unaligned accesses, we can simply read the data where it is.
844 */
845 #if FALCON_LE && FALCON_UNALIGNED // yyyLEU+1
846 return *(uint64_t *)(p->buf.d + u);
847 #else // yyyLEU+0
848 return (uint64_t)p->buf.d[u + 0]
849 | ((uint64_t)p->buf.d[u + 1] << 8)
850 | ((uint64_t)p->buf.d[u + 2] << 16)
851 | ((uint64_t)p->buf.d[u + 3] << 24)
852 | ((uint64_t)p->buf.d[u + 4] << 32)
853 | ((uint64_t)p->buf.d[u + 5] << 40)
854 | ((uint64_t)p->buf.d[u + 6] << 48)
855 | ((uint64_t)p->buf.d[u + 7] << 56);
856 #endif // yyyLEU-
857 }
858
859 /*
860 * Get an 8-bit random value from a PRNG.
861 */
862 static inline unsigned
863 prng_get_u8(prng *p)
864 {
865 unsigned v;
866
867 v = p->buf.d[p->ptr ++];
868 if (p->ptr == sizeof p->buf.d) {
869 Zf(prng_refill)(p);
870 }
871 return v;
872 }
873
874 /* ==================================================================== */
875 /*
876 * FFT (falcon-fft.c).
877 *
878 * A real polynomial is represented as an array of N 'fpr' elements.
879 * The FFT representation of a real polynomial contains N/2 complex
880 * elements; each is stored as two real numbers, for the real and
881 * imaginary parts, respectively. See falcon-fft.c for details on the
882 * internal representation.
883 */
884
885 /*
886 * Compute FFT in-place: the source array should contain a real
887 * polynomial (N coefficients); its storage area is reused to store
888 * the FFT representation of that polynomial (N/2 complex numbers).
889 *
890 * 'logn' MUST lie between 1 and 10 (inclusive).
891 */
892 void Zf(FFT)(fpr *f, unsigned logn);
893
894 /*
895 * Compute the inverse FFT in-place: the source array should contain the
896 * FFT representation of a real polynomial (N/2 elements); the resulting
897 * real polynomial (N coefficients of type 'fpr') is written over the
898 * array.
899 *
900 * 'logn' MUST lie between 1 and 10 (inclusive).
901 */
902 void Zf(iFFT)(fpr *f, unsigned logn);
903
904 /*
905 * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
906 * function works in both normal and FFT representations.
907 */
908 void Zf(poly_add)(fpr *restrict a, const fpr *restrict b, unsigned logn);
909
910 /*
911 * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
912 * function works in both normal and FFT representations.
913 */
914 void Zf(poly_sub)(fpr *restrict a, const fpr *restrict b, unsigned logn);
915
916 /*
917 * Negate polynomial a. This function works in both normal and FFT
918 * representations.
919 */
920 void Zf(poly_neg)(fpr *a, unsigned logn);
921
922 /*
923 * Compute adjoint of polynomial a. This function works only in FFT
924 * representation.
925 */
926 void Zf(poly_adj_fft)(fpr *a, unsigned logn);
927
928 /*
929 * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
930 * This function works only in FFT representation.
931 */
932 void Zf(poly_mul_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
933
934 /*
935 * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
936 * overlap. This function works only in FFT representation.
937 */
938 void Zf(poly_muladj_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
939
940 /*
941 * Multiply polynomial with its own adjoint. This function works only in FFT
942 * representation.
943 */
944 void Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn);
945
946 /*
947 * Multiply polynomial with a real constant. This function works in both
948 * normal and FFT representations.
949 */
950 void Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn);
951
952 /*
953 * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
954 * a and b MUST NOT overlap.
955 */
956 void Zf(poly_div_fft)(fpr *restrict a, const fpr *restrict b, unsigned logn);
957
958 /*
959 * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
960 * (also in FFT representation). Since the result is auto-adjoint, all its
961 * coordinates in FFT representation are real; as such, only the first N/2
962 * values of d[] are filled (the imaginary parts are skipped).
963 *
964 * Array d MUST NOT overlap with either a or b.
965 */
966 void Zf(poly_invnorm2_fft)(fpr *restrict d,
967 const fpr *restrict a, const fpr *restrict b, unsigned logn);
968
969 /*
970 * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
971 * (also in FFT representation). Destination d MUST NOT overlap with
972 * any of the source arrays.
973 */
974 void Zf(poly_add_muladj_fft)(fpr *restrict d,
975 const fpr *restrict F, const fpr *restrict G,
976 const fpr *restrict f, const fpr *restrict g, unsigned logn);
977
978 /*
979 * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
980 * a and b are in FFT representation. Since b is autoadjoint, all its
981 * FFT coefficients are real, and the array b contains only N/2 elements.
982 * a and b MUST NOT overlap.
983 */
984 void Zf(poly_mul_autoadj_fft)(fpr *restrict a,
985 const fpr *restrict b, unsigned logn);
986
987 /*
988 * Divide polynomial a by polynomial b, where b is autoadjoint. Both
989 * a and b are in FFT representation. Since b is autoadjoint, all its
990 * FFT coefficients are real, and the array b contains only N/2 elements.
991 * a and b MUST NOT overlap.
992 */
993 void Zf(poly_div_autoadj_fft)(fpr *restrict a,
994 const fpr *restrict b, unsigned logn);
995
996 /*
997 * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
998 * representation. On input, g00, g01 and g11 are provided (where the
999 * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
1000 * and d11 values are written in g00, g01 and g11, respectively
1001 * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
1002 * (In fact, d00 = g00, so the g00 operand is left unmodified.)
1003 */
1004 void Zf(poly_LDL_fft)(const fpr *restrict g00,
1005 fpr *restrict g01, fpr *restrict g11, unsigned logn);
1006
1007 /*
1008 * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
1009 * representation. This is identical to poly_LDL_fft() except that
1010 * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
1011 * in two other separate buffers provided as extra parameters.
1012 */
1013 void Zf(poly_LDLmv_fft)(fpr *restrict d11, fpr *restrict l10,
1014 const fpr *restrict g00, const fpr *restrict g01,
1015 const fpr *restrict g11, unsigned logn);
1016
1017 /*
1018 * Apply "split" operation on a polynomial in FFT representation:
1019 * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
1020 * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
1021 */
1022 void Zf(poly_split_fft)(fpr *restrict f0, fpr *restrict f1,
1023 const fpr *restrict f, unsigned logn);
1024
1025 /*
1026 * Apply "merge" operation on two polynomials in FFT representation:
1027 * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
1028 * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
1029 * f MUST NOT overlap with either f0 or f1.
1030 */
1031 void Zf(poly_merge_fft)(fpr *restrict f,
1032 const fpr *restrict f0, const fpr *restrict f1, unsigned logn);
1033
1034 /* ==================================================================== */
1035 /*
1036 * Key pair generation.
1037 */
1038
1039 /*
1040 * Required sizes of the temporary buffer (in bytes).
1041 *
1042 * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
1043 * or 2) where it is slightly greater.
1044 */
1045 #define FALCON_KEYGEN_TEMP_1 136
1046 #define FALCON_KEYGEN_TEMP_2 272
1047 #define FALCON_KEYGEN_TEMP_3 224
1048 #define FALCON_KEYGEN_TEMP_4 448
1049 #define FALCON_KEYGEN_TEMP_5 896
1050 #define FALCON_KEYGEN_TEMP_6 1792
1051 #define FALCON_KEYGEN_TEMP_7 3584
1052 #define FALCON_KEYGEN_TEMP_8 7168
1053 #define FALCON_KEYGEN_TEMP_9 14336
1054 #define FALCON_KEYGEN_TEMP_10 28672
1055
1056 /*
1057 * Generate a new key pair. Randomness is extracted from the provided
1058 * SHAKE256 context, which must have already been seeded and flipped.
1059 * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
1060 * macros) and be aligned for the uint32_t, uint64_t and fpr types.
1061 *
1062 * The private key elements are written in f, g, F and G, and the
1063 * public key is written in h. Either or both of G and h may be NULL,
1064 * in which case the corresponding element is not returned (they can
1065 * be recomputed from f, g and F).
1066 *
1067 * tmp[] must have 64-bit alignment.
1068 * This function uses floating-point rounding (see set_fpu_cw()).
1069 */
1070 void Zf(keygen)(inner_shake256_context *rng,
1071 int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
1072 unsigned logn, uint8_t *tmp);
1073
1074 /* ==================================================================== */
1075 /*
1076 * Signature generation.
1077 */
1078
1079 /*
1080 * Expand a private key into the B0 matrix in FFT representation and
1081 * the LDL tree. All the values are written in 'expanded_key', for
1082 * a total of (8*logn+40)*2^logn bytes.
1083 *
1084 * The tmp[] array must have room for at least 48*2^logn bytes.
1085 *
1086 * tmp[] must have 64-bit alignment.
1087 * This function uses floating-point rounding (see set_fpu_cw()).
1088 */
1089 void Zf(expand_privkey)(fpr *restrict expanded_key,
1090 const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
1091 unsigned logn, uint8_t *restrict tmp);
1092
1093 /*
1094 * Compute a signature over the provided hashed message (hm); the
1095 * signature value is one short vector. This function uses an
1096 * expanded key (as generated by Zf(expand_privkey)()).
1097 *
1098 * The sig[] and hm[] buffers may overlap.
1099 *
1100 * On successful output, the start of the tmp[] buffer contains the s1
1101 * vector (as int16_t elements).
1102 *
1103 * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
1104 *
1105 * tmp[] must have 64-bit alignment.
1106 * This function uses floating-point rounding (see set_fpu_cw()).
1107 */
1108 void Zf(sign_tree)(int16_t *sig, inner_shake256_context *rng,
1109 const fpr *restrict expanded_key,
1110 const uint16_t *hm, unsigned logn, uint8_t *tmp);
1111
1112 /*
1113 * Compute a signature over the provided hashed message (hm); the
1114 * signature value is one short vector. This function uses a raw
1115 * key and dynamically recompute the B0 matrix and LDL tree; this
1116 * saves RAM since there is no needed for an expanded key, but
1117 * increases the signature cost.
1118 *
1119 * The sig[] and hm[] buffers may overlap.
1120 *
1121 * On successful output, the start of the tmp[] buffer contains the s1
1122 * vector (as int16_t elements).
1123 *
1124 * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
1125 *
1126 * tmp[] must have 64-bit alignment.
1127 * This function uses floating-point rounding (see set_fpu_cw()).
1128 */
1129 void Zf(sign_dyn)(int16_t *sig, inner_shake256_context *rng,
1130 const int8_t *restrict f, const int8_t *restrict g,
1131 const int8_t *restrict F, const int8_t *restrict G,
1132 const uint16_t *hm, unsigned logn, uint8_t *tmp);
1133
1134 /*
1135 * Internal sampler engine. Exported for tests.
1136 *
1137 * sampler_context wraps around a source of random numbers (PRNG) and
1138 * the sigma_min value (nominally dependent on the degree).
1139 *
1140 * sampler() takes as parameters:
1141 * ctx pointer to the sampler_context structure
1142 * mu center for the distribution
1143 * isigma inverse of the distribution standard deviation
1144 * It returns an integer sampled along the Gaussian distribution centered
1145 * on mu and of standard deviation sigma = 1/isigma.
1146 *
1147 * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
1148 * returns an integer sampled along a half-Gaussian with standard
1149 * deviation sigma0 = 1.8205 (center is 0, returned value is
1150 * nonnegative).
1151 */
1152
1153 typedef struct {
1154 prng p;
1155 fpr sigma_min;
1156 } sampler_context;
1157
1158 TARGET_AVX2
1159 int Zf(sampler)(void *ctx, fpr mu, fpr isigma);
1160
1161 TARGET_AVX2
1162 int Zf(gaussian0_sampler)(prng *p);
1163
1164 /* ==================================================================== */
1165
1166 #endif