Falcon source files (reference implementation)


falcon-sign.c

    1 /*
    2  * Falcon signature generation.
    3  *
    4  * ==========================(LICENSE BEGIN)============================
    5  *
    6  * Copyright (c) 2017  Falcon Project
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining
    9  * a copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be
   17  * included in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * ===========================(LICENSE END)=============================
   28  *
   29  * @author   Thomas Pornin <thomas.pornin@nccgroup.trust>
   30  */
   31 
   32 #include "internal.h"
   33 
   34 /*
   35  * If CLEANSE is non-zero, then temporary areas obtained with malloc()
   36  * and used to contain secret values are explicitly cleared before
   37  * deallocation with free(). This is the default behaviour; use
   38  * -DCLEANSE=0 to disable cleansing.
   39  */
   40 #ifndef CLEANSE
   41 #define CLEANSE   1
   42 #endif
   43 
   44 /*
   45  * If SAMPLER_CODF is non-zero, then the discrete Gaussian sampler will
   46  * use a CoDF table and a variable number of PRNG invocations; use
   47  * -DSAMPLER_CODF (or -DSAMPLER_CODF=1) to enable this code.
   48  */
   49 #ifndef SAMPLER_CODF
   50 #define SAMPLER_CODF   0
   51 #endif
   52 
   53 /*
   54  * If SAMPLER_CDF is non-zero, then the discrete Gaussian sampler will
   55  * use a tabulated distribution with 128 bits of precision and a constant 
   56  * number of PRNG invocations; use -DSAMPLER_CDF (or -DSAMPLER_CDF=1)
   57  * to enable this code. The default Gaussian sampler uses a tabulated 
   58  * distribution with 136 bits of precision and a variable number of PRNG
   59  * invocations.
   60  */
   61 #ifndef SAMPLER_CDF
   62 #define SAMPLER_CDF   0
   63 #endif
   64 
   65 /*
   66  * If CT_BEREXP is non-zero, then the rejection phase on the discrete
   67  * Gaussian sampler will use an algorithm with a constant number of PRNG
   68  * invocations (which should help with achieving constant-time code);
   69  * use -DCT_BEREXP (or -DCT_BEREXP=1) to enable this code. The default
   70  * rejection algorithm uses a lazy method with a variable number of PRNG
   71  * invocations, which help reducing the PRNG overall cost.
   72  */
   73 #ifndef CT_BEREXP
   74 #define CT_BEREXP   0
   75 #endif
   76 
   77 /* =================================================================== */
   78 
   79 /*
   80  * Compute degree N from logarithm 'logn', and ternary flag 'ter'.
   81  * 'ter' MUST be 0 or 1.
   82  */
   83 #define MKN(logn, ter)   ((size_t)(1 + ((ter) << 1)) << ((logn) - (ter)))
   84 
   85 /* =================================================================== */
   86 /*
   87  * Binary case:
   88  *   N = 2^logn
   89  *   phi = X^N+1
   90  */
   91 
   92 /*
   93  * Compute LDL decomposition on an auto-adjoint 2x2 matrix G. Matrix
   94  * elements are real polynomials modulo X^n+1:
   95  *   g00   top-left element
   96  *   g01   top-right element
   97  *   g10   bottom-left element
   98  *   g11   bottom-right element
   99  *
  100  * The matrix being auto-adjoint means that G = G*. Thus, adj(g00) == g00,
  101  * adj(g11) == g11, adj(g10) == g01, and adj(g01) == g10.
  102  * Since g10 and g01 are redundant, g10 is not provided as parameter.
  103  *
  104  * LDL decomposition is:
  105  *   G = L·D·L*
  106  * where L is lower-triangular with 1 on the diagonal, and D is diagonal.
  107  * The top-left element of D is equal to g00, so it is not returned;
  108  * outputs are thus d11 (lower-right element of D) and l10 (lower-left
  109  * element of L).
  110  *
  111  * The tmp[] buffer must be able to hold at least one polynomial.
  112  *
  113  * All operands are in FFT representation. No overlap allowed, except
  114  * between the constant inputs (g00, g01 and g11).
  115  */
  116 static void
  117 LDL_fft(fpr *restrict d11, fpr *restrict l10,
  118         const fpr *restrict g00, const fpr *restrict g01,
  119         const fpr *restrict g11, unsigned logn, fpr *restrict tmp)
  120 {
  121         size_t n;
  122 
  123         n = MKN(logn, 0);
  124 
  125         /* Let tmp = mu = G[0,1] / G[0,0]. */
  126         memcpy(tmp, g01, n * sizeof *g01);
  127         falcon_poly_div_fft(tmp, g00, logn);
  128 
  129         /* Let L[1,0] = adj(mu) and tmp = aux = mu * adj(mu). */
  130         memcpy(l10, tmp, n * sizeof *tmp);
  131         falcon_poly_adj_fft(l10, logn);
  132         falcon_poly_mul_fft(tmp, l10, logn);
  133 
  134         /* D[1,1] = G[1,1] - aux * G[0][0]. */
  135         falcon_poly_mul_fft(tmp, g00, logn);
  136         memcpy(d11, g11, n * sizeof *g11);
  137         falcon_poly_sub_fft(d11, tmp, logn);
  138 }
  139 
  140 /*
  141  * Special case of LDL when G is quasicyclic, i.e. g11 == g00.
  142  */
  143 static inline void
  144 LDLqc_fft(fpr *restrict d11, fpr *restrict l10,
  145         const fpr *restrict g00, const fpr *restrict g01, unsigned logn,
  146         fpr *restrict tmp)
  147 {
  148         LDL_fft(d11, l10, g00, g01, g00, logn, tmp);
  149 }
  150 
  151 /*
  152  * Get the size of the LDL tree for an input with polynomials of size
  153  * 2^logn. The size is expressed in the number of elements.
  154  */
  155 static inline unsigned
  156 ffLDL_treesize(unsigned logn)
  157 {
  158         /*
  159          * For logn = 0 (polynomials are constant), the "tree" is a
  160          * single element. Otherwise, the tree node has size 2^logn, and
  161          * has two child trees for size logn-1 each. Thus, treesize s()
  162          * must fulfill these two relations:
  163          *
  164          *   s(0) = 1
  165          *   s(logn) = (2^logn) + 2*s(logn-1)
  166          */
  167         return (logn + 1) << logn;
  168 }
  169 
  170 /*
  171  * Inner function for ffLDL_fft(). It expects the matrix to be both
  172  * auto-adjoint and quasicyclic; also, it uses the source operands
  173  * as modifiable temporaries.
  174  *
  175  * tmp[] must have room for at least two polynomials.
  176  */
  177 static void
  178 ffLDL_fft_inner(fpr *restrict tree,
  179         fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp)
  180 {
  181         size_t n, hn;
  182 
  183         n = MKN(logn, 0);
  184         if (n == 1) {
  185                 tree[0] = g0[0];
  186                 return;
  187         }
  188         hn = n >> 1;
  189 
  190         /*
  191          * The LDL decomposition yields L (which is written in the tree)
  192          * and the diagonal of D. Since d00 = g0, we just write d11
  193          * into tmp.
  194          */
  195         LDLqc_fft(tmp, tree, g0, g1, logn, tmp + n);
  196 
  197         /*
  198          * Split d00 (currently in g0) and d11 (currently in tmp). We
  199          * reuse g0 and g1 as temporary storage spaces:
  200          *   d00 splits into g1, g1+hn
  201          *   d11 splits into g0, g0+hn
  202          */
  203         falcon_poly_split_fft(g1, g1 + hn, g0, logn);
  204         falcon_poly_split_fft(g0, g0 + hn, tmp, logn);
  205 
  206         /*
  207          * Each split result is the first row of a new auto-adjoint
  208          * quasicyclic matrix for the next recursive step.
  209          */
  210         ffLDL_fft_inner(tree + n,
  211                 g1, g1 + hn, logn - 1, tmp);
  212         ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
  213                 g0, g0 + hn, logn - 1, tmp);
  214 }
  215 
  216 /*
  217  * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
  218  * is provided as three polynomials (FFT representation).
  219  *
  220  * The "tree" array is filled with the computed tree, of size
  221  * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
  222  *
  223  * Input arrays MUST NOT overlap, except possibly the three unmodified
  224  * arrays g00, g01 and g11. tmp[] should have room for at least four
  225  * polynomials of 2^logn elements each.
  226  */
  227 static void
  228 ffLDL_fft(fpr *restrict tree, const fpr *restrict g00,
  229         const fpr *restrict g01, const fpr *restrict g11,
  230         unsigned logn, fpr *restrict tmp)
  231 {
  232         size_t n, hn;
  233         fpr *d00, *d11;
  234 
  235         n = MKN(logn, 0);
  236         if (n == 1) {
  237                 tree[0] = g00[0];
  238                 return;
  239         }
  240         hn = n >> 1;
  241         d00 = tmp;
  242         d11 = tmp + n;
  243         tmp += n << 1;
  244 
  245         memcpy(d00, g00, n * sizeof *g00);
  246         LDL_fft(d11, tree, g00, g01, g11, logn, tmp);
  247 
  248         falcon_poly_split_fft(tmp, tmp + hn, d00, logn);
  249         falcon_poly_split_fft(d00, d00 + hn, d11, logn);
  250         memcpy(d11, tmp, n * sizeof *tmp);
  251         ffLDL_fft_inner(tree + n,
  252                 d11, d11 + hn, logn - 1, tmp);
  253         ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
  254                 d00, d00 + hn, logn - 1, tmp);
  255 }
  256 
  257 /*
  258  * Normalize an ffLDL tree: each leaf of value x is replaced with
  259  * sigma / sqrt(x).
  260  */
  261 static void
  262 ffLDL_binary_normalize(fpr *tree, fpr sigma, unsigned logn)
  263 {
  264         /*
  265          * TODO: make an iterative version.
  266          */
  267         size_t n;
  268 
  269         n = MKN(logn, 0);
  270         if (n == 1) {
  271                 tree[0] = fpr_div(sigma, fpr_sqrt(tree[0]));
  272         } else {
  273                 ffLDL_binary_normalize(tree + n,
  274                         sigma, logn - 1);
  275                 ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
  276                         sigma, logn - 1);
  277         }
  278 }
  279 
  280 /* =================================================================== */
  281 /*
  282  * Ternary case:
  283  *   N = 1.5*2^logn
  284  *   phi = X^N-X^(N/2)+1
  285  *
  286  * When doing operations along the splitting tree, the "top" operation
  287  * divides the degree by 3, while "deep" operations halve the degree.
  288  *
  289  * At the top-level, we perform a trisection:
  290  *
  291  *  - Input 2x2 Gram matrix is decomposed into its LDL representation:
  292  *    G = L·D·L*, where D is diagonal and L is low-triangular with
  293  *    only ones on its diagonal. Thus, there is one non-trivial element
  294  *    in L, and two non-trivial elements in D.
  295  *
  296  *  - Elements of D are each split into three elements of degree N/3.
  297  *
  298  *  - Two recursive invocations on 3x3 Gram matrices are performed.
  299  *
  300  * At the level immediately below:
  301  *
  302  *  - Input 3x3 Gram matrix is decomposed into LDL. We get three non-trivial
  303  *    elements in D (the diagonal), and three others in L (the lower
  304  *    triangle, excluding the diagonal). From these, we performs splits
  305  *    (that halve the degree) and build three 2x2 matrices for the recursive
  306  *    invocation.
  307  *
  308  * Lower levels receive a 2x2 Gram matrix, and perform 2-way splits.
  309  *
  310  * At the lowest level, polynomials are modulo X^2-X+1.
  311  */
  312 
  313 /*
  314  * Perform LDL decomposition of a 2x2 Gram matrix.
  315  *
  316  * Input: matrix G = [[g00, g01], [g10, g11]] such that G* = G
  317  * (hence: adj(g00) = g00, adj(g11) = g11, adj(g01) = g10).
  318  *
  319  * Output: L = [[1, 0], [l10, 1]] and D = [[d00, 0], [0, d11]] such
  320  * that G = L·D·L*.
  321  *
  322  * We have d00 = g00, thus that value is omitted from the outputs.
  323  *
  324  * All inputs and outputs are in FFT3 representation.
  325  * Overlap allowed only between the constant inputs (g00, g10, g11).
  326  */
  327 static void
  328 LDL_dim2_fft3(fpr *restrict d11, fpr *restrict l10,
  329         const fpr *restrict g00, const fpr *restrict g10,
  330         const fpr *restrict g11, unsigned logn, unsigned full)
  331 {
  332         size_t n;
  333 
  334         n = MKN(logn, full);
  335 
  336         /*
  337          * Set l10 = g10/g00.
  338          * Since g00 = adj(g00), FFT representation of g00 contains only
  339          * real numbers.
  340          */
  341         memcpy(l10, g10, n * sizeof *g10);
  342         falcon_poly_div_autoadj_fft3(l10, g00, logn, full);
  343 
  344         /*
  345          * Set d11 = g11 - g10*adj(g10/g00).
  346          * TODO: g11 = adj(g11), so it should be half-sized (only real
  347          * numbers in FFT representation).
  348          */
  349         memcpy(d11, g10, n * sizeof *g10);
  350         falcon_poly_muladj_fft3(d11, l10, logn, full);
  351         falcon_poly_neg_fft3(d11, logn, full);
  352         falcon_poly_add_fft3(d11, g11, logn, full);
  353 }
  354 
  355 /*
  356  * Perform LDL decomposition of a Gram 3x3 matrix.
  357  *
  358  * Input: matrix G = [[g00, g01, g02], [g10, g11, g12], [g20, g21, g22]]
  359  * such that G* = G.
  360  *
  361  * Output: L = [[1, 0, 0], [l10, 1, 0], [l20, l21, 1]], and
  362  * D = [[d00, 0, 0], [0, d11, 0], [0, 0, d22]] such that G = L·D·L*.
  363  *
  364  * We have d00 = g00, thus that value is omitted from the outputs.
  365  *
  366  * All inputs and outputs are in FFT3 representation.
  367  * Overlap allowed only between the constant inputs (g00, g10, g11).
  368  * tmp[] must have room for one polynomial.
  369  */
  370 static void
  371 LDL_dim3_fft3(fpr *restrict d11, fpr *restrict d22,
  372         fpr *restrict l10, fpr *restrict l20, fpr *restrict l21,
  373         const fpr *restrict g00, const fpr *restrict g10,
  374         const fpr *restrict g11, const fpr *restrict g20,
  375         const fpr *restrict g21, const fpr *restrict g22,
  376         unsigned logn, unsigned full, fpr *restrict tmp)
  377 {
  378         size_t n;
  379 
  380         n = MKN(logn, full);
  381 
  382         /*
  383          * l10 = g10/g00
  384          * d11 = g11 - g10*adj(g10/g00)
  385          * These are the same formulas as the LDL decomposition of a 2x2
  386          * matrix.
  387          */
  388         LDL_dim2_fft3(d11, l10, g00, g10, g11, logn, full);
  389 
  390         /*
  391          * l20 = g20/g00
  392          */
  393         memcpy(l20, g20, n * sizeof *g20);
  394         falcon_poly_div_autoadj_fft3(l20, g00, logn, full);
  395 
  396         /*
  397          * l21 = (g21 - g20*adj(g10)/g00) / d11
  398          * Note that d11 = adj(d11)
  399          */
  400         memcpy(l21, g20, n * sizeof *g20);
  401         falcon_poly_muladj_fft3(l21, g10, logn, full);
  402         falcon_poly_div_autoadj_fft3(l21, g00, logn, full);
  403         falcon_poly_neg_fft3(l21, logn, full);
  404         falcon_poly_add_fft3(l21, g21, logn, full);
  405         falcon_poly_div_autoadj_fft3(l21, d11, logn, full);
  406 
  407         /*
  408          * d22 = g22 - l20*adj(g20) - l21*adj(l21) / d11
  409          */
  410         memcpy(d22, l20, n * sizeof *l20);
  411         falcon_poly_muladj_fft3(d22, g20, logn, full);
  412         falcon_poly_neg_fft3(d22, logn, full);
  413         falcon_poly_add_fft3(d22, g22, logn, full);
  414         memcpy(tmp, l21, n * sizeof *l21);
  415         falcon_poly_mulselfadj_fft3(tmp, logn, full);
  416         falcon_poly_mul_autoadj_fft3(tmp, d11, logn, full);
  417         falcon_poly_sub_fft3(d22, tmp, logn, full);
  418 }
  419 
  420 static size_t
  421 ffLDL_inner_fft3(fpr *restrict tree, const fpr *restrict g00,
  422         const fpr *restrict g10, const fpr *restrict g11,
  423         unsigned logn, fpr *restrict tmp)
  424 {
  425         size_t n, hn, s;
  426         fpr *t0, *t1, *t2;
  427 
  428         n = (size_t)1 << logn;
  429         hn = n >> 1;
  430 
  431         if (logn == 1) {
  432                 /*
  433                  * When N = 2, diagonal elements (of D in the LDL
  434                  * decomposition) are real numbers (since they are
  435                  * auto-adjoint), and there is no need for further
  436                  * recursion.
  437                  *
  438                  * LDL_dim2_fft3() returns d11 (in tmp) and l10
  439                  * (two slots, in tree). It will be followed by two
  440                  * leaves, corresponding to d00 (which is g00) and d11.
  441                  * The two leaves are real numbers (one slot each).
  442                  */
  443                 LDL_dim2_fft3(tmp, tree, g00, g10, g11, logn, 0);
  444 
  445                 tree[2] = g00[0];
  446                 tree[3] = tmp[0];
  447                 return 4;
  448         }
  449 
  450         /*
  451          * Do the LDL, split diagonal elements, and recurse.
  452          * Since d00 = g00, we can do the first recursion
  453          * before the LDL.
  454          */
  455         s = n;
  456         t0 = tmp;
  457         t1 = tmp + hn;
  458         t2 = t1 + hn;
  459 
  460         falcon_poly_split_deep_fft3(t0, t1, g00, logn);
  461         falcon_poly_adj_fft3(t1, logn - 1, 0);
  462         s += ffLDL_inner_fft3(tree + s, t0, t1, t0, logn - 1, t2);
  463 
  464         LDL_dim2_fft3(t2, tree, g00, g10, g11, logn, 0);
  465 
  466         falcon_poly_split_deep_fft3(t0, t1, t2, logn);
  467         falcon_poly_adj_fft3(t1, logn - 1, 0);
  468         s += ffLDL_inner_fft3(tree + s, t0, t1, t0, logn - 1, t2);
  469 
  470         return s;
  471 }
  472 
  473 static size_t
  474 ffLDL_depth1_fft3(fpr *restrict tree, const fpr *restrict g00,
  475         const fpr *restrict g10, const fpr *restrict g11,
  476         const fpr *restrict g20, const fpr *restrict g21,
  477         const fpr *restrict g22, unsigned logn, fpr *restrict tmp)
  478 {
  479         /*
  480          * At depth 1, we perform a bisection on the elements of the
  481          * input 3x3 matrix.
  482          */
  483 
  484         size_t n, hn, s;
  485         fpr *l10, *l20, *l21, *d11, *d22;
  486         fpr *t0, *t1, *t2;
  487 
  488         n = (size_t)1 << logn;
  489         hn = n >> 1;
  490         l10 = tree;
  491         l20 = l10 + n;
  492         l21 = l20 + n;
  493         d11 = tmp;
  494         d22 = d11 + n;
  495         t0 = d22 + n;
  496         t1 = t0 + hn;
  497         t2 = t1 + hn;
  498         s = 3 * n;
  499 
  500         /*
  501          * LDL decomposition.
  502          */
  503         LDL_dim3_fft3(d11, d22, l10, l20, l21,
  504                 g00, g10, g11, g20, g21, g22, logn, 0, t2);
  505 
  506         /*
  507          * Splits and recursive invocations.
  508          *
  509          * TODO: for N = 6, this would need special code. Right now,
  510          * we simply refuse to handle it, because N = 6 is way too weak
  511          * to have any value anyway.
  512          */
  513         falcon_poly_split_deep_fft3(t0, t1, g00, logn);
  514         falcon_poly_adj_fft3(t1, logn - 1, 0);
  515         s += ffLDL_inner_fft3(tree + s, t0, t1, t0, logn - 1, t2);
  516 
  517         falcon_poly_split_deep_fft3(t0, t1, d11, logn);
  518         falcon_poly_adj_fft3(t1, logn - 1, 0);
  519         s += ffLDL_inner_fft3(tree + s, t0, t1, t0, logn - 1, t2);
  520 
  521         falcon_poly_split_deep_fft3(t0, t1, d22, logn);
  522         falcon_poly_adj_fft3(t1, logn - 1, 0);
  523         s += ffLDL_inner_fft3(tree + s, t0, t1, t0, logn - 1, t2);
  524 
  525         return s;
  526 }
  527 
  528 static size_t
  529 ffLDL_fft3(fpr *restrict tree, const fpr *restrict g00,
  530         const fpr *restrict g10, const fpr *restrict g11,
  531         unsigned logn, fpr *restrict tmp)
  532 {
  533         size_t n, tn, s;
  534         fpr *l10, *d11, *t0, *t1, *t2, *t3;
  535 
  536         n = (size_t)3 << (logn - 1);
  537         tn = (size_t)1 << (logn - 1);
  538         l10 = tree;
  539         s = n;
  540         t0 = tmp;
  541         t1 = t0 + tn;
  542         t2 = t1 + tn;
  543         t3 = t2 + tn;
  544 
  545         /*
  546          * Formally, we perform the LDL decomposition, _then_ do
  547          * the recursion on split versions of the diagonal elements.
  548          * However, d00 = g00, so we can perform the first recursion
  549          * _before_ computing the LDL; this saves RAM.
  550          */
  551 
  552         /*
  553          * Trisection of d00 for first recursion.
  554          */
  555         falcon_poly_split_top_fft3(t0, t1, t2, g00, logn);
  556         falcon_poly_adj_fft3(t1, logn - 1, 0);
  557         falcon_poly_adj_fft3(t2, logn - 1, 0);
  558         s += ffLDL_depth1_fft3(tree + s, t0, t1, t0, t2, t1, t0, logn - 1, t3);
  559 
  560         /*
  561          * Compute LDL decomposition of the top matrix.
  562          */
  563         d11 = t3;
  564         LDL_dim2_fft3(d11, l10, g00, g10, g11, logn, 1);
  565 
  566         /*
  567          * Trisection of d11 for second recursion.
  568          */
  569         falcon_poly_split_top_fft3(t0, t1, t2, d11, logn);
  570         falcon_poly_adj_fft3(t1, logn - 1, 0);
  571         falcon_poly_adj_fft3(t2, logn - 1, 0);
  572         s += ffLDL_depth1_fft3(tree + s, t0, t1, t0, t2, t1, t0, logn - 1, t3);
  573 
  574         return s;
  575 }
  576 
  577 /*
  578  * Get the size of the LDL tree for an input with polynomials of size
  579  * 2^logn. The size is expressed in the number of elements.
  580  */
  581 static inline unsigned
  582 ffLDL_ternary_treesize(unsigned logn)
  583 {
  584         return 3 * ((logn + 2) << (logn - 1));
  585 }
  586 
  587 static size_t
  588 ffLDL_ternary_normalize_inner(fpr *tree, fpr sigma, unsigned logn)
  589 {
  590         size_t s;
  591 
  592         if (logn == 1) {
  593                 /*
  594                  * At logn = 1, tree consists in three polynomials,
  595                  * one parent node and two leaves. We normalize the
  596                  * leaves.
  597                  */
  598                 tree[2] = fpr_div(sigma, fpr_sqrt(tree[2]));
  599                 tree[3] = fpr_div(sigma, fpr_sqrt(tree[3]));
  600                 return 4;
  601         }
  602 
  603         s = (size_t)1 << logn;
  604         s += ffLDL_ternary_normalize_inner(tree + s, sigma, logn - 1);
  605         s += ffLDL_ternary_normalize_inner(tree + s, sigma, logn - 1);
  606         return s;
  607 }
  608 
  609 static size_t
  610 ffLDL_ternary_normalize_depth1(fpr *tree, fpr sigma, unsigned logn)
  611 {
  612         size_t s;
  613 
  614         s = (size_t)3 << logn;
  615         s += ffLDL_ternary_normalize_inner(tree + s, sigma, logn - 1);
  616         s += ffLDL_ternary_normalize_inner(tree + s, sigma, logn - 1);
  617         s += ffLDL_ternary_normalize_inner(tree + s, sigma, logn - 1);
  618         return s;
  619 }
  620 
  621 static size_t
  622 ffLDL_ternary_normalize(fpr *tree, fpr sigma, unsigned logn)
  623 {
  624         size_t s;
  625 
  626         s = (size_t)3 << (logn - 1);
  627         s += ffLDL_ternary_normalize_depth1(tree + s, sigma, logn - 1);
  628         s += ffLDL_ternary_normalize_depth1(tree + s, sigma, logn - 1);
  629         return s;
  630 }
  631 
  632 /* =================================================================== */
  633 
  634 /*
  635  * Convert an integer polynomial (with small values) into the
  636  * representation with complex numbers.
  637  */
  638 static void
  639 smallints_to_fpr(fpr *r, const int16_t *t, unsigned logn, unsigned ter)
  640 {
  641         size_t n, u;
  642 
  643         n = MKN(logn, ter);
  644         for (u = 0; u < n; u ++) {
  645                 r[u] = fpr_of(t[u]);
  646         }
  647 }
  648 
  649 /*
  650  * The expanded private key contains:
  651  *  - The B0 matrix (four elements)
  652  *  - The ffLDL tree
  653  */
  654 
  655 static inline size_t
  656 skoff_b00(unsigned logn, unsigned ter)
  657 {
  658         (void)logn;
  659         (void)ter;
  660         return 0;
  661 }
  662 
  663 static inline size_t
  664 skoff_b01(unsigned logn, unsigned ter)
  665 {
  666         return MKN(logn, ter);
  667 }
  668 
  669 static inline size_t
  670 skoff_b10(unsigned logn, unsigned ter)
  671 {
  672         return 2 * MKN(logn, ter);
  673 }
  674 
  675 static inline size_t
  676 skoff_b11(unsigned logn, unsigned ter)
  677 {
  678         return 3 * MKN(logn, ter);
  679 }
  680 
  681 static inline size_t
  682 skoff_tree(unsigned logn, unsigned ter)
  683 {
  684         return 4 * MKN(logn, ter);
  685 }
  686 
  687 /*
  688  * Load a private key and perform precomputations for signing.
  689  *
  690  * Number of elements in sk[]:
  691  *
  692  *  - binary case: (logn+5) * 2^logn
  693  *
  694  *  - ternary case: 3*(logn+6) * 2^(logn-1)
  695  *
  696  * tmp[] must have room for at least seven polynomials.
  697  */
  698 static void
  699 load_skey(fpr *restrict sk, unsigned q,
  700         const int16_t *f_src, const int16_t *g_src,
  701         const int16_t *F_src, const int16_t *G_src,
  702         unsigned logn, unsigned ter, fpr *restrict tmp)
  703 {
  704         size_t n;
  705         fpr *f, *g, *F, *G;
  706         fpr *b00, *b01, *b10, *b11;
  707         fpr *tree;
  708         fpr sigma;
  709 
  710         n = MKN(logn, ter);
  711         b00 = sk + skoff_b00(logn, ter);
  712         b01 = sk + skoff_b01(logn, ter);
  713         b10 = sk + skoff_b10(logn, ter);
  714         b11 = sk + skoff_b11(logn, ter);
  715         tree = sk + skoff_tree(logn, ter);
  716 
  717         /*
  718          * We load the private key elements directly into the B0 matrix,
  719          * since B0 = [[g, -f], [G, -F]].
  720          */
  721         f = b01;
  722         g = b00;
  723         F = b11;
  724         G = b10;
  725 
  726         smallints_to_fpr(f, f_src, logn, ter);
  727         smallints_to_fpr(g, g_src, logn, ter);
  728         smallints_to_fpr(F, F_src, logn, ter);
  729         smallints_to_fpr(G, G_src, logn, ter);
  730 
  731         /*
  732          * Compute the FFT for the key elements, and negate f and F.
  733          */
  734         if (ter) {
  735                 falcon_FFT3(f, logn, 1);
  736                 falcon_FFT3(g, logn, 1);
  737                 falcon_FFT3(F, logn, 1);
  738                 falcon_FFT3(G, logn, 1);
  739                 falcon_poly_neg_fft3(f, logn, 1);
  740                 falcon_poly_neg_fft3(F, logn, 1);
  741         } else {
  742                 falcon_FFT(f, logn);
  743                 falcon_FFT(g, logn);
  744                 falcon_FFT(F, logn);
  745                 falcon_FFT(G, logn);
  746                 falcon_poly_neg_fft(f, logn);
  747                 falcon_poly_neg_fft(F, logn);
  748         }
  749 
  750         /*
  751          * The Gram matrix is G = B·B*. Formulas are:
  752          *   g00 = b00*adj(b00) + b01*adj(b01)
  753          *   g01 = b00*adj(b10) + b01*adj(b11)
  754          *   g10 = b10*adj(b00) + b11*adj(b01)
  755          *   g11 = b10*adj(b10) + b11*adj(b11)
  756          */
  757         if (ter) {
  758                 fpr *g00, *g10, *g11, *gxx;
  759 
  760                 g00 = tmp;
  761                 g10 = g00 + n;
  762                 g11 = g10 + n;
  763                 gxx = g11 + n;
  764 
  765                 memcpy(g00, b00, n * sizeof *b00);
  766                 falcon_poly_mulselfadj_fft3(g00, logn, 1);
  767                 memcpy(gxx, b01, n * sizeof *b01);
  768                 falcon_poly_mulselfadj_fft3(gxx, logn, 1);
  769                 falcon_poly_add_fft3(g00, gxx, logn, 1);
  770 
  771                 memcpy(g10, b10, n * sizeof *b10);
  772                 falcon_poly_muladj_fft3(g10, b00, logn, 1);
  773                 memcpy(gxx, b11, n * sizeof *b11);
  774                 falcon_poly_muladj_fft3(gxx, b01, logn, 1);
  775                 falcon_poly_add_fft3(g10, gxx, logn, 1);
  776 
  777                 memcpy(g11, b10, n * sizeof *b10);
  778                 falcon_poly_mulselfadj_fft3(g11, logn, 1);
  779                 memcpy(gxx, b11, n * sizeof *b11);
  780                 falcon_poly_mulselfadj_fft3(gxx, logn, 1);
  781                 falcon_poly_add_fft3(g11, gxx, logn, 1);
  782 
  783                 /*
  784                  * Compute the Falcon tree.
  785                  */
  786                 ffLDL_fft3(tree, g00, g10, g11, logn, gxx);
  787 
  788                 /*
  789                  * Tree normalization:
  790                  *   sigma = 1.32 * sqrt(q/sqrt(2))
  791                  */
  792                 sigma = fpr_mul(
  793                         fpr_sqrt(fpr_mul(fpr_of(q), fpr_sqrt(fpr_of(2)))),
  794                         fpr_div(fpr_of(132), fpr_of(100)));
  795 
  796                 /*
  797                  * Normalize tree with sigma.
  798                  */
  799                 ffLDL_ternary_normalize(tree, sigma, logn);
  800         } else {
  801                 /*
  802                  * For historical reasons, this implementation uses
  803                  * g00, g01 and g11 (upper triangle).
  804                  */
  805                 fpr *g00, *g01, *g11, *gxx;
  806 
  807                 g00 = tmp;
  808                 g01 = g00 + n;
  809                 g11 = g01 + n;
  810                 gxx = g11 + n;
  811 
  812                 memcpy(g00, b00, n * sizeof *b00);
  813                 falcon_poly_mulselfadj_fft(g00, logn);
  814                 memcpy(gxx, b01, n * sizeof *b01);
  815                 falcon_poly_mulselfadj_fft(gxx, logn);
  816                 falcon_poly_add_fft(g00, gxx, logn);
  817 
  818                 memcpy(g01, b00, n * sizeof *b00);
  819                 falcon_poly_muladj_fft(g01, b10, logn);
  820                 memcpy(gxx, b01, n * sizeof *b01);
  821                 falcon_poly_muladj_fft(gxx, b11, logn);
  822                 falcon_poly_add_fft(g01, gxx, logn);
  823 
  824                 memcpy(g11, b10, n * sizeof *b10);
  825                 falcon_poly_mulselfadj_fft(g11, logn);
  826                 memcpy(gxx, b11, n * sizeof *b11);
  827                 falcon_poly_mulselfadj_fft(gxx, logn);
  828                 falcon_poly_add_fft(g11, gxx, logn);
  829 
  830                 /*
  831                  * Compute the Falcon tree.
  832                  */
  833                 ffLDL_fft(tree, g00, g01, g11, logn, gxx);
  834 
  835                 /*
  836                  * Tree normalization:
  837                  *   sigma = 1.55 * sqrt(q)
  838                  */
  839                 sigma = fpr_mul(fpr_sqrt(fpr_of(q)),
  840                         fpr_div(fpr_of(155), fpr_of(100)));
  841 
  842                 /*
  843                  * Normalize tree with sigma.
  844                  */
  845                 ffLDL_binary_normalize(tree, sigma, logn);
  846         }
  847 }
  848 
  849 typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
  850 
  851 /*
  852  * Perform Fast Fourier Sampling for target vector t and LDL tree T.
  853  * tmp[] must have size for at least two polynomials of size 2^logn.
  854  */
  855 static void
  856 ffSampling_fft(samplerZ samp, void *samp_ctx,
  857         fpr *restrict z0, fpr *restrict z1,
  858         const fpr *restrict tree,
  859         const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
  860         fpr *restrict tmp)
  861 {
  862         size_t n, hn;
  863         const fpr *tree0, *tree1;
  864 
  865         n = (size_t)1 << logn;
  866         if (n == 1) {
  867                 fpr x0, x1, sigma;
  868 
  869                 x0 = t0[0];
  870                 x1 = t1[0];
  871                 sigma = tree[0];
  872                 z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
  873                 z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
  874                 return;
  875         }
  876 
  877         hn = n >> 1;
  878         tree0 = tree + n;
  879         tree1 = tree + n + ffLDL_treesize(logn - 1);
  880 
  881         /*
  882          * We split t1 into z1 (reused as temporary storage), then do
  883          * the recursive invocation, with output in tmp. We finally
  884          * merge back into z1.
  885          */
  886         falcon_poly_split_fft(z1, z1 + hn, t1, logn);
  887         ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
  888                 tree1, z1, z1 + hn, logn - 1, tmp + n);
  889         falcon_poly_merge_fft(z1, tmp, tmp + hn, logn);
  890 
  891         /*
  892          * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
  893          */
  894         memcpy(tmp, t1, n * sizeof *t1);
  895         falcon_poly_sub_fft(tmp, z1, logn);
  896         falcon_poly_mul_fft(tmp, tree, logn);
  897         falcon_poly_add_fft(tmp, t0, logn);
  898 
  899         /*
  900          * Second recursive invocation.
  901          */
  902         falcon_poly_split_fft(z0, z0 + hn, tmp, logn);
  903         ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
  904                 tree0, z0, z0 + hn, logn - 1, tmp + n);
  905         falcon_poly_merge_fft(z0, tmp, tmp + hn, logn);
  906 }
  907 
  908 static void
  909 ffSampling_inner_fft3(samplerZ samp, void *samp_ctx,
  910         fpr *restrict z0, fpr *restrict z1,
  911         const fpr *restrict tree,
  912         const fpr *restrict t0, const fpr *restrict t1,
  913         unsigned logn, fpr *restrict tmp)
  914 {
  915         size_t n, hn;
  916         fpr *x0, *x1, *y0, *y1;
  917         const fpr *tree0, *tree1;
  918 
  919         /*
  920          * For tree construction, recursion stopped at n = 2, but it
  921          * produced a tree which also covers n = 1. For sampling, we use
  922          * the fact that the split() and merge() function
  923          * implementations actually supports logn = 1.
  924          */
  925         if (logn == 0) {
  926                 fpr r0, r1, rx;
  927                 fpr sigma;
  928 
  929                 sigma = tree[0];
  930                 r1 = *t1;
  931                 r0 = *t0;
  932                 r1 = fpr_sub(r1, fpr_of(
  933                         samp(samp_ctx, r1, fpr_mul(fpr_IW1I, sigma))));
  934                 rx = fpr_half(r1);
  935                 r0 = fpr_add(r0, rx);
  936                 r0 = fpr_sub(r0, fpr_of(
  937                         samp(samp_ctx, r0, sigma)));
  938                 r0 = fpr_sub(r0, rx);
  939                 *z0 = r0;
  940                 *z1 = r1;
  941                 return;
  942         }
  943 
  944         n = (size_t)1 << logn;
  945         hn = n >> 1;
  946         y0 = tmp;
  947         y1 = y0 + hn;
  948 
  949         tree0 = tree + n;
  950         tree1 = tree + n + (logn << (logn - 1));
  951 
  952         /*
  953          * Split t1, recurse, merge into z1.
  954          */
  955         x0 = z1;
  956         x1 = x0 + hn;
  957         falcon_poly_split_deep_fft3(x0, x1, t1, logn);
  958         ffSampling_inner_fft3(samp, samp_ctx,
  959                 y0, y1, tree1, x0, x1, logn - 1, tmp + n);
  960         falcon_poly_merge_deep_fft3(z1, y0, y1, logn);
  961 
  962         /*
  963          * Compute t0b = t0 + z1 * l10 (into tmp[]).
  964          * FIXME: save z1 * l10 instead of recomputing it later on.
  965          */
  966         memcpy(tmp, z1, n * sizeof *t1);
  967         falcon_poly_mul_fft3(tmp, tree, logn, 0);
  968         falcon_poly_add_fft3(tmp, t0, logn, 0);
  969 
  970         /*
  971          * Split t0b, recurse, merge into z0.
  972          */
  973         x0 = z0;
  974         x1 = x0 + hn;
  975         falcon_poly_split_deep_fft3(x0, x1, tmp, logn);
  976         ffSampling_inner_fft3(samp, samp_ctx,
  977                 y0, y1, tree0, x0, x1, logn - 1, tmp + n);
  978         falcon_poly_merge_deep_fft3(z0, y0, y1, logn);
  979 
  980         /*
  981          * Subtract z1 * l10 from z0.
  982          */
  983         memcpy(tmp, z1, n * sizeof *z1);
  984         falcon_poly_mul_fft3(tmp, tree, logn, 0);
  985         falcon_poly_sub_fft3(z0, tmp, logn, 0);
  986 }
  987 
  988 static void
  989 ffSampling_depth1_fft3(samplerZ samp, void *samp_ctx,
  990         fpr *restrict z0, fpr *restrict z1, fpr *restrict z2,
  991         const fpr *restrict tree,
  992         const fpr *restrict t0, const fpr *restrict t1, const fpr *restrict t2,
  993         unsigned logn, fpr *restrict tmp)
  994 {
  995         size_t n, hn;
  996         fpr *x0, *x1, *y0, *y1;
  997         const fpr *tree0, *tree1, *tree2;
  998 
  999         n = (size_t)1 << logn;
 1000         hn = n >> 1;
 1001 
 1002         tree0 = tree + 3 * n;
 1003         tree1 = tree0 + (logn << (logn - 1));
 1004         tree2 = tree1 + (logn << (logn - 1));
 1005         y0 = tmp;
 1006         y1 = y0 + hn;
 1007 
 1008         /*
 1009          * Split t2, recurse, merge into z2.
 1010          */
 1011         x0 = z2;
 1012         x1 = x0 + hn;
 1013         falcon_poly_split_deep_fft3(x0, x1, t2, logn);
 1014         ffSampling_inner_fft3(samp, samp_ctx,
 1015                 y0, y1, tree2, x0, x1, logn - 1, tmp + n);
 1016         falcon_poly_merge_deep_fft3(z2, y0, y1, logn);
 1017 
 1018         /*
 1019          * Compute t1b = t1 + z2 * l21 (into tmp[]).
 1020          * FIXME: save z2 * l21 instead of recomputing it later on.
 1021          */
 1022         memcpy(tmp, z2, n * sizeof *z2);
 1023         falcon_poly_mul_fft3(tmp, tree + 2 * n, logn, 0);
 1024         falcon_poly_add_fft3(tmp, t1, logn, 0);
 1025 
 1026         /*
 1027          * Split t1b, recurse, merge into z1, and subtract z2 * l21.
 1028          */
 1029         x0 = z1;
 1030         x1 = x0 + hn;
 1031         falcon_poly_split_deep_fft3(x0, x1, tmp, logn);
 1032         ffSampling_inner_fft3(samp, samp_ctx,
 1033                 y0, y1, tree1, x0, x1, logn - 1, tmp + n);
 1034         falcon_poly_merge_deep_fft3(z1, y0, y1, logn);
 1035         memcpy(tmp, z2, n * sizeof *z2);
 1036         falcon_poly_mul_fft3(tmp, tree + 2 * n, logn, 0);
 1037         falcon_poly_sub_fft3(z1, tmp, logn, 0);
 1038 
 1039         /*
 1040          * Compute t0b = t0 + z1 * l10 + z2 * l20 (into z0).
 1041          * We use z0 as extra temporary.
 1042          * FIXME: save z1 * l10 + z2 * l20 instead of recomputing it later on.
 1043          */
 1044         memcpy(z0, t0, n * sizeof *t0);
 1045         memcpy(tmp, z1, n * sizeof *z1);
 1046         falcon_poly_mul_fft3(tmp, tree, logn, 0);
 1047         falcon_poly_add_fft3(z0, tmp, logn, 0);
 1048         memcpy(tmp, z2, n * sizeof *z1);
 1049         falcon_poly_mul_fft3(tmp, tree + n, logn, 0);
 1050         falcon_poly_add_fft3(z0, tmp, logn, 0);
 1051 
 1052         /*
 1053          * Split t0b, recurse, merge into z0.
 1054          */
 1055         x0 = z0;
 1056         x1 = x0 + hn;
 1057         memcpy(tmp, z0, n * sizeof *z0);
 1058         falcon_poly_split_deep_fft3(x0, x1, tmp, logn);
 1059         ffSampling_inner_fft3(samp, samp_ctx,
 1060                 y0, y1, tree0, x0, x1, logn - 1, tmp + n);
 1061         falcon_poly_merge_deep_fft3(z0, y0, y1, logn);
 1062 
 1063         /*
 1064          * Subtract z1 * l10 and z2 * l20 from z0.
 1065          */
 1066         memcpy(tmp, z1, n * sizeof *z1);
 1067         falcon_poly_mul_fft3(tmp, tree, logn, 0);
 1068         falcon_poly_sub_fft3(z0, tmp, logn, 0);
 1069         memcpy(tmp, z2, n * sizeof *z1);
 1070         falcon_poly_mul_fft3(tmp, tree + n, logn, 0);
 1071         falcon_poly_sub_fft3(z0, tmp, logn, 0);
 1072 }
 1073 
 1074 static void
 1075 ffSampling_fft3(samplerZ samp, void *samp_ctx,
 1076         fpr *restrict z0, fpr *restrict z1,
 1077         const fpr *restrict tree,
 1078         const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
 1079         fpr *restrict tmp)
 1080 {
 1081         size_t n, tn;
 1082         fpr *x0, *x1, *x2, *y0, *y1, *y2;
 1083         const fpr *tree0, *tree1;
 1084 
 1085         n = (size_t)3 << (logn - 1);
 1086         tn = (size_t)1 << (logn - 1);
 1087 
 1088         tree0 = tree + n;
 1089         tree1 = tree0 + 3 * ((logn + 1) << (logn - 2));
 1090         y0 = tmp;
 1091         y1 = y0 + tn;
 1092         y2 = y1 + tn;
 1093 
 1094         /*
 1095          * We split t1 three ways for recursive invocation. We use
 1096          * z0 and z1 as temporary storage areas; final merge yields z1.
 1097          */
 1098         x0 = z1;
 1099         x1 = x0 + tn;
 1100         x2 = x1 + tn;
 1101         falcon_poly_split_top_fft3(x0, x1, x2, t1, logn);
 1102         ffSampling_depth1_fft3(samp, samp_ctx,
 1103                 y0, y1, y2, tree1, x0, x1, x2, logn - 1, tmp + n);
 1104         falcon_poly_merge_top_fft3(z1, y0, y1, y2, logn);
 1105 
 1106         /*
 1107          * Compute t0b = t0 + z1 * L (in tmp[]).
 1108          * FIXME: save z1 * L instead of recomputing it later on.
 1109          */
 1110         memcpy(tmp, z1, n * sizeof *z1);
 1111         falcon_poly_mul_fft3(tmp, tree, logn, 1);
 1112         falcon_poly_add_fft3(tmp, t0, logn, 1);
 1113 
 1114         /*
 1115          * Split t0b, recurse, and merge into z0.
 1116          */
 1117         x0 = z0;
 1118         x1 = x0 + tn;
 1119         x2 = x1 + tn;
 1120         falcon_poly_split_top_fft3(x0, x1, x2, tmp, logn);
 1121         ffSampling_depth1_fft3(samp, samp_ctx,
 1122                 y0, y1, y2, tree0, x0, x1, x2, logn - 1, tmp + n);
 1123         falcon_poly_merge_top_fft3(z0, y0, y1, y2, logn);
 1124 
 1125         /*
 1126          * Subtract z1 * L from z0.
 1127          */
 1128         memcpy(tmp, z1, n * sizeof *z1);
 1129         falcon_poly_mul_fft3(tmp, tree, logn, 1);
 1130         falcon_poly_sub_fft3(z0, tmp, logn, 1);
 1131 }
 1132 
 1133 /*
 1134  * Compute a signature: the signature contains two vectors, s1 and s2;
 1135  * the caller must still check that they comply with the signature bound,
 1136  * and try again if that is not the case.
 1137  *
 1138  * tmp[] must have room for at least six polynomials.
 1139  */
 1140 static void
 1141 do_sign(samplerZ samp, void *samp_ctx,
 1142         int16_t *restrict s1, int16_t *restrict s2,
 1143         unsigned q, const fpr *restrict sk, const uint16_t *restrict hm,
 1144         unsigned logn, unsigned ter, fpr *restrict tmp)
 1145 {
 1146         size_t n, u;
 1147         fpr *t0, *t1, *tx, *ty, *tz;
 1148         const fpr *b00, *b01, *b10, *b11, *tree;
 1149         fpr ni;
 1150 
 1151         n = MKN(logn, ter);
 1152         t0 = tmp;
 1153         t1 = t0 + n;
 1154         tx = t1 + n;
 1155         ty = tx + n;
 1156         tz = ty + n;
 1157         b00 = sk + skoff_b00(logn, ter);
 1158         b01 = sk + skoff_b01(logn, ter);
 1159         b10 = sk + skoff_b10(logn, ter);
 1160         b11 = sk + skoff_b11(logn, ter);
 1161         tree = sk + skoff_tree(logn, ter);
 1162 
 1163         /*
 1164          * Set the target vector to [hm, 0] (hm is the hashed message).
 1165          */
 1166         for (u = 0; u < n; u ++) {
 1167                 t0[u] = fpr_of(hm[u]);
 1168                 /* This is implicit.
 1169                 t1[u] = fpr_of(0);
 1170                 */
 1171         }
 1172 
 1173         if (ter) {
 1174                 /*
 1175                  * Apply the lattice basis to obtain the real target
 1176                  * vector (after normalization with regards to modulus).
 1177                  */
 1178                 falcon_FFT3(t0, logn, 1);
 1179                 ni = fpr_inverse_of(q);
 1180                 memcpy(t1, t0, n * sizeof *t0);
 1181                 falcon_poly_mul_fft3(t1, b01, logn, 1);
 1182                 falcon_poly_mulconst_fft3(t1, fpr_neg(ni), logn, 1);
 1183                 falcon_poly_mul_fft3(t0, b11, logn, 1);
 1184                 falcon_poly_mulconst_fft3(t0, ni, logn, 1);
 1185 
 1186                 /*
 1187                  * Apply sampling. Output is written back in [tx, ty].
 1188                  */
 1189                 ffSampling_fft3(samp, samp_ctx, tx, ty, tree, t0, t1, logn, tz);
 1190 
 1191                 /*
 1192                  * Get the lattice point corresponding to that tiny vector.
 1193                  */
 1194                 memcpy(t0, tx, n * sizeof *tx);
 1195                 memcpy(t1, ty, n * sizeof *ty);
 1196                 falcon_poly_mul_fft3(tx, b00, logn, 1);
 1197                 falcon_poly_mul_fft3(ty, b10, logn, 1);
 1198                 falcon_poly_add_fft3(tx, ty, logn, 1);
 1199                 memcpy(ty, t0, n * sizeof *t0);
 1200                 falcon_poly_mul_fft3(ty, b01, logn, 1);
 1201 
 1202                 memcpy(t0, tx, n * sizeof *tx);
 1203                 falcon_poly_mul_fft3(t1, b11, logn, 1);
 1204                 falcon_poly_add_fft3(t1, ty, logn, 1);
 1205 
 1206                 falcon_iFFT3(t0, logn, 1);
 1207                 falcon_iFFT3(t1, logn, 1);
 1208 
 1209                 /*
 1210                  * Compute the signature.
 1211                  */
 1212                 for (u = 0; u < n; u ++) {
 1213                         s1[u] = (int16_t)fpr_rint(t0[u]);
 1214                         s2[u] = (int16_t)fpr_rint(t1[u]);
 1215                 }
 1216         } else {
 1217                 /*
 1218                  * Apply the lattice basis to obtain the real target
 1219                  * vector (after normalization with regards to modulus).
 1220                  */
 1221                 falcon_FFT(t0, logn);
 1222                 ni = fpr_inverse_of(q);
 1223                 memcpy(t1, t0, n * sizeof *t0);
 1224                 falcon_poly_mul_fft(t1, b01, logn);
 1225                 falcon_poly_mulconst_fft(t1, fpr_neg(ni), logn);
 1226                 falcon_poly_mul_fft(t0, b11, logn);
 1227                 falcon_poly_mulconst_fft(t0, ni, logn);
 1228 
 1229                 /*
 1230                  * Apply sampling. Output is written back in [tx, ty].
 1231                  */
 1232                 ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, tz);
 1233 
 1234                 /*
 1235                  * Get the lattice point corresponding to that tiny vector.
 1236                  */
 1237                 memcpy(t0, tx, n * sizeof *tx);
 1238                 memcpy(t1, ty, n * sizeof *ty);
 1239                 falcon_poly_mul_fft(tx, b00, logn);
 1240                 falcon_poly_mul_fft(ty, b10, logn);
 1241                 falcon_poly_add(tx, ty, logn);
 1242                 memcpy(ty, t0, n * sizeof *t0);
 1243                 falcon_poly_mul_fft(ty, b01, logn);
 1244 
 1245                 memcpy(t0, tx, n * sizeof *tx);
 1246                 falcon_poly_mul_fft(t1, b11, logn);
 1247                 falcon_poly_add_fft(t1, ty, logn);
 1248 
 1249                 falcon_iFFT(t0, logn);
 1250                 falcon_iFFT(t1, logn);
 1251 
 1252                 /*
 1253                  * Compute the signature.
 1254                  */
 1255                 for (u = 0; u < n; u ++) {
 1256                         s1[u] = (int16_t)(hm[u] - fpr_rint(t0[u]));
 1257                         s2[u] = (int16_t)-fpr_rint(t1[u]);
 1258                 }
 1259         }
 1260 }
 1261 
 1262 /*
 1263  * We have here three versions of gaussian0_sampler().
 1264  *
 1265  * The CoDF version maintains the to-be-returned value z as an integer,
 1266  * and makes repeated samplings of 64-bit values; each of them is checked
 1267  * against a tabulated value to decide whether z should be returned as
 1268  * is, or z should be incremented and the process repeated.
 1269  *
 1270  * The CDF version obtains a random 128-bit value and finds it in the
 1271  * tabulated distribution to directly return the sampled value.
 1272  *
 1273  * Since we follow a discrete Gaussian of standard deviation sigma0 = 2,
 1274  * the average number of PRNG invocations in the CoDF variant should be
 1275  * 2. The CDF variant always makes exactly 2 invocations of the PRNG.
 1276  * Thus, the two variants should offer about the same performance.
 1277  * Neither is constant-time, as implemented here, but the CDF variant
 1278  * _could_ be made constant-time relatively easily by reading up the
 1279  * whole table systematically.
 1280  *
 1281  * The third version (which is the default) is a CDF variant with a
 1282  * fast test on the top bytes; this reduces the average number of bytes
 1283  * obtained from the RNG.
 1284  */
 1285 
 1286 #if SAMPLER_CODF
 1287 
 1288 /*
 1289  * Precomputed CoDF table, scaled to 2^64.
 1290  *
 1291  * We use distribution D(z) = exp(-(z^2)/(2*sigma0^2)), for integer z >= 0
 1292  * and sigma0 = 2. CoDF is defined as:
 1293  *   CoDF[z] = D(z) / (\sum_{i>=z} D(i))
 1294  * i.e. CoDF[z] is the probability of getting exactly z, conditioned by
 1295  * the probability of getting _at least_ z.
 1296  *
 1297  * Values below have been computed with 200 bits of precision, then
 1298  * scaled up to 2^64, and rounded to the nearest integer. An extra
 1299  * terminator value has been added with value 2^64-1, to ensure that the
 1300  * sampling will always terminate (but it is unreachable with
 1301  * overwhelming probability anyway).
 1302  */
 1303 static const uint64_t CoDF[] = {
 1304          6135359076264090556u,  8112710616924306458u,  9953032297470027236u,
 1305         11570271901782068036u, 12938678630255467417u, 14067899748132476476u,
 1306         14984234773976257854u, 15719358179957714060u, 16304436317268820589u,
 1307         16767483091484223253u, 17132471096555539072u, 17419317572418304948u,
 1308         17644260465667274210u, 17820371633146694049u, 17958082469163820764u,
 1309         18065665744948529650u, 18149652895292277433u, 18215183554218485224u,
 1310         18266292212831004768u, 18306140009815638184u, 18337200327237292561u,
 1311         18361406363042628418u, 18380267875334172737u, 18394963192522594010u,
 1312         18406411526561969749u, 18415329685754092522u, 18422276481254688480u,
 1313         18427687455017083046u, 18431902013094434230u, 18435184609813683248u,
 1314         18446744073709551615u
 1315 };
 1316 
 1317 /*
 1318  * This function samples a positive integer z along the distribution
 1319  * D(z) = exp(-(z^2)/(2*sigma0^2)) (with sigma0 = 2).
 1320  */
 1321 static int
 1322 gaussian0_sampler(prng *p)
 1323 {
 1324         int z;
 1325 
 1326         for (z = 0;; z ++) {
 1327                 if (falcon_prng_get_u64(p) <= CoDF[z]) {
 1328                         return z;
 1329                 }
 1330         }
 1331 }
 1332 
 1333 /*
 1334  * The "large" Gaussian sampler, using sigma0 = sqrt(5) instead of
 1335  * sigma0 = 2. See above for comments.
 1336  */
 1337 
 1338 static const uint64_t CoDF_large[] = {
 1339          5585698290684357146u,  7249223068825568181u,  8847100402462867882u,
 1340         10311416391296380421u, 11610658709007993603u, 12738082895937970315u,
 1341         13701420522128960375u, 14515655832074621060u, 15198539954879572289u,
 1342         15768038078814656243u, 16241001905848789539u, 16632570846605441455u,
 1343         16955985738791465744u, 17222623757559529836u, 17442142477961602124u,
 1344         17622669281608141903u, 17771000949527216889u, 17892795030360415835u,
 1345         17992744215115961471u, 18074730391283377394u, 18141958023865435089u,
 1346         18197068047683742685u, 18242234163582632318u, 18279243671444027479u,
 1347         18309564958002955235u, 18334403612118750997u, 18354748936728452469u,
 1348         18371412406174455800u, 18385059402287341093u, 18396235363810085460u,
 1349         18446744073709551615u
 1350 };
 1351 
 1352 static int
 1353 gaussian0_sampler_large(prng *p)
 1354 {
 1355         int z;
 1356 
 1357         for (z = 0;; z ++) {
 1358                 if (falcon_prng_get_u64(p) <= CoDF_large[z]) {
 1359                         return z;
 1360                 }
 1361         }
 1362 }
 1363 
 1364 #elif SAMPLER_CDF
 1365 
 1366 /*
 1367  * Precomputed CDF table, scaled to 2^128.
 1368  *
 1369  * We use distribution D(z) = exp(-(z^2)/(2*sigma0^2)), for integer z >= 0
 1370  * and sigma0 = 2. CDF is defined as:
 1371  *   CDF[z - 1] = (\sum_{i>=z} D(i)) / (\sum_{i>=0} D(i))
 1372  * i.e. CDF[z - 1] is the probability of getting at least z. Since
 1373  * probability of getting at least 0 is 1, that value is omitted (which
 1374  * is why we use 'CDF[z - 1]' and not 'CDF[z]').
 1375  *
 1376  * Values below have been computed with 250 bits of precision, then
 1377  * scaled up to 2^128, and rounded to the nearest integer.
 1378  */
 1379 typedef struct {
 1380         uint64_t hi, lo;
 1381 } z128;
 1382 
 1383 static const z128 CDF[] = {
 1384         { 12311384997445461060u,  1164166488924346079u },
 1385         {  6896949616398116699u, 12764548512970285063u },
 1386         {  3175666228297764653u,  3821523381717377694u },
 1387         {  1183806766059182243u,  9659191409195894253u },
 1388         {   353476207714659138u, 11568890547115288740u },
 1389         {    83907343225073545u,  8467089641975205687u },
 1390         {    15749660485982248u,  2564667606477951368u },
 1391         {     2328616999791799u,  9676146345336721991u },
 1392         {      270433320942720u,  2935538500327714828u },
 1393         {       24618334939657u, 15160816743013190466u },
 1394         {        1753979576256u, 10774725962424838880u },
 1395         {          97691228987u,   260573317676966082u },
 1396         {           4249834528u, 17953183039089978228u },
 1397         {            144306182u, 18293121091792460987u },
 1398         {              3822728u,  5847176767326442802u },
 1399         {                78971u,  1092095764737208072u },
 1400         {                 1271u, 15793321684841757645u },
 1401         {                   15u, 17810511485592413461u },
 1402         {                    0u,  2881005946479451579u },
 1403         {                    0u,    21959492827510209u },
 1404         {                    0u,      130403777780196u },
 1405         {                    0u,         603269596717u },
 1406         {                    0u,           2173991748u },
 1407         {                    0u,              6102497u },
 1408         {                    0u,                13343u },
 1409         {                    0u,                   23u },
 1410         {                    0u,                    0u }
 1411 };
 1412 
 1413 /*
 1414  * This function samples a positive integer z along the distribution
 1415  * D(z) = exp(-(z^2)/(2*sigma0^2)) (with sigma0 = 2).
 1416  */
 1417 static int
 1418 gaussian0_sampler(prng *p)
 1419 {
 1420         uint64_t hi, lo;
 1421         int z;
 1422 
 1423         hi = falcon_prng_get_u64(p);
 1424         lo = falcon_prng_get_u64(p);
 1425 
 1426         /*
 1427          * Loop below MUST exit, since the last CDF[] table entry is 0.
 1428          */
 1429         for (z = 0;; z ++) {
 1430                 if (hi > CDF[z].hi || (hi == CDF[z].hi && lo >= CDF[z].lo)) {
 1431                         return z;
 1432                 }
 1433         }
 1434 }
 1435 
 1436 /*
 1437  * The "large" Gaussian sampler, using sigma0 = sqrt(5) instead of
 1438  * sigma0 = 2. See above for comments.
 1439  */
 1440 
 1441 static const z128 CDF_large[] = {
 1442         { 12861045783025194470u,  4172822031082392599u },
 1443         {  7806896963754487971u,  1791502523474411737u },
 1444         {  4062691428401757936u,  7604423452952674944u },
 1445         {  1791715974944572781u, 12746882011603970427u },
 1446         {   663982939486702512u, 10691794681225833799u },
 1447         {   205480902982363203u, 16583804506623933557u },
 1448         {    52858833213387347u,  3579657461630629701u },
 1449         {    11264466882686187u,  5640638197337281079u },
 1450         {     1983509262044379u, 16761989531132089487u },
 1451         {      288031217321451u,  6230729019263372079u },
 1452         {       34440907249949u, 15311621317364317869u },
 1453         {        3387143639027u, 10510168041118337409u },
 1454         {         273729206155u,  4159623830376585760u },
 1455         {          18164586717u,  5026128773602959958u },
 1456         {            989235429u, 14864651885815683859u },
 1457         {             44192296u,  6814956854290206119u },
 1458         {              1618856u, 18089287580937416157u },
 1459         {                48613u, 12706079520459332296u },
 1460         {                 1196u,  8301927825924258072u },
 1461         {                   24u,  2373930578664405075u },
 1462         {                    0u,  7354088428325342096u },
 1463         {                    0u,    99537325746473565u },
 1464         {                    0u,     1103521004104852u },
 1465         {                    0u,       10020207975859u },
 1466         {                    0u,          74515224141u },
 1467         {                    0u,            453796867u },
 1468         {                    0u,              2263115u },
 1469         {                    0u,                 9242u },
 1470         {                    0u,                   31u },
 1471         {                    0u,                    0u }
 1472 };
 1473 
 1474 static int
 1475 gaussian0_sampler_large(prng *p)
 1476 {
 1477         uint64_t hi, lo;
 1478         int z;
 1479 
 1480         hi = falcon_prng_get_u64(p);
 1481         lo = falcon_prng_get_u64(p);
 1482         for (z = 0;; z ++) {
 1483                 if (hi > CDF_large[z].hi
 1484                         || (hi == CDF_large[z].hi && lo >= CDF_large[z].lo))
 1485                 {
 1486                         return z;
 1487                 }
 1488         }
 1489 }
 1490 
 1491 #else
 1492 
 1493 /*
 1494  * Partial precomputed CDF tables:
 1495  *  - CDF8 holds the 8 MSBs of each CDF image not starting with 0x00;
 1496  *  - CDFs holds the next 128 MSBs of the same CDF images as CDF8;
 1497  *  - CDF0 holds the next 128 MSBs of each CDF image starting with 0x00.
 1498  *
 1499  * We use the same distribution and indexation as above (i.e. CDF8[z] and 
 1500  * CDFs[z] correspond to CDF[z]) exept that D(z) is scaled to 2^136 and CDF0 
 1501  * is offset by 6 slots (i.e. CDF0[z] corresponds to CDF[z + 6]).
 1502  *
 1503  * Values below have been computed with 256 bits of precision, then
 1504  * scaled up to 2^136, and rounded to the nearest integer.
 1505  */
 1506 typedef struct {
 1507         uint64_t hi, lo;
 1508 } z128;
 1509 
 1510 static const uint8_t CDF8[] = {
 1511         170u, 95u, 44u, 16u, 4u, 1u
 1512 };
 1513 
 1514 static const z128 CDFs[] = {
 1515         { 15768066815414256656u,  2878715985279770247u },
 1516         { 13178414795510471601u,  2650718273802340096u },
 1517         {  1313815201007480117u,   632549813042453946u },
 1518         {  7906626931797828486u,   889294877069012273u },
 1519         { 16702932880114533024u, 10156928267985658938u },
 1520         {  3033535791909276021u,  9305891721635116763u }
 1521 };
 1522 
 1523 static const z128 CDF0[] = {
 1524         {  4031913084411455523u, 10918864678521243583u },
 1525         {   596125951946700678u,  5229758529120913067u },
 1526         {    69230930161336360u, 13628093135512931395u },
 1527         {     6302293744552402u,  7352830732370919967u },
 1528         {      449018771521685u,  9764979398035562428u },
 1529         {       25008954620675u, 11366537104174662165u },
 1530         {        1087957639417u,  2775583653356073882u },
 1531         {          36942382845u, 16012748850353453704u },
 1532         {            978618449u,  2690982465095676317u },
 1533         {             20216591u,  2875354667081992134u },
 1534         {               325595u,  3253399177098153241u },
 1535         {                 4087u,  3145154105398596933u },
 1536         {                   39u, 18114503424067091158u },
 1537         {                    0u,  5621630163842613476u },
 1538         {                    0u,    33383367111730198u },
 1539         {                    0u,      154437016759436u },
 1540         {                    0u,         556541887369u },
 1541         {                    0u,           1562239343u },
 1542         {                    0u,              3415730u },
 1543         {                    0u,                 5817u },
 1544         {                    0u,                    8u },
 1545         {                    0u,                    0u }
 1546 };
 1547 
 1548 /*
 1549  * This function samples a positive integer z along the distribution
 1550  * D(z) = exp(-(z^2)/(2*sigma0^2)) (with sigma0 = 2).
 1551  */
 1552 static int
 1553 gaussian0_sampler(prng *p)
 1554 {
 1555         uint8_t msb;
 1556         uint64_t hi, lo;
 1557         int z;
 1558 
 1559         msb = falcon_prng_get_u8(p);
 1560         if (msb != 0x00) {
 1561                 /*
 1562                  * The loop below return the sample when the byte drawn is 
 1563                  * equal to the MSBs of at most one CDF image (i.e. msb != 0x00
 1564                  * by construction of CDF8 and CDFs).
 1565                  */
 1566                 for (z = 0; z < (int)sizeof CDF8; z ++) {
 1567                         /*
 1568                          * We conclude directly when the byte drawn differs
 1569                          * from all CDF images, since 8 bits are enough to 
 1570                          * compare it to any CDF image.
 1571                          */
 1572                         if (msb > CDF8[z]) {
 1573                                 return z;
 1574                         }
 1575 
 1576                         /*
 1577                          * We draw 128 bits more when the byte drawn is equal
 1578                          * to the MSBs of a CDF image to compare these two.
 1579                          */
 1580                         if (msb == CDF8[z]) {
 1581                                 hi = falcon_prng_get_u64(p);
 1582                                 lo = falcon_prng_get_u64(p);
 1583                                 if (hi > CDFs[z].hi
 1584                                         || (hi == CDFs[z].hi
 1585                                         && lo >= CDFs[z].lo))
 1586                                 {
 1587                                         return z;
 1588                                 }
 1589                                 return z + 1;
 1590                         }
 1591                 }
 1592         }
 1593 
 1594         /*
 1595          * Otherwise (case msb == 0x00), we draw 128 bits more and compare 
 1596          * it with the CDF images whose the 8 MSBs are equal to 0x00.
 1597          */
 1598         hi = falcon_prng_get_u64(p);
 1599         lo = falcon_prng_get_u64(p);
 1600         for (z = 0;; z ++) {
 1601                 if (hi > CDF0[z].hi || (hi == CDF0[z].hi && lo >= CDF0[z].lo)) {
 1602                         return z + (int)sizeof CDF8;
 1603                 }
 1604         }
 1605 }
 1606 
 1607 /*
 1608  * The "large" Gaussian sampler, using sigma0 = sqrt(5) instead of
 1609  * sigma0 = 2. See above for comments.
 1610  */
 1611 
 1612 static const uint8_t CDF8_large[] = {
 1613         178u, 108u, 56u, 24u, 9u, 2u
 1614 };
 1615 
 1616 static const z128 CDFs_large[] = {
 1617         {  8907275334149596729u, 16778027755648063129u },
 1618         {  6317262760517346072u, 15902788240420165876u },
 1619         {  7031337543115141225u,  9824276216381865951u },
 1620         { 15957431816781393328u, 16574837997735344921u },
 1621         {  3958935845209878676u,  6981315484799813327u },
 1622         { 15709623016065876966u,  2702816742530118898u }
 1623 };
 1624 
 1625 static const z128 CDF0_large[] = {
 1626         { 13531861302627160881u, 12501850565673174323u },
 1627         {  2883703521967663950u,  5157340768998930255u },
 1628         {   507778371083361256u, 11424694869198933797u },
 1629         {    73735991634291542u,  8646638592401813304u },
 1630         {     8816872255987156u,  9065313618840431804u },
 1631         {      867108771591057u, 15825127838409392498u },
 1632         {       70074676775737u, 13399288374961512449u },
 1633         {        4650134199621u, 13863624956398687773u },
 1634         {         253244270030u,  5321603584647435131u },
 1635         {          11313227870u, 10635011769594914448u },
 1636         {            414427387u,   724858218881080488u },
 1637         {             12445104u,  6129400264707983377u },
 1638         {               306291u,  3917954960011630607u },
 1639         {                 6176u, 17430417779382047418u },
 1640         {                  102u,  1078742132913311815u },
 1641         {                    1u,  7034811317387681053u },
 1642         {                    0u,   282501377050842205u },
 1643         {                    0u,     2565173241819955u },
 1644         {                    0u,       19075897380102u },
 1645         {                    0u,         116171998071u },
 1646         {                    0u,            579357465u },
 1647         {                    0u,              2365944u },
 1648         {                    0u,                 7912u },
 1649         {                    0u,                   22u },
 1650         {                    0u,                    0u }
 1651 };
 1652 
 1653 static int
 1654 gaussian0_sampler_large(prng *p)
 1655 {
 1656         uint8_t msb;
 1657         uint64_t hi, lo;
 1658         int z;
 1659 
 1660         msb = falcon_prng_get_u8(p);
 1661         if (msb != 0x00) {
 1662                 for (z = 0; z < (int)sizeof CDF8_large; z ++) {
 1663                         if (msb > CDF8_large[z]) {
 1664                                 return z;
 1665                         }
 1666                         if (msb == CDF8_large[z]) {
 1667                                 hi = falcon_prng_get_u64(p);
 1668                                 lo = falcon_prng_get_u64(p);
 1669                                 if (hi > CDFs_large[z].hi
 1670                                         || (hi == CDFs_large[z].hi
 1671                                         && lo >= CDFs_large[z].lo))
 1672                                 {
 1673                                         return z;
 1674                                 }
 1675                                 return z + 1;
 1676                         }
 1677                 }
 1678         }
 1679 
 1680         hi = falcon_prng_get_u64(p);
 1681         lo = falcon_prng_get_u64(p);
 1682         for (z = 0;; z ++) {
 1683                 if (hi > CDF0_large[z].hi
 1684                         || (hi == CDF0_large[z].hi && lo >= CDF0_large[z].lo))
 1685                 {
 1686                         return z + (int)sizeof CDF8_large;
 1687                 }
 1688         }
 1689 }
 1690 
 1691 #endif
 1692 
 1693 #if CT_BEREXP
 1694 
 1695 /*
 1696  * Sample a bit with probability exp(-x) for some x >= 0.
 1697  */
 1698 static int
 1699 BerExp(prng *p, fpr x)
 1700 {
 1701         int s;
 1702         fpr r;
 1703         uint64_t w, z;
 1704         int b;
 1705         uint32_t sw;
 1706 
 1707         /*
 1708          * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
 1709          * and 0 <= r < log(2).
 1710          */
 1711         s = fpr_floor(fpr_div(x, fpr_log2));
 1712         r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
 1713 
 1714         /*
 1715          * It may happen (quite rarely) that s >= 64; if sigma = 1.2
 1716          * (the minimum value for sigma), r = 0 and b = 1, then we get
 1717          * s >= 64 if the half-Gaussian produced a z >= 13, which happens
 1718          * with probability about 0.000000000230383991, which is
 1719          * approximatively equal to 2^(-32). In any case, if s >= 64,
 1720          * then BerExp will be non-zero with probability less than
 1721          * 2^(-64), so we can simply saturate s at 63.
 1722          */
 1723         sw = s;
 1724         sw ^= (sw ^ 63) & -((63 - sw) >> 31);
 1725         s = (int)sw;
 1726 
 1727         /*
 1728          * Sample a bit with probability 2^(-s):
 1729          *  - generate a random 64-bit integer
 1730          *  - keep only s bits
 1731          *  - bit is 1 if the result is zero
 1732          */
 1733         w = falcon_prng_get_u64(p);
 1734         w ^= (w >> s) << s;
 1735         b = 1 - (int)((w | -w) >> 63);
 1736 
 1737         /*
 1738          * Sample a bit with probability exp(-r). Since |r| < log(2),
 1739          * we can use fpr_exp_small(). The value is lower than 1; we
 1740          * scale it to 2^55.
 1741          * With combine (with AND) that bit with the previous bit, which
 1742          * yields the expected result.
 1743          */
 1744         z = (uint64_t)fpr_rint(fpr_mul(fpr_exp_small(fpr_neg(r)), fpr_p55));
 1745         w = falcon_prng_get_u64(p);
 1746         w &= ((uint64_t)1 << 55) - 1;
 1747         b &= (int)((w - z) >> 63);
 1748 
 1749         return b;
 1750 }
 1751 
 1752 #else
 1753 
 1754 /*
 1755  * Sample a bit with probability exp(-x) for some x >= 0.
 1756  */
 1757 static int
 1758 BerExp(prng *p, fpr x)
 1759 {
 1760         int s, i;
 1761         fpr r;
 1762         uint32_t sw;
 1763         uint64_t z, w;
 1764 
 1765         /*
 1766          * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
 1767          * and 0 <= r < log(2).
 1768          */
 1769         s = fpr_floor(fpr_div(x, fpr_log2));
 1770         r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
 1771 
 1772         /*
 1773          * It may happen (quite rarely) that s >= 64; if sigma = 1.2
 1774          * (the minimum value for sigma), r = 0 and b = 1, then we get
 1775          * s >= 64 if the half-Gaussian produced a z >= 13, which happens
 1776          * with probability about 0.000000000230383991, which is
 1777          * approximatively equal to 2^(-32). In any case, if s >= 64,
 1778          * then BerExp will be non-zero with probability less than
 1779          * 2^(-64), so we can simply saturate s at 63.
 1780          */
 1781         sw = s;
 1782         sw ^= (sw ^ 63) & -((63 - sw) >> 31);
 1783         s = (int)sw;
 1784 
 1785         /*
 1786          * Compute exp(-r). Since |r| < log(2), we can use fpr_exp_small().
 1787          * The value is lower than 1; we scale it to 2^64 and store it shifted
 1788          * to the right by s bits. The '-1' makes sure that we fit in 64 bits
 1789          * even if r = 0; the bias is negligible since 'r' itself only has
 1790          * 53 bits of precision.
 1791          */
 1792         z = (((uint64_t)fpr_rint(fpr_mul(
 1793                 fpr_exp_small(fpr_neg(r)), fpr_p63)) << 1) - 1) >> s;
 1794 
 1795         /*
 1796          * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
 1797          * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
 1798          * PRNG output to limit its consumption, the sign of the difference
 1799          * yields the expected result.
 1800          */
 1801         i = 64;
 1802         do {
 1803                 i -= 8;
 1804                 w = falcon_prng_get_u8(p) - ((z >> i) & (uint64_t)0xFF);
 1805         } while (!w && i > 0);
 1806         return (int)(w >> 63);
 1807 }
 1808 
 1809 #endif
 1810 
 1811 /*
 1812  * The sampler produces a random integer that follows a discrete Gaussian
 1813  * distribution, centered on mu, and with standard deviation sigma.
 1814  * The value of sigma MUST lie between 1 and 2 (in Falcon, it should
 1815  * always be between 1.2 and 1.9).
 1816  */
 1817 static int
 1818 sampler(void *ctx, fpr mu, fpr sigma)
 1819 {
 1820         prng *p;
 1821         int s;
 1822         fpr r, dss;
 1823 
 1824         p = ctx;
 1825 
 1826         /*
 1827          * The bimodal Gaussian used for rejection sampling uses
 1828          * sigma0 = 2; we need sigma <= sigma0. Normally, sigma
 1829          * is always between 1.2 and 1.9.
 1830          */
 1831         /* assert(fpr_lt(sigma, fpr_of(2))); */
 1832 
 1833         /*
 1834          * Center is mu. We compute mu = s + r where s is an integer
 1835          * and 0 <= r < 1.
 1836          */
 1837         s = fpr_floor(mu);
 1838         r = fpr_sub(mu, fpr_of(s));
 1839 
 1840         /*
 1841          * dss = 1/(2*sigma^2).
 1842          */
 1843         dss = fpr_inv(fpr_mul(fpr_sqr(sigma), fpr_of(2)));
 1844 
 1845         /*
 1846          * We now need to sample on center r.
 1847          */
 1848         for (;;) {
 1849                 int z, b;
 1850                 fpr x;
 1851 
 1852                 /*
 1853                  * Sample z for a Gaussian distribution. Then get a
 1854                  * random bit b to turn the sampling into a bimodal
 1855                  * distribution: if b = 1, we use z+1, otherwise we
 1856                  * use -z. We thus have two situations:
 1857                  *
 1858                  *  - b = 1: z >= 1 and sampled against a Gaussian
 1859                  *    centered on 1.
 1860                  *  - b = 0: z <= 0 and sampled against a Gaussian
 1861                  *    centered on 0.
 1862                  */
 1863                 z = gaussian0_sampler(p);
 1864                 b = falcon_prng_get_u8(p) & 1;
 1865                 z = b + ((b << 1) - 1) * z;
 1866 
 1867                 /*
 1868                  * Rejection sampling. We want a Gaussian centered on r;
 1869                  * but we sampled against a Gaussian centered on b (0 or
 1870                  * 1). But we know that z is always in the range where
 1871                  * our sampling distribution is greater than the Gaussian
 1872                  * distribution, so rejection works.
 1873                  *
 1874                  * We got z with distribution:
 1875                  *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
 1876                  * We target distribution:
 1877                  *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
 1878                  * Rejection sampling works by keeping the value z with
 1879                  * probability S(z)/G(z), and starting again otherwise.
 1880                  * This requires S(z) <= G(z), which is the case here.
 1881                  * Thus, we simply need to keep our z with probability:
 1882                  *    P = exp(-x)
 1883                  * where:
 1884                  *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
 1885                  *
 1886                  * Note that z and b are integer, and we set sigma0 = 2.
 1887                  */
 1888                 x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
 1889                 x = fpr_sub(x, fpr_div(fpr_of((z - b) * (z - b)), fpr_of(8)));
 1890                 if (BerExp(p, x)) {
 1891                         /*
 1892                          * Rejection sampling was centered on r, but the
 1893                          * actual center is mu = s + r.
 1894                          */
 1895                         return s + z;
 1896                 }
 1897         }
 1898 }
 1899 
 1900 /*
 1901  * This alternate sampler implementation accepts larger standard
 1902  * deviations, up to sqrt(5) = 2.236...
 1903  * It should be used for the ternary case.
 1904  */
 1905 static int
 1906 sampler_large(void *ctx, fpr mu, fpr sigma)
 1907 {
 1908         /*
 1909          * See sampler() for comments.
 1910          * The two only changes here are:
 1911          *  - gaussian0_sampler_large() instead of gaussian0_sampler()
 1912          *  - sigma0 = sqrt(5) instead of 2, so 2*sigma0^2 = 10.
 1913          */
 1914         prng *p;
 1915         int s;
 1916         fpr r, dss;
 1917 
 1918         p = ctx;
 1919 
 1920         s = fpr_floor(mu);
 1921         r = fpr_sub(mu, fpr_of(s));
 1922         dss = fpr_inv(fpr_mul(fpr_sqr(sigma), fpr_of(2)));
 1923 
 1924         for (;;) {
 1925                 int z, b;
 1926                 fpr x;
 1927 
 1928                 z = gaussian0_sampler_large(p);
 1929                 b = falcon_prng_get_u8(p) & 1;
 1930                 z = b + ((b << 1) - 1) * z;
 1931                 x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
 1932                 x = fpr_sub(x, fpr_div(fpr_of((z - b) * (z - b)), fpr_of(10)));
 1933                 if (BerExp(p, x)) {
 1934                         return s + z;
 1935                 }
 1936         }
 1937 }
 1938 
 1939 #if CLEANSE
 1940 /*
 1941  * Cleanse a memory region by overwriting it with zeros.
 1942  */
 1943 static void
 1944 cleanse(void *data, size_t len)
 1945 {
 1946         volatile unsigned char *p;
 1947 
 1948         p = (volatile unsigned char *)data;
 1949         while (len -- > 0) {
 1950                 *p ++ = 0;
 1951         }
 1952 }
 1953 #endif
 1954 
 1955 struct falcon_sign_ {
 1956         /* Context for hashing the nonce + message. */
 1957         shake_context sc;
 1958 
 1959         /* RNG:
 1960            seeded    non-zero when a 'replace' seed or system RNG was pushed
 1961            flipped   non-zero when flipped */
 1962         shake_context rng;
 1963         int seeded;
 1964         int flipped;
 1965 
 1966         /* Private key elements. */
 1967         unsigned q;
 1968         unsigned logn;
 1969         unsigned ternary;
 1970         fpr *sk;
 1971         size_t sk_len;
 1972         fpr *tmp;
 1973         size_t tmp_len;
 1974 };
 1975 
 1976 static void
 1977 clear_private(falcon_sign *fs)
 1978 {
 1979         if (fs->sk != NULL) {
 1980 #if CLEANSE
 1981                 cleanse(fs->sk, fs->sk_len);
 1982 #endif
 1983                 free(fs->sk);
 1984                 fs->sk = NULL;
 1985                 fs->sk_len = 0;
 1986         }
 1987         if (fs->tmp != NULL) {
 1988 #if CLEANSE
 1989                 cleanse(fs->tmp, fs->tmp_len);
 1990 #endif
 1991                 free(fs->tmp);
 1992                 fs->tmp = NULL;
 1993                 fs->tmp_len = 0;
 1994         }
 1995         fs->q = 0;
 1996         fs->logn = 0;
 1997         fs->ternary = 0;
 1998 }
 1999 
 2000 /* see falcon.h */
 2001 falcon_sign *
 2002 falcon_sign_new(void)
 2003 {
 2004         falcon_sign *fs;
 2005 
 2006         fs = malloc(sizeof *fs);
 2007         if (fs == NULL) {
 2008                 return NULL;
 2009         }
 2010         fs->seeded = 0;
 2011         fs->flipped = 0;
 2012         fs->q = 0;
 2013         fs->logn = 0;
 2014         fs->ternary = 0;
 2015         fs->sk = NULL;
 2016         fs->sk_len = 0;
 2017         fs->tmp = NULL;
 2018         fs->tmp_len = 0;
 2019         shake_init(&fs->rng, 512);
 2020         return fs;
 2021 }
 2022 
 2023 /* see falcon.h */
 2024 void
 2025 falcon_sign_free(falcon_sign *fs)
 2026 {
 2027         if (fs != NULL) {
 2028                 clear_private(fs);
 2029                 free(fs);
 2030         }
 2031 }
 2032 
 2033 /* see falcon.h */
 2034 void
 2035 falcon_sign_set_seed(falcon_sign *fs,
 2036         const void *seed, size_t len, int replace)
 2037 {
 2038         if (replace) {
 2039                 shake_init(&fs->rng, 512);
 2040                 shake_inject(&fs->rng, seed, len);
 2041                 fs->seeded = 1;
 2042                 fs->flipped = 0;
 2043                 return;
 2044         }
 2045         if (fs->flipped) {
 2046                 unsigned char tmp[32];
 2047 
 2048                 shake_extract(&fs->rng, tmp, sizeof tmp);
 2049                 shake_init(&fs->rng, 512);
 2050                 shake_inject(&fs->rng, tmp, sizeof tmp);
 2051                 fs->flipped = 0;
 2052         }
 2053         shake_inject(&fs->rng, seed, len);
 2054 }
 2055 
 2056 static int
 2057 rng_ready(falcon_sign *fs)
 2058 {
 2059         if (!fs->seeded) {
 2060                 unsigned char tmp[32];
 2061 
 2062                 if (!falcon_get_seed(tmp, sizeof tmp)) {
 2063                         return 0;
 2064                 }
 2065                 falcon_sign_set_seed(fs, tmp, sizeof tmp, 0);
 2066                 fs->seeded = 1;
 2067         }
 2068         if (!fs->flipped) {
 2069                 shake_flip(&fs->rng);
 2070                 fs->flipped = 1;
 2071         }
 2072         return 1;
 2073 }
 2074 
 2075 /* see falcon.h */
 2076 int
 2077 falcon_sign_set_private_key(falcon_sign *fs,
 2078         const void *skey, size_t len)
 2079 {
 2080         const unsigned char *skey_buf;
 2081         int comp;
 2082         int16_t ske[4][1024];
 2083         int i;
 2084         int fb, has_G;
 2085 
 2086         /*
 2087          * TODO: when reloading a new private key of the same size as
 2088          * the currently allocated one, we could reuse the buffers
 2089          * instead of releasing and reallocating them.
 2090          */
 2091         clear_private(fs);
 2092 
 2093         /*
 2094          * First byte defines modulus, degree and compression:
 2095          *   t cc g dddd
 2096          *   ^ ^^ ^ ^^^^
 2097          *   |  | |   |
 2098          *   |  | |   +----- degree log, over four bits (1 to 10 only)
 2099          *   |  | |
 2100          *   |  | +--------- 0 if G is present, 1 if G is absent
 2101          *   |  |
 2102          *   |  +----------- compression type
 2103          *   |
 2104          *   +-------------- 1 if ternary, 0 if binary
 2105          *
 2106          * Compression is:
 2107          *   00   uncompressed, 16 bits per integer (signed)
 2108          *   01   compressed with static codes (fixed tables)
 2109          * Other compression values are reserved.
 2110          */
 2111         skey_buf = skey;
 2112         fb = *skey_buf ++;
 2113         len --;
 2114         fs->logn = fb & 0x0F;
 2115         has_G = !(fb & 0x10);
 2116         fs->ternary = fb >> 7;
 2117         if (fs->ternary) {
 2118                 fs->q = 18433;
 2119                 if (fs->logn < 3 || fs->logn > 9) {
 2120                         goto bad_skey;
 2121                 }
 2122         } else {
 2123                 fs->q = 12289;
 2124                 if (fs->logn < 1 || fs->logn > 10) {
 2125                         goto bad_skey;
 2126                 }
 2127         }
 2128         comp = (fb >> 5) & 0x03;
 2129 
 2130         /*
 2131          * The f, g, F (and optionally G) short vectors should follow in
 2132          * due order.
 2133          */
 2134         for (i = 0; i < 3 + has_G; i ++) {
 2135                 size_t elen;
 2136 
 2137                 elen = falcon_decode_small(ske[i], fs->logn,
 2138                         comp, fs->q, skey_buf, len);
 2139                 if (elen == 0) {
 2140                         goto bad_skey;
 2141                 }
 2142                 skey_buf += elen;
 2143                 len -= elen;
 2144         }
 2145         if (len != 0) {
 2146                 goto bad_skey;
 2147         }
 2148 
 2149         /*
 2150          * Recompute G if not provided.
 2151          */
 2152         if (!has_G) {
 2153                 if (!falcon_complete_private(ske[3],
 2154                         ske[0], ske[1], ske[2], fs->logn, fs->ternary))
 2155                 {
 2156                         goto bad_skey;
 2157                 }
 2158         }
 2159 
 2160         /*
 2161          * Perform pre-computations on private key.
 2162          */
 2163         if (fs->ternary) {
 2164                 fs->sk_len = ((size_t)(3 * (fs->logn + 6)) << (fs->logn - 1))
 2165                         * sizeof(fpr);
 2166                 fs->tmp_len = ((size_t)21 << (fs->logn - 1)) * sizeof(fpr);
 2167         } else {
 2168                 fs->sk_len = ((size_t)(fs->logn + 5) << fs->logn)
 2169                         * sizeof(fpr);
 2170                 fs->tmp_len = ((size_t)7 << fs->logn) * sizeof(fpr);
 2171         }
 2172         fs->sk = malloc(fs->sk_len);
 2173         if (fs->sk == NULL) {
 2174                 goto bad_skey;
 2175         }
 2176         fs->tmp = malloc(fs->tmp_len);
 2177         if (fs->tmp == NULL) {
 2178                 goto bad_skey;
 2179         }
 2180 
 2181         load_skey(fs->sk, fs->q, ske[0], ske[1], ske[2], ske[3],
 2182                 fs->logn, fs->ternary, fs->tmp);
 2183         return 1;
 2184 
 2185 bad_skey:
 2186         clear_private(fs);
 2187         return 0;
 2188 }
 2189 
 2190 /* see falcon.h */
 2191 int
 2192 falcon_sign_start(falcon_sign *fs, void *r)
 2193 {
 2194         if (!rng_ready(fs)) {
 2195                 return 0;
 2196         }
 2197         shake_extract(&fs->rng, r, 40);
 2198         falcon_sign_start_external_nonce(fs, r, 40);
 2199         return 1;
 2200 }
 2201 
 2202 /* see falcon.h */
 2203 void
 2204 falcon_sign_start_external_nonce(falcon_sign *fs, const void *r, size_t rlen)
 2205 {
 2206         shake_init(&fs->sc, 512);
 2207         shake_inject(&fs->sc, r, rlen);
 2208 }
 2209 
 2210 /* see falcon.h */
 2211 void
 2212 falcon_sign_update(falcon_sign *fs, const void *data, size_t len)
 2213 {
 2214         shake_inject(&fs->sc, data, len);
 2215 }
 2216 
 2217 /* see falcon.h */
 2218 size_t
 2219 falcon_sign_generate(falcon_sign *fs, void *sig, size_t sig_max_len, int comp)
 2220 {
 2221         uint16_t hm[1024];
 2222         int16_t s1[1024], s2[1024];
 2223         unsigned char *sig_buf;
 2224         size_t sig_len;
 2225 
 2226         if (fs->sk == NULL) {
 2227                 return 0;
 2228         }
 2229         if (!rng_ready(fs)) {
 2230                 return 0;
 2231         }
 2232         if (sig_max_len < 2) {
 2233                 return 0;
 2234         }
 2235         shake_flip(&fs->sc);
 2236         falcon_hash_to_point(&fs->sc, fs->q, hm, fs->logn);
 2237 
 2238         for (;;) {
 2239                 /*
 2240                  * Signature produces short vectors s1 and s2. The
 2241                  * signature is acceptable only if the aggregate vector
 2242                  * s1,s2 is short; we must use the same bound as the
 2243                  * verifier.
 2244                  *
 2245                  * If the signature is acceptable, then we return only s2
 2246                  * (the verifier recomputes s1 from s2, the hashed message,
 2247                  * and the public key).
 2248                  */
 2249                 prng p;
 2250                 samplerZ samp;
 2251                 void *samp_ctx;
 2252 
 2253                 /*
 2254                  * Normal sampling. We use a fast PRNG seeded from our
 2255                  * SHAKE context ('rng').
 2256                  */
 2257                 falcon_prng_init(&p, &fs->rng, 0);
 2258                 samp = fs->ternary ? sampler_large : sampler;
 2259                 samp_ctx = &p;
 2260 
 2261                 /*
 2262                  * Do the actual signature.
 2263                  */
 2264                 do_sign(samp, samp_ctx, s1, s2,
 2265                         fs->q, fs->sk, hm, fs->logn, fs->ternary, fs->tmp);
 2266 
 2267                 /*
 2268                  * Check that the norm is correct. With our chosen
 2269                  * acceptance bound, this should almost always be true.
 2270                  * With a tighter bound, it may happen sometimes that we
 2271                  * end up with an invalidly large signature, in which
 2272                  * case we just loop.
 2273                  */
 2274                 if (falcon_is_short(s1, s2, fs->logn, fs->ternary)) {
 2275                         break;
 2276                 }
 2277         }
 2278 
 2279         sig_buf = sig;
 2280         sig_len = falcon_encode_small(sig_buf + 1, sig_max_len - 1,
 2281                 comp, fs->q, s2, fs->logn);
 2282         if (sig_len == 0) {
 2283                 return 0;
 2284         }
 2285         sig_buf[0] = (fs->ternary << 7) | (comp << 5) | fs->logn;
 2286         return sig_len + 1;
 2287 }