Falcon source files (reference implementation)


falcon-fft.c

    1 /*
    2  * FFT code.
    3  *
    4  * ==========================(LICENSE BEGIN)============================
    5  *
    6  * Copyright (c) 2017  Falcon Project
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining
    9  * a copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be
   17  * included in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * ===========================(LICENSE END)=============================
   28  *
   29  * @author   Thomas Pornin <thomas.pornin@nccgroup.trust>
   30  */
   31 
   32 #include "internal.h"
   33 
   34 /*
   35  * Load the table of constants for FFT.
   36  */
   37 #define FPC(re, im)   re, im
   38 #include FPR_IMPL
   39 #undef FPC
   40 
   41 /*
   42  * Rules for complex number macros:
   43  * --------------------------------
   44  *
   45  * Operand order is: destination, source1, source2...
   46  *
   47  * Each operand is a real and an imaginary part.
   48  *
   49  * All overlaps are allowed.
   50  */
   51 
   52 /*
   53  * Addition of two complex numbers (d = a + b).
   54  */
   55 #define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
   56                 fpr fpct_re, fpct_im; \
   57                 fpct_re = fpr_add(a_re, b_re); \
   58                 fpct_im = fpr_add(a_im, b_im); \
   59                 (d_re) = fpct_re; \
   60                 (d_im) = fpct_im; \
   61         } while (0)
   62 
   63 /*
   64  * Subtraction of two complex numbers (d = a - b).
   65  */
   66 #define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
   67                 fpr fpct_re, fpct_im; \
   68                 fpct_re = fpr_sub(a_re, b_re); \
   69                 fpct_im = fpr_sub(a_im, b_im); \
   70                 (d_re) = fpct_re; \
   71                 (d_im) = fpct_im; \
   72         } while (0)
   73 
   74 /*
   75  * Multplication of two complex numbers (d = a * b).
   76  */
   77 #define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
   78                 fpr fpct_a_re, fpct_a_im; \
   79                 fpr fpct_b_re, fpct_b_im; \
   80                 fpr fpct_d_re, fpct_d_im; \
   81                 fpct_a_re = (a_re); \
   82                 fpct_a_im = (a_im); \
   83                 fpct_b_re = (b_re); \
   84                 fpct_b_im = (b_im); \
   85                 fpct_d_re = fpr_sub( \
   86                         fpr_mul(fpct_a_re, fpct_b_re), \
   87                         fpr_mul(fpct_a_im, fpct_b_im)); \
   88                 fpct_d_im = fpr_add( \
   89                         fpr_mul(fpct_a_re, fpct_b_im), \
   90                         fpr_mul(fpct_a_im, fpct_b_re)); \
   91                 (d_re) = fpct_d_re; \
   92                 (d_im) = fpct_d_im; \
   93         } while (0)
   94 
   95 /*
   96  * Squaring of a complex number (d = a * a).
   97  */
   98 #define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
   99                 fpr fpct_a_re, fpct_a_im; \
  100                 fpr fpct_d_re, fpct_d_im; \
  101                 fpct_a_re = (a_re); \
  102                 fpct_a_im = (a_im); \
  103                 fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
  104                 fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
  105                 (d_re) = fpct_d_re; \
  106                 (d_im) = fpct_d_im; \
  107         } while (0)
  108 
  109 /*
  110  * Inversion of a complex number (d = 1 / a).
  111  */
  112 #define FPC_INV(d_re, d_im, a_re, a_im)   do { \
  113                 fpr fpct_a_re, fpct_a_im; \
  114                 fpr fpct_d_re, fpct_d_im; \
  115                 fpr fpct_m; \
  116                 fpct_a_re = (a_re); \
  117                 fpct_a_im = (a_im); \
  118                 fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
  119                 fpct_d_re = fpr_div(fpct_a_re, fpct_m); \
  120                 fpct_d_im = fpr_div(fpr_neg(fpct_a_im), fpct_m); \
  121                 (d_re) = fpct_d_re; \
  122                 (d_im) = fpct_d_im; \
  123         } while (0)
  124 
  125 /*
  126  * Division of complex numbers (d = a / b).
  127  */
  128 #define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
  129                 fpr fpct_a_re, fpct_a_im; \
  130                 fpr fpct_b_re, fpct_b_im; \
  131                 fpr fpct_d_re, fpct_d_im; \
  132                 fpr fpct_m; \
  133                 fpct_a_re = (a_re); \
  134                 fpct_a_im = (a_im); \
  135                 fpct_b_re = (b_re); \
  136                 fpct_b_im = (b_im); \
  137                 fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
  138                 fpct_b_re = fpr_div(fpct_b_re, fpct_m); \
  139                 fpct_b_im = fpr_div(fpr_neg(fpct_b_im), fpct_m); \
  140                 fpct_d_re = fpr_sub( \
  141                         fpr_mul(fpct_a_re, fpct_b_re), \
  142                         fpr_mul(fpct_a_im, fpct_b_im)); \
  143                 fpct_d_im = fpr_add( \
  144                         fpr_mul(fpct_a_re, fpct_b_im), \
  145                         fpr_mul(fpct_a_im, fpct_b_re)); \
  146                 (d_re) = fpct_d_re; \
  147                 (d_im) = fpct_d_im; \
  148         } while (0)
  149 
  150 /*
  151  * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
  152  * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
  153  * of X^N+1 in the field of complex numbers. A crucial property is that
  154  * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
  155  *
  156  * FFT representation of a polynomial f (taken modulo X^N+1) is the
  157  * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
  158  * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
  159  * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
  160  * needed. A consequence is that FFT representation has the same size
  161  * as normal representation: N/2 complex numbers use N real numbers (each
  162  * complex number is the combination of a real and an imaginary part).
  163  *
  164  * We use a specific ordering which makes computations easier. Let rev()
  165  * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
  166  * store the real and imaginary parts of f(w_j) in slots:
  167  *
  168  *    Re(f(w_j)) -> slot rev(j)/2
  169  *    Im(f(w_j)) -> slot rev(j)/2+N/2
  170  *
  171  * (Note that rev(j) is even for j < N/2.)
  172  */
  173 
  174 /* see internal.h */
  175 void
  176 falcon_FFT(fpr *f, unsigned logn)
  177 {
  178         /*
  179          * FFT algorithm in bit-reversal order uses the following
  180          * iterative algorithm:
  181          *
  182          *   t = N
  183          *   for m = 1; m < N; m *= 2:
  184          *       ht = t/2
  185          *       for i1 = 0; i1 < m; i1 ++:
  186          *           j1 = i1 * t
  187          *           s = GM[m + i1]
  188          *           for j = j1; j < (j1 + ht); j ++:
  189          *               x = f[j]
  190          *               y = s * f[j + ht]
  191          *               f[j] = x + y
  192          *               f[j + ht] = x - y
  193          *       t = ht
  194          *
  195          * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
  196          *
  197          * In the description above, f[] is supposed to contain complex
  198          * numbers. In our in-memory representation, the real and
  199          * imaginary parts of f[k] are in array slots k and k+N/2.
  200          *
  201          * We only keep the first half of the complex numbers. We can
  202          * see that after the first iteration, the first and second halves
  203          * of the array of complex numbers have separate lives, so we
  204          * simply ignore the second part.
  205          */
  206 
  207         unsigned u;
  208         size_t t, n, hn, m;
  209 
  210         /*
  211          * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
  212          * (because GM[1] = w^rev(1) = w^(N/2) = i).
  213          * In our chosen representation, this is a no-op: everything is
  214          * already where it should be.
  215          */
  216 
  217         /*
  218          * Subsequent iterations are truncated to use only the first
  219          * half of values.
  220          */
  221         n = (size_t)1 << logn;
  222         hn = n >> 1;
  223         t = hn;
  224         for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
  225                 size_t ht, hm, i1, j1;
  226 
  227                 ht = t >> 1;
  228                 hm = m >> 1;
  229                 for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
  230                         unsigned j, j2;
  231                         fpr s_re, s_im;
  232 
  233                         s_re = fpr_gm_tab[((m + i1) << 1) + 0];
  234                         s_im = fpr_gm_tab[((m + i1) << 1) + 1];
  235                         j2 = j1 + ht;
  236                         for (j = j1; j < j2; j ++) {
  237                                 fpr x_re, x_im, y_re, y_im;
  238 
  239                                 x_re = f[j];
  240                                 x_im = f[j + hn];
  241                                 y_re = f[j + ht];
  242                                 y_im = f[j + ht + hn];
  243                                 FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
  244                                 FPC_ADD(f[j], f[j + hn],
  245                                         x_re, x_im, y_re, y_im);
  246                                 FPC_SUB(f[j + ht], f[j + ht + hn],
  247                                         x_re, x_im, y_re, y_im);
  248                         }
  249                 }
  250                 t = ht;
  251         }
  252 }
  253 
  254 /* see internal.h */
  255 void
  256 falcon_iFFT(fpr *f, unsigned logn)
  257 {
  258         /*
  259          * Inverse FFT algorithm in bit-reversal order uses the following
  260          * iterative algorithm:
  261          *
  262          *   t = 1
  263          *   for m = N; m > 1; m /= 2:
  264          *       hm = m/2
  265          *       dt = t*2
  266          *       for i1 = 0; i1 < hm; i1 ++:
  267          *           j1 = i1 * dt
  268          *           s = iGM[hm + i1]
  269          *           for j = j1; j < (j1 + t); j ++:
  270          *               x = f[j]
  271          *               y = f[j + t]
  272          *               f[j] = x + y
  273          *               f[j + t] = s * (x - y)
  274          *       t = dt
  275          *   for i1 = 0; i1 < N; i1 ++:
  276          *       f[i1] = f[i1] / N
  277          *
  278          * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
  279          * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
  280          *
  281          * In the main loop (not counting the final division loop), in
  282          * all iterations except the last, the first and second half of f[]
  283          * (as an array of complex numbers) are separate. In our chosen
  284          * representation, we do not keep the second half.
  285          *
  286          * The last iteration recombines the recomputed half with the
  287          * implicit half, and should yield only real numbers since the
  288          * target polynomial is real; moreover, s = i at that step.
  289          * Thus, when considering x and y:
  290          *    y = conj(x) since the final f[j] must be real
  291          *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
  292          *    filled with 2*Im(x).
  293          * But we already have Re(x) and Im(x) in array slots j and j+t
  294          * in our chosen representation. That last iteration is thus a
  295          * simple doubling of the values in all the array.
  296          *
  297          * We make the last iteration a no-op by tweaking the final
  298          * division into a division by N/2, not N.
  299          */
  300         size_t u, n, hn, t, m;
  301 
  302         n = (size_t)1 << logn;
  303         t = 1;
  304         m = n;
  305         hn = n >> 1;
  306         for (u = logn; u > 1; u --) {
  307                 size_t hm, dt, i1, j1;
  308 
  309                 hm = m >> 1;
  310                 dt = t << 1;
  311                 for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
  312                         size_t j, j2;
  313                         fpr s_re, s_im;
  314 
  315                         j2 = j1 + t;
  316                         s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
  317                         s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
  318                         for (j = j1; j < j2; j ++) {
  319                                 fpr x_re, x_im, y_re, y_im;
  320 
  321                                 x_re = f[j];
  322                                 x_im = f[j + hn];
  323                                 y_re = f[j + t];
  324                                 y_im = f[j + t + hn];
  325                                 FPC_ADD(f[j], f[j + hn],
  326                                         x_re, x_im, y_re, y_im);
  327                                 FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
  328                                 FPC_MUL(f[j + t], f[j + t + hn],
  329                                         x_re, x_im, s_re, s_im);
  330                         }
  331                 }
  332                 t = dt;
  333                 m = hm;
  334         }
  335 
  336         /*
  337          * Last iteration is a no-op, provided that we divide by N/2
  338          * instead of N. We need to make a special case for logn = 0.
  339          */
  340         if (logn > 0) {
  341                 fpr ni;
  342 
  343                 ni = fpr_scaled(2, -(int)logn);
  344                 for (u = 0; u < n; u ++) {
  345                         f[u] = fpr_mul(f[u], ni);
  346                 }
  347         }
  348 }
  349 
  350 /* see internal.h */
  351 void
  352 falcon_poly_add(fpr *restrict a, const fpr *restrict b, unsigned logn)
  353 {
  354         size_t n, u;
  355 
  356         n = (size_t)1 << logn;
  357         for (u = 0; u < n; u ++) {
  358                 a[u] = fpr_add(a[u], b[u]);
  359         }
  360 }
  361 
  362 /* see internal.h */
  363 void
  364 falcon_poly_addconst(fpr *a, fpr x, unsigned logn)
  365 {
  366         (void)logn;
  367         a[0] = fpr_add(a[0], x);
  368 }
  369 
  370 /* see internal.h */
  371 void
  372 falcon_poly_addconst_fft(fpr *a, fpr x, unsigned logn)
  373 {
  374         size_t hn, u;
  375 
  376         hn = (size_t)1 << (logn - 1);
  377         for (u = 0; u < hn; u ++) {
  378                 a[u] = fpr_add(a[u], x);
  379         }
  380 }
  381 
  382 /* see internal.h */
  383 void
  384 falcon_poly_sub(fpr *restrict a, const fpr *restrict b, unsigned logn)
  385 {
  386         size_t n, u;
  387 
  388         n = (size_t)1 << logn;
  389         for (u = 0; u < n; u ++) {
  390                 a[u] = fpr_sub(a[u], b[u]);
  391         }
  392 }
  393 
  394 /* see internal.h */
  395 void
  396 falcon_poly_neg(fpr *a, unsigned logn)
  397 {
  398         size_t n, u;
  399 
  400         n = (size_t)1 << logn;
  401         for (u = 0; u < n; u ++) {
  402                 a[u] = fpr_neg(a[u]);
  403         }
  404 }
  405 
  406 /* see internal.h */
  407 void
  408 falcon_poly_adj(fpr *a, unsigned logn)
  409 {
  410         size_t n, hn, u;
  411 
  412         n = (size_t)1 << logn;
  413         hn = n >> 1;
  414         for (u = 1; u < hn; u ++) {
  415                 fpr t;
  416 
  417                 t = fpr_neg(a[u]);
  418                 a[u] = fpr_neg(a[n - u]);
  419                 a[n - u] = t;
  420         }
  421         a[hn] = fpr_neg(a[hn]);
  422 }
  423 
  424 /* see internal.h */
  425 void
  426 falcon_poly_adj_fft(fpr *a, unsigned logn)
  427 {
  428         size_t n, u;
  429 
  430         n = (size_t)1 << logn;
  431         for (u = (n >> 1); u < n; u ++) {
  432                 a[u] = fpr_neg(a[u]);
  433         }
  434 }
  435 
  436 /* see internal.h */
  437 void
  438 falcon_poly_mul_fft(fpr *restrict a, const fpr *restrict b, unsigned logn)
  439 {
  440         size_t n, hn, u;
  441 
  442         n = (size_t)1 << logn;
  443         hn = n >> 1;
  444         for (u = 0; u < hn; u ++) {
  445                 fpr a_re, a_im, b_re, b_im;
  446 
  447                 a_re = a[u];
  448                 a_im = a[u + hn];
  449                 b_re = b[u];
  450                 b_im = b[u + hn];
  451                 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  452         }
  453 }
  454 
  455 /* see internal.h */
  456 void
  457 falcon_poly_sqr_fft(fpr *a, unsigned logn)
  458 {
  459         size_t n, hn, u;
  460 
  461         n = (size_t)1 << logn;
  462         hn = n >> 1;
  463         for (u = 0; u < hn; u ++) {
  464                 fpr a_re, a_im;
  465 
  466                 a_re = a[u];
  467                 a_im = a[u + hn];
  468                 FPC_SQR(a[u], a[u + hn], a_re, a_im);
  469         }
  470 }
  471 
  472 /* see internal.h */
  473 void
  474 falcon_poly_muladj_fft(fpr *restrict a, const fpr *restrict b, unsigned logn)
  475 {
  476         size_t n, hn, u;
  477 
  478         n = (size_t)1 << logn;
  479         hn = n >> 1;
  480         for (u = 0; u < hn; u ++) {
  481                 fpr a_re, a_im, b_re, b_im;
  482 
  483                 a_re = a[u];
  484                 a_im = a[u + hn];
  485                 b_re = b[u];
  486                 b_im = fpr_neg(b[u + hn]);
  487                 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  488         }
  489 }
  490 
  491 /* see internal.h */
  492 void
  493 falcon_poly_mulselfadj_fft(fpr *a, unsigned logn)
  494 {
  495         /*
  496          * Since each coefficient is multiplied with its own conjugate,
  497          * the result contains only real values.
  498          */
  499         size_t n, hn, u;
  500 
  501         n = (size_t)1 << logn;
  502         hn = n >> 1;
  503         for (u = 0; u < hn; u ++) {
  504                 fpr a_re, a_im;
  505 
  506                 a_re = a[u];
  507                 a_im = a[u + hn];
  508                 a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
  509                 a[u + hn] = fpr_of(0);
  510         }
  511 }
  512 
  513 /* see internal.h */
  514 void
  515 falcon_poly_mulconst(fpr *a, fpr x, unsigned logn)
  516 {
  517         size_t n, u;
  518 
  519         n = (size_t)1 << logn;
  520         for (u = 0; u < n; u ++) {
  521                 a[u] = fpr_mul(a[u], x);
  522         }
  523 }
  524 
  525 /* see internal.h */
  526 void
  527 falcon_poly_inv_fft(fpr *a, unsigned logn)
  528 {
  529         size_t n, hn, u;
  530 
  531         n = (size_t)1 << logn;
  532         hn = n >> 1;
  533         for (u = 0; u < hn; u ++) {
  534                 fpr a_re, a_im;
  535 
  536                 a_re = a[u];
  537                 a_im = a[u + hn];
  538                 FPC_INV(a[u], a[u + hn], a_re, a_im);
  539         }
  540 }
  541 
  542 /* see internal.h */
  543 void
  544 falcon_poly_div_fft(fpr *restrict a, const fpr *restrict b, unsigned logn)
  545 {
  546         size_t n, hn, u;
  547 
  548         n = (size_t)1 << logn;
  549         hn = n >> 1;
  550         for (u = 0; u < hn; u ++) {
  551                 fpr a_re, a_im, b_re, b_im;
  552 
  553                 a_re = a[u];
  554                 a_im = a[u + hn];
  555                 b_re = b[u];
  556                 b_im = b[u + hn];
  557                 FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  558         }
  559 }
  560 
  561 /* see internal.h */
  562 void
  563 falcon_poly_divadj_fft(fpr *restrict a, const fpr *restrict b, unsigned logn)
  564 {
  565         size_t n, hn, u;
  566 
  567         n = (size_t)1 << logn;
  568         hn = n >> 1;
  569         for (u = 0; u < hn; u ++) {
  570                 fpr a_re, a_im, b_re, b_im;
  571 
  572                 a_re = a[u];
  573                 a_im = a[u + hn];
  574                 b_re = b[u];
  575                 b_im = fpr_neg(b[u + hn]);
  576                 FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  577         }
  578 }
  579 
  580 /* see internal.h */
  581 void
  582 falcon_poly_invnorm2_fft(fpr *restrict d,
  583         const fpr *restrict a, const fpr *restrict b, unsigned logn)
  584 {
  585         size_t n, hn, u;
  586 
  587         n = (size_t)1 << logn;
  588         hn = n >> 1;
  589         for (u = 0; u < hn; u ++) {
  590                 fpr a_re, a_im;
  591                 fpr b_re, b_im;
  592 
  593                 a_re = a[u];
  594                 a_im = a[u + hn];
  595                 b_re = b[u];
  596                 b_im = b[u + hn];
  597                 d[u] = fpr_inv(fpr_add(
  598                         fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
  599                         fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
  600         }
  601 }
  602 
  603 /* see internal.h */
  604 void
  605 falcon_poly_add_muladj_fft(fpr *restrict d,
  606         const fpr *restrict F, const fpr *restrict G,
  607         const fpr *restrict f, const fpr *restrict g, unsigned logn)
  608 {
  609         size_t n, hn, u;
  610 
  611         n = (size_t)1 << logn;
  612         hn = n >> 1;
  613         for (u = 0; u < hn; u ++) {
  614                 fpr F_re, F_im, G_re, G_im;
  615                 fpr f_re, f_im, g_re, g_im;
  616                 fpr a_re, a_im, b_re, b_im;
  617 
  618                 F_re = F[u];
  619                 F_im = F[u + hn];
  620                 G_re = G[u];
  621                 G_im = G[u + hn];
  622                 f_re = f[u];
  623                 f_im = f[u + hn];
  624                 g_re = g[u];
  625                 g_im = g[u + hn];
  626 
  627                 FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
  628                 FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
  629                 d[u] = fpr_add(a_re, b_re);
  630                 d[u + hn] = fpr_add(a_im, b_im);
  631         }
  632 }
  633 
  634 /* see internal.h */
  635 void
  636 falcon_poly_mul_autoadj_fft(fpr *restrict a,
  637         const fpr *restrict b, unsigned logn)
  638 {
  639         size_t n, hn, u;
  640 
  641         n = (size_t)1 << logn;
  642         hn = n >> 1;
  643         for (u = 0; u < hn; u ++) {
  644                 a[u] = fpr_mul(a[u], b[u]);
  645                 a[u + hn] = fpr_mul(a[u + hn], b[u]);
  646         }
  647 }
  648 
  649 /* see internal.h */
  650 void
  651 falcon_poly_div_autoadj_fft(fpr *restrict a,
  652         const fpr *restrict b, unsigned logn)
  653 {
  654         size_t n, hn, u;
  655 
  656         n = (size_t)1 << logn;
  657         hn = n >> 1;
  658         for (u = 0; u < hn; u ++) {
  659                 a[u] = fpr_div(a[u], b[u]);
  660                 a[u + hn] = fpr_div(a[u + hn], b[u]);
  661         }
  662 }
  663 
  664 /* see internal.h */
  665 void
  666 falcon_poly_split_fft(fpr *restrict f0, fpr *restrict f1,
  667         const fpr *restrict f, unsigned logn)
  668 {
  669         /*
  670          * The FFT representation we use is in bit-reversed order
  671          * (element i contains f(w^(rev(i))), where rev() is the
  672          * bit-reversal function over the ring degree. This changes
  673          * indexes with regards to the Falcon specification.
  674          */
  675         size_t n, hn, qn, u;
  676 
  677         n = (size_t)1 << logn;
  678         hn = n >> 1;
  679         qn = hn >> 1;
  680 
  681         /*
  682          * We process complex values by pairs. For logn = 1, there is only
  683          * one complex value (the other one is the implicit conjugate),
  684          * so we add the two lines below because the loop will be
  685          * skipped.
  686          */
  687         f0[0] = f[0];
  688         f1[0] = f[hn];
  689 
  690         for (u = 0; u < qn; u ++) {
  691                 fpr a_re, a_im, b_re, b_im;
  692                 fpr t_re, t_im;
  693 
  694                 a_re = f[(u << 1) + 0];
  695                 a_im = f[(u << 1) + 0 + hn];
  696                 b_re = f[(u << 1) + 1];
  697                 b_im = f[(u << 1) + 1 + hn];
  698 
  699                 FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
  700                 f0[u] = fpr_half(t_re);
  701                 f0[u + qn] = fpr_half(t_im);
  702 
  703                 FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
  704                 FPC_MUL(t_re, t_im, t_re, t_im,
  705                         fpr_gm_tab[((u + hn) << 1) + 0],
  706                         fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
  707                 f1[u] = fpr_half(t_re);
  708                 f1[u + qn] = fpr_half(t_im);
  709         }
  710 }
  711 
  712 /* see internal.h */
  713 void
  714 falcon_poly_merge_fft(fpr *restrict f,
  715         const fpr *restrict f0, const fpr *restrict f1, unsigned logn)
  716 {
  717         size_t n, hn, qn, u;
  718 
  719         n = (size_t)1 << logn;
  720         hn = n >> 1;
  721         qn = hn >> 1;
  722 
  723         /*
  724          * An extra copy to handle the special case logn = 1.
  725          */
  726         f[0] = f0[0];
  727         f[hn] = f1[0];
  728 
  729         for (u = 0; u < qn; u ++) {
  730                 fpr a_re, a_im, b_re, b_im;
  731                 fpr t_re, t_im;
  732 
  733                 a_re = f0[u];
  734                 a_im = f0[u + qn];
  735                 FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
  736                         fpr_gm_tab[((u + hn) << 1) + 0],
  737                         fpr_gm_tab[((u + hn) << 1) + 1]);
  738                 FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
  739                 f[(u << 1) + 0] = t_re;
  740                 f[(u << 1) + 0 + hn] = t_im;
  741                 FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
  742                 f[(u << 1) + 1] = t_re;
  743                 f[(u << 1) + 1 + hn] = t_im;
  744         }
  745 }
  746 
  747 /* ==================================================================== */
  748 /*
  749  * Here begins code for FFT3. Modulus is X^N-X^(N/2)+1.
  750  *
  751  * (X^N-X^(N/2)+1)*(X^(N/2)+1) = X^(1.5*N)+1. The roots of X^N-X^(N/2)+1
  752  * are thus the roots of X^(1.5*N)+1 which are not roots of X^(N/2)+1.
  753  */
  754 
  755 #define MKN(logn, full)   ((size_t)(1 + ((full) << 1)) << ((logn) - (full)))
  756 
  757 /* see internal.h */
  758 void
  759 falcon_FFT3(fpr *a, unsigned logn, unsigned full)
  760 {
  761         size_t n, hn, u, t, tmin, m;
  762 
  763         n = MKN(logn, full);
  764         hn = n >> 1;
  765 
  766         /*
  767          * First pass, for sub-polynomials of degree 2, each modulo
  768          * X^2-X+1. If w = exp(i*pi/3), then the roots are w and w^5.
  769          * We keep only a(w) = a0 + a1*w.
  770          */
  771         for (u = 0; u < hn; u ++) {
  772                 fpr a0, a1;
  773 
  774                 a0 = a[u];
  775                 a1 = a[u + hn];
  776                 a[u] = fpr_add(a0, fpr_mul(a1, fpr_W1R));
  777                 a[u + hn] = fpr_mul(a1, fpr_W1I);
  778         }
  779 
  780         /*
  781          * Intermediate steps for doubling the degree.
  782          */
  783         t = hn;
  784         tmin = 1 + (full << 1);
  785         for (m = 2; t > tmin; m <<= 1) {
  786                 size_t ht, hm, u1, v1;
  787 
  788                 ht = t >> 1;
  789                 hm = m >> 1;
  790                 for (u1 = 0, v1 = 0; u1 < hm; u1 ++, v1 += t) {
  791                         size_t v, v2;
  792                         fpr sr, si;
  793 
  794                         sr = fpr_gm3_square[((m + u1) << 1) + 0];
  795                         si = fpr_gm3_square[((m + u1) << 1) + 1];
  796                         v2 = v1 + ht;
  797                         for (v = v1; v < v2; v ++) {
  798                                 fpr a0r, a0i, a1r, a1i;
  799 
  800                                 a0r = a[v];
  801                                 a0i = a[v + hn];
  802                                 a1r = a[v + ht];
  803                                 a1i = a[v + ht + hn];
  804                                 FPC_MUL(a1r, a1i, a1r, a1i, sr, si);
  805                                 FPC_ADD(a[v], a[v + hn],
  806                                         a0r, a0i, a1r, a1i);
  807                                 FPC_SUB(a[v + ht], a[v + ht + hn],
  808                                         a0r, a0i, a1r, a1i);
  809                         }
  810                 }
  811                 t = ht;
  812         }
  813 
  814         /*
  815          * Last step: degree tripling (only if degree is multiple of 3).
  816          */
  817         if (full) {
  818                 size_t v;
  819 
  820                 for (u = 0, v = (size_t)1 << logn; u < hn; u += 3, v += 2) {
  821                         fpr fAr, fAi, fBr, fBi, fCr, fCi;
  822                         fpr xr, xi;
  823                         fpr fB0r, fB0i, fB1r, fB1i, fB2r, fB2i;
  824                         fpr fC0r, fC0i, fC1r, fC1i, fC2r, fC2i;
  825 
  826                         fAr = a[u];
  827                         fAi = a[u + hn];
  828                         fBr = a[u + 1];
  829                         fBi = a[u + 1 + hn];
  830                         fCr = a[u + 2];
  831                         fCi = a[u + 2 + hn];
  832 
  833                         xr = fpr_gm3_cubic[v + 0];
  834                         xi = fpr_gm3_cubic[v + 1];
  835                         FPC_MUL(fB0r, fB0i, fBr, fBi, xr, xi);
  836                         FPC_MUL(fB1r, fB1i, fB0r, fB0i, fpr_W2R, fpr_W2I);
  837                         FPC_MUL(fB2r, fB2i, fB0r, fB0i, fpr_W4R, fpr_W4I);
  838                         FPC_SQR(xr, xi, xr, xi);
  839                         FPC_MUL(fC0r, fC0i, fCr, fCi, xr, xi);
  840                         FPC_MUL(fC1r, fC1i, fC0r, fC0i, fpr_W2R, fpr_W2I);
  841                         FPC_MUL(fC2r, fC2i, fC0r, fC0i, fpr_W4R, fpr_W4I);
  842                         FPC_ADD(fB0r, fB0i, fB0r, fB0i, fC0r, fC0i);
  843                         FPC_ADD(fB1r, fB1i, fB1r, fB1i, fC2r, fC2i);
  844                         FPC_ADD(fB2r, fB2i, fB2r, fB2i, fC1r, fC1i);
  845                         FPC_ADD(a[u + 0], a[u + 0 + hn], fAr, fAi, fB0r, fB0i);
  846                         FPC_ADD(a[u + 1], a[u + 1 + hn], fAr, fAi, fB1r, fB1i);
  847                         FPC_ADD(a[u + 2], a[u + 2 + hn], fAr, fAi, fB2r, fB2i);
  848                 }
  849         }
  850 }
  851 
  852 /* see internal.h */
  853 void
  854 falcon_iFFT3(fpr *a, unsigned logn, unsigned full)
  855 {
  856         size_t n, hn, u, t, m;
  857         fpr ni;
  858 
  859         n = MKN(logn, full);
  860         hn = n >> 1;
  861 
  862         /*
  863          * First step: divide degree by 3.
  864          */
  865         if (full) {
  866                 size_t v;
  867 
  868                 for (u = 0, v = (size_t)1 << logn; u < hn; u += 3, v += 2) {
  869                         fpr f0r, f0i, f1r, f1i, f2r, f2i, xr, xi;
  870                         fpr f11r, f11i, f12r, f12i, f21r, f21i, f22r, f22i;
  871 
  872                         f0r = a[u];
  873                         f0i = a[u + hn];
  874                         f1r = a[u + 1];
  875                         f1i = a[u + 1 + hn];
  876                         f2r = a[u + 2];
  877                         f2i = a[u + 2 + hn];
  878 
  879                         xr = fpr_gm3_cubic[v + 0];
  880                         xi = fpr_neg(fpr_gm3_cubic[v + 1]);
  881                         FPC_MUL(f11r, f11i, f1r, f1i, fpr_W4R, fpr_W4I);
  882                         FPC_MUL(f12r, f12i, f1r, f1i, fpr_W2R, fpr_W2I);
  883                         FPC_MUL(f21r, f21i, f2r, f2i, fpr_W4R, fpr_W4I);
  884                         FPC_MUL(f22r, f22i, f2r, f2i, fpr_W2R, fpr_W2I);
  885 
  886                         FPC_ADD(f1r, f1i, f1r, f1i, f2r, f2i);
  887                         FPC_ADD(a[u], a[u + hn], f0r, f0i, f1r, f1i);
  888 
  889                         FPC_ADD(f11r, f11i, f11r, f11i, f22r, f22i);
  890                         FPC_ADD(f11r, f11i, f11r, f11i, f0r, f0i);
  891                         FPC_MUL(a[u + 1], a[u + 1 + hn], xr, xi, f11r, f11i);
  892 
  893                         FPC_SQR(xr, xi, xr, xi);
  894                         FPC_ADD(f12r, f12i, f12r, f12i, f21r, f21i);
  895                         FPC_ADD(f12r, f12i, f12r, f12i, f0r, f0i);
  896                         FPC_MUL(a[u + 2], a[u + 2 + hn], xr, xi, f12r, f12i);
  897                 }
  898         }
  899 
  900         /*
  901          * Intermediate steps for halving the degree.
  902          */
  903         t = 2 + (full << 2);
  904         for (m = (size_t)1 << (logn - 1 - full); t < n; m >>= 1) {
  905                 size_t ht, hm, u1, v1;
  906 
  907                 ht = t >> 1;
  908                 hm = m >> 1;
  909                 for (u1 = 0, v1 = 0; u1 < hm; u1 ++, v1 += t) {
  910                         size_t v, v2;
  911                         fpr sr, si;
  912 
  913                         sr = fpr_gm3_square[((m + u1) << 1) + 0];
  914                         si = fpr_neg(fpr_gm3_square[((m + u1) << 1) + 1]);
  915                         v2 = v1 + ht;
  916                         for (v = v1; v < v2; v ++) {
  917                                 fpr a0r, a0i, a1r, a1i;
  918 
  919                                 a0r = a[v];
  920                                 a0i = a[v + hn];
  921                                 a1r = a[v + ht];
  922                                 a1i = a[v + ht + hn];
  923                                 FPC_ADD(a[v], a[v + hn],
  924                                         a0r, a0i, a1r, a1i);
  925                                 FPC_SUB(a0r, a0i, a0r, a0i, a1r, a1i);
  926                                 FPC_MUL(a[v + ht], a[v + ht + hn],
  927                                         a0r, a0i, sr, si);
  928                         }
  929                 }
  930                 t <<= 1;
  931         }
  932 
  933         /*
  934          * Last step: modulo X^2-X+1.
  935          *
  936          * For w = exp(i*pi/3), roots of X^2-X+1 are w and w^5. Since
  937          * w^3 = -1, we have w^5 = -w^2 = -w + 1.
  938          *
  939          * The FFT computed a(w) = a0 + a1*w. Since a0 and a1 are
  940          * real, we can recompute a0 and a1:
  941          *
  942          *   a1 = Im(a(w)) / Im(w)
  943          *   a0 = Re(a(w)) - Re(w) * a1
  944          *
  945          * Note that Re(w) = 1/2.
  946          */
  947         for (u = 0; u < hn; u ++) {
  948                 fpr xr, xi, a0, a1;
  949 
  950                 xr = a[u];
  951                 xi = a[u + hn];
  952                 a1 = fpr_mul(xi, fpr_IW1I);
  953                 a0 = fpr_sub(xr, fpr_half(a1));
  954                 a[u] = a0;
  955                 a[u + hn] = a1;
  956         }
  957 
  958         /*
  959          * We have an accumulated N/2 multiplier to remove.
  960          */
  961         ni = fpr_inverse_of(n >> 1);
  962         for (u = 0; u < n; u ++) {
  963                 a[u] = fpr_mul(ni, a[u]);
  964         }
  965 }
  966 
  967 /* see internal.h */
  968 void
  969 falcon_poly_add3(fpr *restrict a, const fpr *restrict b,
  970         unsigned logn, unsigned full)
  971 {
  972         size_t n, u;
  973 
  974         n = MKN(logn, full);
  975         for (u = 0; u < n; u ++) {
  976                 a[u] = fpr_add(a[u], b[u]);
  977         }
  978 }
  979 
  980 /* see internal.h */
  981 void
  982 falcon_poly_addconst3(fpr *restrict a, fpr x, unsigned logn, unsigned full)
  983 {
  984         (void)logn;
  985         (void)full;
  986         a[0] = fpr_add(a[0], x);
  987 }
  988 
  989 /* see internal.h */
  990 void
  991 falcon_poly_addconst_fft3(fpr *restrict a, fpr x, unsigned logn, unsigned full)
  992 {
  993         size_t n, u;
  994 
  995         n = MKN(logn, full);
  996         for (u = 0; u < n; u ++) {
  997                 a[u] = fpr_add(a[u], x);
  998         }
  999 }
 1000 
 1001 /* see internal.h */
 1002 void
 1003 falcon_poly_sub3(fpr *restrict a, const fpr *restrict b,
 1004         unsigned logn, unsigned full)
 1005 {
 1006         size_t n, u;
 1007 
 1008         n = MKN(logn, full);
 1009         for (u = 0; u < n; u ++) {
 1010                 a[u] = fpr_sub(a[u], b[u]);
 1011         }
 1012 }
 1013 
 1014 /* see internal.h */
 1015 void
 1016 falcon_poly_neg3(fpr *restrict a, unsigned logn, unsigned full)
 1017 {
 1018         size_t n, u;
 1019 
 1020         n = MKN(logn, full);
 1021         for (u = 0; u < n; u ++) {
 1022                 a[u] = fpr_neg(a[u]);
 1023         }
 1024 }
 1025 
 1026 /* see internal.h */
 1027 void
 1028 falcon_poly_adj_fft3(fpr *a, unsigned logn, unsigned full)
 1029 {
 1030         size_t n, hn, u;
 1031 
 1032         n = MKN(logn, full);
 1033         hn = n >> 1;
 1034         for (u = hn; u < n; u ++) {
 1035                 a[u] = fpr_neg(a[u]);
 1036         }
 1037 }
 1038 
 1039 /* see internal.h */
 1040 void
 1041 falcon_poly_mul_fft3(fpr *restrict a, const fpr *restrict b,
 1042         unsigned logn, unsigned full)
 1043 {
 1044         size_t n, hn, u;
 1045 
 1046         n = MKN(logn, full);
 1047         hn = n >> 1;
 1048         for (u = 0; u < hn; u ++) {
 1049                 fpr a_re, a_im, b_re, b_im;
 1050 
 1051                 a_re = a[u];
 1052                 a_im = a[u + hn];
 1053                 b_re = b[u];
 1054                 b_im = b[u + hn];
 1055                 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
 1056         }
 1057 }
 1058 
 1059 /* see internal.h */
 1060 void
 1061 falcon_poly_sqr_fft3(fpr *a, unsigned logn, unsigned full)
 1062 {
 1063         size_t n, hn, u;
 1064 
 1065         n = MKN(logn, full);
 1066         hn = n >> 1;
 1067         for (u = 0; u < hn; u ++) {
 1068                 fpr a_re, a_im;
 1069 
 1070                 a_re = a[u];
 1071                 a_im = a[u + hn];
 1072                 FPC_SQR(a[u], a[u + hn], a_re, a_im);
 1073         }
 1074 }
 1075 
 1076 /* see internal.h */
 1077 void
 1078 falcon_poly_muladj_fft3(fpr *restrict a, const fpr *restrict b,
 1079         unsigned logn, unsigned full)
 1080 {
 1081         size_t n, hn, u;
 1082 
 1083         n = MKN(logn, full);
 1084         hn = n >> 1;
 1085         for (u = 0; u < hn; u ++) {
 1086                 fpr a_re, a_im, b_re, b_im;
 1087 
 1088                 a_re = a[u];
 1089                 a_im = a[u + hn];
 1090                 b_re = b[u];
 1091                 b_im = fpr_neg(b[u + hn]);
 1092                 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
 1093         }
 1094 }
 1095 
 1096 /* see internal.h */
 1097 void
 1098 falcon_poly_mulselfadj_fft3(fpr *a, unsigned logn, unsigned full)
 1099 {
 1100         /*
 1101          * Since each coefficient is multiplied with its own conjugate,
 1102          * the result contains only real values.
 1103          */
 1104         size_t n, hn, u;
 1105 
 1106         n = MKN(logn, full);
 1107         hn = n >> 1;
 1108         for (u = 0; u < hn; u ++) {
 1109                 fpr a_re, a_im;
 1110 
 1111                 a_re = a[u];
 1112                 a_im = a[u + hn];
 1113                 a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
 1114                 a[u + hn] = fpr_of(0);
 1115         }
 1116 }
 1117 
 1118 /* see internal.h */
 1119 void
 1120 falcon_poly_mulconst3(fpr *a, fpr x, unsigned logn, unsigned full)
 1121 {
 1122         size_t n, u;
 1123 
 1124         n = MKN(logn, full);
 1125         for (u = 0; u < n; u ++) {
 1126                 a[u] = fpr_mul(a[u], x);
 1127         }
 1128 }
 1129 
 1130 /* see internal.h */
 1131 void
 1132 falcon_poly_inv_fft3(fpr *a, unsigned logn, unsigned full)
 1133 {
 1134         size_t n, hn, u;
 1135 
 1136         n = MKN(logn, full);
 1137         hn = n >> 1;
 1138         for (u = 0; u < hn; u ++) {
 1139                 fpr a_re, a_im;
 1140 
 1141                 a_re = a[u];
 1142                 a_im = a[u + hn];
 1143                 FPC_INV(a[u], a[u + hn], a_re, a_im);
 1144         }
 1145 }
 1146 
 1147 /* see internal.h */
 1148 void
 1149 falcon_poly_div_fft3(fpr *restrict a, const fpr *restrict b,
 1150         unsigned logn, unsigned full)
 1151 {
 1152         size_t n, hn, u;
 1153 
 1154         n = MKN(logn, full);
 1155         hn = n >> 1;
 1156         for (u = 0; u < hn; u ++) {
 1157                 fpr a_re, a_im, b_re, b_im;
 1158 
 1159                 a_re = a[u];
 1160                 a_im = a[u + hn];
 1161                 b_re = b[u];
 1162                 b_im = b[u + hn];
 1163                 FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
 1164         }
 1165 }
 1166 
 1167 /* see internal.h */
 1168 void
 1169 falcon_poly_divadj_fft3(fpr *restrict a, const fpr *restrict b,
 1170         unsigned logn, unsigned full)
 1171 {
 1172         size_t n, hn, u;
 1173 
 1174         n = MKN(logn, full);
 1175         hn = n >> 1;
 1176         for (u = 0; u < hn; u ++) {
 1177                 fpr a_re, a_im, b_re, b_im;
 1178 
 1179                 a_re = a[u];
 1180                 a_im = a[u + hn];
 1181                 b_re = b[u];
 1182                 b_im = fpr_neg(b[u + hn]);
 1183                 FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
 1184         }
 1185 }
 1186 
 1187 /* see internal.h */
 1188 void
 1189 falcon_poly_invnorm2_fft3(fpr *restrict d,
 1190         const fpr *restrict a, const fpr *restrict b,
 1191         unsigned logn, unsigned full)
 1192 {
 1193         size_t n, hn, u;
 1194 
 1195         n = MKN(logn, full);
 1196         hn = n >> 1;
 1197         for (u = 0; u < hn; u ++) {
 1198                 fpr a_re, a_im;
 1199                 fpr b_re, b_im;
 1200 
 1201                 a_re = a[u];
 1202                 a_im = a[u + hn];
 1203                 b_re = b[u];
 1204                 b_im = b[u + hn];
 1205                 d[u] = fpr_inv(fpr_add(
 1206                         fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
 1207                         fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
 1208         }
 1209 }
 1210 
 1211 /* see internal.h */
 1212 void
 1213 falcon_poly_add_muladj_fft3(fpr *restrict d,
 1214         const fpr *restrict F, const fpr *restrict G,
 1215         const fpr *restrict f, const fpr *restrict g,
 1216         unsigned logn, unsigned full)
 1217 {
 1218         size_t n, hn, u;
 1219 
 1220         n = MKN(logn, full);
 1221         hn = n >> 1;
 1222         for (u = 0; u < hn; u ++) {
 1223                 fpr F_re, F_im, G_re, G_im;
 1224                 fpr f_re, f_im, g_re, g_im;
 1225                 fpr a_re, a_im, b_re, b_im;
 1226 
 1227                 F_re = F[u];
 1228                 F_im = F[u + hn];
 1229                 G_re = G[u];
 1230                 G_im = G[u + hn];
 1231                 f_re = f[u];
 1232                 f_im = f[u + hn];
 1233                 g_re = g[u];
 1234                 g_im = g[u + hn];
 1235 
 1236                 FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
 1237                 FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
 1238                 d[u] = fpr_add(a_re, b_re);
 1239                 d[u + hn] = fpr_add(a_im, b_im);
 1240         }
 1241 }
 1242 
 1243 /* see internal.h */
 1244 void
 1245 falcon_poly_mul_autoadj_fft3(fpr *restrict a,
 1246         const fpr *restrict b, unsigned logn, unsigned full)
 1247 {
 1248         size_t n, hn, u;
 1249 
 1250         n = MKN(logn, full);
 1251         hn = n >> 1;
 1252         for (u = 0; u < hn; u ++) {
 1253                 a[u] = fpr_mul(a[u], b[u]);
 1254                 a[u + hn] = fpr_mul(a[u + hn], b[u]);
 1255         }
 1256 }
 1257 
 1258 /* see internal.h */
 1259 void
 1260 falcon_poly_div_autoadj_fft3(fpr *restrict a,
 1261         const fpr *restrict b, unsigned logn, unsigned full)
 1262 {
 1263         size_t n, hn, u;
 1264 
 1265         n = MKN(logn, full);
 1266         hn = n >> 1;
 1267         for (u = 0; u < hn; u ++) {
 1268                 a[u] = fpr_div(a[u], b[u]);
 1269                 a[u + hn] = fpr_div(a[u + hn], b[u]);
 1270         }
 1271 }
 1272 
 1273 /* see internal.h */
 1274 void
 1275 falcon_poly_split_top_fft3(
 1276         fpr *restrict f0, fpr *restrict f1, fpr *restrict f2,
 1277         const fpr *restrict f, unsigned logn)
 1278 {
 1279         size_t n, hn, qn, u, v;
 1280 
 1281         n = (size_t)3 << (logn - 1);
 1282         hn = n >> 1;
 1283         qn = (size_t)1 << (logn - 2);
 1284         for (u = 0, v = 0; u < hn; u += 3, v ++) {
 1285                 fpr fAr, fAi, fBr, fBi, fCr, fCi, xr, xi;
 1286                 fpr fB1r, fB1i, fB2r, fB2i, fC1r, fC1i, fC2r, fC2i;
 1287                 fpr t0r, t0i, t1r, t1i, t2r, t2i;
 1288 
 1289                 fAr = f[u];
 1290                 fAi = f[u + hn];
 1291                 fBr = f[u + 1];
 1292                 fBi = f[u + 1 + hn];
 1293                 fCr = f[u + 2];
 1294                 fCi = f[u + 2 + hn];
 1295 
 1296                 xr = fpr_gm3_cubic[(v << 1) + ((size_t)1 << logn) + 0];
 1297                 xi = fpr_neg(fpr_gm3_cubic[(v << 1) + ((size_t)1 << logn) + 1]);
 1298                 FPC_MUL(fB1r, fB1i, fBr, fBi, fpr_W4R, fpr_W4I);
 1299                 FPC_MUL(fB2r, fB2i, fBr, fBi, fpr_W2R, fpr_W2I);
 1300                 FPC_MUL(fC1r, fC1i, fCr, fCi, fpr_W4R, fpr_W4I);
 1301                 FPC_MUL(fC2r, fC2i, fCr, fCi, fpr_W2R, fpr_W2I);
 1302 
 1303                 FPC_ADD(fBr, fBi, fBr, fBi, fCr, fCi);
 1304                 FPC_ADD(t0r, t0i, fAr, fAi, fBr, fBi);
 1305 
 1306                 FPC_ADD(fB1r, fB1i, fB1r, fB1i, fC2r, fC2i);
 1307                 FPC_ADD(fB1r, fB1i, fB1r, fB1i, fAr, fAi);
 1308                 FPC_MUL(t1r, t1i, xr, xi, fB1r, fB1i);
 1309 
 1310                 FPC_SQR(xr, xi, xr, xi);
 1311                 FPC_ADD(fB2r, fB2i, fB2r, fB2i, fC1r, fC1i);
 1312                 FPC_ADD(fB2r, fB2i, fB2r, fB2i, fAr, fAi);
 1313                 FPC_MUL(t2r, t2i, xr, xi, fB2r, fB2i);
 1314 
 1315                 f0[v] = fpr_mul(t0r, fpr_inverse_of(3));
 1316                 f0[v + qn] = fpr_mul(t0i, fpr_inverse_of(3));
 1317                 f1[v] = fpr_mul(t1r, fpr_inverse_of(3));
 1318                 f1[v + qn] = fpr_mul(t1i, fpr_inverse_of(3));
 1319                 f2[v] = fpr_mul(t2r, fpr_inverse_of(3));
 1320                 f2[v + qn] = fpr_mul(t2i, fpr_inverse_of(3));
 1321         }
 1322 }
 1323 
 1324 /* see internal.h */
 1325 void
 1326 falcon_poly_split_deep_fft3(fpr *restrict f0, fpr *restrict f1,
 1327         const fpr *restrict f, unsigned logn)
 1328 {
 1329         size_t n, hn, qn, u, m;
 1330 
 1331         /*
 1332          * Special code for n = 2.
 1333          */
 1334         if (logn == 1) {
 1335                 fpr re, im, xx;
 1336 
 1337                 re = f[0];
 1338                 im = f[1];
 1339                 xx = fpr_mul(fpr_IW1I, im);
 1340                 *f1 = xx;
 1341                 *f0 = fpr_sub(re, fpr_half(xx));
 1342                 return;
 1343         }
 1344 
 1345         n = (size_t)1 << logn;
 1346         hn = n >> 1;
 1347         qn = hn >> 1;
 1348 
 1349         m = (size_t)1 << (logn - 1);
 1350         for (u = 0; u < qn; u ++) {
 1351                 fpr a_re, a_im, b_re, b_im;
 1352                 fpr t_re, t_im;
 1353 
 1354                 a_re = f[(u << 1) + 0];
 1355                 a_im = f[(u << 1) + 0 + hn];
 1356                 b_re = f[(u << 1) + 1];
 1357                 b_im = f[(u << 1) + 1 + hn];
 1358 
 1359                 FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
 1360                 f0[u] = fpr_half(t_re);
 1361                 f0[u + qn] = fpr_half(t_im);
 1362 
 1363                 FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
 1364                 FPC_MUL(t_re, t_im, t_re, t_im,
 1365                         fpr_gm3_square[((u + m) << 1) + 0],
 1366                         fpr_neg(fpr_gm3_square[((u + m) << 1) + 1]));
 1367                 f1[u] = fpr_half(t_re);
 1368                 f1[u + qn] = fpr_half(t_im);
 1369         }
 1370 }
 1371 
 1372 /* see internal.h */
 1373 void
 1374 falcon_poly_merge_top_fft3(fpr *restrict f,
 1375         const fpr *restrict f0, const fpr *restrict f1, const fpr *restrict f2,
 1376         unsigned logn)
 1377 {
 1378         size_t n, hn, qn, u, v;
 1379 
 1380         n = (size_t)3 << (logn - 1);
 1381         hn = n >> 1;
 1382         qn = (size_t)1 << (logn - 2);
 1383         for (u = 0, v = 0; u < hn; u += 3, v ++) {
 1384                 fpr fAr, fAi, fBr, fBi, fCr, fCi;
 1385                 fpr xr, xi;
 1386                 fpr fB0r, fB0i, fB1r, fB1i, fB2r, fB2i;
 1387                 fpr fC0r, fC0i, fC1r, fC1i, fC2r, fC2i;
 1388 
 1389                 fAr = f0[v];
 1390                 fAi = f0[v + qn];
 1391                 fBr = f1[v];
 1392                 fBi = f1[v + qn];
 1393                 fCr = f2[v];
 1394                 fCi = f2[v + qn];
 1395 
 1396                 xr = fpr_gm3_cubic[(v << 1) + ((size_t)1 << logn) + 0];
 1397                 xi = fpr_gm3_cubic[(v << 1) + ((size_t)1 << logn) + 1];
 1398                 FPC_MUL(fB0r, fB0i, fBr, fBi, xr, xi);
 1399                 FPC_MUL(fB1r, fB1i, fB0r, fB0i, fpr_W2R, fpr_W2I);
 1400                 FPC_MUL(fB2r, fB2i, fB0r, fB0i, fpr_W4R, fpr_W4I);
 1401                 FPC_SQR(xr, xi, xr, xi);
 1402                 FPC_MUL(fC0r, fC0i, fCr, fCi, xr, xi);
 1403                 FPC_MUL(fC1r, fC1i, fC0r, fC0i, fpr_W2R, fpr_W2I);
 1404                 FPC_MUL(fC2r, fC2i, fC0r, fC0i, fpr_W4R, fpr_W4I);
 1405                 FPC_ADD(fB0r, fB0i, fB0r, fB0i, fC0r, fC0i);
 1406                 FPC_ADD(fB1r, fB1i, fB1r, fB1i, fC2r, fC2i);
 1407                 FPC_ADD(fB2r, fB2i, fB2r, fB2i, fC1r, fC1i);
 1408                 FPC_ADD(f[u + 0], f[u + 0 + hn], fAr, fAi, fB0r, fB0i);
 1409                 FPC_ADD(f[u + 1], f[u + 1 + hn], fAr, fAi, fB1r, fB1i);
 1410                 FPC_ADD(f[u + 2], f[u + 2 + hn], fAr, fAi, fB2r, fB2i);
 1411         }
 1412 }
 1413 
 1414 /* see internal.h */
 1415 void
 1416 falcon_poly_merge_deep_fft3(fpr *restrict f,
 1417         const fpr *restrict f0, const fpr *restrict f1, unsigned logn)
 1418 {
 1419         size_t n, hn, qn, u, m;
 1420 
 1421         /*
 1422          * Special code for n = 2.
 1423          */
 1424         if (logn == 1) {
 1425                 fpr x, y;
 1426 
 1427                 x = *f0;
 1428                 y = *f1;
 1429                 f[0] = fpr_add(x, fpr_mul(y, fpr_W1R));
 1430                 f[1] = fpr_mul(y, fpr_W1I);
 1431                 return;
 1432         }
 1433 
 1434         n = (size_t)1 << logn;
 1435         hn = n >> 1;
 1436         qn = hn >> 1;
 1437 
 1438         m = (size_t)1 << (logn - 1);
 1439         for (u = 0; u < qn; u ++) {
 1440                 fpr a_re, a_im, b_re, b_im;
 1441                 fpr t_re, t_im;
 1442 
 1443                 a_re = f0[u];
 1444                 a_im = f0[u + qn];
 1445                 b_re = f1[u];
 1446                 b_im = f1[u + qn];
 1447                 FPC_MUL(t_re, t_im, b_re, b_im,
 1448                         fpr_gm3_square[((u + m) << 1) + 0],
 1449                         fpr_gm3_square[((u + m) << 1) + 1]);
 1450                 FPC_ADD(f[(u << 1) + 0], f[(u << 1) + 0 + hn],
 1451                         a_re, a_im, t_re, t_im);
 1452                 FPC_SUB(f[(u << 1) + 1], f[(u << 1) + 1 + hn],
 1453                         a_re, a_im, t_re, t_im);
 1454         }
 1455 }