Falcon source files (reference implementation)


fft.c

    1 /*
    2  * FFT code.
    3  *
    4  * ==========================(LICENSE BEGIN)============================
    5  *
    6  * Copyright (c) 2017-2019  Falcon Project
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining
    9  * a copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be
   17  * included in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * ===========================(LICENSE END)=============================
   28  *
   29  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
   30  */
   31 
   32 #include "inner.h"
   33 
   34 /*
   35  * Rules for complex number macros:
   36  * --------------------------------
   37  *
   38  * Operand order is: destination, source1, source2...
   39  *
   40  * Each operand is a real and an imaginary part.
   41  *
   42  * All overlaps are allowed.
   43  */
   44 
   45 /*
   46  * Addition of two complex numbers (d = a + b).
   47  */
   48 #define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
   49                 fpr fpct_re, fpct_im; \
   50                 fpct_re = fpr_add(a_re, b_re); \
   51                 fpct_im = fpr_add(a_im, b_im); \
   52                 (d_re) = fpct_re; \
   53                 (d_im) = fpct_im; \
   54         } while (0)
   55 
   56 /*
   57  * Subtraction of two complex numbers (d = a - b).
   58  */
   59 #define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
   60                 fpr fpct_re, fpct_im; \
   61                 fpct_re = fpr_sub(a_re, b_re); \
   62                 fpct_im = fpr_sub(a_im, b_im); \
   63                 (d_re) = fpct_re; \
   64                 (d_im) = fpct_im; \
   65         } while (0)
   66 
   67 /*
   68  * Multplication of two complex numbers (d = a * b).
   69  */
   70 #define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
   71                 fpr fpct_a_re, fpct_a_im; \
   72                 fpr fpct_b_re, fpct_b_im; \
   73                 fpr fpct_d_re, fpct_d_im; \
   74                 fpct_a_re = (a_re); \
   75                 fpct_a_im = (a_im); \
   76                 fpct_b_re = (b_re); \
   77                 fpct_b_im = (b_im); \
   78                 fpct_d_re = fpr_sub( \
   79                         fpr_mul(fpct_a_re, fpct_b_re), \
   80                         fpr_mul(fpct_a_im, fpct_b_im)); \
   81                 fpct_d_im = fpr_add( \
   82                         fpr_mul(fpct_a_re, fpct_b_im), \
   83                         fpr_mul(fpct_a_im, fpct_b_re)); \
   84                 (d_re) = fpct_d_re; \
   85                 (d_im) = fpct_d_im; \
   86         } while (0)
   87 
   88 /*
   89  * Squaring of a complex number (d = a * a).
   90  */
   91 #define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
   92                 fpr fpct_a_re, fpct_a_im; \
   93                 fpr fpct_d_re, fpct_d_im; \
   94                 fpct_a_re = (a_re); \
   95                 fpct_a_im = (a_im); \
   96                 fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
   97                 fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
   98                 (d_re) = fpct_d_re; \
   99                 (d_im) = fpct_d_im; \
  100         } while (0)
  101 
  102 /*
  103  * Inversion of a complex number (d = 1 / a).
  104  */
  105 #define FPC_INV(d_re, d_im, a_re, a_im)   do { \
  106                 fpr fpct_a_re, fpct_a_im; \
  107                 fpr fpct_d_re, fpct_d_im; \
  108                 fpr fpct_m; \
  109                 fpct_a_re = (a_re); \
  110                 fpct_a_im = (a_im); \
  111                 fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
  112                 fpct_m = fpr_inv(fpct_m); \
  113                 fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
  114                 fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
  115                 (d_re) = fpct_d_re; \
  116                 (d_im) = fpct_d_im; \
  117         } while (0)
  118 
  119 /*
  120  * Division of complex numbers (d = a / b).
  121  */
  122 #define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
  123                 fpr fpct_a_re, fpct_a_im; \
  124                 fpr fpct_b_re, fpct_b_im; \
  125                 fpr fpct_d_re, fpct_d_im; \
  126                 fpr fpct_m; \
  127                 fpct_a_re = (a_re); \
  128                 fpct_a_im = (a_im); \
  129                 fpct_b_re = (b_re); \
  130                 fpct_b_im = (b_im); \
  131                 fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
  132                 fpct_m = fpr_inv(fpct_m); \
  133                 fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
  134                 fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
  135                 fpct_d_re = fpr_sub( \
  136                         fpr_mul(fpct_a_re, fpct_b_re), \
  137                         fpr_mul(fpct_a_im, fpct_b_im)); \
  138                 fpct_d_im = fpr_add( \
  139                         fpr_mul(fpct_a_re, fpct_b_im), \
  140                         fpr_mul(fpct_a_im, fpct_b_re)); \
  141                 (d_re) = fpct_d_re; \
  142                 (d_im) = fpct_d_im; \
  143         } while (0)
  144 
  145 /*
  146  * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
  147  * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
  148  * of X^N+1 in the field of complex numbers. A crucial property is that
  149  * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
  150  *
  151  * FFT representation of a polynomial f (taken modulo X^N+1) is the
  152  * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
  153  * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
  154  * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
  155  * needed. A consequence is that FFT representation has the same size
  156  * as normal representation: N/2 complex numbers use N real numbers (each
  157  * complex number is the combination of a real and an imaginary part).
  158  *
  159  * We use a specific ordering which makes computations easier. Let rev()
  160  * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
  161  * store the real and imaginary parts of f(w_j) in slots:
  162  *
  163  *    Re(f(w_j)) -> slot rev(j)/2
  164  *    Im(f(w_j)) -> slot rev(j)/2+N/2
  165  *
  166  * (Note that rev(j) is even for j < N/2.)
  167  */
  168 
  169 /* see inner.h */
  170 TARGET_AVX2
  171 void
  172 Zf(FFT)(fpr *f, unsigned logn)
  173 {
  174         /*
  175          * FFT algorithm in bit-reversal order uses the following
  176          * iterative algorithm:
  177          *
  178          *   t = N
  179          *   for m = 1; m < N; m *= 2:
  180          *       ht = t/2
  181          *       for i1 = 0; i1 < m; i1 ++:
  182          *           j1 = i1 * t
  183          *           s = GM[m + i1]
  184          *           for j = j1; j < (j1 + ht); j ++:
  185          *               x = f[j]
  186          *               y = s * f[j + ht]
  187          *               f[j] = x + y
  188          *               f[j + ht] = x - y
  189          *       t = ht
  190          *
  191          * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
  192          *
  193          * In the description above, f[] is supposed to contain complex
  194          * numbers. In our in-memory representation, the real and
  195          * imaginary parts of f[k] are in array slots k and k+N/2.
  196          *
  197          * We only keep the first half of the complex numbers. We can
  198          * see that after the first iteration, the first and second halves
  199          * of the array of complex numbers have separate lives, so we
  200          * simply ignore the second part.
  201          */
  202 
  203         unsigned u;
  204         size_t t, n, hn, m;
  205 
  206         /*
  207          * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
  208          * (because GM[1] = w^rev(1) = w^(N/2) = i).
  209          * In our chosen representation, this is a no-op: everything is
  210          * already where it should be.
  211          */
  212 
  213         /*
  214          * Subsequent iterations are truncated to use only the first
  215          * half of values.
  216          */
  217         n = (size_t)1 << logn;
  218         hn = n >> 1;
  219         t = hn;
  220         for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
  221                 size_t ht, hm, i1, j1;
  222 
  223                 ht = t >> 1;
  224                 hm = m >> 1;
  225                 for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
  226                         size_t j, j2;
  227 
  228                         j2 = j1 + ht;
  229 #if FALCON_AVX2 // yyyAVX2+1
  230                         if (ht >= 4) {
  231                                 __m256d s_re, s_im;
  232 
  233                                 s_re = _mm256_set1_pd(
  234                                         fpr_gm_tab[((m + i1) << 1) + 0].v);
  235                                 s_im = _mm256_set1_pd(
  236                                         fpr_gm_tab[((m + i1) << 1) + 1].v);
  237                                 for (j = j1; j < j2; j += 4) {
  238                                         __m256d x_re, x_im, y_re, y_im;
  239                                         __m256d z_re, z_im;
  240 
  241                                         x_re = _mm256_loadu_pd(&f[j].v);
  242                                         x_im = _mm256_loadu_pd(&f[j + hn].v);
  243                                         z_re = _mm256_loadu_pd(&f[j+ht].v);
  244                                         z_im = _mm256_loadu_pd(&f[j+ht + hn].v);
  245                                         y_re = FMSUB(z_re, s_re,
  246                                                 _mm256_mul_pd(z_im, s_im));
  247                                         y_im = FMADD(z_re, s_im,
  248                                                 _mm256_mul_pd(z_im, s_re));
  249                                         _mm256_storeu_pd(&f[j].v,
  250                                                 _mm256_add_pd(x_re, y_re));
  251                                         _mm256_storeu_pd(&f[j + hn].v,
  252                                                 _mm256_add_pd(x_im, y_im));
  253                                         _mm256_storeu_pd(&f[j + ht].v,
  254                                                 _mm256_sub_pd(x_re, y_re));
  255                                         _mm256_storeu_pd(&f[j + ht + hn].v,
  256                                                 _mm256_sub_pd(x_im, y_im));
  257                                 }
  258                         } else {
  259                                 fpr s_re, s_im;
  260 
  261                                 s_re = fpr_gm_tab[((m + i1) << 1) + 0];
  262                                 s_im = fpr_gm_tab[((m + i1) << 1) + 1];
  263                                 for (j = j1; j < j2; j ++) {
  264                                         fpr x_re, x_im, y_re, y_im;
  265 
  266                                         x_re = f[j];
  267                                         x_im = f[j + hn];
  268                                         y_re = f[j + ht];
  269                                         y_im = f[j + ht + hn];
  270                                         FPC_MUL(y_re, y_im,
  271                                                 y_re, y_im, s_re, s_im);
  272                                         FPC_ADD(f[j], f[j + hn],
  273                                                 x_re, x_im, y_re, y_im);
  274                                         FPC_SUB(f[j + ht], f[j + ht + hn],
  275                                                 x_re, x_im, y_re, y_im);
  276                                 }
  277                         }
  278 #else // yyyAVX2+0
  279                         fpr s_re, s_im;
  280 
  281                         s_re = fpr_gm_tab[((m + i1) << 1) + 0];
  282                         s_im = fpr_gm_tab[((m + i1) << 1) + 1];
  283                         for (j = j1; j < j2; j ++) {
  284                                 fpr x_re, x_im, y_re, y_im;
  285 
  286                                 x_re = f[j];
  287                                 x_im = f[j + hn];
  288                                 y_re = f[j + ht];
  289                                 y_im = f[j + ht + hn];
  290                                 FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
  291                                 FPC_ADD(f[j], f[j + hn],
  292                                         x_re, x_im, y_re, y_im);
  293                                 FPC_SUB(f[j + ht], f[j + ht + hn],
  294                                         x_re, x_im, y_re, y_im);
  295                         }
  296 #endif // yyyAVX2-
  297                 }
  298                 t = ht;
  299         }
  300 }
  301 
  302 /* see inner.h */
  303 TARGET_AVX2
  304 void
  305 Zf(iFFT)(fpr *f, unsigned logn)
  306 {
  307         /*
  308          * Inverse FFT algorithm in bit-reversal order uses the following
  309          * iterative algorithm:
  310          *
  311          *   t = 1
  312          *   for m = N; m > 1; m /= 2:
  313          *       hm = m/2
  314          *       dt = t*2
  315          *       for i1 = 0; i1 < hm; i1 ++:
  316          *           j1 = i1 * dt
  317          *           s = iGM[hm + i1]
  318          *           for j = j1; j < (j1 + t); j ++:
  319          *               x = f[j]
  320          *               y = f[j + t]
  321          *               f[j] = x + y
  322          *               f[j + t] = s * (x - y)
  323          *       t = dt
  324          *   for i1 = 0; i1 < N; i1 ++:
  325          *       f[i1] = f[i1] / N
  326          *
  327          * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
  328          * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
  329          *
  330          * In the main loop (not counting the final division loop), in
  331          * all iterations except the last, the first and second half of f[]
  332          * (as an array of complex numbers) are separate. In our chosen
  333          * representation, we do not keep the second half.
  334          *
  335          * The last iteration recombines the recomputed half with the
  336          * implicit half, and should yield only real numbers since the
  337          * target polynomial is real; moreover, s = i at that step.
  338          * Thus, when considering x and y:
  339          *    y = conj(x) since the final f[j] must be real
  340          *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
  341          *    filled with 2*Im(x).
  342          * But we already have Re(x) and Im(x) in array slots j and j+t
  343          * in our chosen representation. That last iteration is thus a
  344          * simple doubling of the values in all the array.
  345          *
  346          * We make the last iteration a no-op by tweaking the final
  347          * division into a division by N/2, not N.
  348          */
  349         size_t u, n, hn, t, m;
  350 
  351         n = (size_t)1 << logn;
  352         t = 1;
  353         m = n;
  354         hn = n >> 1;
  355         for (u = logn; u > 1; u --) {
  356                 size_t hm, dt, i1, j1;
  357 
  358                 hm = m >> 1;
  359                 dt = t << 1;
  360                 for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
  361                         size_t j, j2;
  362 
  363                         j2 = j1 + t;
  364 #if FALCON_AVX2 // yyyAVX2+1
  365                         if (t >= 4) {
  366                                 __m256d s_re, s_im;
  367 
  368                                 s_re = _mm256_set1_pd(
  369                                         fpr_gm_tab[((hm + i1) << 1) + 0].v);
  370                                 s_im = _mm256_set1_pd(
  371                                         fpr_gm_tab[((hm + i1) << 1) + 1].v);
  372                                 for (j = j1; j < j2; j += 4) {
  373                                         __m256d x_re, x_im, y_re, y_im;
  374                                         __m256d z_re, z_im;
  375 
  376                                         x_re = _mm256_loadu_pd(&f[j].v);
  377                                         x_im = _mm256_loadu_pd(&f[j + hn].v);
  378                                         y_re = _mm256_loadu_pd(&f[j+t].v);
  379                                         y_im = _mm256_loadu_pd(&f[j+t + hn].v);
  380                                         _mm256_storeu_pd(&f[j].v,
  381                                                 _mm256_add_pd(x_re, y_re));
  382                                         _mm256_storeu_pd(&f[j + hn].v,
  383                                                 _mm256_add_pd(x_im, y_im));
  384                                         x_re = _mm256_sub_pd(y_re, x_re);
  385                                         x_im = _mm256_sub_pd(x_im, y_im);
  386                                         z_re = FMSUB(x_im, s_im,
  387                                                 _mm256_mul_pd(x_re, s_re));
  388                                         z_im = FMADD(x_re, s_im,
  389                                                 _mm256_mul_pd(x_im, s_re));
  390                                         _mm256_storeu_pd(&f[j+t].v, z_re);
  391                                         _mm256_storeu_pd(&f[j+t + hn].v, z_im);
  392                                 }
  393                         } else {
  394                                 fpr s_re, s_im;
  395 
  396                                 s_re = fpr_gm_tab[((hm + i1) << 1)+0];
  397                                 s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]);
  398                                 for (j = j1; j < j2; j ++) {
  399                                         fpr x_re, x_im, y_re, y_im;
  400 
  401                                         x_re = f[j];
  402                                         x_im = f[j + hn];
  403                                         y_re = f[j + t];
  404                                         y_im = f[j + t + hn];
  405                                         FPC_ADD(f[j], f[j + hn],
  406                                                 x_re, x_im, y_re, y_im);
  407                                         FPC_SUB(x_re, x_im,
  408                                                 x_re, x_im, y_re, y_im);
  409                                         FPC_MUL(f[j + t], f[j + t + hn],
  410                                                 x_re, x_im, s_re, s_im);
  411                                 }
  412                         }
  413 #else // yyyAVX2+0
  414                         fpr s_re, s_im;
  415 
  416                         s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
  417                         s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
  418                         for (j = j1; j < j2; j ++) {
  419                                 fpr x_re, x_im, y_re, y_im;
  420 
  421                                 x_re = f[j];
  422                                 x_im = f[j + hn];
  423                                 y_re = f[j + t];
  424                                 y_im = f[j + t + hn];
  425                                 FPC_ADD(f[j], f[j + hn],
  426                                         x_re, x_im, y_re, y_im);
  427                                 FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
  428                                 FPC_MUL(f[j + t], f[j + t + hn],
  429                                         x_re, x_im, s_re, s_im);
  430                         }
  431 #endif // yyyAVX2-
  432                 }
  433                 t = dt;
  434                 m = hm;
  435         }
  436 
  437         /*
  438          * Last iteration is a no-op, provided that we divide by N/2
  439          * instead of N. We need to make a special case for logn = 0.
  440          */
  441         if (logn > 0) {
  442                 fpr ni;
  443 
  444                 ni = fpr_p2_tab[logn];
  445                 for (u = 0; u < n; u ++) {
  446                         f[u] = fpr_mul(f[u], ni);
  447                 }
  448         }
  449 }
  450 
  451 /* see inner.h */
  452 TARGET_AVX2
  453 void
  454 Zf(poly_add)(
  455         fpr *restrict a, const fpr *restrict b, unsigned logn)
  456 {
  457         size_t n, u;
  458 
  459         n = (size_t)1 << logn;
  460 #if FALCON_AVX2 // yyyAVX2+1
  461         if (n >= 4) {
  462                 for (u = 0; u < n; u += 4) {
  463                         _mm256_storeu_pd(&a[u].v,
  464                                 _mm256_add_pd(
  465                                         _mm256_loadu_pd(&a[u].v),
  466                                         _mm256_loadu_pd(&b[u].v)));
  467                 }
  468         } else {
  469                 for (u = 0; u < n; u ++) {
  470                         a[u] = fpr_add(a[u], b[u]);
  471                 }
  472         }
  473 #else // yyyAVX2+0
  474         for (u = 0; u < n; u ++) {
  475                 a[u] = fpr_add(a[u], b[u]);
  476         }
  477 #endif // yyyAVX2-
  478 }
  479 
  480 /* see inner.h */
  481 TARGET_AVX2
  482 void
  483 Zf(poly_sub)(
  484         fpr *restrict a, const fpr *restrict b, unsigned logn)
  485 {
  486         size_t n, u;
  487 
  488         n = (size_t)1 << logn;
  489 #if FALCON_AVX2 // yyyAVX2+1
  490         if (n >= 4) {
  491                 for (u = 0; u < n; u += 4) {
  492                         _mm256_storeu_pd(&a[u].v,
  493                                 _mm256_sub_pd(
  494                                         _mm256_loadu_pd(&a[u].v),
  495                                         _mm256_loadu_pd(&b[u].v)));
  496                 }
  497         } else {
  498                 for (u = 0; u < n; u ++) {
  499                         a[u] = fpr_sub(a[u], b[u]);
  500                 }
  501         }
  502 #else // yyyAVX2+0
  503         for (u = 0; u < n; u ++) {
  504                 a[u] = fpr_sub(a[u], b[u]);
  505         }
  506 #endif // yyyAVX2-
  507 }
  508 
  509 /* see inner.h */
  510 TARGET_AVX2
  511 void
  512 Zf(poly_neg)(fpr *a, unsigned logn)
  513 {
  514         size_t n, u;
  515 
  516         n = (size_t)1 << logn;
  517 #if FALCON_AVX2 // yyyAVX2+1
  518         if (n >= 4) {
  519                 __m256d s;
  520 
  521                 s = _mm256_set1_pd(-0.0);
  522                 for (u = 0; u < n; u += 4) {
  523                         _mm256_storeu_pd(&a[u].v,
  524                                 _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
  525                 }
  526         } else {
  527                 for (u = 0; u < n; u ++) {
  528                         a[u] = fpr_neg(a[u]);
  529                 }
  530         }
  531 #else // yyyAVX2+0
  532         for (u = 0; u < n; u ++) {
  533                 a[u] = fpr_neg(a[u]);
  534         }
  535 #endif // yyyAVX2-
  536 }
  537 
  538 /* see inner.h */
  539 TARGET_AVX2
  540 void
  541 Zf(poly_adj_fft)(fpr *a, unsigned logn)
  542 {
  543         size_t n, u;
  544 
  545         n = (size_t)1 << logn;
  546 #if FALCON_AVX2 // yyyAVX2+1
  547         if (n >= 8) {
  548                 __m256d s;
  549 
  550                 s = _mm256_set1_pd(-0.0);
  551                 for (u = (n >> 1); u < n; u += 4) {
  552                         _mm256_storeu_pd(&a[u].v,
  553                                 _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
  554                 }
  555         } else {
  556                 for (u = (n >> 1); u < n; u ++) {
  557                         a[u] = fpr_neg(a[u]);
  558                 }
  559         }
  560 #else // yyyAVX2+0
  561         for (u = (n >> 1); u < n; u ++) {
  562                 a[u] = fpr_neg(a[u]);
  563         }
  564 #endif // yyyAVX2-
  565 }
  566 
  567 /* see inner.h */
  568 TARGET_AVX2
  569 void
  570 Zf(poly_mul_fft)(
  571         fpr *restrict a, const fpr *restrict b, unsigned logn)
  572 {
  573         size_t n, hn, u;
  574 
  575         n = (size_t)1 << logn;
  576         hn = n >> 1;
  577 #if FALCON_AVX2 // yyyAVX2+1
  578         if (n >= 8) {
  579                 for (u = 0; u < hn; u += 4) {
  580                         __m256d a_re, a_im, b_re, b_im, c_re, c_im;
  581 
  582                         a_re = _mm256_loadu_pd(&a[u].v);
  583                         a_im = _mm256_loadu_pd(&a[u + hn].v);
  584                         b_re = _mm256_loadu_pd(&b[u].v);
  585                         b_im = _mm256_loadu_pd(&b[u + hn].v);
  586                         c_re = FMSUB(
  587                                 a_re, b_re, _mm256_mul_pd(a_im, b_im));
  588                         c_im = FMADD(
  589                                 a_re, b_im, _mm256_mul_pd(a_im, b_re));
  590                         _mm256_storeu_pd(&a[u].v, c_re);
  591                         _mm256_storeu_pd(&a[u + hn].v, c_im);
  592                 }
  593         } else {
  594                 for (u = 0; u < hn; u ++) {
  595                         fpr a_re, a_im, b_re, b_im;
  596 
  597                         a_re = a[u];
  598                         a_im = a[u + hn];
  599                         b_re = b[u];
  600                         b_im = b[u + hn];
  601                         FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  602                 }
  603         }
  604 #else // yyyAVX2+0
  605         for (u = 0; u < hn; u ++) {
  606                 fpr a_re, a_im, b_re, b_im;
  607 
  608                 a_re = a[u];
  609                 a_im = a[u + hn];
  610                 b_re = b[u];
  611                 b_im = b[u + hn];
  612                 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  613         }
  614 #endif // yyyAVX2-
  615 }
  616 
  617 /* see inner.h */
  618 TARGET_AVX2
  619 void
  620 Zf(poly_muladj_fft)(
  621         fpr *restrict a, const fpr *restrict b, unsigned logn)
  622 {
  623         size_t n, hn, u;
  624 
  625         n = (size_t)1 << logn;
  626         hn = n >> 1;
  627 #if FALCON_AVX2 // yyyAVX2+1
  628         if (n >= 8) {
  629                 for (u = 0; u < hn; u += 4) {
  630                         __m256d a_re, a_im, b_re, b_im, c_re, c_im;
  631 
  632                         a_re = _mm256_loadu_pd(&a[u].v);
  633                         a_im = _mm256_loadu_pd(&a[u + hn].v);
  634                         b_re = _mm256_loadu_pd(&b[u].v);
  635                         b_im = _mm256_loadu_pd(&b[u + hn].v);
  636                         c_re = FMADD(
  637                                 a_re, b_re, _mm256_mul_pd(a_im, b_im));
  638                         c_im = FMSUB(
  639                                 a_im, b_re, _mm256_mul_pd(a_re, b_im));
  640                         _mm256_storeu_pd(&a[u].v, c_re);
  641                         _mm256_storeu_pd(&a[u + hn].v, c_im);
  642                 }
  643         } else {
  644                 for (u = 0; u < hn; u ++) {
  645                         fpr a_re, a_im, b_re, b_im;
  646 
  647                         a_re = a[u];
  648                         a_im = a[u + hn];
  649                         b_re = b[u];
  650                         b_im = fpr_neg(b[u + hn]);
  651                         FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  652                 }
  653         }
  654 #else // yyyAVX2+0
  655         for (u = 0; u < hn; u ++) {
  656                 fpr a_re, a_im, b_re, b_im;
  657 
  658                 a_re = a[u];
  659                 a_im = a[u + hn];
  660                 b_re = b[u];
  661                 b_im = fpr_neg(b[u + hn]);
  662                 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  663         }
  664 #endif // yyyAVX2-
  665 }
  666 
  667 /* see inner.h */
  668 TARGET_AVX2
  669 void
  670 Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn)
  671 {
  672         /*
  673          * Since each coefficient is multiplied with its own conjugate,
  674          * the result contains only real values.
  675          */
  676         size_t n, hn, u;
  677 
  678         n = (size_t)1 << logn;
  679         hn = n >> 1;
  680 #if FALCON_AVX2 // yyyAVX2+1
  681         if (n >= 8) {
  682                 __m256d zero;
  683 
  684                 zero = _mm256_setzero_pd();
  685                 for (u = 0; u < hn; u += 4) {
  686                         __m256d a_re, a_im;
  687 
  688                         a_re = _mm256_loadu_pd(&a[u].v);
  689                         a_im = _mm256_loadu_pd(&a[u + hn].v);
  690                         _mm256_storeu_pd(&a[u].v,
  691                                 FMADD(a_re, a_re,
  692                                         _mm256_mul_pd(a_im, a_im)));
  693                         _mm256_storeu_pd(&a[u + hn].v, zero);
  694                 }
  695         } else {
  696                 for (u = 0; u < hn; u ++) {
  697                         fpr a_re, a_im;
  698 
  699                         a_re = a[u];
  700                         a_im = a[u + hn];
  701                         a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
  702                         a[u + hn] = fpr_zero;
  703                 }
  704         }
  705 #else // yyyAVX2+0
  706         for (u = 0; u < hn; u ++) {
  707                 fpr a_re, a_im;
  708 
  709                 a_re = a[u];
  710                 a_im = a[u + hn];
  711                 a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
  712                 a[u + hn] = fpr_zero;
  713         }
  714 #endif // yyyAVX2-
  715 }
  716 
  717 /* see inner.h */
  718 TARGET_AVX2
  719 void
  720 Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn)
  721 {
  722         size_t n, u;
  723 
  724         n = (size_t)1 << logn;
  725 #if FALCON_AVX2 // yyyAVX2+1
  726         if (n >= 4) {
  727                 __m256d x4;
  728 
  729                 x4 = _mm256_set1_pd(x.v);
  730                 for (u = 0; u < n; u += 4) {
  731                         _mm256_storeu_pd(&a[u].v,
  732                                 _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
  733                 }
  734         } else {
  735                 for (u = 0; u < n; u ++) {
  736                         a[u] = fpr_mul(a[u], x);
  737                 }
  738         }
  739 #else // yyyAVX2+0
  740         for (u = 0; u < n; u ++) {
  741                 a[u] = fpr_mul(a[u], x);
  742         }
  743 #endif // yyyAVX2-
  744 }
  745 
  746 /* see inner.h */
  747 TARGET_AVX2
  748 void
  749 Zf(poly_div_fft)(
  750         fpr *restrict a, const fpr *restrict b, unsigned logn)
  751 {
  752         size_t n, hn, u;
  753 
  754         n = (size_t)1 << logn;
  755         hn = n >> 1;
  756 #if FALCON_AVX2 // yyyAVX2+1
  757         if (n >= 8) {
  758                 __m256d one;
  759 
  760                 one = _mm256_set1_pd(1.0);
  761                 for (u = 0; u < hn; u += 4) {
  762                         __m256d a_re, a_im, b_re, b_im, c_re, c_im, t;
  763 
  764                         a_re = _mm256_loadu_pd(&a[u].v);
  765                         a_im = _mm256_loadu_pd(&a[u + hn].v);
  766                         b_re = _mm256_loadu_pd(&b[u].v);
  767                         b_im = _mm256_loadu_pd(&b[u + hn].v);
  768                         t = _mm256_div_pd(one,
  769                                 FMADD(b_re, b_re,
  770                                         _mm256_mul_pd(b_im, b_im)));
  771                         b_re = _mm256_mul_pd(b_re, t);
  772                         b_im = _mm256_mul_pd(b_im, t);
  773                         c_re = FMADD(
  774                                 a_re, b_re, _mm256_mul_pd(a_im, b_im));
  775                         c_im = FMSUB(
  776                                 a_im, b_re, _mm256_mul_pd(a_re, b_im));
  777                         _mm256_storeu_pd(&a[u].v, c_re);
  778                         _mm256_storeu_pd(&a[u + hn].v, c_im);
  779                 }
  780         } else {
  781                 for (u = 0; u < hn; u ++) {
  782                         fpr a_re, a_im, b_re, b_im;
  783 
  784                         a_re = a[u];
  785                         a_im = a[u + hn];
  786                         b_re = b[u];
  787                         b_im = b[u + hn];
  788                         FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  789                 }
  790         }
  791 #else // yyyAVX2+0
  792         for (u = 0; u < hn; u ++) {
  793                 fpr a_re, a_im, b_re, b_im;
  794 
  795                 a_re = a[u];
  796                 a_im = a[u + hn];
  797                 b_re = b[u];
  798                 b_im = b[u + hn];
  799                 FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
  800         }
  801 #endif // yyyAVX2-
  802 }
  803 
  804 /* see inner.h */
  805 TARGET_AVX2
  806 void
  807 Zf(poly_invnorm2_fft)(fpr *restrict d,
  808         const fpr *restrict a, const fpr *restrict b, unsigned logn)
  809 {
  810         size_t n, hn, u;
  811 
  812         n = (size_t)1 << logn;
  813         hn = n >> 1;
  814 #if FALCON_AVX2 // yyyAVX2+1
  815         if (n >= 8) {
  816                 __m256d one;
  817 
  818                 one = _mm256_set1_pd(1.0);
  819                 for (u = 0; u < hn; u += 4) {
  820                         __m256d a_re, a_im, b_re, b_im, dv;
  821 
  822                         a_re = _mm256_loadu_pd(&a[u].v);
  823                         a_im = _mm256_loadu_pd(&a[u + hn].v);
  824                         b_re = _mm256_loadu_pd(&b[u].v);
  825                         b_im = _mm256_loadu_pd(&b[u + hn].v);
  826                         dv = _mm256_div_pd(one,
  827                                 _mm256_add_pd(
  828                                         FMADD(a_re, a_re,
  829                                                 _mm256_mul_pd(a_im, a_im)),
  830                                         FMADD(b_re, b_re,
  831                                                 _mm256_mul_pd(b_im, b_im))));
  832                         _mm256_storeu_pd(&d[u].v, dv);
  833                 }
  834         } else {
  835                 for (u = 0; u < hn; u ++) {
  836                         fpr a_re, a_im;
  837                         fpr b_re, b_im;
  838 
  839                         a_re = a[u];
  840                         a_im = a[u + hn];
  841                         b_re = b[u];
  842                         b_im = b[u + hn];
  843                         d[u] = fpr_inv(fpr_add(
  844                                 fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
  845                                 fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
  846                 }
  847         }
  848 #else // yyyAVX2+0
  849         for (u = 0; u < hn; u ++) {
  850                 fpr a_re, a_im;
  851                 fpr b_re, b_im;
  852 
  853                 a_re = a[u];
  854                 a_im = a[u + hn];
  855                 b_re = b[u];
  856                 b_im = b[u + hn];
  857                 d[u] = fpr_inv(fpr_add(
  858                         fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
  859                         fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
  860         }
  861 #endif // yyyAVX2-
  862 }
  863 
  864 /* see inner.h */
  865 TARGET_AVX2
  866 void
  867 Zf(poly_add_muladj_fft)(fpr *restrict d,
  868         const fpr *restrict F, const fpr *restrict G,
  869         const fpr *restrict f, const fpr *restrict g, unsigned logn)
  870 {
  871         size_t n, hn, u;
  872 
  873         n = (size_t)1 << logn;
  874         hn = n >> 1;
  875 #if FALCON_AVX2 // yyyAVX2+1
  876         if (n >= 8) {
  877                 for (u = 0; u < hn; u += 4) {
  878                         __m256d F_re, F_im, G_re, G_im;
  879                         __m256d f_re, f_im, g_re, g_im;
  880                         __m256d a_re, a_im, b_re, b_im;
  881 
  882                         F_re = _mm256_loadu_pd(&F[u].v);
  883                         F_im = _mm256_loadu_pd(&F[u + hn].v);
  884                         G_re = _mm256_loadu_pd(&G[u].v);
  885                         G_im = _mm256_loadu_pd(&G[u + hn].v);
  886                         f_re = _mm256_loadu_pd(&f[u].v);
  887                         f_im = _mm256_loadu_pd(&f[u + hn].v);
  888                         g_re = _mm256_loadu_pd(&g[u].v);
  889                         g_im = _mm256_loadu_pd(&g[u + hn].v);
  890 
  891                         a_re = FMADD(F_re, f_re,
  892                                 _mm256_mul_pd(F_im, f_im));
  893                         a_im = FMSUB(F_im, f_re,
  894                                 _mm256_mul_pd(F_re, f_im));
  895                         b_re = FMADD(G_re, g_re,
  896                                 _mm256_mul_pd(G_im, g_im));
  897                         b_im = FMSUB(G_im, g_re,
  898                                 _mm256_mul_pd(G_re, g_im));
  899                         _mm256_storeu_pd(&d[u].v,
  900                                 _mm256_add_pd(a_re, b_re));
  901                         _mm256_storeu_pd(&d[u + hn].v,
  902                                 _mm256_add_pd(a_im, b_im));
  903                 }
  904         } else {
  905                 for (u = 0; u < hn; u ++) {
  906                         fpr F_re, F_im, G_re, G_im;
  907                         fpr f_re, f_im, g_re, g_im;
  908                         fpr a_re, a_im, b_re, b_im;
  909 
  910                         F_re = F[u];
  911                         F_im = F[u + hn];
  912                         G_re = G[u];
  913                         G_im = G[u + hn];
  914                         f_re = f[u];
  915                         f_im = f[u + hn];
  916                         g_re = g[u];
  917                         g_im = g[u + hn];
  918 
  919                         FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
  920                         FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
  921                         d[u] = fpr_add(a_re, b_re);
  922                         d[u + hn] = fpr_add(a_im, b_im);
  923                 }
  924         }
  925 #else // yyyAVX2+0
  926         for (u = 0; u < hn; u ++) {
  927                 fpr F_re, F_im, G_re, G_im;
  928                 fpr f_re, f_im, g_re, g_im;
  929                 fpr a_re, a_im, b_re, b_im;
  930 
  931                 F_re = F[u];
  932                 F_im = F[u + hn];
  933                 G_re = G[u];
  934                 G_im = G[u + hn];
  935                 f_re = f[u];
  936                 f_im = f[u + hn];
  937                 g_re = g[u];
  938                 g_im = g[u + hn];
  939 
  940                 FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
  941                 FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
  942                 d[u] = fpr_add(a_re, b_re);
  943                 d[u + hn] = fpr_add(a_im, b_im);
  944         }
  945 #endif // yyyAVX2-
  946 }
  947 
  948 /* see inner.h */
  949 TARGET_AVX2
  950 void
  951 Zf(poly_mul_autoadj_fft)(
  952         fpr *restrict a, const fpr *restrict b, unsigned logn)
  953 {
  954         size_t n, hn, u;
  955 
  956         n = (size_t)1 << logn;
  957         hn = n >> 1;
  958 #if FALCON_AVX2 // yyyAVX2+1
  959         if (n >= 8) {
  960                 for (u = 0; u < hn; u += 4) {
  961                         __m256d a_re, a_im, bv;
  962 
  963                         a_re = _mm256_loadu_pd(&a[u].v);
  964                         a_im = _mm256_loadu_pd(&a[u + hn].v);
  965                         bv = _mm256_loadu_pd(&b[u].v);
  966                         _mm256_storeu_pd(&a[u].v,
  967                                 _mm256_mul_pd(a_re, bv));
  968                         _mm256_storeu_pd(&a[u + hn].v,
  969                                 _mm256_mul_pd(a_im, bv));
  970                 }
  971         } else {
  972                 for (u = 0; u < hn; u ++) {
  973                         a[u] = fpr_mul(a[u], b[u]);
  974                         a[u + hn] = fpr_mul(a[u + hn], b[u]);
  975                 }
  976         }
  977 #else // yyyAVX2+0
  978         for (u = 0; u < hn; u ++) {
  979                 a[u] = fpr_mul(a[u], b[u]);
  980                 a[u + hn] = fpr_mul(a[u + hn], b[u]);
  981         }
  982 #endif // yyyAVX2-
  983 }
  984 
  985 /* see inner.h */
  986 TARGET_AVX2
  987 void
  988 Zf(poly_div_autoadj_fft)(
  989         fpr *restrict a, const fpr *restrict b, unsigned logn)
  990 {
  991         size_t n, hn, u;
  992 
  993         n = (size_t)1 << logn;
  994         hn = n >> 1;
  995 #if FALCON_AVX2 // yyyAVX2+1
  996         if (n >= 8) {
  997                 __m256d one;
  998 
  999                 one = _mm256_set1_pd(1.0);
 1000                 for (u = 0; u < hn; u += 4) {
 1001                         __m256d ib, a_re, a_im;
 1002 
 1003                         ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
 1004                         a_re = _mm256_loadu_pd(&a[u].v);
 1005                         a_im = _mm256_loadu_pd(&a[u + hn].v);
 1006                         _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
 1007                         _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
 1008                 }
 1009         } else {
 1010                 for (u = 0; u < hn; u ++) {
 1011                         fpr ib;
 1012 
 1013                         ib = fpr_inv(b[u]);
 1014                         a[u] = fpr_mul(a[u], ib);
 1015                         a[u + hn] = fpr_mul(a[u + hn], ib);
 1016                 }
 1017         }
 1018 #else // yyyAVX2+0
 1019         for (u = 0; u < hn; u ++) {
 1020                 fpr ib;
 1021 
 1022                 ib = fpr_inv(b[u]);
 1023                 a[u] = fpr_mul(a[u], ib);
 1024                 a[u + hn] = fpr_mul(a[u + hn], ib);
 1025         }
 1026 #endif // yyyAVX2-
 1027 }
 1028 
 1029 /* see inner.h */
 1030 TARGET_AVX2
 1031 void
 1032 Zf(poly_LDL_fft)(
 1033         const fpr *restrict g00,
 1034         fpr *restrict g01, fpr *restrict g11, unsigned logn)
 1035 {
 1036         size_t n, hn, u;
 1037 
 1038         n = (size_t)1 << logn;
 1039         hn = n >> 1;
 1040 #if FALCON_AVX2 // yyyAVX2+1
 1041         if (n >= 8) {
 1042                 __m256d one;
 1043 
 1044                 one = _mm256_set1_pd(1.0);
 1045                 for (u = 0; u < hn; u += 4) {
 1046                         __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
 1047                         __m256d t, mu_re, mu_im, xi_re, xi_im;
 1048 
 1049                         g00_re = _mm256_loadu_pd(&g00[u].v);
 1050                         g00_im = _mm256_loadu_pd(&g00[u + hn].v);
 1051                         g01_re = _mm256_loadu_pd(&g01[u].v);
 1052                         g01_im = _mm256_loadu_pd(&g01[u + hn].v);
 1053                         g11_re = _mm256_loadu_pd(&g11[u].v);
 1054                         g11_im = _mm256_loadu_pd(&g11[u + hn].v);
 1055 
 1056                         t = _mm256_div_pd(one,
 1057                                 FMADD(g00_re, g00_re,
 1058                                         _mm256_mul_pd(g00_im, g00_im)));
 1059                         g00_re = _mm256_mul_pd(g00_re, t);
 1060                         g00_im = _mm256_mul_pd(g00_im, t);
 1061                         mu_re = FMADD(g01_re, g00_re,
 1062                                 _mm256_mul_pd(g01_im, g00_im));
 1063                         mu_im = FMSUB(g01_re, g00_im,
 1064                                 _mm256_mul_pd(g01_im, g00_re));
 1065                         xi_re = FMSUB(mu_re, g01_re,
 1066                                 _mm256_mul_pd(mu_im, g01_im));
 1067                         xi_im = FMADD(mu_im, g01_re,
 1068                                 _mm256_mul_pd(mu_re, g01_im));
 1069                         _mm256_storeu_pd(&g11[u].v,
 1070                                 _mm256_sub_pd(g11_re, xi_re));
 1071                         _mm256_storeu_pd(&g11[u + hn].v,
 1072                                 _mm256_add_pd(g11_im, xi_im));
 1073                         _mm256_storeu_pd(&g01[u].v, mu_re);
 1074                         _mm256_storeu_pd(&g01[u + hn].v, mu_im);
 1075                 }
 1076         } else {
 1077                 for (u = 0; u < hn; u ++) {
 1078                         fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
 1079                         fpr mu_re, mu_im;
 1080 
 1081                         g00_re = g00[u];
 1082                         g00_im = g00[u + hn];
 1083                         g01_re = g01[u];
 1084                         g01_im = g01[u + hn];
 1085                         g11_re = g11[u];
 1086                         g11_im = g11[u + hn];
 1087                         FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
 1088                         FPC_MUL(g01_re, g01_im,
 1089                                 mu_re, mu_im, g01_re, fpr_neg(g01_im));
 1090                         FPC_SUB(g11[u], g11[u + hn],
 1091                                 g11_re, g11_im, g01_re, g01_im);
 1092                         g01[u] = mu_re;
 1093                         g01[u + hn] = fpr_neg(mu_im);
 1094                 }
 1095         }
 1096 #else // yyyAVX2+0
 1097         for (u = 0; u < hn; u ++) {
 1098                 fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
 1099                 fpr mu_re, mu_im;
 1100 
 1101                 g00_re = g00[u];
 1102                 g00_im = g00[u + hn];
 1103                 g01_re = g01[u];
 1104                 g01_im = g01[u + hn];
 1105                 g11_re = g11[u];
 1106                 g11_im = g11[u + hn];
 1107                 FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
 1108                 FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
 1109                 FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
 1110                 g01[u] = mu_re;
 1111                 g01[u + hn] = fpr_neg(mu_im);
 1112         }
 1113 #endif // yyyAVX2-
 1114 }
 1115 
 1116 /* see inner.h */
 1117 TARGET_AVX2
 1118 void
 1119 Zf(poly_LDLmv_fft)(
 1120         fpr *restrict d11, fpr *restrict l10,
 1121         const fpr *restrict g00, const fpr *restrict g01,
 1122         const fpr *restrict g11, unsigned logn)
 1123 {
 1124         size_t n, hn, u;
 1125 
 1126         n = (size_t)1 << logn;
 1127         hn = n >> 1;
 1128 #if FALCON_AVX2 // yyyAVX2+1
 1129         if (n >= 8) {
 1130                 __m256d one;
 1131 
 1132                 one = _mm256_set1_pd(1.0);
 1133                 for (u = 0; u < hn; u += 4) {
 1134                         __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
 1135                         __m256d t, mu_re, mu_im, xi_re, xi_im;
 1136 
 1137                         g00_re = _mm256_loadu_pd(&g00[u].v);
 1138                         g00_im = _mm256_loadu_pd(&g00[u + hn].v);
 1139                         g01_re = _mm256_loadu_pd(&g01[u].v);
 1140                         g01_im = _mm256_loadu_pd(&g01[u + hn].v);
 1141                         g11_re = _mm256_loadu_pd(&g11[u].v);
 1142                         g11_im = _mm256_loadu_pd(&g11[u + hn].v);
 1143 
 1144                         t = _mm256_div_pd(one,
 1145                                 FMADD(g00_re, g00_re,
 1146                                         _mm256_mul_pd(g00_im, g00_im)));
 1147                         g00_re = _mm256_mul_pd(g00_re, t);
 1148                         g00_im = _mm256_mul_pd(g00_im, t);
 1149                         mu_re = FMADD(g01_re, g00_re,
 1150                                 _mm256_mul_pd(g01_im, g00_im));
 1151                         mu_im = FMSUB(g01_re, g00_im,
 1152                                 _mm256_mul_pd(g01_im, g00_re));
 1153                         xi_re = FMSUB(mu_re, g01_re,
 1154                                 _mm256_mul_pd(mu_im, g01_im));
 1155                         xi_im = FMADD(mu_im, g01_re,
 1156                                 _mm256_mul_pd(mu_re, g01_im));
 1157                         _mm256_storeu_pd(&d11[u].v,
 1158                                 _mm256_sub_pd(g11_re, xi_re));
 1159                         _mm256_storeu_pd(&d11[u + hn].v,
 1160                                 _mm256_add_pd(g11_im, xi_im));
 1161                         _mm256_storeu_pd(&l10[u].v, mu_re);
 1162                         _mm256_storeu_pd(&l10[u + hn].v, mu_im);
 1163                 }
 1164         } else {
 1165                 for (u = 0; u < hn; u ++) {
 1166                         fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
 1167                         fpr mu_re, mu_im;
 1168 
 1169                         g00_re = g00[u];
 1170                         g00_im = g00[u + hn];
 1171                         g01_re = g01[u];
 1172                         g01_im = g01[u + hn];
 1173                         g11_re = g11[u];
 1174                         g11_im = g11[u + hn];
 1175                         FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
 1176                         FPC_MUL(g01_re, g01_im,
 1177                                 mu_re, mu_im, g01_re, fpr_neg(g01_im));
 1178                         FPC_SUB(d11[u], d11[u + hn],
 1179                                 g11_re, g11_im, g01_re, g01_im);
 1180                         l10[u] = mu_re;
 1181                         l10[u + hn] = fpr_neg(mu_im);
 1182                 }
 1183         }
 1184 #else // yyyAVX2+0
 1185         for (u = 0; u < hn; u ++) {
 1186                 fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
 1187                 fpr mu_re, mu_im;
 1188 
 1189                 g00_re = g00[u];
 1190                 g00_im = g00[u + hn];
 1191                 g01_re = g01[u];
 1192                 g01_im = g01[u + hn];
 1193                 g11_re = g11[u];
 1194                 g11_im = g11[u + hn];
 1195                 FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
 1196                 FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
 1197                 FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
 1198                 l10[u] = mu_re;
 1199                 l10[u + hn] = fpr_neg(mu_im);
 1200         }
 1201 #endif // yyyAVX2-
 1202 }
 1203 
 1204 /* see inner.h */
 1205 TARGET_AVX2
 1206 void
 1207 Zf(poly_split_fft)(
 1208         fpr *restrict f0, fpr *restrict f1,
 1209         const fpr *restrict f, unsigned logn)
 1210 {
 1211         /*
 1212          * The FFT representation we use is in bit-reversed order
 1213          * (element i contains f(w^(rev(i))), where rev() is the
 1214          * bit-reversal function over the ring degree. This changes
 1215          * indexes with regards to the Falcon specification.
 1216          */
 1217         size_t n, hn, qn, u;
 1218 
 1219         n = (size_t)1 << logn;
 1220         hn = n >> 1;
 1221         qn = hn >> 1;
 1222 
 1223 #if FALCON_AVX2 // yyyAVX2+1
 1224         if (n >= 8) {
 1225                 __m256d half, sv;
 1226 
 1227                 half = _mm256_set1_pd(0.5);
 1228                 sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
 1229                 for (u = 0; u < qn; u += 2) {
 1230                         __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;
 1231 
 1232                         ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
 1233                         ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
 1234                         ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
 1235                         ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
 1236                         _mm_storeu_pd(&f0[u].v,
 1237                                 _mm256_extractf128_pd(ff0, 0));
 1238                         _mm_storeu_pd(&f0[u + qn].v,
 1239                                 _mm256_extractf128_pd(ff0, 1));
 1240 
 1241                         ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
 1242                         gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
 1243                         ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
 1244                         ff3 = _mm256_hadd_pd(
 1245                                 _mm256_mul_pd(ff1, gmt),
 1246                                 _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
 1247                         ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
 1248                         _mm_storeu_pd(&f1[u].v,
 1249                                 _mm256_extractf128_pd(ff3, 0));
 1250                         _mm_storeu_pd(&f1[u + qn].v,
 1251                                 _mm256_extractf128_pd(ff3, 1));
 1252                 }
 1253         } else {
 1254                 f0[0] = f[0];
 1255                 f1[0] = f[hn];
 1256 
 1257                 for (u = 0; u < qn; u ++) {
 1258                         fpr a_re, a_im, b_re, b_im;
 1259                         fpr t_re, t_im;
 1260 
 1261                         a_re = f[(u << 1) + 0];
 1262                         a_im = f[(u << 1) + 0 + hn];
 1263                         b_re = f[(u << 1) + 1];
 1264                         b_im = f[(u << 1) + 1 + hn];
 1265 
 1266                         FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
 1267                         f0[u] = fpr_half(t_re);
 1268                         f0[u + qn] = fpr_half(t_im);
 1269 
 1270                         FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
 1271                         FPC_MUL(t_re, t_im, t_re, t_im,
 1272                                 fpr_gm_tab[((u + hn) << 1) + 0],
 1273                                 fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
 1274                         f1[u] = fpr_half(t_re);
 1275                         f1[u + qn] = fpr_half(t_im);
 1276                 }
 1277         }
 1278 #else // yyyAVX2+0
 1279         /*
 1280          * We process complex values by pairs. For logn = 1, there is only
 1281          * one complex value (the other one is the implicit conjugate),
 1282          * so we add the two lines below because the loop will be
 1283          * skipped.
 1284          */
 1285         f0[0] = f[0];
 1286         f1[0] = f[hn];
 1287 
 1288         for (u = 0; u < qn; u ++) {
 1289                 fpr a_re, a_im, b_re, b_im;
 1290                 fpr t_re, t_im;
 1291 
 1292                 a_re = f[(u << 1) + 0];
 1293                 a_im = f[(u << 1) + 0 + hn];
 1294                 b_re = f[(u << 1) + 1];
 1295                 b_im = f[(u << 1) + 1 + hn];
 1296 
 1297                 FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
 1298                 f0[u] = fpr_half(t_re);
 1299                 f0[u + qn] = fpr_half(t_im);
 1300 
 1301                 FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
 1302                 FPC_MUL(t_re, t_im, t_re, t_im,
 1303                         fpr_gm_tab[((u + hn) << 1) + 0],
 1304                         fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
 1305                 f1[u] = fpr_half(t_re);
 1306                 f1[u + qn] = fpr_half(t_im);
 1307         }
 1308 #endif // yyyAVX2-
 1309 }
 1310 
 1311 /* see inner.h */
 1312 TARGET_AVX2
 1313 void
 1314 Zf(poly_merge_fft)(
 1315         fpr *restrict f,
 1316         const fpr *restrict f0, const fpr *restrict f1, unsigned logn)
 1317 {
 1318         size_t n, hn, qn, u;
 1319 
 1320         n = (size_t)1 << logn;
 1321         hn = n >> 1;
 1322         qn = hn >> 1;
 1323 
 1324 #if FALCON_AVX2 // yyyAVX2+1
 1325         if (n >= 16) {
 1326                 for (u = 0; u < qn; u += 4) {
 1327                         __m256d a_re, a_im, b_re, b_im, c_re, c_im;
 1328                         __m256d gm1, gm2, g_re, g_im;
 1329                         __m256d t_re, t_im, u_re, u_im;
 1330                         __m256d tu1_re, tu2_re, tu1_im, tu2_im;
 1331 
 1332                         a_re = _mm256_loadu_pd(&f0[u].v);
 1333                         a_im = _mm256_loadu_pd(&f0[u + qn].v);
 1334                         c_re = _mm256_loadu_pd(&f1[u].v);
 1335                         c_im = _mm256_loadu_pd(&f1[u + qn].v);
 1336 
 1337                         gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
 1338                         gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
 1339                         g_re = _mm256_unpacklo_pd(gm1, gm2);
 1340                         g_im = _mm256_unpackhi_pd(gm1, gm2);
 1341                         g_re = _mm256_permute4x64_pd(g_re, 0xD8);
 1342                         g_im = _mm256_permute4x64_pd(g_im, 0xD8);
 1343 
 1344                         b_re = FMSUB(
 1345                                 c_re, g_re, _mm256_mul_pd(c_im, g_im));
 1346                         b_im = FMADD(
 1347                                 c_re, g_im, _mm256_mul_pd(c_im, g_re));
 1348 
 1349                         t_re = _mm256_add_pd(a_re, b_re);
 1350                         t_im = _mm256_add_pd(a_im, b_im);
 1351                         u_re = _mm256_sub_pd(a_re, b_re);
 1352                         u_im = _mm256_sub_pd(a_im, b_im);
 1353 
 1354                         tu1_re = _mm256_unpacklo_pd(t_re, u_re);
 1355                         tu2_re = _mm256_unpackhi_pd(t_re, u_re);
 1356                         tu1_im = _mm256_unpacklo_pd(t_im, u_im);
 1357                         tu2_im = _mm256_unpackhi_pd(t_im, u_im);
 1358                         _mm256_storeu_pd(&f[(u << 1)].v,
 1359                                 _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
 1360                         _mm256_storeu_pd(&f[(u << 1) + 4].v,
 1361                                 _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
 1362                         _mm256_storeu_pd(&f[(u << 1) + hn].v,
 1363                                 _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
 1364                         _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
 1365                                 _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
 1366                 }
 1367         } else {
 1368                 f[0] = f0[0];
 1369                 f[hn] = f1[0];
 1370 
 1371                 for (u = 0; u < qn; u ++) {
 1372                         fpr a_re, a_im, b_re, b_im;
 1373                         fpr t_re, t_im;
 1374 
 1375                         a_re = f0[u];
 1376                         a_im = f0[u + qn];
 1377                         FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
 1378                                 fpr_gm_tab[((u + hn) << 1) + 0],
 1379                                 fpr_gm_tab[((u + hn) << 1) + 1]);
 1380                         FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
 1381                         f[(u << 1) + 0] = t_re;
 1382                         f[(u << 1) + 0 + hn] = t_im;
 1383                         FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
 1384                         f[(u << 1) + 1] = t_re;
 1385                         f[(u << 1) + 1 + hn] = t_im;
 1386                 }
 1387         }
 1388 #else // yyyAVX2+0
 1389         /*
 1390          * An extra copy to handle the special case logn = 1.
 1391          */
 1392         f[0] = f0[0];
 1393         f[hn] = f1[0];
 1394 
 1395         for (u = 0; u < qn; u ++) {
 1396                 fpr a_re, a_im, b_re, b_im;
 1397                 fpr t_re, t_im;
 1398 
 1399                 a_re = f0[u];
 1400                 a_im = f0[u + qn];
 1401                 FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
 1402                         fpr_gm_tab[((u + hn) << 1) + 0],
 1403                         fpr_gm_tab[((u + hn) << 1) + 1]);
 1404                 FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
 1405                 f[(u << 1) + 0] = t_re;
 1406                 f[(u << 1) + 0 + hn] = t_im;
 1407                 FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
 1408                 f[(u << 1) + 1] = t_re;
 1409                 f[(u << 1) + 1 + hn] = t_im;
 1410         }
 1411 #endif // yyyAVX2-
 1412 }