fft.c
1 /*
2 * FFT code.
3 *
4 * ==========================(LICENSE BEGIN)============================
5 *
6 * Copyright (c) 2017-2019 Falcon Project
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining
9 * a copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be
17 * included in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * ===========================(LICENSE END)=============================
28 *
29 * @author Thomas Pornin <thomas.pornin@nccgroup.com>
30 */
31
32 #include "inner.h"
33
34 /*
35 * Rules for complex number macros:
36 * --------------------------------
37 *
38 * Operand order is: destination, source1, source2...
39 *
40 * Each operand is a real and an imaginary part.
41 *
42 * All overlaps are allowed.
43 */
44
45 /*
46 * Addition of two complex numbers (d = a + b).
47 */
48 #define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) do { \
49 fpr fpct_re, fpct_im; \
50 fpct_re = fpr_add(a_re, b_re); \
51 fpct_im = fpr_add(a_im, b_im); \
52 (d_re) = fpct_re; \
53 (d_im) = fpct_im; \
54 } while (0)
55
56 /*
57 * Subtraction of two complex numbers (d = a - b).
58 */
59 #define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) do { \
60 fpr fpct_re, fpct_im; \
61 fpct_re = fpr_sub(a_re, b_re); \
62 fpct_im = fpr_sub(a_im, b_im); \
63 (d_re) = fpct_re; \
64 (d_im) = fpct_im; \
65 } while (0)
66
67 /*
68 * Multplication of two complex numbers (d = a * b).
69 */
70 #define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) do { \
71 fpr fpct_a_re, fpct_a_im; \
72 fpr fpct_b_re, fpct_b_im; \
73 fpr fpct_d_re, fpct_d_im; \
74 fpct_a_re = (a_re); \
75 fpct_a_im = (a_im); \
76 fpct_b_re = (b_re); \
77 fpct_b_im = (b_im); \
78 fpct_d_re = fpr_sub( \
79 fpr_mul(fpct_a_re, fpct_b_re), \
80 fpr_mul(fpct_a_im, fpct_b_im)); \
81 fpct_d_im = fpr_add( \
82 fpr_mul(fpct_a_re, fpct_b_im), \
83 fpr_mul(fpct_a_im, fpct_b_re)); \
84 (d_re) = fpct_d_re; \
85 (d_im) = fpct_d_im; \
86 } while (0)
87
88 /*
89 * Squaring of a complex number (d = a * a).
90 */
91 #define FPC_SQR(d_re, d_im, a_re, a_im) do { \
92 fpr fpct_a_re, fpct_a_im; \
93 fpr fpct_d_re, fpct_d_im; \
94 fpct_a_re = (a_re); \
95 fpct_a_im = (a_im); \
96 fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
97 fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
98 (d_re) = fpct_d_re; \
99 (d_im) = fpct_d_im; \
100 } while (0)
101
102 /*
103 * Inversion of a complex number (d = 1 / a).
104 */
105 #define FPC_INV(d_re, d_im, a_re, a_im) do { \
106 fpr fpct_a_re, fpct_a_im; \
107 fpr fpct_d_re, fpct_d_im; \
108 fpr fpct_m; \
109 fpct_a_re = (a_re); \
110 fpct_a_im = (a_im); \
111 fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
112 fpct_m = fpr_inv(fpct_m); \
113 fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
114 fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
115 (d_re) = fpct_d_re; \
116 (d_im) = fpct_d_im; \
117 } while (0)
118
119 /*
120 * Division of complex numbers (d = a / b).
121 */
122 #define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im) do { \
123 fpr fpct_a_re, fpct_a_im; \
124 fpr fpct_b_re, fpct_b_im; \
125 fpr fpct_d_re, fpct_d_im; \
126 fpr fpct_m; \
127 fpct_a_re = (a_re); \
128 fpct_a_im = (a_im); \
129 fpct_b_re = (b_re); \
130 fpct_b_im = (b_im); \
131 fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
132 fpct_m = fpr_inv(fpct_m); \
133 fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
134 fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
135 fpct_d_re = fpr_sub( \
136 fpr_mul(fpct_a_re, fpct_b_re), \
137 fpr_mul(fpct_a_im, fpct_b_im)); \
138 fpct_d_im = fpr_add( \
139 fpr_mul(fpct_a_re, fpct_b_im), \
140 fpr_mul(fpct_a_im, fpct_b_re)); \
141 (d_re) = fpct_d_re; \
142 (d_im) = fpct_d_im; \
143 } while (0)
144
145 /*
146 * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
147 * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
148 * of X^N+1 in the field of complex numbers. A crucial property is that
149 * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
150 *
151 * FFT representation of a polynomial f (taken modulo X^N+1) is the
152 * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
153 * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
154 * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
155 * needed. A consequence is that FFT representation has the same size
156 * as normal representation: N/2 complex numbers use N real numbers (each
157 * complex number is the combination of a real and an imaginary part).
158 *
159 * We use a specific ordering which makes computations easier. Let rev()
160 * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
161 * store the real and imaginary parts of f(w_j) in slots:
162 *
163 * Re(f(w_j)) -> slot rev(j)/2
164 * Im(f(w_j)) -> slot rev(j)/2+N/2
165 *
166 * (Note that rev(j) is even for j < N/2.)
167 */
168
169 /* see inner.h */
170 TARGET_AVX2
171 void
172 Zf(FFT)(fpr *f, unsigned logn)
173 {
174 /*
175 * FFT algorithm in bit-reversal order uses the following
176 * iterative algorithm:
177 *
178 * t = N
179 * for m = 1; m < N; m *= 2:
180 * ht = t/2
181 * for i1 = 0; i1 < m; i1 ++:
182 * j1 = i1 * t
183 * s = GM[m + i1]
184 * for j = j1; j < (j1 + ht); j ++:
185 * x = f[j]
186 * y = s * f[j + ht]
187 * f[j] = x + y
188 * f[j + ht] = x - y
189 * t = ht
190 *
191 * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
192 *
193 * In the description above, f[] is supposed to contain complex
194 * numbers. In our in-memory representation, the real and
195 * imaginary parts of f[k] are in array slots k and k+N/2.
196 *
197 * We only keep the first half of the complex numbers. We can
198 * see that after the first iteration, the first and second halves
199 * of the array of complex numbers have separate lives, so we
200 * simply ignore the second part.
201 */
202
203 unsigned u;
204 size_t t, n, hn, m;
205
206 /*
207 * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
208 * (because GM[1] = w^rev(1) = w^(N/2) = i).
209 * In our chosen representation, this is a no-op: everything is
210 * already where it should be.
211 */
212
213 /*
214 * Subsequent iterations are truncated to use only the first
215 * half of values.
216 */
217 n = (size_t)1 << logn;
218 hn = n >> 1;
219 t = hn;
220 for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
221 size_t ht, hm, i1, j1;
222
223 ht = t >> 1;
224 hm = m >> 1;
225 for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
226 size_t j, j2;
227
228 j2 = j1 + ht;
229 #if FALCON_AVX2 // yyyAVX2+1
230 if (ht >= 4) {
231 __m256d s_re, s_im;
232
233 s_re = _mm256_set1_pd(
234 fpr_gm_tab[((m + i1) << 1) + 0].v);
235 s_im = _mm256_set1_pd(
236 fpr_gm_tab[((m + i1) << 1) + 1].v);
237 for (j = j1; j < j2; j += 4) {
238 __m256d x_re, x_im, y_re, y_im;
239 __m256d z_re, z_im;
240
241 x_re = _mm256_loadu_pd(&f[j].v);
242 x_im = _mm256_loadu_pd(&f[j + hn].v);
243 z_re = _mm256_loadu_pd(&f[j+ht].v);
244 z_im = _mm256_loadu_pd(&f[j+ht + hn].v);
245 y_re = FMSUB(z_re, s_re,
246 _mm256_mul_pd(z_im, s_im));
247 y_im = FMADD(z_re, s_im,
248 _mm256_mul_pd(z_im, s_re));
249 _mm256_storeu_pd(&f[j].v,
250 _mm256_add_pd(x_re, y_re));
251 _mm256_storeu_pd(&f[j + hn].v,
252 _mm256_add_pd(x_im, y_im));
253 _mm256_storeu_pd(&f[j + ht].v,
254 _mm256_sub_pd(x_re, y_re));
255 _mm256_storeu_pd(&f[j + ht + hn].v,
256 _mm256_sub_pd(x_im, y_im));
257 }
258 } else {
259 fpr s_re, s_im;
260
261 s_re = fpr_gm_tab[((m + i1) << 1) + 0];
262 s_im = fpr_gm_tab[((m + i1) << 1) + 1];
263 for (j = j1; j < j2; j ++) {
264 fpr x_re, x_im, y_re, y_im;
265
266 x_re = f[j];
267 x_im = f[j + hn];
268 y_re = f[j + ht];
269 y_im = f[j + ht + hn];
270 FPC_MUL(y_re, y_im,
271 y_re, y_im, s_re, s_im);
272 FPC_ADD(f[j], f[j + hn],
273 x_re, x_im, y_re, y_im);
274 FPC_SUB(f[j + ht], f[j + ht + hn],
275 x_re, x_im, y_re, y_im);
276 }
277 }
278 #else // yyyAVX2+0
279 fpr s_re, s_im;
280
281 s_re = fpr_gm_tab[((m + i1) << 1) + 0];
282 s_im = fpr_gm_tab[((m + i1) << 1) + 1];
283 for (j = j1; j < j2; j ++) {
284 fpr x_re, x_im, y_re, y_im;
285
286 x_re = f[j];
287 x_im = f[j + hn];
288 y_re = f[j + ht];
289 y_im = f[j + ht + hn];
290 FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
291 FPC_ADD(f[j], f[j + hn],
292 x_re, x_im, y_re, y_im);
293 FPC_SUB(f[j + ht], f[j + ht + hn],
294 x_re, x_im, y_re, y_im);
295 }
296 #endif // yyyAVX2-
297 }
298 t = ht;
299 }
300 }
301
302 /* see inner.h */
303 TARGET_AVX2
304 void
305 Zf(iFFT)(fpr *f, unsigned logn)
306 {
307 /*
308 * Inverse FFT algorithm in bit-reversal order uses the following
309 * iterative algorithm:
310 *
311 * t = 1
312 * for m = N; m > 1; m /= 2:
313 * hm = m/2
314 * dt = t*2
315 * for i1 = 0; i1 < hm; i1 ++:
316 * j1 = i1 * dt
317 * s = iGM[hm + i1]
318 * for j = j1; j < (j1 + t); j ++:
319 * x = f[j]
320 * y = f[j + t]
321 * f[j] = x + y
322 * f[j + t] = s * (x - y)
323 * t = dt
324 * for i1 = 0; i1 < N; i1 ++:
325 * f[i1] = f[i1] / N
326 *
327 * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
328 * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
329 *
330 * In the main loop (not counting the final division loop), in
331 * all iterations except the last, the first and second half of f[]
332 * (as an array of complex numbers) are separate. In our chosen
333 * representation, we do not keep the second half.
334 *
335 * The last iteration recombines the recomputed half with the
336 * implicit half, and should yield only real numbers since the
337 * target polynomial is real; moreover, s = i at that step.
338 * Thus, when considering x and y:
339 * y = conj(x) since the final f[j] must be real
340 * Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
341 * filled with 2*Im(x).
342 * But we already have Re(x) and Im(x) in array slots j and j+t
343 * in our chosen representation. That last iteration is thus a
344 * simple doubling of the values in all the array.
345 *
346 * We make the last iteration a no-op by tweaking the final
347 * division into a division by N/2, not N.
348 */
349 size_t u, n, hn, t, m;
350
351 n = (size_t)1 << logn;
352 t = 1;
353 m = n;
354 hn = n >> 1;
355 for (u = logn; u > 1; u --) {
356 size_t hm, dt, i1, j1;
357
358 hm = m >> 1;
359 dt = t << 1;
360 for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
361 size_t j, j2;
362
363 j2 = j1 + t;
364 #if FALCON_AVX2 // yyyAVX2+1
365 if (t >= 4) {
366 __m256d s_re, s_im;
367
368 s_re = _mm256_set1_pd(
369 fpr_gm_tab[((hm + i1) << 1) + 0].v);
370 s_im = _mm256_set1_pd(
371 fpr_gm_tab[((hm + i1) << 1) + 1].v);
372 for (j = j1; j < j2; j += 4) {
373 __m256d x_re, x_im, y_re, y_im;
374 __m256d z_re, z_im;
375
376 x_re = _mm256_loadu_pd(&f[j].v);
377 x_im = _mm256_loadu_pd(&f[j + hn].v);
378 y_re = _mm256_loadu_pd(&f[j+t].v);
379 y_im = _mm256_loadu_pd(&f[j+t + hn].v);
380 _mm256_storeu_pd(&f[j].v,
381 _mm256_add_pd(x_re, y_re));
382 _mm256_storeu_pd(&f[j + hn].v,
383 _mm256_add_pd(x_im, y_im));
384 x_re = _mm256_sub_pd(y_re, x_re);
385 x_im = _mm256_sub_pd(x_im, y_im);
386 z_re = FMSUB(x_im, s_im,
387 _mm256_mul_pd(x_re, s_re));
388 z_im = FMADD(x_re, s_im,
389 _mm256_mul_pd(x_im, s_re));
390 _mm256_storeu_pd(&f[j+t].v, z_re);
391 _mm256_storeu_pd(&f[j+t + hn].v, z_im);
392 }
393 } else {
394 fpr s_re, s_im;
395
396 s_re = fpr_gm_tab[((hm + i1) << 1)+0];
397 s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1)+1]);
398 for (j = j1; j < j2; j ++) {
399 fpr x_re, x_im, y_re, y_im;
400
401 x_re = f[j];
402 x_im = f[j + hn];
403 y_re = f[j + t];
404 y_im = f[j + t + hn];
405 FPC_ADD(f[j], f[j + hn],
406 x_re, x_im, y_re, y_im);
407 FPC_SUB(x_re, x_im,
408 x_re, x_im, y_re, y_im);
409 FPC_MUL(f[j + t], f[j + t + hn],
410 x_re, x_im, s_re, s_im);
411 }
412 }
413 #else // yyyAVX2+0
414 fpr s_re, s_im;
415
416 s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
417 s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
418 for (j = j1; j < j2; j ++) {
419 fpr x_re, x_im, y_re, y_im;
420
421 x_re = f[j];
422 x_im = f[j + hn];
423 y_re = f[j + t];
424 y_im = f[j + t + hn];
425 FPC_ADD(f[j], f[j + hn],
426 x_re, x_im, y_re, y_im);
427 FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
428 FPC_MUL(f[j + t], f[j + t + hn],
429 x_re, x_im, s_re, s_im);
430 }
431 #endif // yyyAVX2-
432 }
433 t = dt;
434 m = hm;
435 }
436
437 /*
438 * Last iteration is a no-op, provided that we divide by N/2
439 * instead of N. We need to make a special case for logn = 0.
440 */
441 if (logn > 0) {
442 fpr ni;
443
444 ni = fpr_p2_tab[logn];
445 for (u = 0; u < n; u ++) {
446 f[u] = fpr_mul(f[u], ni);
447 }
448 }
449 }
450
451 /* see inner.h */
452 TARGET_AVX2
453 void
454 Zf(poly_add)(
455 fpr *restrict a, const fpr *restrict b, unsigned logn)
456 {
457 size_t n, u;
458
459 n = (size_t)1 << logn;
460 #if FALCON_AVX2 // yyyAVX2+1
461 if (n >= 4) {
462 for (u = 0; u < n; u += 4) {
463 _mm256_storeu_pd(&a[u].v,
464 _mm256_add_pd(
465 _mm256_loadu_pd(&a[u].v),
466 _mm256_loadu_pd(&b[u].v)));
467 }
468 } else {
469 for (u = 0; u < n; u ++) {
470 a[u] = fpr_add(a[u], b[u]);
471 }
472 }
473 #else // yyyAVX2+0
474 for (u = 0; u < n; u ++) {
475 a[u] = fpr_add(a[u], b[u]);
476 }
477 #endif // yyyAVX2-
478 }
479
480 /* see inner.h */
481 TARGET_AVX2
482 void
483 Zf(poly_sub)(
484 fpr *restrict a, const fpr *restrict b, unsigned logn)
485 {
486 size_t n, u;
487
488 n = (size_t)1 << logn;
489 #if FALCON_AVX2 // yyyAVX2+1
490 if (n >= 4) {
491 for (u = 0; u < n; u += 4) {
492 _mm256_storeu_pd(&a[u].v,
493 _mm256_sub_pd(
494 _mm256_loadu_pd(&a[u].v),
495 _mm256_loadu_pd(&b[u].v)));
496 }
497 } else {
498 for (u = 0; u < n; u ++) {
499 a[u] = fpr_sub(a[u], b[u]);
500 }
501 }
502 #else // yyyAVX2+0
503 for (u = 0; u < n; u ++) {
504 a[u] = fpr_sub(a[u], b[u]);
505 }
506 #endif // yyyAVX2-
507 }
508
509 /* see inner.h */
510 TARGET_AVX2
511 void
512 Zf(poly_neg)(fpr *a, unsigned logn)
513 {
514 size_t n, u;
515
516 n = (size_t)1 << logn;
517 #if FALCON_AVX2 // yyyAVX2+1
518 if (n >= 4) {
519 __m256d s;
520
521 s = _mm256_set1_pd(-0.0);
522 for (u = 0; u < n; u += 4) {
523 _mm256_storeu_pd(&a[u].v,
524 _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
525 }
526 } else {
527 for (u = 0; u < n; u ++) {
528 a[u] = fpr_neg(a[u]);
529 }
530 }
531 #else // yyyAVX2+0
532 for (u = 0; u < n; u ++) {
533 a[u] = fpr_neg(a[u]);
534 }
535 #endif // yyyAVX2-
536 }
537
538 /* see inner.h */
539 TARGET_AVX2
540 void
541 Zf(poly_adj_fft)(fpr *a, unsigned logn)
542 {
543 size_t n, u;
544
545 n = (size_t)1 << logn;
546 #if FALCON_AVX2 // yyyAVX2+1
547 if (n >= 8) {
548 __m256d s;
549
550 s = _mm256_set1_pd(-0.0);
551 for (u = (n >> 1); u < n; u += 4) {
552 _mm256_storeu_pd(&a[u].v,
553 _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
554 }
555 } else {
556 for (u = (n >> 1); u < n; u ++) {
557 a[u] = fpr_neg(a[u]);
558 }
559 }
560 #else // yyyAVX2+0
561 for (u = (n >> 1); u < n; u ++) {
562 a[u] = fpr_neg(a[u]);
563 }
564 #endif // yyyAVX2-
565 }
566
567 /* see inner.h */
568 TARGET_AVX2
569 void
570 Zf(poly_mul_fft)(
571 fpr *restrict a, const fpr *restrict b, unsigned logn)
572 {
573 size_t n, hn, u;
574
575 n = (size_t)1 << logn;
576 hn = n >> 1;
577 #if FALCON_AVX2 // yyyAVX2+1
578 if (n >= 8) {
579 for (u = 0; u < hn; u += 4) {
580 __m256d a_re, a_im, b_re, b_im, c_re, c_im;
581
582 a_re = _mm256_loadu_pd(&a[u].v);
583 a_im = _mm256_loadu_pd(&a[u + hn].v);
584 b_re = _mm256_loadu_pd(&b[u].v);
585 b_im = _mm256_loadu_pd(&b[u + hn].v);
586 c_re = FMSUB(
587 a_re, b_re, _mm256_mul_pd(a_im, b_im));
588 c_im = FMADD(
589 a_re, b_im, _mm256_mul_pd(a_im, b_re));
590 _mm256_storeu_pd(&a[u].v, c_re);
591 _mm256_storeu_pd(&a[u + hn].v, c_im);
592 }
593 } else {
594 for (u = 0; u < hn; u ++) {
595 fpr a_re, a_im, b_re, b_im;
596
597 a_re = a[u];
598 a_im = a[u + hn];
599 b_re = b[u];
600 b_im = b[u + hn];
601 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
602 }
603 }
604 #else // yyyAVX2+0
605 for (u = 0; u < hn; u ++) {
606 fpr a_re, a_im, b_re, b_im;
607
608 a_re = a[u];
609 a_im = a[u + hn];
610 b_re = b[u];
611 b_im = b[u + hn];
612 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
613 }
614 #endif // yyyAVX2-
615 }
616
617 /* see inner.h */
618 TARGET_AVX2
619 void
620 Zf(poly_muladj_fft)(
621 fpr *restrict a, const fpr *restrict b, unsigned logn)
622 {
623 size_t n, hn, u;
624
625 n = (size_t)1 << logn;
626 hn = n >> 1;
627 #if FALCON_AVX2 // yyyAVX2+1
628 if (n >= 8) {
629 for (u = 0; u < hn; u += 4) {
630 __m256d a_re, a_im, b_re, b_im, c_re, c_im;
631
632 a_re = _mm256_loadu_pd(&a[u].v);
633 a_im = _mm256_loadu_pd(&a[u + hn].v);
634 b_re = _mm256_loadu_pd(&b[u].v);
635 b_im = _mm256_loadu_pd(&b[u + hn].v);
636 c_re = FMADD(
637 a_re, b_re, _mm256_mul_pd(a_im, b_im));
638 c_im = FMSUB(
639 a_im, b_re, _mm256_mul_pd(a_re, b_im));
640 _mm256_storeu_pd(&a[u].v, c_re);
641 _mm256_storeu_pd(&a[u + hn].v, c_im);
642 }
643 } else {
644 for (u = 0; u < hn; u ++) {
645 fpr a_re, a_im, b_re, b_im;
646
647 a_re = a[u];
648 a_im = a[u + hn];
649 b_re = b[u];
650 b_im = fpr_neg(b[u + hn]);
651 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
652 }
653 }
654 #else // yyyAVX2+0
655 for (u = 0; u < hn; u ++) {
656 fpr a_re, a_im, b_re, b_im;
657
658 a_re = a[u];
659 a_im = a[u + hn];
660 b_re = b[u];
661 b_im = fpr_neg(b[u + hn]);
662 FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
663 }
664 #endif // yyyAVX2-
665 }
666
667 /* see inner.h */
668 TARGET_AVX2
669 void
670 Zf(poly_mulselfadj_fft)(fpr *a, unsigned logn)
671 {
672 /*
673 * Since each coefficient is multiplied with its own conjugate,
674 * the result contains only real values.
675 */
676 size_t n, hn, u;
677
678 n = (size_t)1 << logn;
679 hn = n >> 1;
680 #if FALCON_AVX2 // yyyAVX2+1
681 if (n >= 8) {
682 __m256d zero;
683
684 zero = _mm256_setzero_pd();
685 for (u = 0; u < hn; u += 4) {
686 __m256d a_re, a_im;
687
688 a_re = _mm256_loadu_pd(&a[u].v);
689 a_im = _mm256_loadu_pd(&a[u + hn].v);
690 _mm256_storeu_pd(&a[u].v,
691 FMADD(a_re, a_re,
692 _mm256_mul_pd(a_im, a_im)));
693 _mm256_storeu_pd(&a[u + hn].v, zero);
694 }
695 } else {
696 for (u = 0; u < hn; u ++) {
697 fpr a_re, a_im;
698
699 a_re = a[u];
700 a_im = a[u + hn];
701 a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
702 a[u + hn] = fpr_zero;
703 }
704 }
705 #else // yyyAVX2+0
706 for (u = 0; u < hn; u ++) {
707 fpr a_re, a_im;
708
709 a_re = a[u];
710 a_im = a[u + hn];
711 a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
712 a[u + hn] = fpr_zero;
713 }
714 #endif // yyyAVX2-
715 }
716
717 /* see inner.h */
718 TARGET_AVX2
719 void
720 Zf(poly_mulconst)(fpr *a, fpr x, unsigned logn)
721 {
722 size_t n, u;
723
724 n = (size_t)1 << logn;
725 #if FALCON_AVX2 // yyyAVX2+1
726 if (n >= 4) {
727 __m256d x4;
728
729 x4 = _mm256_set1_pd(x.v);
730 for (u = 0; u < n; u += 4) {
731 _mm256_storeu_pd(&a[u].v,
732 _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
733 }
734 } else {
735 for (u = 0; u < n; u ++) {
736 a[u] = fpr_mul(a[u], x);
737 }
738 }
739 #else // yyyAVX2+0
740 for (u = 0; u < n; u ++) {
741 a[u] = fpr_mul(a[u], x);
742 }
743 #endif // yyyAVX2-
744 }
745
746 /* see inner.h */
747 TARGET_AVX2
748 void
749 Zf(poly_div_fft)(
750 fpr *restrict a, const fpr *restrict b, unsigned logn)
751 {
752 size_t n, hn, u;
753
754 n = (size_t)1 << logn;
755 hn = n >> 1;
756 #if FALCON_AVX2 // yyyAVX2+1
757 if (n >= 8) {
758 __m256d one;
759
760 one = _mm256_set1_pd(1.0);
761 for (u = 0; u < hn; u += 4) {
762 __m256d a_re, a_im, b_re, b_im, c_re, c_im, t;
763
764 a_re = _mm256_loadu_pd(&a[u].v);
765 a_im = _mm256_loadu_pd(&a[u + hn].v);
766 b_re = _mm256_loadu_pd(&b[u].v);
767 b_im = _mm256_loadu_pd(&b[u + hn].v);
768 t = _mm256_div_pd(one,
769 FMADD(b_re, b_re,
770 _mm256_mul_pd(b_im, b_im)));
771 b_re = _mm256_mul_pd(b_re, t);
772 b_im = _mm256_mul_pd(b_im, t);
773 c_re = FMADD(
774 a_re, b_re, _mm256_mul_pd(a_im, b_im));
775 c_im = FMSUB(
776 a_im, b_re, _mm256_mul_pd(a_re, b_im));
777 _mm256_storeu_pd(&a[u].v, c_re);
778 _mm256_storeu_pd(&a[u + hn].v, c_im);
779 }
780 } else {
781 for (u = 0; u < hn; u ++) {
782 fpr a_re, a_im, b_re, b_im;
783
784 a_re = a[u];
785 a_im = a[u + hn];
786 b_re = b[u];
787 b_im = b[u + hn];
788 FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
789 }
790 }
791 #else // yyyAVX2+0
792 for (u = 0; u < hn; u ++) {
793 fpr a_re, a_im, b_re, b_im;
794
795 a_re = a[u];
796 a_im = a[u + hn];
797 b_re = b[u];
798 b_im = b[u + hn];
799 FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
800 }
801 #endif // yyyAVX2-
802 }
803
804 /* see inner.h */
805 TARGET_AVX2
806 void
807 Zf(poly_invnorm2_fft)(fpr *restrict d,
808 const fpr *restrict a, const fpr *restrict b, unsigned logn)
809 {
810 size_t n, hn, u;
811
812 n = (size_t)1 << logn;
813 hn = n >> 1;
814 #if FALCON_AVX2 // yyyAVX2+1
815 if (n >= 8) {
816 __m256d one;
817
818 one = _mm256_set1_pd(1.0);
819 for (u = 0; u < hn; u += 4) {
820 __m256d a_re, a_im, b_re, b_im, dv;
821
822 a_re = _mm256_loadu_pd(&a[u].v);
823 a_im = _mm256_loadu_pd(&a[u + hn].v);
824 b_re = _mm256_loadu_pd(&b[u].v);
825 b_im = _mm256_loadu_pd(&b[u + hn].v);
826 dv = _mm256_div_pd(one,
827 _mm256_add_pd(
828 FMADD(a_re, a_re,
829 _mm256_mul_pd(a_im, a_im)),
830 FMADD(b_re, b_re,
831 _mm256_mul_pd(b_im, b_im))));
832 _mm256_storeu_pd(&d[u].v, dv);
833 }
834 } else {
835 for (u = 0; u < hn; u ++) {
836 fpr a_re, a_im;
837 fpr b_re, b_im;
838
839 a_re = a[u];
840 a_im = a[u + hn];
841 b_re = b[u];
842 b_im = b[u + hn];
843 d[u] = fpr_inv(fpr_add(
844 fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
845 fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
846 }
847 }
848 #else // yyyAVX2+0
849 for (u = 0; u < hn; u ++) {
850 fpr a_re, a_im;
851 fpr b_re, b_im;
852
853 a_re = a[u];
854 a_im = a[u + hn];
855 b_re = b[u];
856 b_im = b[u + hn];
857 d[u] = fpr_inv(fpr_add(
858 fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
859 fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
860 }
861 #endif // yyyAVX2-
862 }
863
864 /* see inner.h */
865 TARGET_AVX2
866 void
867 Zf(poly_add_muladj_fft)(fpr *restrict d,
868 const fpr *restrict F, const fpr *restrict G,
869 const fpr *restrict f, const fpr *restrict g, unsigned logn)
870 {
871 size_t n, hn, u;
872
873 n = (size_t)1 << logn;
874 hn = n >> 1;
875 #if FALCON_AVX2 // yyyAVX2+1
876 if (n >= 8) {
877 for (u = 0; u < hn; u += 4) {
878 __m256d F_re, F_im, G_re, G_im;
879 __m256d f_re, f_im, g_re, g_im;
880 __m256d a_re, a_im, b_re, b_im;
881
882 F_re = _mm256_loadu_pd(&F[u].v);
883 F_im = _mm256_loadu_pd(&F[u + hn].v);
884 G_re = _mm256_loadu_pd(&G[u].v);
885 G_im = _mm256_loadu_pd(&G[u + hn].v);
886 f_re = _mm256_loadu_pd(&f[u].v);
887 f_im = _mm256_loadu_pd(&f[u + hn].v);
888 g_re = _mm256_loadu_pd(&g[u].v);
889 g_im = _mm256_loadu_pd(&g[u + hn].v);
890
891 a_re = FMADD(F_re, f_re,
892 _mm256_mul_pd(F_im, f_im));
893 a_im = FMSUB(F_im, f_re,
894 _mm256_mul_pd(F_re, f_im));
895 b_re = FMADD(G_re, g_re,
896 _mm256_mul_pd(G_im, g_im));
897 b_im = FMSUB(G_im, g_re,
898 _mm256_mul_pd(G_re, g_im));
899 _mm256_storeu_pd(&d[u].v,
900 _mm256_add_pd(a_re, b_re));
901 _mm256_storeu_pd(&d[u + hn].v,
902 _mm256_add_pd(a_im, b_im));
903 }
904 } else {
905 for (u = 0; u < hn; u ++) {
906 fpr F_re, F_im, G_re, G_im;
907 fpr f_re, f_im, g_re, g_im;
908 fpr a_re, a_im, b_re, b_im;
909
910 F_re = F[u];
911 F_im = F[u + hn];
912 G_re = G[u];
913 G_im = G[u + hn];
914 f_re = f[u];
915 f_im = f[u + hn];
916 g_re = g[u];
917 g_im = g[u + hn];
918
919 FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
920 FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
921 d[u] = fpr_add(a_re, b_re);
922 d[u + hn] = fpr_add(a_im, b_im);
923 }
924 }
925 #else // yyyAVX2+0
926 for (u = 0; u < hn; u ++) {
927 fpr F_re, F_im, G_re, G_im;
928 fpr f_re, f_im, g_re, g_im;
929 fpr a_re, a_im, b_re, b_im;
930
931 F_re = F[u];
932 F_im = F[u + hn];
933 G_re = G[u];
934 G_im = G[u + hn];
935 f_re = f[u];
936 f_im = f[u + hn];
937 g_re = g[u];
938 g_im = g[u + hn];
939
940 FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
941 FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
942 d[u] = fpr_add(a_re, b_re);
943 d[u + hn] = fpr_add(a_im, b_im);
944 }
945 #endif // yyyAVX2-
946 }
947
948 /* see inner.h */
949 TARGET_AVX2
950 void
951 Zf(poly_mul_autoadj_fft)(
952 fpr *restrict a, const fpr *restrict b, unsigned logn)
953 {
954 size_t n, hn, u;
955
956 n = (size_t)1 << logn;
957 hn = n >> 1;
958 #if FALCON_AVX2 // yyyAVX2+1
959 if (n >= 8) {
960 for (u = 0; u < hn; u += 4) {
961 __m256d a_re, a_im, bv;
962
963 a_re = _mm256_loadu_pd(&a[u].v);
964 a_im = _mm256_loadu_pd(&a[u + hn].v);
965 bv = _mm256_loadu_pd(&b[u].v);
966 _mm256_storeu_pd(&a[u].v,
967 _mm256_mul_pd(a_re, bv));
968 _mm256_storeu_pd(&a[u + hn].v,
969 _mm256_mul_pd(a_im, bv));
970 }
971 } else {
972 for (u = 0; u < hn; u ++) {
973 a[u] = fpr_mul(a[u], b[u]);
974 a[u + hn] = fpr_mul(a[u + hn], b[u]);
975 }
976 }
977 #else // yyyAVX2+0
978 for (u = 0; u < hn; u ++) {
979 a[u] = fpr_mul(a[u], b[u]);
980 a[u + hn] = fpr_mul(a[u + hn], b[u]);
981 }
982 #endif // yyyAVX2-
983 }
984
985 /* see inner.h */
986 TARGET_AVX2
987 void
988 Zf(poly_div_autoadj_fft)(
989 fpr *restrict a, const fpr *restrict b, unsigned logn)
990 {
991 size_t n, hn, u;
992
993 n = (size_t)1 << logn;
994 hn = n >> 1;
995 #if FALCON_AVX2 // yyyAVX2+1
996 if (n >= 8) {
997 __m256d one;
998
999 one = _mm256_set1_pd(1.0);
1000 for (u = 0; u < hn; u += 4) {
1001 __m256d ib, a_re, a_im;
1002
1003 ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
1004 a_re = _mm256_loadu_pd(&a[u].v);
1005 a_im = _mm256_loadu_pd(&a[u + hn].v);
1006 _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
1007 _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
1008 }
1009 } else {
1010 for (u = 0; u < hn; u ++) {
1011 fpr ib;
1012
1013 ib = fpr_inv(b[u]);
1014 a[u] = fpr_mul(a[u], ib);
1015 a[u + hn] = fpr_mul(a[u + hn], ib);
1016 }
1017 }
1018 #else // yyyAVX2+0
1019 for (u = 0; u < hn; u ++) {
1020 fpr ib;
1021
1022 ib = fpr_inv(b[u]);
1023 a[u] = fpr_mul(a[u], ib);
1024 a[u + hn] = fpr_mul(a[u + hn], ib);
1025 }
1026 #endif // yyyAVX2-
1027 }
1028
1029 /* see inner.h */
1030 TARGET_AVX2
1031 void
1032 Zf(poly_LDL_fft)(
1033 const fpr *restrict g00,
1034 fpr *restrict g01, fpr *restrict g11, unsigned logn)
1035 {
1036 size_t n, hn, u;
1037
1038 n = (size_t)1 << logn;
1039 hn = n >> 1;
1040 #if FALCON_AVX2 // yyyAVX2+1
1041 if (n >= 8) {
1042 __m256d one;
1043
1044 one = _mm256_set1_pd(1.0);
1045 for (u = 0; u < hn; u += 4) {
1046 __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
1047 __m256d t, mu_re, mu_im, xi_re, xi_im;
1048
1049 g00_re = _mm256_loadu_pd(&g00[u].v);
1050 g00_im = _mm256_loadu_pd(&g00[u + hn].v);
1051 g01_re = _mm256_loadu_pd(&g01[u].v);
1052 g01_im = _mm256_loadu_pd(&g01[u + hn].v);
1053 g11_re = _mm256_loadu_pd(&g11[u].v);
1054 g11_im = _mm256_loadu_pd(&g11[u + hn].v);
1055
1056 t = _mm256_div_pd(one,
1057 FMADD(g00_re, g00_re,
1058 _mm256_mul_pd(g00_im, g00_im)));
1059 g00_re = _mm256_mul_pd(g00_re, t);
1060 g00_im = _mm256_mul_pd(g00_im, t);
1061 mu_re = FMADD(g01_re, g00_re,
1062 _mm256_mul_pd(g01_im, g00_im));
1063 mu_im = FMSUB(g01_re, g00_im,
1064 _mm256_mul_pd(g01_im, g00_re));
1065 xi_re = FMSUB(mu_re, g01_re,
1066 _mm256_mul_pd(mu_im, g01_im));
1067 xi_im = FMADD(mu_im, g01_re,
1068 _mm256_mul_pd(mu_re, g01_im));
1069 _mm256_storeu_pd(&g11[u].v,
1070 _mm256_sub_pd(g11_re, xi_re));
1071 _mm256_storeu_pd(&g11[u + hn].v,
1072 _mm256_add_pd(g11_im, xi_im));
1073 _mm256_storeu_pd(&g01[u].v, mu_re);
1074 _mm256_storeu_pd(&g01[u + hn].v, mu_im);
1075 }
1076 } else {
1077 for (u = 0; u < hn; u ++) {
1078 fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
1079 fpr mu_re, mu_im;
1080
1081 g00_re = g00[u];
1082 g00_im = g00[u + hn];
1083 g01_re = g01[u];
1084 g01_im = g01[u + hn];
1085 g11_re = g11[u];
1086 g11_im = g11[u + hn];
1087 FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
1088 FPC_MUL(g01_re, g01_im,
1089 mu_re, mu_im, g01_re, fpr_neg(g01_im));
1090 FPC_SUB(g11[u], g11[u + hn],
1091 g11_re, g11_im, g01_re, g01_im);
1092 g01[u] = mu_re;
1093 g01[u + hn] = fpr_neg(mu_im);
1094 }
1095 }
1096 #else // yyyAVX2+0
1097 for (u = 0; u < hn; u ++) {
1098 fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
1099 fpr mu_re, mu_im;
1100
1101 g00_re = g00[u];
1102 g00_im = g00[u + hn];
1103 g01_re = g01[u];
1104 g01_im = g01[u + hn];
1105 g11_re = g11[u];
1106 g11_im = g11[u + hn];
1107 FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
1108 FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
1109 FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
1110 g01[u] = mu_re;
1111 g01[u + hn] = fpr_neg(mu_im);
1112 }
1113 #endif // yyyAVX2-
1114 }
1115
1116 /* see inner.h */
1117 TARGET_AVX2
1118 void
1119 Zf(poly_LDLmv_fft)(
1120 fpr *restrict d11, fpr *restrict l10,
1121 const fpr *restrict g00, const fpr *restrict g01,
1122 const fpr *restrict g11, unsigned logn)
1123 {
1124 size_t n, hn, u;
1125
1126 n = (size_t)1 << logn;
1127 hn = n >> 1;
1128 #if FALCON_AVX2 // yyyAVX2+1
1129 if (n >= 8) {
1130 __m256d one;
1131
1132 one = _mm256_set1_pd(1.0);
1133 for (u = 0; u < hn; u += 4) {
1134 __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
1135 __m256d t, mu_re, mu_im, xi_re, xi_im;
1136
1137 g00_re = _mm256_loadu_pd(&g00[u].v);
1138 g00_im = _mm256_loadu_pd(&g00[u + hn].v);
1139 g01_re = _mm256_loadu_pd(&g01[u].v);
1140 g01_im = _mm256_loadu_pd(&g01[u + hn].v);
1141 g11_re = _mm256_loadu_pd(&g11[u].v);
1142 g11_im = _mm256_loadu_pd(&g11[u + hn].v);
1143
1144 t = _mm256_div_pd(one,
1145 FMADD(g00_re, g00_re,
1146 _mm256_mul_pd(g00_im, g00_im)));
1147 g00_re = _mm256_mul_pd(g00_re, t);
1148 g00_im = _mm256_mul_pd(g00_im, t);
1149 mu_re = FMADD(g01_re, g00_re,
1150 _mm256_mul_pd(g01_im, g00_im));
1151 mu_im = FMSUB(g01_re, g00_im,
1152 _mm256_mul_pd(g01_im, g00_re));
1153 xi_re = FMSUB(mu_re, g01_re,
1154 _mm256_mul_pd(mu_im, g01_im));
1155 xi_im = FMADD(mu_im, g01_re,
1156 _mm256_mul_pd(mu_re, g01_im));
1157 _mm256_storeu_pd(&d11[u].v,
1158 _mm256_sub_pd(g11_re, xi_re));
1159 _mm256_storeu_pd(&d11[u + hn].v,
1160 _mm256_add_pd(g11_im, xi_im));
1161 _mm256_storeu_pd(&l10[u].v, mu_re);
1162 _mm256_storeu_pd(&l10[u + hn].v, mu_im);
1163 }
1164 } else {
1165 for (u = 0; u < hn; u ++) {
1166 fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
1167 fpr mu_re, mu_im;
1168
1169 g00_re = g00[u];
1170 g00_im = g00[u + hn];
1171 g01_re = g01[u];
1172 g01_im = g01[u + hn];
1173 g11_re = g11[u];
1174 g11_im = g11[u + hn];
1175 FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
1176 FPC_MUL(g01_re, g01_im,
1177 mu_re, mu_im, g01_re, fpr_neg(g01_im));
1178 FPC_SUB(d11[u], d11[u + hn],
1179 g11_re, g11_im, g01_re, g01_im);
1180 l10[u] = mu_re;
1181 l10[u + hn] = fpr_neg(mu_im);
1182 }
1183 }
1184 #else // yyyAVX2+0
1185 for (u = 0; u < hn; u ++) {
1186 fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
1187 fpr mu_re, mu_im;
1188
1189 g00_re = g00[u];
1190 g00_im = g00[u + hn];
1191 g01_re = g01[u];
1192 g01_im = g01[u + hn];
1193 g11_re = g11[u];
1194 g11_im = g11[u + hn];
1195 FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
1196 FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
1197 FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
1198 l10[u] = mu_re;
1199 l10[u + hn] = fpr_neg(mu_im);
1200 }
1201 #endif // yyyAVX2-
1202 }
1203
1204 /* see inner.h */
1205 TARGET_AVX2
1206 void
1207 Zf(poly_split_fft)(
1208 fpr *restrict f0, fpr *restrict f1,
1209 const fpr *restrict f, unsigned logn)
1210 {
1211 /*
1212 * The FFT representation we use is in bit-reversed order
1213 * (element i contains f(w^(rev(i))), where rev() is the
1214 * bit-reversal function over the ring degree. This changes
1215 * indexes with regards to the Falcon specification.
1216 */
1217 size_t n, hn, qn, u;
1218
1219 n = (size_t)1 << logn;
1220 hn = n >> 1;
1221 qn = hn >> 1;
1222
1223 #if FALCON_AVX2 // yyyAVX2+1
1224 if (n >= 8) {
1225 __m256d half, sv;
1226
1227 half = _mm256_set1_pd(0.5);
1228 sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
1229 for (u = 0; u < qn; u += 2) {
1230 __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;
1231
1232 ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
1233 ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
1234 ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
1235 ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
1236 _mm_storeu_pd(&f0[u].v,
1237 _mm256_extractf128_pd(ff0, 0));
1238 _mm_storeu_pd(&f0[u + qn].v,
1239 _mm256_extractf128_pd(ff0, 1));
1240
1241 ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
1242 gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
1243 ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
1244 ff3 = _mm256_hadd_pd(
1245 _mm256_mul_pd(ff1, gmt),
1246 _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
1247 ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
1248 _mm_storeu_pd(&f1[u].v,
1249 _mm256_extractf128_pd(ff3, 0));
1250 _mm_storeu_pd(&f1[u + qn].v,
1251 _mm256_extractf128_pd(ff3, 1));
1252 }
1253 } else {
1254 f0[0] = f[0];
1255 f1[0] = f[hn];
1256
1257 for (u = 0; u < qn; u ++) {
1258 fpr a_re, a_im, b_re, b_im;
1259 fpr t_re, t_im;
1260
1261 a_re = f[(u << 1) + 0];
1262 a_im = f[(u << 1) + 0 + hn];
1263 b_re = f[(u << 1) + 1];
1264 b_im = f[(u << 1) + 1 + hn];
1265
1266 FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
1267 f0[u] = fpr_half(t_re);
1268 f0[u + qn] = fpr_half(t_im);
1269
1270 FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
1271 FPC_MUL(t_re, t_im, t_re, t_im,
1272 fpr_gm_tab[((u + hn) << 1) + 0],
1273 fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
1274 f1[u] = fpr_half(t_re);
1275 f1[u + qn] = fpr_half(t_im);
1276 }
1277 }
1278 #else // yyyAVX2+0
1279 /*
1280 * We process complex values by pairs. For logn = 1, there is only
1281 * one complex value (the other one is the implicit conjugate),
1282 * so we add the two lines below because the loop will be
1283 * skipped.
1284 */
1285 f0[0] = f[0];
1286 f1[0] = f[hn];
1287
1288 for (u = 0; u < qn; u ++) {
1289 fpr a_re, a_im, b_re, b_im;
1290 fpr t_re, t_im;
1291
1292 a_re = f[(u << 1) + 0];
1293 a_im = f[(u << 1) + 0 + hn];
1294 b_re = f[(u << 1) + 1];
1295 b_im = f[(u << 1) + 1 + hn];
1296
1297 FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
1298 f0[u] = fpr_half(t_re);
1299 f0[u + qn] = fpr_half(t_im);
1300
1301 FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
1302 FPC_MUL(t_re, t_im, t_re, t_im,
1303 fpr_gm_tab[((u + hn) << 1) + 0],
1304 fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
1305 f1[u] = fpr_half(t_re);
1306 f1[u + qn] = fpr_half(t_im);
1307 }
1308 #endif // yyyAVX2-
1309 }
1310
1311 /* see inner.h */
1312 TARGET_AVX2
1313 void
1314 Zf(poly_merge_fft)(
1315 fpr *restrict f,
1316 const fpr *restrict f0, const fpr *restrict f1, unsigned logn)
1317 {
1318 size_t n, hn, qn, u;
1319
1320 n = (size_t)1 << logn;
1321 hn = n >> 1;
1322 qn = hn >> 1;
1323
1324 #if FALCON_AVX2 // yyyAVX2+1
1325 if (n >= 16) {
1326 for (u = 0; u < qn; u += 4) {
1327 __m256d a_re, a_im, b_re, b_im, c_re, c_im;
1328 __m256d gm1, gm2, g_re, g_im;
1329 __m256d t_re, t_im, u_re, u_im;
1330 __m256d tu1_re, tu2_re, tu1_im, tu2_im;
1331
1332 a_re = _mm256_loadu_pd(&f0[u].v);
1333 a_im = _mm256_loadu_pd(&f0[u + qn].v);
1334 c_re = _mm256_loadu_pd(&f1[u].v);
1335 c_im = _mm256_loadu_pd(&f1[u + qn].v);
1336
1337 gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
1338 gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
1339 g_re = _mm256_unpacklo_pd(gm1, gm2);
1340 g_im = _mm256_unpackhi_pd(gm1, gm2);
1341 g_re = _mm256_permute4x64_pd(g_re, 0xD8);
1342 g_im = _mm256_permute4x64_pd(g_im, 0xD8);
1343
1344 b_re = FMSUB(
1345 c_re, g_re, _mm256_mul_pd(c_im, g_im));
1346 b_im = FMADD(
1347 c_re, g_im, _mm256_mul_pd(c_im, g_re));
1348
1349 t_re = _mm256_add_pd(a_re, b_re);
1350 t_im = _mm256_add_pd(a_im, b_im);
1351 u_re = _mm256_sub_pd(a_re, b_re);
1352 u_im = _mm256_sub_pd(a_im, b_im);
1353
1354 tu1_re = _mm256_unpacklo_pd(t_re, u_re);
1355 tu2_re = _mm256_unpackhi_pd(t_re, u_re);
1356 tu1_im = _mm256_unpacklo_pd(t_im, u_im);
1357 tu2_im = _mm256_unpackhi_pd(t_im, u_im);
1358 _mm256_storeu_pd(&f[(u << 1)].v,
1359 _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
1360 _mm256_storeu_pd(&f[(u << 1) + 4].v,
1361 _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
1362 _mm256_storeu_pd(&f[(u << 1) + hn].v,
1363 _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
1364 _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
1365 _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
1366 }
1367 } else {
1368 f[0] = f0[0];
1369 f[hn] = f1[0];
1370
1371 for (u = 0; u < qn; u ++) {
1372 fpr a_re, a_im, b_re, b_im;
1373 fpr t_re, t_im;
1374
1375 a_re = f0[u];
1376 a_im = f0[u + qn];
1377 FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
1378 fpr_gm_tab[((u + hn) << 1) + 0],
1379 fpr_gm_tab[((u + hn) << 1) + 1]);
1380 FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
1381 f[(u << 1) + 0] = t_re;
1382 f[(u << 1) + 0 + hn] = t_im;
1383 FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
1384 f[(u << 1) + 1] = t_re;
1385 f[(u << 1) + 1 + hn] = t_im;
1386 }
1387 }
1388 #else // yyyAVX2+0
1389 /*
1390 * An extra copy to handle the special case logn = 1.
1391 */
1392 f[0] = f0[0];
1393 f[hn] = f1[0];
1394
1395 for (u = 0; u < qn; u ++) {
1396 fpr a_re, a_im, b_re, b_im;
1397 fpr t_re, t_im;
1398
1399 a_re = f0[u];
1400 a_im = f0[u + qn];
1401 FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
1402 fpr_gm_tab[((u + hn) << 1) + 0],
1403 fpr_gm_tab[((u + hn) << 1) + 1]);
1404 FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
1405 f[(u << 1) + 0] = t_re;
1406 f[(u << 1) + 0 + hn] = t_im;
1407 FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
1408 f[(u << 1) + 1] = t_re;
1409 f[(u << 1) + 1 + hn] = t_im;
1410 }
1411 #endif // yyyAVX2-
1412 }