fpr.h
1 /*
2 * Floating-point operations.
3 *
4 * ==========================(LICENSE BEGIN)============================
5 *
6 * Copyright (c) 2017-2019 Falcon Project
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining
9 * a copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be
17 * included in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * ===========================(LICENSE END)=============================
28 *
29 * @author Thomas Pornin <thomas.pornin@nccgroup.com>
30 */
31
32 #if FALCON_FPEMU // yyyFPEMU+1 yyyFPNATIVE+0
33
34 /* ====================================================================== */
35 /*
36 * Custom floating-point implementation with integer arithmetics. We
37 * use IEEE-754 "binary64" format, with some simplifications:
38 *
39 * - Top bit is s = 1 for negative, 0 for positive.
40 *
41 * - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
42 *
43 * - Mantissa m uses the 52 low bits.
44 *
45 * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
46 * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
47 * less than 1.0), but the top bit (equal to 1 by definition) is omitted
48 * in the encoding.
49 *
50 * In IEEE-754, there are some special values:
51 *
52 * - If e = 2047, then the value is either an infinite (m = 0) or
53 * a NaN (m != 0).
54 *
55 * - If e = 0, then the value is either a zero (m = 0) or a subnormal,
56 * aka "denormalized number" (m != 0).
57 *
58 * Of these, we only need the zeros. The caller is responsible for not
59 * providing operands that would lead to infinites, NaNs or subnormals.
60 * If inputs are such that values go out of range, then indeterminate
61 * values are returned (it would still be deterministic, but no specific
62 * value may be relied upon).
63 *
64 * At the C level, the three parts are stored in a 64-bit unsigned
65 * word.
66 *
67 * One may note that a property of the IEEE-754 format is that order
68 * is preserved for positive values: if two positive floating-point
69 * values x and y are such that x < y, then their respective encodings
70 * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
71 * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
72 * y < 0, and x < y, then ia64(x) > ia64(y).
73 *
74 * IMPORTANT ASSUMPTIONS:
75 * ======================
76 *
77 * For proper computations, and constant-time behaviour, we assume the
78 * following:
79 *
80 * - 32x32->64 multiplication (unsigned) has an execution time that
81 * is independent of its operands. This is true of most modern
82 * x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
83 * and M3 (in the M0 and M0+, this is done in software, so it depends
84 * on that routine), and the PowerPC cores from the G3/G4 lines.
85 * For more info, see: https://www.bearssl.org/ctmul.html
86 *
87 * - Left-shifts and right-shifts of 32-bit values have an execution
88 * time which does not depend on the shifted value nor on the
89 * shift count. An historical exception is the Pentium IV, but most
90 * modern CPU have barrel shifters. Some small microcontrollers
91 * might have varying-time shifts (not the ARM Cortex M*, though).
92 *
93 * - Right-shift of a signed negative value performs a sign extension.
94 * As per the C standard, this operation returns an
95 * implementation-defined result (this is NOT an "undefined
96 * behaviour"). On most/all systems, an arithmetic shift is
97 * performed, because this is what makes most sense.
98 */
99
100 /*
101 * Normally we should declare the 'fpr' type to be a struct or union
102 * around the internal 64-bit value; however, we want to use the
103 * direct 64-bit integer type to enable a lighter call convention on
104 * ARM platforms. This means that direct (invalid) use of operators
105 * such as '*' or '+' will not be caught by the compiler. We rely on
106 * the "normal" (non-emulated) code to detect such instances.
107 */
108 typedef uint64_t fpr;
109
110 /*
111 * For computations, we split values into an integral mantissa in the
112 * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
113 * "sticky" (it is set to 1 if any of the bits below it is 1); when
114 * re-encoding, the low two bits are dropped, but may induce an
115 * increment in the value for proper rounding.
116 */
117
118 /*
119 * Right-shift a 64-bit unsigned value by a possibly secret shift count.
120 * We assumed that the underlying architecture had a barrel shifter for
121 * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
122 * typically invoke a software routine that is not necessarily
123 * constant-time; hence the function below.
124 *
125 * Shift count n MUST be in the 0..63 range.
126 */
127 static inline uint64_t
128 fpr_ursh(uint64_t x, int n)
129 {
130 x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
131 return x >> (n & 31);
132 }
133
134 /*
135 * Right-shift a 64-bit signed value by a possibly secret shift count
136 * (see fpr_ursh() for the rationale).
137 *
138 * Shift count n MUST be in the 0..63 range.
139 */
140 static inline int64_t
141 fpr_irsh(int64_t x, int n)
142 {
143 x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
144 return x >> (n & 31);
145 }
146
147 /*
148 * Left-shift a 64-bit unsigned value by a possibly secret shift count
149 * (see fpr_ursh() for the rationale).
150 *
151 * Shift count n MUST be in the 0..63 range.
152 */
153 static inline uint64_t
154 fpr_ulsh(uint64_t x, int n)
155 {
156 x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
157 return x << (n & 31);
158 }
159
160 /*
161 * Expectations:
162 * s = 0 or 1
163 * exponent e is "arbitrary" and unbiased
164 * 2^54 <= m < 2^55
165 * Numerical value is (-1)^2 * m * 2^e
166 *
167 * Exponents which are too low lead to value zero. If the exponent is
168 * too large, the returned value is indeterminate.
169 *
170 * If m = 0, then a zero is returned (using the provided sign).
171 * If e < -1076, then a zero is returned (regardless of the value of m).
172 * If e >= -1076 and e != 0, m must be within the expected range
173 * (2^54 to 2^55-1).
174 */
175 static inline fpr
176 FPR(int s, int e, uint64_t m)
177 {
178 fpr x;
179 uint32_t t;
180 unsigned f;
181
182 /*
183 * If e >= -1076, then the value is "normal"; otherwise, it
184 * should be a subnormal, which we clamp down to zero.
185 */
186 e += 1076;
187 t = (uint32_t)e >> 31;
188 m &= (uint64_t)t - 1;
189
190 /*
191 * If m = 0 then we want a zero; make e = 0 too, but conserve
192 * the sign.
193 */
194 t = (uint32_t)(m >> 54);
195 e &= -(int)t;
196
197 /*
198 * The 52 mantissa bits come from m. Value m has its top bit set
199 * (unless it is a zero); we leave it "as is": the top bit will
200 * increment the exponent by 1, except when m = 0, which is
201 * exactly what we want.
202 */
203 x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
204
205 /*
206 * Rounding: if the low three bits of m are 011, 110 or 111,
207 * then the value should be incremented to get the next
208 * representable value. This implements the usual
209 * round-to-nearest rule (with preference to even values in case
210 * of a tie). Note that the increment may make a carry spill
211 * into the exponent field, which is again exactly what we want
212 * in that case.
213 */
214 f = (unsigned)m & 7U;
215 x += (0xC8U >> f) & 1;
216 return x;
217 }
218
219 #define fpr_scaled Zf(fpr_scaled)
220 fpr fpr_scaled(int64_t i, int sc);
221
222 static inline fpr
223 fpr_of(int64_t i)
224 {
225 return fpr_scaled(i, 0);
226 }
227
228 static const fpr fpr_q = 4667981563525332992;
229 static const fpr fpr_inverse_of_q = 4545632735260551042;
230 static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
231 static const fpr fpr_inv_sigma[] = {
232 0, /* unused */
233 4574611497772390042,
234 4574501679055810265,
235 4574396282908341804,
236 4574245855758572086,
237 4574103865040221165,
238 4573969550563515544,
239 4573842244705920822,
240 4573721358406441454,
241 4573606369665796042,
242 4573496814039276259
243 };
244 static const fpr fpr_sigma_min[] = {
245 0, /* unused */
246 4607707126469777035,
247 4607777455861499430,
248 4607846828256951418,
249 4607949175006100261,
250 4608049571757433526,
251 4608148125896792003,
252 4608244935301382692,
253 4608340089478362016,
254 4608433670533905013,
255 4608525754002622308
256 };
257 static const fpr fpr_log2 = 4604418534313441775;
258 static const fpr fpr_inv_log2 = 4609176140021203710;
259 static const fpr fpr_bnorm_max = 4670353323383631276;
260 static const fpr fpr_zero = 0;
261 static const fpr fpr_one = 4607182418800017408;
262 static const fpr fpr_two = 4611686018427387904;
263 static const fpr fpr_onehalf = 4602678819172646912;
264 static const fpr fpr_invsqrt2 = 4604544271217802189;
265 static const fpr fpr_invsqrt8 = 4600040671590431693;
266 static const fpr fpr_ptwo31 = 4746794007248502784;
267 static const fpr fpr_ptwo31m1 = 4746794007244308480;
268 static const fpr fpr_mtwo31m1 = 13970166044099084288U;
269 static const fpr fpr_ptwo63m1 = 4890909195324358656;
270 static const fpr fpr_mtwo63m1 = 14114281232179134464U;
271 static const fpr fpr_ptwo63 = 4890909195324358656;
272
273 static inline int64_t
274 fpr_rint(fpr x)
275 {
276 uint64_t m, d;
277 int e;
278 uint32_t s, dd, f;
279
280 /*
281 * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
282 * thus extract the mantissa as a 63-bit integer, then right-shift
283 * it as needed.
284 */
285 m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
286 e = 1085 - ((int)(x >> 52) & 0x7FF);
287
288 /*
289 * If a shift of more than 63 bits is needed, then simply set m
290 * to zero. This also covers the case of an input operand equal
291 * to zero.
292 */
293 m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
294 e &= 63;
295
296 /*
297 * Right-shift m as needed. Shift count is e. Proper rounding
298 * mandates that:
299 * - If the highest dropped bit is zero, then round low.
300 * - If the highest dropped bit is one, and at least one of the
301 * other dropped bits is one, then round up.
302 * - If the highest dropped bit is one, and all other dropped
303 * bits are zero, then round up if the lowest kept bit is 1,
304 * or low otherwise (i.e. ties are broken by "rounding to even").
305 *
306 * We thus first extract a word consisting of all the dropped bit
307 * AND the lowest kept bit; then we shrink it down to three bits,
308 * the lowest being "sticky".
309 */
310 d = fpr_ulsh(m, 63 - e);
311 dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
312 f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
313 m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
314
315 /*
316 * Apply the sign bit.
317 */
318 s = (uint32_t)(x >> 63);
319 return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
320 }
321
322 static inline int64_t
323 fpr_floor(fpr x)
324 {
325 uint64_t t;
326 int64_t xi;
327 int e, cc;
328
329 /*
330 * We extract the integer as a _signed_ 64-bit integer with
331 * a scaling factor. Since we assume that the value fits
332 * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
333 * absolute value to make it in the 2^62..2^63-1 range: we
334 * will only need a right-shift afterwards.
335 */
336 e = (int)(x >> 52) & 0x7FF;
337 t = x >> 63;
338 xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
339 & (((uint64_t)1 << 63) - 1));
340 xi = (xi ^ -(int64_t)t) + (int64_t)t;
341 cc = 1085 - e;
342
343 /*
344 * We perform an arithmetic right-shift on the value. This
345 * applies floor() semantics on both positive and negative values
346 * (rounding toward minus infinity).
347 */
348 xi = fpr_irsh(xi, cc & 63);
349
350 /*
351 * If the true shift count was 64 or more, then we should instead
352 * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
353 * case: -0 will be floored to -1, not 0 (whether this is correct
354 * is debatable; in any case, the other functions normalize zero
355 * to +0).
356 *
357 * For an input of zero, the non-shifted xi was incorrect (we used
358 * a top implicit bit of value 1, not 0), but this does not matter
359 * since this operation will clamp it down.
360 */
361 xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
362 return xi;
363 }
364
365 static inline int64_t
366 fpr_trunc(fpr x)
367 {
368 uint64_t t, xu;
369 int e, cc;
370
371 /*
372 * Extract the absolute value. Since we assume that the value
373 * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
374 * the absolute value into the 2^62..2^63-1 range, and then
375 * do a right shift afterwards.
376 */
377 e = (int)(x >> 52) & 0x7FF;
378 xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
379 cc = 1085 - e;
380 xu = fpr_ursh(xu, cc & 63);
381
382 /*
383 * If the exponent is too low (cc > 63), then the shift was wrong
384 * and we must clamp the value to 0. This also covers the case
385 * of an input equal to zero.
386 */
387 xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
388
389 /*
390 * Apply back the sign, if the source value is negative.
391 */
392 t = x >> 63;
393 xu = (xu ^ -t) + t;
394 return *(int64_t *)&xu;
395 }
396
397 #define fpr_add Zf(fpr_add)
398 fpr fpr_add(fpr x, fpr y);
399
400 static inline fpr
401 fpr_sub(fpr x, fpr y)
402 {
403 y ^= (uint64_t)1 << 63;
404 return fpr_add(x, y);
405 }
406
407 static inline fpr
408 fpr_neg(fpr x)
409 {
410 x ^= (uint64_t)1 << 63;
411 return x;
412 }
413
414 static inline fpr
415 fpr_half(fpr x)
416 {
417 /*
418 * To divide a value by 2, we just have to subtract 1 from its
419 * exponent, but we have to take care of zero.
420 */
421 uint32_t t;
422
423 x -= (uint64_t)1 << 52;
424 t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
425 x &= (uint64_t)t - 1;
426 return x;
427 }
428
429 static inline fpr
430 fpr_double(fpr x)
431 {
432 /*
433 * To double a value, we just increment by one the exponent. We
434 * don't care about infinites or NaNs; however, 0 is a
435 * special case.
436 */
437 x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
438 return x;
439 }
440
441 #define fpr_mul Zf(fpr_mul)
442 fpr fpr_mul(fpr x, fpr y);
443
444 static inline fpr
445 fpr_sqr(fpr x)
446 {
447 return fpr_mul(x, x);
448 }
449
450 #define fpr_div Zf(fpr_div)
451 fpr fpr_div(fpr x, fpr y);
452
453 static inline fpr
454 fpr_inv(fpr x)
455 {
456 return fpr_div(4607182418800017408u, x);
457 }
458
459 #define fpr_sqrt Zf(fpr_sqrt)
460 fpr fpr_sqrt(fpr x);
461
462 static inline int
463 fpr_lt(fpr x, fpr y)
464 {
465 /*
466 * If x >= 0 or y >= 0, a signed comparison yields the proper
467 * result:
468 * - For positive values, the order is preserved.
469 * - The sign bit is at the same place as in integers, so
470 * sign is preserved.
471 *
472 * If both x and y are negative, then the order is reversed.
473 * We cannot simply invert the comparison result in that case
474 * because it would not handle the edge case x = y properly.
475 */
476 int cc0, cc1;
477
478 cc0 = *(int64_t *)&x < *(int64_t *)&y;
479 cc1 = *(int64_t *)&x > *(int64_t *)&y;
480 return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
481 }
482
483 /*
484 * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
485 * bits or so.
486 */
487 #define fpr_expm_p63 Zf(fpr_expm_p63)
488 uint64_t fpr_expm_p63(fpr x, fpr ccs);
489
490 #define fpr_gm_tab Zf(fpr_gm_tab)
491 extern const fpr fpr_gm_tab[];
492
493 #define fpr_p2_tab Zf(fpr_p2_tab)
494 extern const fpr fpr_p2_tab[];
495
496 /* ====================================================================== */
497
498 #elif FALCON_FPNATIVE // yyyFPEMU+0 yyyFPNATIVE+1
499
500 /* ====================================================================== */
501
502 #include <math.h>
503
504 /*
505 * We wrap the native 'double' type into a structure so that the C compiler
506 * complains if we inadvertently use raw arithmetic operators on the 'fpr'
507 * type instead of using the inline functions below. This should have no
508 * extra runtime cost, since all the functions below are 'inline'.
509 */
510 typedef struct { double v; } fpr;
511
512 static inline fpr
513 FPR(double v)
514 {
515 fpr x;
516
517 x.v = v;
518 return x;
519 }
520
521 static inline fpr
522 fpr_of(int64_t i)
523 {
524 return FPR((double)i);
525 }
526
527 static const fpr fpr_q = { 12289.0 };
528 static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
529 static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
530 static const fpr fpr_inv_sigma[] = {
531 { 0.0 }, /* unused */
532 { 0.0069054793295940891952143765991630516 },
533 { 0.0068102267767177975961393730687908629 },
534 { 0.0067188101910722710707826117910434131 },
535 { 0.0065883354370073665545865037227681924 },
536 { 0.0064651781207602900738053897763485516 },
537 { 0.0063486788828078995327741182928037856 },
538 { 0.0062382586529084374473367528433697537 },
539 { 0.0061334065020930261548984001431770281 },
540 { 0.0060336696681577241031668062510953022 },
541 { 0.0059386453095331159950250124336477482 }
542 };
543 static const fpr fpr_sigma_min[] = {
544 { 0.0 }, /* unused */
545 { 1.1165085072329102588881898380334015 },
546 { 1.1321247692325272405718031785357108 },
547 { 1.1475285353733668684571123112513188 },
548 { 1.1702540788534828939713084716509250 },
549 { 1.1925466358390344011122170489094133 },
550 { 1.2144300507766139921088487776957699 },
551 { 1.2359260567719808790104525941706723 },
552 { 1.2570545284063214162779743112075080 },
553 { 1.2778336969128335860256340575729042 },
554 { 1.2982803343442918539708792538826807 }
555 };
556 static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
557 static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
558 static const fpr fpr_bnorm_max = { 16822.4121 };
559 static const fpr fpr_zero = { 0.0 };
560 static const fpr fpr_one = { 1.0 };
561 static const fpr fpr_two = { 2.0 };
562 static const fpr fpr_onehalf = { 0.5 };
563 static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
564 static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
565 static const fpr fpr_ptwo31 = { 2147483648.0 };
566 static const fpr fpr_ptwo31m1 = { 2147483647.0 };
567 static const fpr fpr_mtwo31m1 = { -2147483647.0 };
568 static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
569 static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
570 static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
571
572 static inline int64_t
573 fpr_rint(fpr x)
574 {
575 /*
576 * We do not want to use llrint() since it might be not
577 * constant-time.
578 *
579 * Suppose that x >= 0. If x >= 2^52, then it is already an
580 * integer. Otherwise, if x < 2^52, then computing x+2^52 will
581 * yield a value that will be rounded to the nearest integer
582 * with exactly the right rules (round-to-nearest-even).
583 *
584 * In order to have constant-time processing, we must do the
585 * computation for both x >= 0 and x < 0 cases, and use a
586 * cast to an integer to access the sign and select the proper
587 * value. Such casts also allow us to find out if |x| < 2^52.
588 */
589 int64_t sx, tx, rp, rn, m;
590 uint32_t ub;
591
592 sx = (int64_t)(x.v - 1.0);
593 tx = (int64_t)x.v;
594 rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
595 rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
596
597 /*
598 * If tx >= 2^52 or tx < -2^52, then result is tx.
599 * Otherwise, if sx >= 0, then result is rp.
600 * Otherwise, result is rn. We use the fact that when x is
601 * close to 0 (|x| <= 0.25) then both rp and rn are correct;
602 * and if x is not close to 0, then trunc(x-1.0) yields the
603 * appropriate sign.
604 */
605
606 /*
607 * Clamp rp to zero if tx < 0.
608 * Clamp rn to zero if tx >= 0.
609 */
610 m = sx >> 63;
611 rn &= m;
612 rp &= ~m;
613
614 /*
615 * Get the 12 upper bits of tx; if they are not all zeros or
616 * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
617 * rp and rn to zero. Otherwise, we clamp tx to zero.
618 */
619 ub = (uint32_t)((uint64_t)tx >> 52);
620 m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
621 rp &= m;
622 rn &= m;
623 tx &= ~m;
624
625 /*
626 * Only one of tx, rn or rp (at most) can be non-zero at this
627 * point.
628 */
629 return tx | rn | rp;
630 }
631
632 static inline int64_t
633 fpr_floor(fpr x)
634 {
635 int64_t r;
636
637 /*
638 * The cast performs a trunc() (rounding toward 0) and thus is
639 * wrong by 1 for most negative values. The correction below is
640 * constant-time as long as the compiler turns the
641 * floating-point conversion result into a 0/1 integer without a
642 * conditional branch or another non-constant-time construction.
643 * This should hold on all modern architectures with an FPU (and
644 * if it is false on a given arch, then chances are that the FPU
645 * itself is not constant-time, making the point moot).
646 */
647 r = (int64_t)x.v;
648 return r - (x.v < (double)r);
649 }
650
651 static inline int64_t
652 fpr_trunc(fpr x)
653 {
654 return (int64_t)x.v;
655 }
656
657 static inline fpr
658 fpr_add(fpr x, fpr y)
659 {
660 return FPR(x.v + y.v);
661 }
662
663 static inline fpr
664 fpr_sub(fpr x, fpr y)
665 {
666 return FPR(x.v - y.v);
667 }
668
669 static inline fpr
670 fpr_neg(fpr x)
671 {
672 return FPR(-x.v);
673 }
674
675 static inline fpr
676 fpr_half(fpr x)
677 {
678 return FPR(x.v * 0.5);
679 }
680
681 static inline fpr
682 fpr_double(fpr x)
683 {
684 return FPR(x.v + x.v);
685 }
686
687 static inline fpr
688 fpr_mul(fpr x, fpr y)
689 {
690 return FPR(x.v * y.v);
691 }
692
693 static inline fpr
694 fpr_sqr(fpr x)
695 {
696 return FPR(x.v * x.v);
697 }
698
699 static inline fpr
700 fpr_inv(fpr x)
701 {
702 return FPR(1.0 / x.v);
703 }
704
705 static inline fpr
706 fpr_div(fpr x, fpr y)
707 {
708 return FPR(x.v / y.v);
709 }
710
711 #if FALCON_AVX2 // yyyAVX2+1
712 TARGET_AVX2
713 static inline void
714 fpr_sqrt_avx2(double *t)
715 {
716 __m128d x;
717
718 x = _mm_load1_pd(t);
719 x = _mm_sqrt_pd(x);
720 _mm_storel_pd(t, x);
721 }
722 #endif // yyyAVX2-
723
724 static inline fpr
725 fpr_sqrt(fpr x)
726 {
727 /*
728 * We prefer not to have a dependency on libm when it can be
729 * avoided. On x86, calling the sqrt() libm function inlines
730 * the relevant opcode (fsqrt or sqrtsd, depending on whether
731 * the 387 FPU or SSE2 is used for floating-point operations)
732 * but then makes an optional call to the library function
733 * for proper error handling, in case the operand is negative.
734 *
735 * To avoid this dependency, we use intrinsics or inline assembly
736 * on recognized platforms:
737 *
738 * - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
739 *
740 * - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
741 *
742 * - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
743 * to call the 387 FPU fsqrt opcode.
744 *
745 * - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
746 * the fsqrt opcode (Clang needs a special hack).
747 *
748 * - On GCC/Clang on ARM with hardware floating-point, we use
749 * inline assembly to call the vqsrt.f64 opcode. Due to a
750 * complex ecosystem of compilers and assembly syntaxes, we
751 * have to call it "fsqrt" or "fsqrtd", depending on case.
752 *
753 * If the platform is not recognized, a call to the system
754 * library function sqrt() is performed. On some compilers, this
755 * may actually inline the relevant opcode, and call the library
756 * function only when the input is invalid (e.g. negative);
757 * Falcon never actually calls sqrt() on a negative value, but
758 * the dependency to libm will still be there.
759 */
760
761 #if FALCON_AVX2 // yyyAVX2+1
762 fpr_sqrt_avx2(&x.v);
763 return x;
764 #else // yyyAVX2+0
765 #if defined __GNUC__ && defined __SSE2_MATH__
766 return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v))));
767 #elif defined __GNUC__ && defined __i386__
768 __asm__ __volatile__ (
769 "fldl %0\n\t"
770 "fsqrt\n\t"
771 "fstpl %0\n\t"
772 : "+m" (x.v) : : );
773 return x;
774 #elif defined _M_IX86
775 __asm {
776 fld x.v
777 fsqrt
778 fstp x.v
779 }
780 return x;
781 #elif defined __PPC__ && defined __GNUC__
782 fpr y;
783
784 #if defined __clang__
785 /*
786 * Normally we should use a 'd' constraint (register that contains
787 * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use
788 * an 'f' constraint, counting on the fact that 'float' values
789 * are managed in double-precision registers anyway, and the
790 * compiler will not add extra rounding steps.
791 */
792 __asm__ ( "fsqrt %0, %1" : "=f" (y.v) : "f" (x.v) : );
793 #else
794 __asm__ ( "fsqrt %0, %1" : "=d" (y.v) : "d" (x.v) : );
795 #endif
796 return y;
797 #elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
798 || (!defined __ARM_FP && defined __ARM_VFPV2__)
799 /*
800 * On ARM, assembly syntaxes are a bit of a mess, depending on
801 * whether GCC or Clang is used, and the binutils version, and
802 * whether this is 32-bit or 64-bit mode. The code below appears
803 * to work on:
804 * 32-bit GCC-4.9.2 Clang-3.5 Binutils-2.25
805 * 64-bit GCC-6.3.0 Clang-3.9 Binutils-2.28
806 */
807 #if defined __aarch64__ && __aarch64__
808 __asm__ ( "fsqrt %d0, %d0" : "+w" (x.v) : : );
809 #else
810 __asm__ ( "fsqrtd %P0, %P0" : "+w" (x.v) : : );
811 #endif
812 return x;
813 #else
814 return FPR(sqrt(x.v));
815 #endif
816 #endif // yyyAVX2-
817 }
818
819 static inline int
820 fpr_lt(fpr x, fpr y)
821 {
822 return x.v < y.v;
823 }
824
825 TARGET_AVX2
826 static inline uint64_t
827 fpr_expm_p63(fpr x, fpr ccs)
828 {
829 /*
830 * Polynomial approximation of exp(-x) is taken from FACCT:
831 * https://eprint.iacr.org/2018/1234
832 * Specifically, values are extracted from the implementation
833 * referenced from the FACCT article, and available at:
834 * https://github.com/raykzhao/gaussian
835 * Tests over more than 24 billions of random inputs in the
836 * 0..log(2) range have never shown a deviation larger than
837 * 2^(-50) from the true mathematical value.
838 */
839
840 #if FALCON_AVX2 // yyyAVX2+1
841
842 /*
843 * AVX2 implementation uses more operations than Horner's method,
844 * but with a lower expression tree depth. This helps because
845 * additions and multiplications have a latency of 4 cycles on
846 * a Skylake, but the CPU can issue two of them per cycle.
847 */
848
849 static const union {
850 double d[12];
851 __m256d v[3];
852 } c = {
853 {
854 0.999999999999994892974086724280,
855 0.500000000000019206858326015208,
856 0.166666666666984014666397229121,
857 0.041666666666110491190622155955,
858 0.008333333327800835146903501993,
859 0.001388888894063186997887560103,
860 0.000198412739277311890541063977,
861 0.000024801566833585381209939524,
862 0.000002755586350219122514855659,
863 0.000000275607356160477811864927,
864 0.000000025299506379442070029551,
865 0.000000002073772366009083061987
866 }
867 };
868
869 double d1, d2, d4, d8, y;
870 __m256d d14, d58, d9c;
871
872 d1 = -x.v;
873 d2 = d1 * d1;
874 d4 = d2 * d2;
875 d8 = d4 * d4;
876 d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
877 d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
878 d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
879 d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
880 d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
881 d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
882 d9c = _mm256_hadd_pd(d9c, d9c);
883 y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
884 + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
885 y *= ccs.v;
886
887 /*
888 * Final conversion goes through int64_t first, because that's what
889 * the underlying opcode (vcvttsd2si) will do, and we know that the
890 * result will fit, since x >= 0 and ccs < 1. If we did the
891 * conversion directly to uint64_t, then the compiler would add some
892 * extra code to cover the case of a source value of 2^63 or more,
893 * and though the alternate path would never be exercised, the
894 * extra comparison would cost us some cycles.
895 */
896 return (uint64_t)(int64_t)(y * fpr_ptwo63.v);
897
898 #else // yyyAVX2+0
899
900 /*
901 * Normal implementation uses Horner's method, which minimizes
902 * the number of operations.
903 */
904
905 double d, y;
906
907 d = x.v;
908 y = 0.000000002073772366009083061987;
909 y = 0.000000025299506379442070029551 - y * d;
910 y = 0.000000275607356160477811864927 - y * d;
911 y = 0.000002755586350219122514855659 - y * d;
912 y = 0.000024801566833585381209939524 - y * d;
913 y = 0.000198412739277311890541063977 - y * d;
914 y = 0.001388888894063186997887560103 - y * d;
915 y = 0.008333333327800835146903501993 - y * d;
916 y = 0.041666666666110491190622155955 - y * d;
917 y = 0.166666666666984014666397229121 - y * d;
918 y = 0.500000000000019206858326015208 - y * d;
919 y = 0.999999999999994892974086724280 - y * d;
920 y = 1.000000000000000000000000000000 - y * d;
921 y *= ccs.v;
922 return (uint64_t)(y * fpr_ptwo63.v);
923
924 #endif // yyyAVX2-
925 }
926
927 #define fpr_gm_tab Zf(fpr_gm_tab)
928 extern const fpr fpr_gm_tab[];
929
930 #define fpr_p2_tab Zf(fpr_p2_tab)
931 extern const fpr fpr_p2_tab[];
932
933 /* ====================================================================== */
934
935 #else // yyyFPEMU+0 yyyFPNATIVE+0
936
937 #error No FP implementation selected
938
939 #endif // yyyFPEMU- yyyFPNATIVE-