Falcon source files (reference implementation)


fpr.h

    1 /*
    2  * Floating-point operations.
    3  *
    4  * ==========================(LICENSE BEGIN)============================
    5  *
    6  * Copyright (c) 2017-2019  Falcon Project
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining
    9  * a copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be
   17  * included in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * ===========================(LICENSE END)=============================
   28  *
   29  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
   30  */
   31 
   32 #if FALCON_FPEMU  // yyyFPEMU+1 yyyFPNATIVE+0
   33 
   34 /* ====================================================================== */
   35 /*
   36  * Custom floating-point implementation with integer arithmetics. We
   37  * use IEEE-754 "binary64" format, with some simplifications:
   38  *
   39  *   - Top bit is s = 1 for negative, 0 for positive.
   40  *
   41  *   - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
   42  *
   43  *   - Mantissa m uses the 52 low bits.
   44  *
   45  * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
   46  * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
   47  * less than 1.0), but the top bit (equal to 1 by definition) is omitted
   48  * in the encoding.
   49  *
   50  * In IEEE-754, there are some special values:
   51  *
   52  *   - If e = 2047, then the value is either an infinite (m = 0) or
   53  *     a NaN (m != 0).
   54  *
   55  *   - If e = 0, then the value is either a zero (m = 0) or a subnormal,
   56  *     aka "denormalized number" (m != 0).
   57  *
   58  * Of these, we only need the zeros. The caller is responsible for not
   59  * providing operands that would lead to infinites, NaNs or subnormals.
   60  * If inputs are such that values go out of range, then indeterminate
   61  * values are returned (it would still be deterministic, but no specific
   62  * value may be relied upon).
   63  *
   64  * At the C level, the three parts are stored in a 64-bit unsigned
   65  * word.
   66  *
   67  * One may note that a property of the IEEE-754 format is that order
   68  * is preserved for positive values: if two positive floating-point
   69  * values x and y are such that x < y, then their respective encodings
   70  * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
   71  * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
   72  * y < 0, and x < y, then ia64(x) > ia64(y).
   73  *
   74  * IMPORTANT ASSUMPTIONS:
   75  * ======================
   76  *
   77  * For proper computations, and constant-time behaviour, we assume the
   78  * following:
   79  *
   80  *   - 32x32->64 multiplication (unsigned) has an execution time that
   81  *     is independent of its operands. This is true of most modern
   82  *     x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
   83  *     and M3 (in the M0 and M0+, this is done in software, so it depends
   84  *     on that routine), and the PowerPC cores from the G3/G4 lines.
   85  *     For more info, see: https://www.bearssl.org/ctmul.html
   86  *
   87  *   - Left-shifts and right-shifts of 32-bit values have an execution
   88  *     time which does not depend on the shifted value nor on the
   89  *     shift count. An historical exception is the Pentium IV, but most
   90  *     modern CPU have barrel shifters. Some small microcontrollers
   91  *     might have varying-time shifts (not the ARM Cortex M*, though).
   92  *
   93  *   - Right-shift of a signed negative value performs a sign extension.
   94  *     As per the C standard, this operation returns an
   95  *     implementation-defined result (this is NOT an "undefined
   96  *     behaviour"). On most/all systems, an arithmetic shift is
   97  *     performed, because this is what makes most sense.
   98  */
   99 
  100 /*
  101  * Normally we should declare the 'fpr' type to be a struct or union
  102  * around the internal 64-bit value; however, we want to use the
  103  * direct 64-bit integer type to enable a lighter call convention on
  104  * ARM platforms. This means that direct (invalid) use of operators
  105  * such as '*' or '+' will not be caught by the compiler. We rely on
  106  * the "normal" (non-emulated) code to detect such instances.
  107  */
  108 typedef uint64_t fpr;
  109 
  110 /*
  111  * For computations, we split values into an integral mantissa in the
  112  * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
  113  * "sticky" (it is set to 1 if any of the bits below it is 1); when
  114  * re-encoding, the low two bits are dropped, but may induce an
  115  * increment in the value for proper rounding.
  116  */
  117 
  118 /*
  119  * Right-shift a 64-bit unsigned value by a possibly secret shift count.
  120  * We assumed that the underlying architecture had a barrel shifter for
  121  * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
  122  * typically invoke a software routine that is not necessarily
  123  * constant-time; hence the function below.
  124  *
  125  * Shift count n MUST be in the 0..63 range.
  126  */
  127 static inline uint64_t
  128 fpr_ursh(uint64_t x, int n)
  129 {
  130         x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
  131         return x >> (n & 31);
  132 }
  133 
  134 /*
  135  * Right-shift a 64-bit signed value by a possibly secret shift count
  136  * (see fpr_ursh() for the rationale).
  137  *
  138  * Shift count n MUST be in the 0..63 range.
  139  */
  140 static inline int64_t
  141 fpr_irsh(int64_t x, int n)
  142 {
  143         x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
  144         return x >> (n & 31);
  145 }
  146 
  147 /*
  148  * Left-shift a 64-bit unsigned value by a possibly secret shift count
  149  * (see fpr_ursh() for the rationale).
  150  *
  151  * Shift count n MUST be in the 0..63 range.
  152  */
  153 static inline uint64_t
  154 fpr_ulsh(uint64_t x, int n)
  155 {
  156         x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
  157         return x << (n & 31);
  158 }
  159 
  160 /*
  161  * Expectations:
  162  *   s = 0 or 1
  163  *   exponent e is "arbitrary" and unbiased
  164  *   2^54 <= m < 2^55
  165  * Numerical value is (-1)^2 * m * 2^e
  166  *
  167  * Exponents which are too low lead to value zero. If the exponent is
  168  * too large, the returned value is indeterminate.
  169  *
  170  * If m = 0, then a zero is returned (using the provided sign).
  171  * If e < -1076, then a zero is returned (regardless of the value of m).
  172  * If e >= -1076 and e != 0, m must be within the expected range
  173  * (2^54 to 2^55-1).
  174  */
  175 static inline fpr
  176 FPR(int s, int e, uint64_t m)
  177 {
  178         fpr x;
  179         uint32_t t;
  180         unsigned f;
  181 
  182         /*
  183          * If e >= -1076, then the value is "normal"; otherwise, it
  184          * should be a subnormal, which we clamp down to zero.
  185          */
  186         e += 1076;
  187         t = (uint32_t)e >> 31;
  188         m &= (uint64_t)t - 1;
  189 
  190         /*
  191          * If m = 0 then we want a zero; make e = 0 too, but conserve
  192          * the sign.
  193          */
  194         t = (uint32_t)(m >> 54);
  195         e &= -(int)t;
  196 
  197         /*
  198          * The 52 mantissa bits come from m. Value m has its top bit set
  199          * (unless it is a zero); we leave it "as is": the top bit will
  200          * increment the exponent by 1, except when m = 0, which is
  201          * exactly what we want.
  202          */
  203         x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
  204 
  205         /*
  206          * Rounding: if the low three bits of m are 011, 110 or 111,
  207          * then the value should be incremented to get the next
  208          * representable value. This implements the usual
  209          * round-to-nearest rule (with preference to even values in case
  210          * of a tie). Note that the increment may make a carry spill
  211          * into the exponent field, which is again exactly what we want
  212          * in that case.
  213          */
  214         f = (unsigned)m & 7U;
  215         x += (0xC8U >> f) & 1;
  216         return x;
  217 }
  218 
  219 #define fpr_scaled   Zf(fpr_scaled)
  220 fpr fpr_scaled(int64_t i, int sc);
  221 
  222 static inline fpr
  223 fpr_of(int64_t i)
  224 {
  225         return fpr_scaled(i, 0);
  226 }
  227 
  228 static const fpr fpr_q = 4667981563525332992;
  229 static const fpr fpr_inverse_of_q = 4545632735260551042;
  230 static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
  231 static const fpr fpr_inv_sigma[] = {
  232         0,  /* unused */
  233         4574611497772390042,
  234         4574501679055810265,
  235         4574396282908341804,
  236         4574245855758572086,
  237         4574103865040221165,
  238         4573969550563515544,
  239         4573842244705920822,
  240         4573721358406441454,
  241         4573606369665796042,
  242         4573496814039276259
  243 };
  244 static const fpr fpr_sigma_min[] = {
  245         0,  /* unused */
  246         4607707126469777035,
  247         4607777455861499430,
  248         4607846828256951418,
  249         4607949175006100261,
  250         4608049571757433526,
  251         4608148125896792003,
  252         4608244935301382692,
  253         4608340089478362016,
  254         4608433670533905013,
  255         4608525754002622308
  256 };
  257 static const fpr fpr_log2 = 4604418534313441775;
  258 static const fpr fpr_inv_log2 = 4609176140021203710;
  259 static const fpr fpr_bnorm_max = 4670353323383631276;
  260 static const fpr fpr_zero = 0;
  261 static const fpr fpr_one = 4607182418800017408;
  262 static const fpr fpr_two = 4611686018427387904;
  263 static const fpr fpr_onehalf = 4602678819172646912;
  264 static const fpr fpr_invsqrt2 = 4604544271217802189;
  265 static const fpr fpr_invsqrt8 = 4600040671590431693;
  266 static const fpr fpr_ptwo31 = 4746794007248502784;
  267 static const fpr fpr_ptwo31m1 = 4746794007244308480;
  268 static const fpr fpr_mtwo31m1 = 13970166044099084288U;
  269 static const fpr fpr_ptwo63m1 = 4890909195324358656;
  270 static const fpr fpr_mtwo63m1 = 14114281232179134464U;
  271 static const fpr fpr_ptwo63 = 4890909195324358656;
  272 
  273 static inline int64_t
  274 fpr_rint(fpr x)
  275 {
  276         uint64_t m, d;
  277         int e;
  278         uint32_t s, dd, f;
  279 
  280         /*
  281          * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
  282          * thus extract the mantissa as a 63-bit integer, then right-shift
  283          * it as needed.
  284          */
  285         m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
  286         e = 1085 - ((int)(x >> 52) & 0x7FF);
  287 
  288         /*
  289          * If a shift of more than 63 bits is needed, then simply set m
  290          * to zero. This also covers the case of an input operand equal
  291          * to zero.
  292          */
  293         m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
  294         e &= 63;
  295 
  296         /*
  297          * Right-shift m as needed. Shift count is e. Proper rounding
  298          * mandates that:
  299          *   - If the highest dropped bit is zero, then round low.
  300          *   - If the highest dropped bit is one, and at least one of the
  301          *     other dropped bits is one, then round up.
  302          *   - If the highest dropped bit is one, and all other dropped
  303          *     bits are zero, then round up if the lowest kept bit is 1,
  304          *     or low otherwise (i.e. ties are broken by "rounding to even").
  305          *
  306          * We thus first extract a word consisting of all the dropped bit
  307          * AND the lowest kept bit; then we shrink it down to three bits,
  308          * the lowest being "sticky".
  309          */
  310         d = fpr_ulsh(m, 63 - e);
  311         dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
  312         f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
  313         m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
  314 
  315         /*
  316          * Apply the sign bit.
  317          */
  318         s = (uint32_t)(x >> 63);
  319         return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
  320 }
  321 
  322 static inline int64_t
  323 fpr_floor(fpr x)
  324 {
  325         uint64_t t;
  326         int64_t xi;
  327         int e, cc;
  328 
  329         /*
  330          * We extract the integer as a _signed_ 64-bit integer with
  331          * a scaling factor. Since we assume that the value fits
  332          * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
  333          * absolute value to make it in the 2^62..2^63-1 range: we
  334          * will only need a right-shift afterwards.
  335          */
  336         e = (int)(x >> 52) & 0x7FF;
  337         t = x >> 63;
  338         xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
  339                 & (((uint64_t)1 << 63) - 1));
  340         xi = (xi ^ -(int64_t)t) + (int64_t)t;
  341         cc = 1085 - e;
  342 
  343         /*
  344          * We perform an arithmetic right-shift on the value. This
  345          * applies floor() semantics on both positive and negative values
  346          * (rounding toward minus infinity).
  347          */
  348         xi = fpr_irsh(xi, cc & 63);
  349 
  350         /*
  351          * If the true shift count was 64 or more, then we should instead
  352          * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
  353          * case: -0 will be floored to -1, not 0 (whether this is correct
  354          * is debatable; in any case, the other functions normalize zero
  355          * to +0).
  356          *
  357          * For an input of zero, the non-shifted xi was incorrect (we used
  358          * a top implicit bit of value 1, not 0), but this does not matter
  359          * since this operation will clamp it down.
  360          */
  361         xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
  362         return xi;
  363 }
  364 
  365 static inline int64_t
  366 fpr_trunc(fpr x)
  367 {
  368         uint64_t t, xu;
  369         int e, cc;
  370 
  371         /*
  372          * Extract the absolute value. Since we assume that the value
  373          * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
  374          * the absolute value into the 2^62..2^63-1 range, and then
  375          * do a right shift afterwards.
  376          */
  377         e = (int)(x >> 52) & 0x7FF;
  378         xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
  379         cc = 1085 - e;
  380         xu = fpr_ursh(xu, cc & 63);
  381 
  382         /*
  383          * If the exponent is too low (cc > 63), then the shift was wrong
  384          * and we must clamp the value to 0. This also covers the case
  385          * of an input equal to zero.
  386          */
  387         xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
  388 
  389         /*
  390          * Apply back the sign, if the source value is negative.
  391          */
  392         t = x >> 63;
  393         xu = (xu ^ -t) + t;
  394         return *(int64_t *)&xu;
  395 }
  396 
  397 #define fpr_add   Zf(fpr_add)
  398 fpr fpr_add(fpr x, fpr y);
  399 
  400 static inline fpr
  401 fpr_sub(fpr x, fpr y)
  402 {
  403         y ^= (uint64_t)1 << 63;
  404         return fpr_add(x, y);
  405 }
  406 
  407 static inline fpr
  408 fpr_neg(fpr x)
  409 {
  410         x ^= (uint64_t)1 << 63;
  411         return x;
  412 }
  413 
  414 static inline fpr
  415 fpr_half(fpr x)
  416 {
  417         /*
  418          * To divide a value by 2, we just have to subtract 1 from its
  419          * exponent, but we have to take care of zero.
  420          */
  421         uint32_t t;
  422 
  423         x -= (uint64_t)1 << 52;
  424         t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
  425         x &= (uint64_t)t - 1;
  426         return x;
  427 }
  428 
  429 static inline fpr
  430 fpr_double(fpr x)
  431 {
  432         /*
  433          * To double a value, we just increment by one the exponent. We
  434          * don't care about infinites or NaNs; however, 0 is a
  435          * special case.
  436          */
  437         x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
  438         return x;
  439 }
  440 
  441 #define fpr_mul   Zf(fpr_mul)
  442 fpr fpr_mul(fpr x, fpr y);
  443 
  444 static inline fpr
  445 fpr_sqr(fpr x)
  446 {
  447         return fpr_mul(x, x);
  448 }
  449 
  450 #define fpr_div   Zf(fpr_div)
  451 fpr fpr_div(fpr x, fpr y);
  452 
  453 static inline fpr
  454 fpr_inv(fpr x)
  455 {
  456         return fpr_div(4607182418800017408u, x);
  457 }
  458 
  459 #define fpr_sqrt   Zf(fpr_sqrt)
  460 fpr fpr_sqrt(fpr x);
  461 
  462 static inline int
  463 fpr_lt(fpr x, fpr y)
  464 {
  465         /*
  466          * If x >= 0 or y >= 0, a signed comparison yields the proper
  467          * result:
  468          *   - For positive values, the order is preserved.
  469          *   - The sign bit is at the same place as in integers, so
  470          *     sign is preserved.
  471          *
  472          * If both x and y are negative, then the order is reversed.
  473          * We cannot simply invert the comparison result in that case
  474          * because it would not handle the edge case x = y properly.
  475          */
  476         int cc0, cc1;
  477 
  478         cc0 = *(int64_t *)&x < *(int64_t *)&y;
  479         cc1 = *(int64_t *)&x > *(int64_t *)&y;
  480         return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
  481 }
  482 
  483 /*
  484  * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
  485  * bits or so.
  486  */
  487 #define fpr_expm_p63   Zf(fpr_expm_p63)
  488 uint64_t fpr_expm_p63(fpr x, fpr ccs);
  489 
  490 #define fpr_gm_tab   Zf(fpr_gm_tab)
  491 extern const fpr fpr_gm_tab[];
  492 
  493 #define fpr_p2_tab   Zf(fpr_p2_tab)
  494 extern const fpr fpr_p2_tab[];
  495 
  496 /* ====================================================================== */
  497 
  498 #elif FALCON_FPNATIVE  // yyyFPEMU+0 yyyFPNATIVE+1
  499 
  500 /* ====================================================================== */
  501 
  502 #include <math.h>
  503 
  504 /*
  505  * We wrap the native 'double' type into a structure so that the C compiler
  506  * complains if we inadvertently use raw arithmetic operators on the 'fpr'
  507  * type instead of using the inline functions below. This should have no
  508  * extra runtime cost, since all the functions below are 'inline'.
  509  */
  510 typedef struct { double v; } fpr;
  511 
  512 static inline fpr
  513 FPR(double v)
  514 {
  515         fpr x;
  516 
  517         x.v = v;
  518         return x;
  519 }
  520 
  521 static inline fpr
  522 fpr_of(int64_t i)
  523 {
  524         return FPR((double)i);
  525 }
  526 
  527 static const fpr fpr_q = { 12289.0 };
  528 static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
  529 static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
  530 static const fpr fpr_inv_sigma[] = {
  531         { 0.0 }, /* unused */
  532         { 0.0069054793295940891952143765991630516 },
  533         { 0.0068102267767177975961393730687908629 },
  534         { 0.0067188101910722710707826117910434131 },
  535         { 0.0065883354370073665545865037227681924 },
  536         { 0.0064651781207602900738053897763485516 },
  537         { 0.0063486788828078995327741182928037856 },
  538         { 0.0062382586529084374473367528433697537 },
  539         { 0.0061334065020930261548984001431770281 },
  540         { 0.0060336696681577241031668062510953022 },
  541         { 0.0059386453095331159950250124336477482 }
  542 };
  543 static const fpr fpr_sigma_min[] = {
  544         { 0.0 }, /* unused */
  545         { 1.1165085072329102588881898380334015 },
  546         { 1.1321247692325272405718031785357108 },
  547         { 1.1475285353733668684571123112513188 },
  548         { 1.1702540788534828939713084716509250 },
  549         { 1.1925466358390344011122170489094133 },
  550         { 1.2144300507766139921088487776957699 },
  551         { 1.2359260567719808790104525941706723 },
  552         { 1.2570545284063214162779743112075080 },
  553         { 1.2778336969128335860256340575729042 },
  554         { 1.2982803343442918539708792538826807 }
  555 };
  556 static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
  557 static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
  558 static const fpr fpr_bnorm_max = { 16822.4121 };
  559 static const fpr fpr_zero = { 0.0 };
  560 static const fpr fpr_one = { 1.0 };
  561 static const fpr fpr_two = { 2.0 };
  562 static const fpr fpr_onehalf = { 0.5 };
  563 static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
  564 static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
  565 static const fpr fpr_ptwo31 = { 2147483648.0 };
  566 static const fpr fpr_ptwo31m1 = { 2147483647.0 };
  567 static const fpr fpr_mtwo31m1 = { -2147483647.0 };
  568 static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
  569 static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
  570 static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
  571 
  572 static inline int64_t
  573 fpr_rint(fpr x)
  574 {
  575         /*
  576          * We do not want to use llrint() since it might be not
  577          * constant-time.
  578          *
  579          * Suppose that x >= 0. If x >= 2^52, then it is already an
  580          * integer. Otherwise, if x < 2^52, then computing x+2^52 will
  581          * yield a value that will be rounded to the nearest integer
  582          * with exactly the right rules (round-to-nearest-even).
  583          *
  584          * In order to have constant-time processing, we must do the
  585          * computation for both x >= 0 and x < 0 cases, and use a
  586          * cast to an integer to access the sign and select the proper
  587          * value. Such casts also allow us to find out if |x| < 2^52.
  588          */
  589         int64_t sx, tx, rp, rn, m;
  590         uint32_t ub;
  591 
  592         sx = (int64_t)(x.v - 1.0);
  593         tx = (int64_t)x.v;
  594         rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
  595         rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
  596 
  597         /*
  598          * If tx >= 2^52 or tx < -2^52, then result is tx.
  599          * Otherwise, if sx >= 0, then result is rp.
  600          * Otherwise, result is rn. We use the fact that when x is
  601          * close to 0 (|x| <= 0.25) then both rp and rn are correct;
  602          * and if x is not close to 0, then trunc(x-1.0) yields the
  603          * appropriate sign.
  604          */
  605 
  606         /*
  607          * Clamp rp to zero if tx < 0.
  608          * Clamp rn to zero if tx >= 0.
  609          */
  610         m = sx >> 63;
  611         rn &= m;
  612         rp &= ~m;
  613 
  614         /*
  615          * Get the 12 upper bits of tx; if they are not all zeros or
  616          * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
  617          * rp and rn to zero. Otherwise, we clamp tx to zero.
  618          */
  619         ub = (uint32_t)((uint64_t)tx >> 52);
  620         m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
  621         rp &= m;
  622         rn &= m;
  623         tx &= ~m;
  624 
  625         /*
  626          * Only one of tx, rn or rp (at most) can be non-zero at this
  627          * point.
  628          */
  629         return tx | rn | rp;
  630 }
  631 
  632 static inline int64_t
  633 fpr_floor(fpr x)
  634 {
  635         int64_t r;
  636 
  637         /*
  638          * The cast performs a trunc() (rounding toward 0) and thus is
  639          * wrong by 1 for most negative values. The correction below is
  640          * constant-time as long as the compiler turns the
  641          * floating-point conversion result into a 0/1 integer without a
  642          * conditional branch or another non-constant-time construction.
  643          * This should hold on all modern architectures with an FPU (and
  644          * if it is false on a given arch, then chances are that the FPU
  645          * itself is not constant-time, making the point moot).
  646          */
  647         r = (int64_t)x.v;
  648         return r - (x.v < (double)r);
  649 }
  650 
  651 static inline int64_t
  652 fpr_trunc(fpr x)
  653 {
  654         return (int64_t)x.v;
  655 }
  656 
  657 static inline fpr
  658 fpr_add(fpr x, fpr y)
  659 {
  660         return FPR(x.v + y.v);
  661 }
  662 
  663 static inline fpr
  664 fpr_sub(fpr x, fpr y)
  665 {
  666         return FPR(x.v - y.v);
  667 }
  668 
  669 static inline fpr
  670 fpr_neg(fpr x)
  671 {
  672         return FPR(-x.v);
  673 }
  674 
  675 static inline fpr
  676 fpr_half(fpr x)
  677 {
  678         return FPR(x.v * 0.5);
  679 }
  680 
  681 static inline fpr
  682 fpr_double(fpr x)
  683 {
  684         return FPR(x.v + x.v);
  685 }
  686 
  687 static inline fpr
  688 fpr_mul(fpr x, fpr y)
  689 {
  690         return FPR(x.v * y.v);
  691 }
  692 
  693 static inline fpr
  694 fpr_sqr(fpr x)
  695 {
  696         return FPR(x.v * x.v);
  697 }
  698 
  699 static inline fpr
  700 fpr_inv(fpr x)
  701 {
  702         return FPR(1.0 / x.v);
  703 }
  704 
  705 static inline fpr
  706 fpr_div(fpr x, fpr y)
  707 {
  708         return FPR(x.v / y.v);
  709 }
  710 
  711 #if FALCON_AVX2  // yyyAVX2+1
  712 TARGET_AVX2
  713 static inline void
  714 fpr_sqrt_avx2(double *t)
  715 {
  716         __m128d x;
  717 
  718         x = _mm_load1_pd(t);
  719         x = _mm_sqrt_pd(x);
  720         _mm_storel_pd(t, x);
  721 }
  722 #endif  // yyyAVX2-
  723 
  724 static inline fpr
  725 fpr_sqrt(fpr x)
  726 {
  727         /*
  728          * We prefer not to have a dependency on libm when it can be
  729          * avoided. On x86, calling the sqrt() libm function inlines
  730          * the relevant opcode (fsqrt or sqrtsd, depending on whether
  731          * the 387 FPU or SSE2 is used for floating-point operations)
  732          * but then makes an optional call to the library function
  733          * for proper error handling, in case the operand is negative.
  734          *
  735          * To avoid this dependency, we use intrinsics or inline assembly
  736          * on recognized platforms:
  737          *
  738          *  - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
  739          *
  740          *  - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
  741          *
  742          *  - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
  743          *    to call the 387 FPU fsqrt opcode.
  744          *
  745          *  - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
  746          *    the fsqrt opcode (Clang needs a special hack).
  747          *
  748          *  - On GCC/Clang on ARM with hardware floating-point, we use
  749          *    inline assembly to call the vqsrt.f64 opcode. Due to a
  750          *    complex ecosystem of compilers and assembly syntaxes, we
  751          *    have to call it "fsqrt" or "fsqrtd", depending on case.
  752          *
  753          * If the platform is not recognized, a call to the system
  754          * library function sqrt() is performed. On some compilers, this
  755          * may actually inline the relevant opcode, and call the library
  756          * function only when the input is invalid (e.g. negative);
  757          * Falcon never actually calls sqrt() on a negative value, but
  758          * the dependency to libm will still be there.
  759          */
  760 
  761 #if FALCON_AVX2  // yyyAVX2+1
  762         fpr_sqrt_avx2(&x.v);
  763         return x;
  764 #else  // yyyAVX2+0
  765 #if defined __GNUC__ && defined __SSE2_MATH__
  766         return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v))));
  767 #elif defined __GNUC__ && defined __i386__
  768         __asm__ __volatile__ (
  769                 "fldl   %0\n\t"
  770                 "fsqrt\n\t"
  771                 "fstpl  %0\n\t"
  772                 : "+m" (x.v) : : );
  773         return x;
  774 #elif defined _M_IX86
  775         __asm {
  776                 fld x.v
  777                 fsqrt
  778                 fstp x.v
  779         }
  780         return x;
  781 #elif defined __PPC__ && defined __GNUC__
  782         fpr y;
  783 
  784 #if defined __clang__
  785         /*
  786          * Normally we should use a 'd' constraint (register that contains
  787          * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use
  788          * an 'f' constraint, counting on the fact that 'float' values
  789          * are managed in double-precision registers anyway, and the
  790          * compiler will not add extra rounding steps.
  791          */
  792         __asm__ ( "fsqrt  %0, %1" : "=f" (y.v) : "f" (x.v) : );
  793 #else
  794         __asm__ ( "fsqrt  %0, %1" : "=d" (y.v) : "d" (x.v) : );
  795 #endif
  796         return y;
  797 #elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
  798         || (!defined __ARM_FP && defined __ARM_VFPV2__)
  799         /*
  800          * On ARM, assembly syntaxes are a bit of a mess, depending on
  801          * whether GCC or Clang is used, and the binutils version, and
  802          * whether this is 32-bit or 64-bit mode. The code below appears
  803          * to work on:
  804          *    32-bit   GCC-4.9.2   Clang-3.5   Binutils-2.25
  805          *    64-bit   GCC-6.3.0   Clang-3.9   Binutils-2.28
  806          */
  807 #if defined __aarch64__ && __aarch64__
  808         __asm__ ( "fsqrt   %d0, %d0" : "+w" (x.v) : : );
  809 #else
  810         __asm__ ( "fsqrtd  %P0, %P0" : "+w" (x.v) : : );
  811 #endif
  812         return x;
  813 #else
  814         return FPR(sqrt(x.v));
  815 #endif
  816 #endif  // yyyAVX2-
  817 }
  818 
  819 static inline int
  820 fpr_lt(fpr x, fpr y)
  821 {
  822         return x.v < y.v;
  823 }
  824 
  825 TARGET_AVX2
  826 static inline uint64_t
  827 fpr_expm_p63(fpr x, fpr ccs)
  828 {
  829         /*
  830          * Polynomial approximation of exp(-x) is taken from FACCT:
  831          *   https://eprint.iacr.org/2018/1234
  832          * Specifically, values are extracted from the implementation
  833          * referenced from the FACCT article, and available at:
  834          *   https://github.com/raykzhao/gaussian
  835          * Tests over more than 24 billions of random inputs in the
  836          * 0..log(2) range have never shown a deviation larger than
  837          * 2^(-50) from the true mathematical value.
  838          */
  839 
  840 #if FALCON_AVX2  // yyyAVX2+1
  841 
  842         /*
  843          * AVX2 implementation uses more operations than Horner's method,
  844          * but with a lower expression tree depth. This helps because
  845          * additions and multiplications have a latency of 4 cycles on
  846          * a Skylake, but the CPU can issue two of them per cycle.
  847          */
  848 
  849         static const union {
  850                 double d[12];
  851                 __m256d v[3];
  852         } c = {
  853                 {
  854                         0.999999999999994892974086724280,
  855                         0.500000000000019206858326015208,
  856                         0.166666666666984014666397229121,
  857                         0.041666666666110491190622155955,
  858                         0.008333333327800835146903501993,
  859                         0.001388888894063186997887560103,
  860                         0.000198412739277311890541063977,
  861                         0.000024801566833585381209939524,
  862                         0.000002755586350219122514855659,
  863                         0.000000275607356160477811864927,
  864                         0.000000025299506379442070029551,
  865                         0.000000002073772366009083061987
  866                 }
  867         };
  868 
  869         double d1, d2, d4, d8, y;
  870         __m256d d14, d58, d9c;
  871 
  872         d1 = -x.v;
  873         d2 = d1 * d1;
  874         d4 = d2 * d2;
  875         d8 = d4 * d4;
  876         d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
  877         d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
  878         d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
  879         d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
  880         d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
  881         d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
  882         d9c = _mm256_hadd_pd(d9c, d9c);
  883         y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
  884                 + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
  885         y *= ccs.v;
  886 
  887         /*
  888          * Final conversion goes through int64_t first, because that's what
  889          * the underlying opcode (vcvttsd2si) will do, and we know that the
  890          * result will fit, since x >= 0 and ccs < 1. If we did the
  891          * conversion directly to uint64_t, then the compiler would add some
  892          * extra code to cover the case of a source value of 2^63 or more,
  893          * and though the alternate path would never be exercised, the
  894          * extra comparison would cost us some cycles.
  895          */
  896         return (uint64_t)(int64_t)(y * fpr_ptwo63.v);
  897 
  898 #else  // yyyAVX2+0
  899 
  900         /*
  901          * Normal implementation uses Horner's method, which minimizes
  902          * the number of operations.
  903          */
  904 
  905         double d, y;
  906 
  907         d = x.v;
  908         y = 0.000000002073772366009083061987;
  909         y = 0.000000025299506379442070029551 - y * d;
  910         y = 0.000000275607356160477811864927 - y * d;
  911         y = 0.000002755586350219122514855659 - y * d;
  912         y = 0.000024801566833585381209939524 - y * d;
  913         y = 0.000198412739277311890541063977 - y * d;
  914         y = 0.001388888894063186997887560103 - y * d;
  915         y = 0.008333333327800835146903501993 - y * d;
  916         y = 0.041666666666110491190622155955 - y * d;
  917         y = 0.166666666666984014666397229121 - y * d;
  918         y = 0.500000000000019206858326015208 - y * d;
  919         y = 0.999999999999994892974086724280 - y * d;
  920         y = 1.000000000000000000000000000000 - y * d;
  921         y *= ccs.v;
  922         return (uint64_t)(y * fpr_ptwo63.v);
  923 
  924 #endif  // yyyAVX2-
  925 }
  926 
  927 #define fpr_gm_tab   Zf(fpr_gm_tab)
  928 extern const fpr fpr_gm_tab[];
  929 
  930 #define fpr_p2_tab   Zf(fpr_p2_tab)
  931 extern const fpr fpr_p2_tab[];
  932 
  933 /* ====================================================================== */
  934 
  935 #else  // yyyFPEMU+0 yyyFPNATIVE+0
  936 
  937 #error No FP implementation selected
  938 
  939 #endif  // yyyFPEMU- yyyFPNATIVE-