Falcon source files (reference implementation)


fpr.h

    1 /*
    2  * Floating-point operations.
    3  *
    4  * ==========================(LICENSE BEGIN)============================
    5  *
    6  * Copyright (c) 2017-2019  Falcon Project
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining
    9  * a copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be
   17  * included in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * ===========================(LICENSE END)=============================
   28  *
   29  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
   30  */
   31 
   32 #if FALCON_FPEMU  // yyyFPEMU+1 yyyFPNATIVE+0
   33 
   34 /* ====================================================================== */
   35 /*
   36  * Custom floating-point implementation with integer arithmetics. We
   37  * use IEEE-754 "binary64" format, with some simplifications:
   38  *
   39  *   - Top bit is s = 1 for negative, 0 for positive.
   40  *
   41  *   - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
   42  *
   43  *   - Mantissa m uses the 52 low bits.
   44  *
   45  * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
   46  * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
   47  * less than 1.0), but the top bit (equal to 1 by definition) is omitted
   48  * in the encoding.
   49  *
   50  * In IEEE-754, there are some special values:
   51  *
   52  *   - If e = 2047, then the value is either an infinite (m = 0) or
   53  *     a NaN (m != 0).
   54  *
   55  *   - If e = 0, then the value is either a zero (m = 0) or a subnormal,
   56  *     aka "denormalized number" (m != 0).
   57  *
   58  * Of these, we only need the zeros. The caller is responsible for not
   59  * providing operands that would lead to infinites, NaNs or subnormals.
   60  * If inputs are such that values go out of range, then indeterminate
   61  * values are returned (it would still be deterministic, but no specific
   62  * value may be relied upon).
   63  *
   64  * At the C level, the three parts are stored in a 64-bit unsigned
   65  * word.
   66  *
   67  * One may note that a property of the IEEE-754 format is that order
   68  * is preserved for positive values: if two positive floating-point
   69  * values x and y are such that x < y, then their respective encodings
   70  * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
   71  * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
   72  * y < 0, and x < y, then ia64(x) > ia64(y).
   73  *
   74  * IMPORTANT ASSUMPTIONS:
   75  * ======================
   76  *
   77  * For proper computations, and constant-time behaviour, we assume the
   78  * following:
   79  *
   80  *   - 32x32->64 multiplication (unsigned) has an execution time that
   81  *     is independent of its operands. This is true of most modern
   82  *     x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
   83  *     and M3 (in the M0 and M0+, this is done in software, so it depends
   84  *     on that routine), and the PowerPC cores from the G3/G4 lines.
   85  *     For more info, see: https://www.bearssl.org/ctmul.html
   86  *
   87  *   - Left-shifts and right-shifts of 32-bit values have an execution
   88  *     time which does not depend on the shifted value nor on the
   89  *     shift count. An historical exception is the Pentium IV, but most
   90  *     modern CPU have barrel shifters. Some small microcontrollers
   91  *     might have varying-time shifts (not the ARM Cortex M*, though).
   92  *
   93  *   - Right-shift of a signed negative value performs a sign extension.
   94  *     As per the C standard, this operation returns an
   95  *     implementation-defined result (this is NOT an "undefined
   96  *     behaviour"). On most/all systems, an arithmetic shift is
   97  *     performed, because this is what makes most sense.
   98  */
   99 
  100 /*
  101  * Normally we should declare the 'fpr' type to be a struct or union
  102  * around the internal 64-bit value; however, we want to use the
  103  * direct 64-bit integer type to enable a lighter call convention on
  104  * ARM platforms. This means that direct (invalid) use of operators
  105  * such as '*' or '+' will not be caught by the compiler. We rely on
  106  * the "normal" (non-emulated) code to detect such instances.
  107  */
  108 typedef uint64_t fpr;
  109 
  110 /*
  111  * For computations, we split values into an integral mantissa in the
  112  * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
  113  * "sticky" (it is set to 1 if any of the bits below it is 1); when
  114  * re-encoding, the low two bits are dropped, but may induce an
  115  * increment in the value for proper rounding.
  116  */
  117 
  118 /*
  119  * Right-shift a 64-bit unsigned value by a possibly secret shift count.
  120  * We assumed that the underlying architecture had a barrel shifter for
  121  * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
  122  * typically invoke a software routine that is not necessarily
  123  * constant-time; hence the function below.
  124  *
  125  * Shift count n MUST be in the 0..63 range.
  126  */
  127 static inline uint64_t
  128 fpr_ursh(uint64_t x, int n)
  129 {
  130         x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
  131         return x >> (n & 31);
  132 }
  133 
  134 /*
  135  * Right-shift a 64-bit signed value by a possibly secret shift count
  136  * (see fpr_ursh() for the rationale).
  137  *
  138  * Shift count n MUST be in the 0..63 range.
  139  */
  140 static inline int64_t
  141 fpr_irsh(int64_t x, int n)
  142 {
  143         x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
  144         return x >> (n & 31);
  145 }
  146 
  147 /*
  148  * Left-shift a 64-bit unsigned value by a possibly secret shift count
  149  * (see fpr_ursh() for the rationale).
  150  *
  151  * Shift count n MUST be in the 0..63 range.
  152  */
  153 static inline uint64_t
  154 fpr_ulsh(uint64_t x, int n)
  155 {
  156         x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
  157         return x << (n & 31);
  158 }
  159 
  160 /*
  161  * Expectations:
  162  *   s = 0 or 1
  163  *   exponent e is "arbitrary" and unbiased
  164  *   2^54 <= m < 2^55
  165  * Numerical value is (-1)^2 * m * 2^e
  166  *
  167  * Exponents which are too low lead to value zero. If the exponent is
  168  * too large, the returned value is indeterminate.
  169  *
  170  * If m = 0, then a zero is returned (using the provided sign).
  171  * If e < -1076, then a zero is returned (regardless of the value of m).
  172  * If e >= -1076 and e != 0, m must be within the expected range
  173  * (2^54 to 2^55-1).
  174  */
  175 static inline fpr
  176 FPR(int s, int e, uint64_t m)
  177 {
  178         fpr x;
  179         uint32_t t;
  180         unsigned f;
  181 
  182         /*
  183          * If e >= -1076, then the value is "normal"; otherwise, it
  184          * should be a subnormal, which we clamp down to zero.
  185          */
  186         e += 1076;
  187         t = (uint32_t)e >> 31;
  188         m &= (uint64_t)t - 1;
  189 
  190         /*
  191          * If m = 0 then we want a zero; make e = 0 too, but conserve
  192          * the sign.
  193          */
  194         t = (uint32_t)(m >> 54);
  195         e &= -(int)t;
  196 
  197         /*
  198          * The 52 mantissa bits come from m. Value m has its top bit set
  199          * (unless it is a zero); we leave it "as is": the top bit will
  200          * increment the exponent by 1, except when m = 0, which is
  201          * exactly what we want.
  202          */
  203         x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
  204 
  205         /*
  206          * Rounding: if the low three bits of m are 011, 110 or 111,
  207          * then the value should be incremented to get the next
  208          * representable value. This implements the usual
  209          * round-to-nearest rule (with preference to even values in case
  210          * of a tie). Note that the increment may make a carry spill
  211          * into the exponent field, which is again exactly what we want
  212          * in that case.
  213          */
  214         f = (unsigned)m & 7U;
  215         x += (0xC8U >> f) & 1;
  216         return x;
  217 }
  218 
  219 #define fpr_scaled   Zf(fpr_scaled)
  220 fpr fpr_scaled(int64_t i, int sc);
  221 
  222 static inline fpr
  223 fpr_of(int64_t i)
  224 {
  225         return fpr_scaled(i, 0);
  226 }
  227 
  228 static const fpr fpr_q = 4667981563525332992;
  229 static const fpr fpr_inverse_of_q = 4545632735260551042;
  230 static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
  231 static const fpr fpr_inv_sigma = 4573359825155195350;
  232 static const fpr fpr_sigma_min_9 = 4608495221497168882;
  233 static const fpr fpr_sigma_min_10 = 4608586345619182117;
  234 static const fpr fpr_log2 = 4604418534313441775;
  235 static const fpr fpr_inv_log2 = 4609176140021203710;
  236 static const fpr fpr_bnorm_max = 4670353323383631276;
  237 static const fpr fpr_zero = 0;
  238 static const fpr fpr_one = 4607182418800017408;
  239 static const fpr fpr_two = 4611686018427387904;
  240 static const fpr fpr_onehalf = 4602678819172646912;
  241 static const fpr fpr_invsqrt2 = 4604544271217802189;
  242 static const fpr fpr_invsqrt8 = 4600040671590431693;
  243 static const fpr fpr_ptwo31 = 4746794007248502784;
  244 static const fpr fpr_ptwo31m1 = 4746794007244308480;
  245 static const fpr fpr_mtwo31m1 = 13970166044099084288U;
  246 static const fpr fpr_ptwo63m1 = 4890909195324358656;
  247 static const fpr fpr_mtwo63m1 = 14114281232179134464U;
  248 static const fpr fpr_ptwo63 = 4890909195324358656;
  249 
  250 static inline int64_t
  251 fpr_rint(fpr x)
  252 {
  253         uint64_t m, d;
  254         int e;
  255         uint32_t s, dd, f;
  256 
  257         /*
  258          * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
  259          * thus extract the mantissa as a 63-bit integer, then right-shift
  260          * it as needed.
  261          */
  262         m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
  263         e = 1085 - ((int)(x >> 52) & 0x7FF);
  264 
  265         /*
  266          * If a shift of more than 63 bits is needed, then simply set m
  267          * to zero. This also covers the case of an input operand equal
  268          * to zero.
  269          */
  270         m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
  271         e &= 63;
  272 
  273         /*
  274          * Right-shift m as needed. Shift count is e. Proper rounding
  275          * mandates that:
  276          *   - If the highest dropped bit is zero, then round low.
  277          *   - If the highest dropped bit is one, and at least one of the
  278          *     other dropped bits is one, then round up.
  279          *   - If the highest dropped bit is one, and all other dropped
  280          *     bits are zero, then round up if the lowest kept bit is 1,
  281          *     or low otherwise (i.e. ties are broken by "rounding to even").
  282          *
  283          * We thus first extract a word consisting of all the dropped bit
  284          * AND the lowest kept bit; then we shrink it down to three bits,
  285          * the lowest being "sticky".
  286          */
  287         d = fpr_ulsh(m, 63 - e);
  288         dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
  289         f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
  290         m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
  291 
  292         /*
  293          * Apply the sign bit.
  294          */
  295         s = (uint32_t)(x >> 63);
  296         return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
  297 }
  298 
  299 static inline int64_t
  300 fpr_floor(fpr x)
  301 {
  302         uint64_t t;
  303         int64_t xi;
  304         int e, cc;
  305 
  306         /*
  307          * We extract the integer as a _signed_ 64-bit integer with
  308          * a scaling factor. Since we assume that the value fits
  309          * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
  310          * absolute value to make it in the 2^62..2^63-1 range: we
  311          * will only need a right-shift afterwards.
  312          */
  313         e = (int)(x >> 52) & 0x7FF;
  314         t = x >> 63;
  315         xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
  316                 & (((uint64_t)1 << 63) - 1));
  317         xi = (xi ^ -(int64_t)t) + (int64_t)t;
  318         cc = 1085 - e;
  319 
  320         /*
  321          * We perform an arithmetic right-shift on the value. This
  322          * applies floor() semantics on both positive and negative values
  323          * (rounding toward minus infinity).
  324          */
  325         xi = fpr_irsh(xi, cc & 63);
  326 
  327         /*
  328          * If the true shift count was 64 or more, then we should instead
  329          * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
  330          * case: -0 will be floored to -1, not 0 (whether this is correct
  331          * is debatable; in any case, the other functions normalize zero
  332          * to +0).
  333          *
  334          * For an input of zero, the non-shifted xi was incorrect (we used
  335          * a top implicit bit of value 1, not 0), but this does not matter
  336          * since this operation will clamp it down.
  337          */
  338         xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
  339         return xi;
  340 }
  341 
  342 static inline int64_t
  343 fpr_trunc(fpr x)
  344 {
  345         uint64_t t, xu;
  346         int e, cc;
  347 
  348         /*
  349          * Extract the absolute value. Since we assume that the value
  350          * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
  351          * the absolute value into the 2^62..2^63-1 range, and then
  352          * do a right shift afterwards.
  353          */
  354         e = (int)(x >> 52) & 0x7FF;
  355         xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
  356         cc = 1085 - e;
  357         xu = fpr_ursh(xu, cc & 63);
  358 
  359         /*
  360          * If the exponent is too low (cc > 63), then the shift was wrong
  361          * and we must clamp the value to 0. This also covers the case
  362          * of an input equal to zero.
  363          */
  364         xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
  365 
  366         /*
  367          * Apply back the sign, if the source value is negative.
  368          */
  369         t = x >> 63;
  370         xu = (xu ^ -t) + t;
  371         return *(int64_t *)&xu;
  372 }
  373 
  374 #define fpr_add   Zf(fpr_add)
  375 fpr fpr_add(fpr x, fpr y);
  376 
  377 static inline fpr
  378 fpr_sub(fpr x, fpr y)
  379 {
  380         y ^= (uint64_t)1 << 63;
  381         return fpr_add(x, y);
  382 }
  383 
  384 static inline fpr
  385 fpr_neg(fpr x)
  386 {
  387         x ^= (uint64_t)1 << 63;
  388         return x;
  389 }
  390 
  391 static inline fpr
  392 fpr_half(fpr x)
  393 {
  394         /*
  395          * To divide a value by 2, we just have to subtract 1 from its
  396          * exponent, but we have to take care of zero.
  397          */
  398         uint32_t t;
  399 
  400         x -= (uint64_t)1 << 52;
  401         t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
  402         x &= (uint64_t)t - 1;
  403         return x;
  404 }
  405 
  406 static inline fpr
  407 fpr_double(fpr x)
  408 {
  409         /*
  410          * To double a value, we just increment by one the exponent. We
  411          * don't care about infinites or NaNs; however, 0 is a
  412          * special case.
  413          */
  414         x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
  415         return x;
  416 }
  417 
  418 #define fpr_mul   Zf(fpr_mul)
  419 fpr fpr_mul(fpr x, fpr y);
  420 
  421 static inline fpr
  422 fpr_sqr(fpr x)
  423 {
  424         return fpr_mul(x, x);
  425 }
  426 
  427 #define fpr_div   Zf(fpr_div)
  428 fpr fpr_div(fpr x, fpr y);
  429 
  430 static inline fpr
  431 fpr_inv(fpr x)
  432 {
  433         return fpr_div(4607182418800017408u, x);
  434 }
  435 
  436 #define fpr_sqrt   Zf(fpr_sqrt)
  437 fpr fpr_sqrt(fpr x);
  438 
  439 static inline int
  440 fpr_lt(fpr x, fpr y)
  441 {
  442         /*
  443          * If x >= 0 or y >= 0, a signed comparison yields the proper
  444          * result:
  445          *   - For positive values, the order is preserved.
  446          *   - The sign bit is at the same place as in integers, so
  447          *     sign is preserved.
  448          *
  449          * If both x and y are negative, then the order is reversed.
  450          * We cannot simply invert the comparison result in that case
  451          * because it would not handle the edge case x = y properly.
  452          */
  453         int cc0, cc1;
  454 
  455         cc0 = *(int64_t *)&x < *(int64_t *)&y;
  456         cc1 = *(int64_t *)&x > *(int64_t *)&y;
  457         return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
  458 }
  459 
  460 /*
  461  * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
  462  * bits or so.
  463  */
  464 #define fpr_expm_p63   Zf(fpr_expm_p63)
  465 uint64_t fpr_expm_p63(fpr x);
  466 
  467 #define fpr_gm_tab   Zf(fpr_gm_tab)
  468 extern const fpr fpr_gm_tab[];
  469 
  470 #define fpr_p2_tab   Zf(fpr_p2_tab)
  471 extern const fpr fpr_p2_tab[];
  472 
  473 /* ====================================================================== */
  474 
  475 #elif FALCON_FPNATIVE  // yyyFPEMU+0 yyyFPNATIVE+1
  476 
  477 /* ====================================================================== */
  478 
  479 #include <math.h>
  480 
  481 /*
  482  * We wrap the native 'double' type into a structure so that the C compiler
  483  * complains if we inadvertently use raw arithmetic operators on the 'fpr'
  484  * type instead of using the inline functions below. This should have no
  485  * extra runtime cost, since all the functions below are 'inline'.
  486  */
  487 typedef struct { double v; } fpr;
  488 
  489 static inline fpr
  490 FPR(double v)
  491 {
  492         fpr x;
  493 
  494         x.v = v;
  495         return x;
  496 }
  497 
  498 static inline fpr
  499 fpr_of(int64_t i)
  500 {
  501         return FPR((double)i);
  502 }
  503 
  504 static const fpr fpr_q = { 12289.0 };
  505 static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
  506 static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
  507 static const fpr fpr_inv_sigma = { .005819826392951607426919370871 };
  508 static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 };
  509 static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 };
  510 static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
  511 static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
  512 static const fpr fpr_bnorm_max = { 16822.4121 };
  513 static const fpr fpr_zero = { 0.0 };
  514 static const fpr fpr_one = { 1.0 };
  515 static const fpr fpr_two = { 2.0 };
  516 static const fpr fpr_onehalf = { 0.5 };
  517 static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
  518 static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
  519 static const fpr fpr_ptwo31 = { 2147483648.0 };
  520 static const fpr fpr_ptwo31m1 = { 2147483647.0 };
  521 static const fpr fpr_mtwo31m1 = { -2147483647.0 };
  522 static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
  523 static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
  524 static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
  525 
  526 static inline int64_t
  527 fpr_rint(fpr x)
  528 {
  529         /*
  530          * We do not want to use llrint() since it might be not
  531          * constant-time.
  532          *
  533          * Suppose that x >= 0. If x >= 2^52, then it is already an
  534          * integer. Otherwise, if x < 2^52, then computing x+2^52 will
  535          * yield a value that will be rounded to the nearest integer
  536          * with exactly the right rules (round-to-nearest-even).
  537          *
  538          * In order to have constant-time processing, we must do the
  539          * computation for both x >= 0 and x < 0 cases, and use a
  540          * cast to an integer to access the sign and select the proper
  541          * value. Such casts also allow us to find out if |x| < 2^52.
  542          */
  543         int64_t sx, tx, rp, rn, m;
  544         uint32_t ub;
  545 
  546         sx = (int64_t)(x.v - 1.0);
  547         tx = (int64_t)x.v;
  548         rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
  549         rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
  550 
  551         /*
  552          * If tx >= 2^52 or tx < -2^52, then result is tx.
  553          * Otherwise, if sx >= 0, then result is rp.
  554          * Otherwise, result is rn. We use the fact that when x is
  555          * close to 0 (|x| <= 0.25) then both rp and rn are correct;
  556          * and if x is not close to 0, then trunc(x-1.0) yields the
  557          * appropriate sign.
  558          */
  559 
  560         /*
  561          * Clamp rp to zero if tx < 0.
  562          * Clamp rn to zero if tx >= 0.
  563          */
  564         m = sx >> 63;
  565         rn &= m;
  566         rp &= ~m;
  567 
  568         /*
  569          * Get the 12 upper bits of tx; if they are not all zeros or
  570          * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
  571          * rp and rn to zero. Otherwise, we clamp tx to zero.
  572          */
  573         ub = (uint32_t)((uint64_t)tx >> 52);
  574         m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
  575         rp &= m;
  576         rn &= m;
  577         tx &= ~m;
  578 
  579         /*
  580          * Only one of tx, rn or rp (at most) can be non-zero at this
  581          * point.
  582          */
  583         return tx | rn | rp;
  584 }
  585 
  586 static inline int64_t
  587 fpr_floor(fpr x)
  588 {
  589         int64_t r;
  590 
  591         /*
  592          * The cast performs a trunc() (rounding toward 0) and thus is
  593          * wrong by 1 for most negative values. The correction below is
  594          * constant-time as long as the compiler turns the
  595          * floating-point conversion result into a 0/1 integer without a
  596          * conditional branch or another non-constant-time construction.
  597          * This should hold on all modern architectures with an FPU (and
  598          * if it is false on a given arch, then chances are that the FPU
  599          * itself is not constant-time, making the point moot).
  600          */
  601         r = (int64_t)x.v;
  602         return r - (x.v < (double)r);
  603 }
  604 
  605 static inline int64_t
  606 fpr_trunc(fpr x)
  607 {
  608         return (int64_t)x.v;
  609 }
  610 
  611 static inline fpr
  612 fpr_add(fpr x, fpr y)
  613 {
  614         return FPR(x.v + y.v);
  615 }
  616 
  617 static inline fpr
  618 fpr_sub(fpr x, fpr y)
  619 {
  620         return FPR(x.v - y.v);
  621 }
  622 
  623 static inline fpr
  624 fpr_neg(fpr x)
  625 {
  626         return FPR(-x.v);
  627 }
  628 
  629 static inline fpr
  630 fpr_half(fpr x)
  631 {
  632         return FPR(x.v * 0.5);
  633 }
  634 
  635 static inline fpr
  636 fpr_double(fpr x)
  637 {
  638         return FPR(x.v + x.v);
  639 }
  640 
  641 static inline fpr
  642 fpr_mul(fpr x, fpr y)
  643 {
  644         return FPR(x.v * y.v);
  645 }
  646 
  647 static inline fpr
  648 fpr_sqr(fpr x)
  649 {
  650         return FPR(x.v * x.v);
  651 }
  652 
  653 static inline fpr
  654 fpr_inv(fpr x)
  655 {
  656         return FPR(1.0 / x.v);
  657 }
  658 
  659 static inline fpr
  660 fpr_div(fpr x, fpr y)
  661 {
  662         return FPR(x.v / y.v);
  663 }
  664 
  665 #if FALCON_AVX2  // yyyAVX2+1
  666 TARGET_AVX2
  667 static inline void
  668 fpr_sqrt_avx2(double *t)
  669 {
  670         __m128d x;
  671 
  672         x = _mm_load1_pd(t);
  673         x = _mm_sqrt_pd(x);
  674         _mm_storel_pd(t, x);
  675 }
  676 #endif  // yyyAVX2-
  677 
  678 static inline fpr
  679 fpr_sqrt(fpr x)
  680 {
  681         /*
  682          * We prefer not to have a dependency on libm when it can be
  683          * avoided. On x86, calling the sqrt() libm function inlines
  684          * the relevant opcode (fsqrt or sqrtsd, depending on whether
  685          * the 387 FPU or SSE2 is used for floating-point operations)
  686          * but then makes an optional call to the library function
  687          * for proper error handling, in case the operand is negative.
  688          *
  689          * To avoid this dependency, we use intrinsics or inline assembly
  690          * on recognized platforms:
  691          *
  692          *  - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
  693          *
  694          *  - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
  695          *
  696          *  - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
  697          *    to call the 387 FPU fsqrt opcode.
  698          *
  699          *  - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
  700          *    the fsqrt opcode (Clang needs a special hack).
  701          *
  702          *  - On GCC/Clang on ARM with hardware floating-point, we use
  703          *    inline assembly to call the vqsrt.f64 opcode. Due to a
  704          *    complex ecosystem of compilers and assembly syntaxes, we
  705          *    have to call it "fsqrt" or "fsqrtd", depending on case.
  706          *
  707          * If the platform is not recognized, a call to the system
  708          * library function sqrt() is performed. On some compilers, this
  709          * may actually inline the relevant opcode, and call the library
  710          * function only when the input is invalid (e.g. negative);
  711          * Falcon never actually calls sqrt() on a negative value, but
  712          * the dependency to libm will still be there.
  713          */
  714 
  715 #if FALCON_AVX2  // yyyAVX2+1
  716         fpr_sqrt_avx2(&x.v);
  717         return x;
  718 #else  // yyyAVX2+0
  719 #if defined __GNUC__ && defined __SSE2_MATH__
  720         return FPR(_mm_cvtsd_f64(_mm_sqrt_pd(_mm_set1_pd(x.v))));
  721 #elif defined __GNUC__ && defined __i386__
  722         __asm__ __volatile__ (
  723                 "fldl   %0\n\t"
  724                 "fsqrt\n\t"
  725                 "fstpl  %0\n\t"
  726                 : "+m" (x.v) : : );
  727         return x;
  728 #elif defined _M_IX86
  729         __asm {
  730                 fld x.v
  731                 fsqrt
  732                 fstp x.v
  733         }
  734         return x;
  735 #elif defined __PPC__ && defined __GNUC__
  736         fpr y;
  737 
  738 #if defined __clang__
  739         /*
  740          * Normally we should use a 'd' constraint (register that contains
  741          * a 'double' value) but Clang 3.8.1 chokes on it. Instead we use
  742          * an 'f' constraint, counting on the fact that 'float' values
  743          * are managed in double-precision registers anyway, and the
  744          * compiler will not add extra rounding steps.
  745          */
  746         __asm__ ( "fsqrt  %0, %1" : "=f" (y.v) : "f" (x.v) : );
  747 #else
  748         __asm__ ( "fsqrt  %0, %1" : "=d" (y.v) : "d" (x.v) : );
  749 #endif
  750         return y;
  751 #elif (defined __ARM_FP && ((__ARM_FP & 0x08) == 0x08)) \
  752         || (!defined __ARM_FP && defined __ARM_VFPV2__)
  753         /*
  754          * On ARM, assembly syntaxes are a bit of a mess, depending on
  755          * whether GCC or Clang is used, and the binutils version, and
  756          * whether this is 32-bit or 64-bit mode. The code below appears
  757          * to work on:
  758          *    32-bit   GCC-4.9.2   Clang-3.5   Binutils-2.25
  759          *    64-bit   GCC-6.3.0   Clang-3.9   Binutils-2.28
  760          */
  761 #if defined __aarch64__ && __aarch64__
  762         __asm__ ( "fsqrt   %d0, %d0" : "+w" (x.v) : : );
  763 #else
  764         __asm__ ( "fsqrtd  %P0, %P0" : "+w" (x.v) : : );
  765 #endif
  766         return x;
  767 #else
  768         return FPR(sqrt(x.v));
  769 #endif
  770 #endif  // yyyAVX2-
  771 }
  772 
  773 static inline int
  774 fpr_lt(fpr x, fpr y)
  775 {
  776         return x.v < y.v;
  777 }
  778 
  779 TARGET_AVX2
  780 static inline uint64_t
  781 fpr_expm_p63(fpr x)
  782 {
  783         /*
  784          * Polynomial approximation of exp(-x) is taken from FACCT:
  785          *   https://eprint.iacr.org/2018/1234
  786          * Specifically, values are extracted from the implementation
  787          * referenced from the FACCT article, and available at:
  788          *   https://github.com/raykzhao/gaussian
  789          * Tests over more than 24 billions of random inputs in the
  790          * 0..log(2) range have never shown a deviation larger than
  791          * 2^(-50) from the true mathematical value.
  792          */
  793 
  794 #if FALCON_AVX2  // yyyAVX2+1
  795 
  796         /*
  797          * AVX2 implementation uses more operations than Horner's method,
  798          * but with a lower expression tree depth. This helps because
  799          * additions and multiplications have a latency of 4 cycles on
  800          * a Skylake, but the CPU can issue two of them per cycle.
  801          */
  802 
  803         static const union {
  804                 double d[12];
  805                 __m256d v[3];
  806         } c = {
  807                 {
  808                         0.999999999999994892974086724280,
  809                         0.500000000000019206858326015208,
  810                         0.166666666666984014666397229121,
  811                         0.041666666666110491190622155955,
  812                         0.008333333327800835146903501993,
  813                         0.001388888894063186997887560103,
  814                         0.000198412739277311890541063977,
  815                         0.000024801566833585381209939524,
  816                         0.000002755586350219122514855659,
  817                         0.000000275607356160477811864927,
  818                         0.000000025299506379442070029551,
  819                         0.000000002073772366009083061987
  820                 }
  821         };
  822 
  823         double d1, d2, d4, d8, y;
  824         __m256d d14, d58, d9c;
  825 
  826         d1 = -x.v;
  827         d2 = d1 * d1;
  828         d4 = d2 * d2;
  829         d8 = d4 * d4;
  830         d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
  831         d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
  832         d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
  833         d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
  834         d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
  835         d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
  836         d9c = _mm256_hadd_pd(d9c, d9c);
  837         y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
  838                 + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
  839         return (uint64_t)(y * fpr_ptwo63.v);
  840 
  841 #else  // yyyAVX2+0
  842 
  843         /*
  844          * Normal implementation uses Horner's method, which minimizes
  845          * the number of operations.
  846          */
  847 
  848         double d, y;
  849 
  850         d = x.v;
  851         y = 0.000000002073772366009083061987;
  852         y = 0.000000025299506379442070029551 - y * d;
  853         y = 0.000000275607356160477811864927 - y * d;
  854         y = 0.000002755586350219122514855659 - y * d;
  855         y = 0.000024801566833585381209939524 - y * d;
  856         y = 0.000198412739277311890541063977 - y * d;
  857         y = 0.001388888894063186997887560103 - y * d;
  858         y = 0.008333333327800835146903501993 - y * d;
  859         y = 0.041666666666110491190622155955 - y * d;
  860         y = 0.166666666666984014666397229121 - y * d;
  861         y = 0.500000000000019206858326015208 - y * d;
  862         y = 0.999999999999994892974086724280 - y * d;
  863         y = 1.000000000000000000000000000000 - y * d;
  864         return (uint64_t)(y * fpr_ptwo63.v);
  865 
  866 #endif  // yyyAVX2-
  867 }
  868 
  869 #define fpr_gm_tab   Zf(fpr_gm_tab)
  870 extern const fpr fpr_gm_tab[];
  871 
  872 #define fpr_p2_tab   Zf(fpr_p2_tab)
  873 extern const fpr fpr_p2_tab[];
  874 
  875 /* ====================================================================== */
  876 
  877 #else  // yyyFPEMU+0 yyyFPNATIVE+0
  878 
  879 #error No FP implementation selected
  880 
  881 #endif  // yyyFPEMU- yyyFPNATIVE-