Falcon source files (reference implementation)


falcon-keygen.c

    1 /*
    2  * Falcon key pair generation.
    3  *
    4  * ==========================(LICENSE BEGIN)============================
    5  *
    6  * Copyright (c) 2017  Falcon Project
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining
    9  * a copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be
   17  * included in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * ===========================(LICENSE END)=============================
   28  *
   29  * @author   Thomas Pornin <thomas.pornin@nccgroup.trust>
   30  */
   31 
   32 #include <stddef.h>
   33 #include "internal.h"
   34 
   35 /*
   36  * If CLEANSE is non-zero, then temporary areas obtained with malloc()
   37  * and used to contain secret values are explicitly cleared before
   38  * deallocation with free(). This is the default behaviour; use
   39  * -DCLEANSE=0 to disable cleansing.
   40  */
   41 #ifndef CLEANSE
   42 #define CLEANSE   1
   43 #endif
   44 
   45 /*
   46  * If MEMCHECK is non-zero, then a few extra bytes will be allocated
   47  * at the end of the main buffer, and filled with a specific pattern
   48  * which is checked upon exit. This implies a dependency to some extra
   49  * libc facilities (stderr, fprintf, abort). Overflows may occur only
   50  * if the computation in temp_size() is wrong.
   51  */
   52 #ifndef MEMCHECK
   53 #define MEMCHECK   1
   54 #endif
   55 
   56 #if MEMCHECK
   57 #include <stdio.h>
   58 #include <stdlib.h>
   59 #endif
   60 
   61 #define MKN(logn, full)   ((size_t)(1 + ((full) << 1)) << ((logn) - (full)))
   62 
   63 #if CLEANSE
   64 /*
   65  * Cleanse a memory region by overwriting it with zeros.
   66  */
   67 static void
   68 cleanse(void *data, size_t len)
   69 {
   70         volatile unsigned char *p;
   71 
   72         p = (volatile unsigned char *)data;
   73         while (len -- > 0) {
   74                 *p ++ = 0;
   75         }
   76 }
   77 #endif
   78 
   79 /* ==================================================================== */
   80 /*
   81  * Compute resultant of polynomial f with phi, modulo 2. This function
   82  * is for phi = X^N-X(N/2)+1, where N = 1.5*2^logn.
   83  */
   84 static unsigned
   85 mod2_res_ternary(const int16_t *f, unsigned logn)
   86 {
   87         /*
   88          * We lower down the degree to 6, by successive degree halving:
   89          * we replace f with N(f). If:
   90          *   f = f0(x^2) + x*f1(x^2)
   91          * with f0 and f1 taken modulo X^(N/2)-(X^N/4)+1, then:
   92          *   N(f) = f0^2 - x*f1^2
   93          *
   94          * From f expressed as an array of bits:
   95          *   - f0 and f1 are obtained by extracting bits at even and
   96          *     odd indexes, respectively;
   97          *   - the "holes" are squeezed out to get down to degree N/2;
   98          *   - squarings (f0^2 and f1^2) compute the exact opposite of
   99          *     that "squeezing out", so in practice we skip both; but
  100          *     we must reduce modulo X^(N/2)-X^(N/4)+1.
  101          */
  102 
  103         uint32_t b[24];
  104         size_t u, n;
  105 
  106         /*
  107          * Extract bits into an array of 32-bit words.
  108          */
  109         n = MKN(logn, 1);
  110         memset(b, 0, sizeof b);
  111         for (u = 0; u < n; u ++) {
  112                 uint32_t bit;
  113 
  114                 bit = (uint32_t)f[u] & 1;
  115                 b[u >> 5] |= bit << (u & 31);
  116         }
  117 
  118         /*
  119          * Halve the degree repeatedly.
  120          *
  121          * If the input is: a0 || a1 || a2 || a3
  122          * Then we must compute: (a0 XOR a2 XOR a3) || (a1 XOR a2)
  123          */
  124         switch (logn) {
  125         case 9:
  126                 /*
  127                  * a0 = b[0..5]
  128                  * a1 = b[6..11]
  129                  * a2 = b[12..17]
  130                  * a3 = b[18..23]
  131                  */
  132                 b[0] ^= b[12] ^ b[18];
  133                 b[1] ^= b[13] ^ b[19];
  134                 b[2] ^= b[14] ^ b[20];
  135                 b[3] ^= b[15] ^ b[21];
  136                 b[4] ^= b[16] ^ b[22];
  137                 b[5] ^= b[17] ^ b[23];
  138                 b[6] ^= b[12];
  139                 b[7] ^= b[13];
  140                 b[8] ^= b[14];
  141                 b[9] ^= b[15];
  142                 b[10] ^= b[16];
  143                 b[11] ^= b[17];
  144                 /* fall through */
  145         case 8:
  146                 /*
  147                  * a0 = b[0..2]
  148                  * a1 = b[3..5]
  149                  * a2 = b[6..8]
  150                  * a3 = b[9..11]
  151                  */
  152                 b[0] ^= b[6] ^ b[9];
  153                 b[1] ^= b[7] ^ b[10];
  154                 b[2] ^= b[8] ^ b[11];
  155                 b[3] ^= b[6];
  156                 b[4] ^= b[7];
  157                 b[5] ^= b[8];
  158                 /* fall through */
  159         case 7:
  160                 /*
  161                  * a0 = b[0] || b[1](0..15)
  162                  * a1 = b[1](16..31) || b[2]
  163                  * a2 = b[3] || b[4](0..15)
  164                  * a3 = b[4](16..31) || b[5]
  165                  */
  166                 b[0] ^= b[3] ^ (b[4] >> 16) ^ (b[5] << 16);
  167                 b[1] ^= (b[4] & 0xFFFF) ^ (b[5] >> 16);
  168                 b[1] ^= (b[3] << 16);
  169                 b[2] ^= (b[3] >> 16) ^ (b[4] << 16);
  170                 /* fall through */
  171         case 6:
  172                 /*
  173                  * a0 = b[0](0..23)
  174                  * a1 = b[0](24..31) || b[1](0..15)
  175                  * a2 = b[1](16..31) || b[2](0..7)
  176                  * a3 = b[2](8..31)
  177                  */
  178                 b[0] ^= (b[1] >> 16) ^ ((b[2] & 0xFF) << 16) ^ (b[2] >> 8);
  179                 b[0] ^= ((b[1] << 8) & 0xFF000000);
  180                 b[1] ^= (b[1] >> 24) ^ ((b[2] & 0xFF) << 8);
  181                 b[1] &= 0xFFFF;
  182                 /* fall through */
  183         case 5:
  184                 /*
  185                  * a0 = b[0](0..11)
  186                  * a1 = b[0](12..23)
  187                  * a2 = b[0](24..31) || b[1](0..3)
  188                  * a3 = b[1](4..15)
  189                  */
  190                 b[0] ^= (b[0] >> 24) ^ ((b[1] & 0x0F) << 8) ^ (b[1] >> 4);
  191                 b[0] ^= ((b[0] >> 12) & 0xFF000) ^ ((b[1] & 0x0F) << 20);
  192                 b[0] &= 0xFFFFFF;
  193                 /* fall through */
  194         case 4:
  195                 /*
  196                  * a0 = b[0](0..5)
  197                  * a1 = b[0](6..11)
  198                  * a2 = b[0](12..17)
  199                  * a3 = b[0](18..23)
  200                  */
  201                 b[0] ^= ((b[0] >> 12) & 0x3F) ^ ((b[0] >> 18) & 0x3F);
  202                 b[0] ^= ((b[0] >> 6) & 0xFC0);
  203                 b[0] &= 0xFFF;
  204                 /* fall through */
  205         case 3:
  206                 /*
  207                  * a0 = b[0](0..2)
  208                  * a1 = b[0](3..5)
  209                  * a2 = b[0](6..8)
  210                  * a3 = b[0](9..11)
  211                  */
  212                 b[0] ^= ((b[0] >> 6) & 0x7) ^ ((b[0] >> 9) & 0x7);
  213                 b[0] ^= ((b[0] >> 3) & 0x38);
  214                 b[0] &= 0x3F;
  215                 /* fall through */
  216         case 2:
  217                 break;
  218         }
  219 
  220         /*
  221          * When we are done to phi = X^6-X^3+1, we have only 64
  222          * possibilities. It turns out that all of them except 0 yield
  223          * a resultant of value 1 with phi (modulo 2).
  224          */
  225         return b[0] != 0;
  226 }
  227 
  228 /* ==================================================================== */
  229 /*
  230  * Modular arithmetics.
  231  *
  232  * We implement a few functions for computing modulo a small integer p.
  233  *
  234  * All functions require that 2^30 < p < 2^31. Moreover, operands must
  235  * be in the 0..p-1 range.
  236  *
  237  * Modular addition and subtraction work for all such p.
  238  *
  239  * Montgomery multiplication requires that p is odd, and must be provided
  240  * with an additional value p0i = -1/p mod 2^31. See below for some basics
  241  * on Montgomery multiplication.
  242  *
  243  * Division computes an inverse modulo p by an exponentiation (with
  244  * exponent p-2): this works only if p is prime. Multiplication
  245  * requirements also apply, i.e. p must be odd and p0i must be provided.
  246  *
  247  * The NTT and inverse NTT need all of the above, and also that
  248  * p = 1 mod 2048 (binary case) or p = 1 mod 2304 (ternary case).
  249  *
  250  * -----------------------------------------------------------------------
  251  *
  252  * We use Montgomery representation with 31-bit values:
  253  *
  254  *   Let R = 2^31 mod p. When p > 2^30, R = 2^31 - p.
  255  *   Montgomery representation of an integer x modulo p is x*R mod p.
  256  *
  257  *   Montgomery multiplication computes (x*y)/R mod p for
  258  *   operands x and y. Therefore:
  259  *
  260  *    - if operands are x*R and y*R (Montgomery representations of x and y),
  261  *      then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R mod p,
  262  *      which is the Montgomery representation of the product x*y;
  263  *
  264  *    - if operands are x*R and y (or x and y*R), then Montgomery
  265  *      representation returns x*y mod p: mixed-representation
  266  *      multiplications yield results in normal representation.
  267  *
  268  * To convert to Montgomery representation, we multiply by R, which is done
  269  * by Montgomery-multiplying by R^2. Stand-alone conversion back from
  270  * Montgomery representation is Montgomery-multiplication by 1.
  271  */
  272 
  273 /*
  274  * Precomputed small primes. Each element contains the following:
  275  *
  276  *  p   The prime itself.
  277  *
  278  *  g   A primitive root of phi = X^N+1 or phi = X^N-X^(N/2)+1.
  279  *
  280  *  s   The inverse of the product of all previous primes in the array,
  281  *      computed modulo p and in Montgomery representation.
  282  *
  283  * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
  284  * are listed in decreasing order.
  285  */
  286 
  287 typedef struct {
  288         uint32_t p;
  289         uint32_t g;
  290         uint32_t s;
  291 } small_prime;
  292 
  293 static const small_prime PRIMES2[] = {
  294         { 2147473409,  383167813,      10239 },
  295         { 2147389441,  211808905,  471403745 },
  296         { 2147387393,   37672282, 1329335065 },
  297         { 2147377153, 1977035326,  968223422 },
  298         { 2147358721, 1067163706,  132460015 },
  299         { 2147352577, 1606082042,  598693809 },
  300         { 2147346433, 2033915641, 1056257184 },
  301         { 2147338241, 1653770625,  421286710 },
  302         { 2147309569,  631200819, 1111201074 },
  303         { 2147297281, 2038364663, 1042003613 },
  304         { 2147295233, 1962540515,   19440033 },
  305         { 2147239937, 2100082663,  353296760 },
  306         { 2147235841, 1991153006, 1703918027 },
  307         { 2147217409,  516405114, 1258919613 },
  308         { 2147205121,  409347988, 1089726929 },
  309         { 2147196929,  927788991, 1946238668 },
  310         { 2147178497, 1136922411, 1347028164 },
  311         { 2147100673,  868626236,  701164723 },
  312         { 2147082241, 1897279176,  617820870 },
  313         { 2147074049, 1888819123,  158382189 },
  314         { 2147051521,   25006327,  522758543 },
  315         { 2147043329,  327546255,   37227845 },
  316         { 2147039233,  766324424, 1133356428 },
  317         { 2146988033, 1862817362,   73861329 },
  318         { 2146963457,  404622040,  653019435 },
  319         { 2146959361, 1936581214,  995143093 },
  320         { 2146938881, 1559770096,  634921513 },
  321         { 2146908161,  422623708, 1985060172 },
  322         { 2146885633, 1751189170,  298238186 },
  323         { 2146871297,  578919515,  291810829 },
  324         { 2146846721, 1114060353,  915902322 },
  325         { 2146834433, 2069565474,   47859524 },
  326         { 2146818049, 1552824584,  646281055 },
  327         { 2146775041, 1906267847, 1597832891 },
  328         { 2146756609, 1847414714, 1228090888 },
  329         { 2146744321, 1818792070, 1176377637 },
  330         { 2146738177, 1118066398, 1054971214 },
  331         { 2146736129,   52057278,  933422153 },
  332         { 2146713601,  592259376, 1406621510 },
  333         { 2146695169,  263161877, 1514178701 },
  334         { 2146656257,  685363115,  384505091 },
  335         { 2146650113,  927727032,  537575289 },
  336         { 2146646017,   52575506, 1799464037 },
  337         { 2146643969, 1276803876, 1348954416 },
  338         { 2146603009,  814028633, 1521547704 },
  339         { 2146572289, 1846678872, 1310832121 },
  340         { 2146547713,  919368090, 1019041349 },
  341         { 2146508801,  671847612,   38582496 },
  342         { 2146492417,  283911680,  532424562 },
  343         { 2146490369, 1780044827,  896447978 },
  344         { 2146459649,  327980850, 1327906900 },
  345         { 2146447361, 1310561493,  958645253 },
  346         { 2146441217,  412148926,  287271128 },
  347         { 2146437121,  293186449, 2009822534 },
  348         { 2146430977,  179034356, 1359155584 },
  349         { 2146418689, 1517345488, 1790248672 },
  350         { 2146406401, 1615820390, 1584833571 },
  351         { 2146404353,  826651445,  607120498 },
  352         { 2146379777,    3816988, 1897049071 },
  353         { 2146363393, 1221409784, 1986921567 },
  354         { 2146355201, 1388081168,  849968120 },
  355         { 2146336769, 1803473237, 1655544036 },
  356         { 2146312193, 1023484977,  273671831 },
  357         { 2146293761, 1074591448,  467406983 },
  358         { 2146283521,  831604668, 1523950494 },
  359         { 2146203649,  712865423, 1170834574 },
  360         { 2146154497, 1764991362, 1064856763 },
  361         { 2146142209,  627386213, 1406840151 },
  362         { 2146127873, 1638674429, 2088393537 },
  363         { 2146099201, 1516001018,  690673370 },
  364         { 2146093057, 1294931393,  315136610 },
  365         { 2146091009, 1942399533,  973539425 },
  366         { 2146078721, 1843461814, 2132275436 },
  367         { 2146060289, 1098740778,  360423481 },
  368         { 2146048001, 1617213232, 1951981294 },
  369         { 2146041857, 1805783169, 2075683489 },
  370         { 2146019329,  272027909, 1753219918 },
  371         { 2145986561, 1206530344, 2034028118 },
  372         { 2145976321, 1243769360, 1173377644 },
  373         { 2145964033,  887200839, 1281344586 },
  374         { 2145906689, 1651026455,  906178216 },
  375         { 2145875969, 1673238256, 1043521212 },
  376         { 2145871873, 1226591210, 1399796492 },
  377         { 2145841153, 1465353397, 1324527802 },
  378         { 2145832961, 1150638905,  554084759 },
  379         { 2145816577,  221601706,  427340863 },
  380         { 2145785857,  608896761,  316590738 },
  381         { 2145755137, 1712054942, 1684294304 },
  382         { 2145742849, 1302302867,  724873116 },
  383         { 2145728513,  516717693,  431671476 },
  384         { 2145699841,  524575579, 1619722537 },
  385         { 2145691649, 1925625239,  982974435 },
  386         { 2145687553,  463795662, 1293154300 },
  387         { 2145673217,  771716636,  881778029 },
  388         { 2145630209, 1509556977,  837364988 },
  389         { 2145595393,  229091856,  851648427 },
  390         { 2145587201, 1796903241,  635342424 },
  391         { 2145525761,  715310882, 1677228081 },
  392         { 2145495041, 1040930522,  200685896 },
  393         { 2145466369,  949804237, 1809146322 },
  394         { 2145445889, 1673903706,   95316881 },
  395         { 2145390593,  806941852, 1428671135 },
  396         { 2145372161, 1402525292,  159350694 },
  397         { 2145361921, 2124760298, 1589134749 },
  398         { 2145359873, 1217503067, 1561543010 },
  399         { 2145355777,  338341402,   83865711 },
  400         { 2145343489, 1381532164,  641430002 },
  401         { 2145325057, 1883895478, 1528469895 },
  402         { 2145318913, 1335370424,   65809740 },
  403         { 2145312769, 2000008042, 1919775760 },
  404         { 2145300481,  961450962, 1229540578 },
  405         { 2145282049,  910466767, 1964062701 },
  406         { 2145232897,  816527501,  450152063 },
  407         { 2145218561, 1435128058, 1794509700 },
  408         { 2145187841,   33505311, 1272467582 },
  409         { 2145181697,  269767433, 1380363849 },
  410         { 2145175553,   56386299, 1316870546 },
  411         { 2145079297, 2106880293, 1391797340 },
  412         { 2145021953, 1347906152,  720510798 },
  413         { 2145015809,  206769262, 1651459955 },
  414         { 2145003521, 1885513236, 1393381284 },
  415         { 2144960513, 1810381315,   31937275 },
  416         { 2144944129, 1306487838, 2019419520 },
  417         { 2144935937,   37304730, 1841489054 },
  418         { 2144894977, 1601434616,  157985831 },
  419         { 2144888833,   98749330, 2128592228 },
  420         { 2144880641, 1772327002, 2076128344 },
  421         { 2144864257, 1404514762, 2029969964 },
  422         { 2144827393,  801236594,  406627220 },
  423         { 2144806913,  349217443, 1501080290 },
  424         { 2144796673, 1542656776, 2084736519 },
  425         { 2144778241, 1210734884, 1746416203 },
  426         { 2144759809, 1146598851,  716464489 },
  427         { 2144757761,  286328400, 1823728177 },
  428         { 2144729089, 1347555695, 1836644881 },
  429         { 2144727041, 1795703790,  520296412 },
  430         { 2144696321, 1302475157,  852964281 },
  431         { 2144667649, 1075877614,  504992927 },
  432         { 2144573441,  198765808, 1617144982 },
  433         { 2144555009,  321528767,  155821259 },
  434         { 2144550913,  814139516, 1819937644 },
  435         { 2144536577,  571143206,  962942255 },
  436         { 2144524289, 1746733766,    2471321 },
  437         { 2144512001, 1821415077,  124190939 },
  438         { 2144468993,  917871546, 1260072806 },
  439         { 2144458753,  378417981, 1569240563 },
  440         { 2144421889,  175229668, 1825620763 },
  441         { 2144409601, 1699216963,  351648117 },
  442         { 2144370689, 1071885991,  958186029 },
  443         { 2144348161, 1763151227,  540353574 },
  444         { 2144335873, 1060214804,  919598847 },
  445         { 2144329729,  663515846, 1448552668 },
  446         { 2144327681, 1057776305,  590222840 },
  447         { 2144309249, 1705149168, 1459294624 },
  448         { 2144296961,  325823721, 1649016934 },
  449         { 2144290817,  738775789,  447427206 },
  450         { 2144243713,  962347618,  893050215 },
  451         { 2144237569, 1655257077,  900860862 },
  452         { 2144161793,  242206694, 1567868672 },
  453         { 2144155649,  769415308, 1247993134 },
  454         { 2144137217,  320492023,  515841070 },
  455         { 2144120833, 1639388522,  770877302 },
  456         { 2144071681, 1761785233,  964296120 },
  457         { 2144065537,  419817825,  204564472 },
  458         { 2144028673,  666050597, 2091019760 },
  459         { 2144010241, 1413657615, 1518702610 },
  460         { 2143952897, 1238327946,  475672271 },
  461         { 2143940609,  307063413, 1176750846 },
  462         { 2143918081, 2062905559,  786785803 },
  463         { 2143899649, 1338112849, 1562292083 },
  464         { 2143891457,   68149545,   87166451 },
  465         { 2143885313,  921750778,  394460854 },
  466         { 2143854593,  719766593,  133877196 },
  467         { 2143836161, 1149399850, 1861591875 },
  468         { 2143762433, 1848739366, 1335934145 },
  469         { 2143756289, 1326674710,  102999236 },
  470         { 2143713281,  808061791, 1156900308 },
  471         { 2143690753,  388399459, 1926468019 },
  472         { 2143670273, 1427891374, 1756689401 },
  473         { 2143666177, 1912173949,  986629565 },
  474         { 2143645697, 2041160111,  371842865 },
  475         { 2143641601, 1279906897, 2023974350 },
  476         { 2143635457,  720473174, 1389027526 },
  477         { 2143621121, 1298309455, 1732632006 },
  478         { 2143598593, 1548762216, 1825417506 },
  479         { 2143567873,  620475784, 1073787233 },
  480         { 2143561729, 1932954575,  949167309 },
  481         { 2143553537,  354315656, 1652037534 },
  482         { 2143541249,  577424288, 1097027618 },
  483         { 2143531009,  357862822,  478640055 },
  484         { 2143522817, 2017706025, 1550531668 },
  485         { 2143506433, 2078127419, 1824320165 },
  486         { 2143488001,  613475285, 1604011510 },
  487         { 2143469569, 1466594987,  502095196 },
  488         { 2143426561, 1115430331, 1044637111 },
  489         { 2143383553,    9778045, 1902463734 },
  490         { 2143377409, 1557401276, 2056861771 },
  491         { 2143363073,  652036455, 1965915971 },
  492         { 2143260673, 1464581171, 1523257541 },
  493         { 2143246337, 1876119649,  764541916 },
  494         { 2143209473, 1614992673, 1920672844 },
  495         { 2143203329,  981052047, 2049774209 },
  496         { 2143160321, 1847355533,  728535665 },
  497         { 2143129601,  965558457,  603052992 },
  498         { 2143123457, 2140817191,    8348679 },
  499         { 2143100929, 1547263683,  694209023 },
  500         { 2143092737,  643459066, 1979934533 },
  501         { 2143082497,  188603778, 2026175670 },
  502         { 2143062017, 1657329695,  377451099 },
  503         { 2143051777,  114967950,  979255473 },
  504         { 2143025153, 1698431342, 1449196896 },
  505         { 2143006721, 1862741675, 1739650365 },
  506         { 2142996481,  756660457,  996160050 },
  507         { 2142976001,  927864010, 1166847574 },
  508         { 2142965761,  905070557,  661974566 },
  509         { 2142916609,   40932754, 1787161127 },
  510         { 2142892033, 1987985648,  675335382 },
  511         { 2142885889,  797497211, 1323096997 },
  512         { 2142871553, 2068025830, 1411877159 },
  513         { 2142861313, 1217177090, 1438410687 },
  514         { 2142830593,  409906375, 1767860634 },
  515         { 2142803969, 1197788993,  359782919 },
  516         { 2142785537,  643817365,  513932862 },
  517         { 2142779393, 1717046338,  218943121 },
  518         { 2142724097,   89336830,  416687049 },
  519         { 2142707713,    5944581, 1356813523 },
  520         { 2142658561,  887942135, 2074011722 },
  521         { 2142638081,  151851972, 1647339939 },
  522         { 2142564353, 1691505537, 1483107336 },
  523         { 2142533633, 1989920200, 1135938817 },
  524         { 2142529537,  959263126, 1531961857 },
  525         { 2142527489,  453251129, 1725566162 },
  526         { 2142502913, 1536028102,  182053257 },
  527         { 2142498817,  570138730,  701443447 },
  528         { 2142416897,  326965800,  411931819 },
  529         { 2142363649, 1675665410, 1517191733 },
  530         { 2142351361,  968529566, 1575712703 },
  531         { 2142330881, 1384953238, 1769087884 },
  532         { 2142314497, 1977173242, 1833745524 },
  533         { 2142289921,   95082313, 1714775493 },
  534         { 2142283777,  109377615, 1070584533 },
  535         { 2142277633,   16960510,  702157145 },
  536         { 2142263297,  553850819,  431364395 },
  537         { 2142208001,  241466367, 2053967982 },
  538         { 2142164993, 1795661326, 1031836848 },
  539         { 2142097409, 1212530046,  712772031 },
  540         { 2142087169, 1763869720,  822276067 },
  541         { 2142078977,  644065713, 1765268066 },
  542         { 2142074881,  112671944,  643204925 },
  543         { 2142044161, 1387785471, 1297890174 },
  544         { 2142025729,  783885537, 1000425730 },
  545         { 2142011393,  905662232, 1679401033 },
  546         { 2141974529,  799788433,  468119557 },
  547         { 2141943809, 1932544124,  449305555 },
  548         { 2141933569, 1527403256,  841867925 },
  549         { 2141931521, 1247076451,  743823916 },
  550         { 2141902849, 1199660531,  401687910 },
  551         { 2141890561,  150132350, 1720336972 },
  552         { 2141857793, 1287438162,  663880489 },
  553         { 2141833217,  618017731, 1819208266 },
  554         { 2141820929,  999578638, 1403090096 },
  555         { 2141786113,   81834325, 1523542501 },
  556         { 2141771777,  120001928,  463556492 },
  557         { 2141759489,  122455485, 2124928282 },
  558         { 2141749249,  141986041,  940339153 },
  559         { 2141685761,  889088734,  477141499 },
  560         { 2141673473,  324212681, 1122558298 },
  561         { 2141669377, 1175806187, 1373818177 },
  562         { 2141655041, 1113654822,  296887082 },
  563         { 2141587457,  991103258, 1585913875 },
  564         { 2141583361, 1401451409, 1802457360 },
  565         { 2141575169, 1571977166,  712760980 },
  566         { 2141546497, 1107849376, 1250270109 },
  567         { 2141515777,  196544219,  356001130 },
  568         { 2141495297, 1733571506, 1060744866 },
  569         { 2141483009,  321552363, 1168297026 },
  570         { 2141458433,  505818251,  733225819 },
  571         { 2141360129, 1026840098,  948342276 },
  572         { 2141325313,  945133744, 2129965998 },
  573         { 2141317121, 1871100260, 1843844634 },
  574         { 2141286401, 1790639498, 1750465696 },
  575         { 2141267969, 1376858592,  186160720 },
  576         { 2141255681, 2129698296, 1876677959 },
  577         { 2141243393, 2138900688, 1340009628 },
  578         { 2141214721, 1933049835, 1087819477 },
  579         { 2141212673, 1898664939, 1786328049 },
  580         { 2141202433,  990234828,  940682169 },
  581         { 2141175809, 1406392421,  993089586 },
  582         { 2141165569, 1263518371,  289019479 },
  583         { 2141073409, 1485624211,  507864514 },
  584         { 2141052929, 1885134788,  311252465 },
  585         { 2141040641, 1285021247,  280941862 },
  586         { 2141028353, 1527610374,  375035110 },
  587         { 2141011969, 1400626168,  164696620 },
  588         { 2140999681,  632959608,  966175067 },
  589         { 2140997633, 2045628978, 1290889438 },
  590         { 2140993537, 1412755491,  375366253 },
  591         { 2140942337,  719477232,  785367828 },
  592         { 2140925953,   45224252,  836552317 },
  593         { 2140917761, 1157376588, 1001839569 },
  594         { 2140887041,  278480752, 2098732796 },
  595         { 2140837889, 1663139953,  924094810 },
  596         { 2140788737,  802501511, 2045368990 },
  597         { 2140766209, 1820083885, 1800295504 },
  598         { 2140764161, 1169561905, 2106792035 },
  599         { 2140696577,  127781498, 1885987531 },
  600         { 2140684289,   16014477, 1098116827 },
  601         { 2140653569,  665960598, 1796728247 },
  602         { 2140594177, 1043085491,  377310938 },
  603         { 2140579841, 1732838211, 1504505945 },
  604         { 2140569601,  302071939,  358291016 },
  605         { 2140567553,  192393733, 1909137143 },
  606         { 2140557313,  406595731, 1175330270 },
  607         { 2140549121, 1748850918,  525007007 },
  608         { 2140477441,  499436566, 1031159814 },
  609         { 2140469249, 1886004401, 1029951320 },
  610         { 2140426241, 1483168100, 1676273461 },
  611         { 2140420097, 1779917297,  846024476 },
  612         { 2140413953,  522948893, 1816354149 },
  613         { 2140383233, 1931364473, 1296921241 },
  614         { 2140366849, 1917356555,  147196204 },
  615         { 2140354561,   16466177, 1349052107 },
  616         { 2140348417, 1875366972, 1860485634 },
  617         { 2140323841,  456498717, 1790256483 },
  618         { 2140321793, 1629493973,  150031888 },
  619         { 2140315649, 1904063898,  395510935 },
  620         { 2140280833, 1784104328,  831417909 },
  621         { 2140250113,  256087139,  697349101 },
  622         { 2140229633,  388553070,  243875754 },
  623         { 2140223489,  747459608, 1396270850 },
  624         { 2140200961,  507423743, 1895572209 },
  625         { 2140162049,  580106016, 2045297469 },
  626         { 2140149761,  712426444,  785217995 },
  627         { 2140137473, 1441607584,  536866543 },
  628         { 2140119041,  346538902, 1740434653 },
  629         { 2140090369,  282642885,   21051094 },
  630         { 2140076033, 1407456228,  319910029 },
  631         { 2140047361, 1619330500, 1488632070 },
  632         { 2140041217, 2089408064, 2012026134 },
  633         { 2140008449, 1705524800, 1613440760 },
  634         { 2139924481, 1846208233, 1280649481 },
  635         { 2139906049,  989438755, 1185646076 },
  636         { 2139867137, 1522314850,  372783595 },
  637         { 2139842561, 1681587377,  216848235 },
  638         { 2139826177, 2066284988, 1784999464 },
  639         { 2139824129,  480888214, 1513323027 },
  640         { 2139789313,  847937200,  858192859 },
  641         { 2139783169, 1642000434, 1583261448 },
  642         { 2139770881,  940699589,  179702100 },
  643         { 2139768833,  315623242,  964612676 },
  644         { 2139666433,  331649203,  764666914 },
  645         { 2139641857, 2118730799, 1313764644 },
  646         { 2139635713,  519149027,  519212449 },
  647         { 2139598849, 1526413634, 1769667104 },
  648         { 2139574273,  551148610,  820739925 },
  649         { 2139568129, 1386800242,  472447405 },
  650         { 2139549697,  813760130, 1412328531 },
  651         { 2139537409, 1615286260, 1609362979 },
  652         { 2139475969, 1352559299, 1696720421 },
  653         { 2139455489, 1048691649, 1584935400 },
  654         { 2139432961,  836025845,  950121150 },
  655         { 2139424769, 1558281165, 1635486858 },
  656         { 2139406337, 1728402143, 1674423301 },
  657         { 2139396097, 1727715782, 1483470544 },
  658         { 2139383809, 1092853491, 1741699084 },
  659         { 2139369473,  690776899, 1242798709 },
  660         { 2139351041, 1768782380, 2120712049 },
  661         { 2139334657, 1739968247, 1427249225 },
  662         { 2139332609, 1547189119,  623011170 },
  663         { 2139310081, 1346827917, 1605466350 },
  664         { 2139303937,  369317948,  828392831 },
  665         { 2139301889, 1560417239, 1788073219 },
  666         { 2139283457, 1303121623,  595079358 },
  667         { 2139248641, 1354555286,  573424177 },
  668         { 2139240449,   60974056,  885781403 },
  669         { 2139222017,  355573421, 1221054839 },
  670         { 2139215873,  566477826, 1724006500 },
  671         { 2139150337,  871437673, 1609133294 },
  672         { 2139144193, 1478130914, 1137491905 },
  673         { 2139117569, 1854880922,  964728507 },
  674         { 2139076609,  202405335,  756508944 },
  675         { 2139062273, 1399715741,  884826059 },
  676         { 2139045889, 1051045798, 1202295476 },
  677         { 2139033601, 1707715206,  632234634 },
  678         { 2139006977, 2035853139,  231626690 },
  679         { 2138951681,  183867876,  838350879 },
  680         { 2138945537, 1403254661,  404460202 },
  681         { 2138920961,  310865011, 1282911681 },
  682         { 2138910721, 1328496553,  103472415 },
  683         { 2138904577,   78831681,  993513549 },
  684         { 2138902529, 1319697451, 1055904361 },
  685         { 2138816513,  384338872, 1706202469 },
  686         { 2138810369, 1084868275,  405677177 },
  687         { 2138787841,  401181788, 1964773901 },
  688         { 2138775553, 1850532988, 1247087473 },
  689         { 2138767361,  874261901, 1576073565 },
  690         { 2138757121, 1187474742,  993541415 },
  691         { 2138748929, 1782458888, 1043206483 },
  692         { 2138744833, 1221500487,  800141243 },
  693         { 2138738689,  413465368, 1450660558 },
  694         { 2138695681,  739045140,  342611472 },
  695         { 2138658817, 1355845756,  672674190 },
  696         { 2138644481,  608379162, 1538874380 },
  697         { 2138632193, 1444914034,  686911254 },
  698         { 2138607617,  484707818, 1435142134 },
  699         { 2138591233,  539460669, 1290458549 },
  700         { 2138572801, 2093538990, 2011138646 },
  701         { 2138552321, 1149786988, 1076414907 },
  702         { 2138546177,  840688206, 2108985273 },
  703         { 2138533889,  209669619,  198172413 },
  704         { 2138523649, 1975879426, 1277003968 },
  705         { 2138490881, 1351891144, 1976858109 },
  706         { 2138460161, 1817321013, 1979278293 },
  707         { 2138429441, 1950077177,  203441928 },
  708         { 2138400769,  908970113,  628395069 },
  709         { 2138398721,  219890864,  758486760 },
  710         { 2138376193, 1306654379,  977554090 },
  711         { 2138351617,  298822498, 2004708503 },
  712         { 2138337281,  441457816, 1049002108 },
  713         { 2138320897, 1517731724, 1442269609 },
  714         { 2138290177, 1355911197, 1647139103 },
  715         { 2138234881,  531313247, 1746591962 },
  716         { 2138214401, 1899410930,  781416444 },
  717         { 2138202113, 1813477173, 1622508515 },
  718         { 2138191873, 1086458299, 1025408615 },
  719         { 2138183681, 1998800427,  827063290 },
  720         { 2138173441, 1921308898,  749670117 },
  721         { 2138103809, 1620902804, 2126787647 },
  722         { 2138099713,  828647069, 1892961817 },
  723         { 2138085377,  179405355, 1525506535 },
  724         { 2138060801,  615683235, 1259580138 },
  725         { 2138044417, 2030277840, 1731266562 },
  726         { 2138042369, 2087222316, 1627902259 },
  727         { 2138032129,  126388712, 1108640984 },
  728         { 2138011649,  715026550, 1017980050 },
  729         { 2137993217, 1693714349, 1351778704 },
  730         { 2137888769, 1289762259, 1053090405 },
  731         { 2137853953,  199991890, 1254192789 },
  732         { 2137833473,  941421685,  896995556 },
  733         { 2137817089,  750416446, 1251031181 },
  734         { 2137792513,  798075119,  368077456 },
  735         { 2137786369,  878543495, 1035375025 },
  736         { 2137767937,    9351178, 1156563902 },
  737         { 2137755649, 1382297614, 1686559583 },
  738         { 2137724929, 1345472850, 1681096331 },
  739         { 2137704449,  834666929,  630551727 },
  740         { 2137673729, 1646165729, 1892091571 },
  741         { 2137620481,  778943821,   48456461 },
  742         { 2137618433, 1730837875, 1713336725 },
  743         { 2137581569,  805610339, 1378891359 },
  744         { 2137538561,  204342388, 1950165220 },
  745         { 2137526273, 1947629754, 1500789441 },
  746         { 2137516033,  719902645, 1499525372 },
  747         { 2137491457,  230451261,  556382829 },
  748         { 2137440257,  979573541,  412760291 },
  749         { 2137374721,  927841248, 1954137185 },
  750         { 2137362433, 1243778559,  861024672 },
  751         { 2137313281, 1341338501,  980638386 },
  752         { 2137311233,  937415182, 1793212117 },
  753         { 2137255937,  795331324, 1410253405 },
  754         { 2137243649,  150756339, 1966999887 },
  755         { 2137182209,  163346914, 1939301431 },
  756         { 2137171969, 1952552395,  758913141 },
  757         { 2137159681,  570788721,  218668666 },
  758         { 2137147393, 1896656810, 2045670345 },
  759         { 2137141249,  358493842,  518199643 },
  760         { 2137139201, 1505023029,  674695848 },
  761         { 2137133057,   27911103,  830956306 },
  762         { 2137122817,  439771337, 1555268614 },
  763         { 2137116673,  790988579, 1871449599 },
  764         { 2137110529,  432109234,  811805080 },
  765         { 2137102337, 1357900653, 1184997641 },
  766         { 2137098241,  515119035, 1715693095 },
  767         { 2137090049,  408575203, 2085660657 },
  768         { 2137085953, 2097793407, 1349626963 },
  769         { 2137055233, 1556739954, 1449960883 },
  770         { 2137030657, 1545758650, 1369303716 },
  771         { 2136987649,  332602570,  103875114 },
  772         { 2136969217, 1499989506, 1662964115 },
  773         { 2136924161,  857040753,    4738842 },
  774         { 2136895489, 1948872712,  570436091 },
  775         { 2136893441,   58969960, 1568349634 },
  776         { 2136887297, 2127193379,  273612548 },
  777         { 2136850433,  111208983, 1181257116 },
  778         { 2136809473, 1627275942, 1680317971 },
  779         { 2136764417, 1574888217,   14011331 },
  780         { 2136741889,   14011055, 1129154251 },
  781         { 2136727553,   35862563, 1838555253 },
  782         { 2136721409,  310235666, 1363928244 },
  783         { 2136698881, 1612429202, 1560383828 },
  784         { 2136649729, 1138540131,  800014364 },
  785         { 2136606721,  602323503, 1433096652 },
  786         { 2136563713,  182209265, 1919611038 },
  787         { 2136555521,  324156477,  165591039 },
  788         { 2136549377,  195513113,  217165345 },
  789         { 2136526849, 1050768046,  939647887 },
  790         { 2136508417, 1886286237, 1619926572 },
  791         { 2136477697,  609647664,   35065157 },
  792         { 2136471553,  679352216, 1452259468 },
  793         { 2136457217,  128630031,  824816521 },
  794         { 2136422401,   19787464, 1526049830 },
  795         { 2136420353,  698316836, 1530623527 },
  796         { 2136371201, 1651862373, 1804812805 },
  797         { 2136334337,  326596005,  336977082 },
  798         { 2136322049,   63253370, 1904972151 },
  799         { 2136297473,  312176076,  172182411 },
  800         { 2136248321,  381261841,  369032670 },
  801         { 2136242177,  358688773, 1640007994 },
  802         { 2136229889,  512677188,   75585225 },
  803         { 2136219649, 2095003250, 1970086149 },
  804         { 2136207361, 1909650722,  537760675 },
  805         { 2136176641, 1334616195, 1533487619 },
  806         { 2136158209, 2096285632, 1793285210 },
  807         { 2136143873, 1897347517,  293843959 },
  808         { 2136133633,  923586222, 1022655978 },
  809         { 2136096769, 1464868191, 1515074410 },
  810         { 2136094721, 2020679520, 2061636104 },
  811         { 2136076289,  290798503, 1814726809 },
  812         { 2136041473,  156415894, 1250757633 },
  813         { 2135996417,  297459940, 1132158924 },
  814         { 2135955457,  538755304, 1688831340 },
  815         { 0, 0, 0 }
  816 };
  817 
  818 /*
  819  * Prime moduli for the ternary case. Each prime p is such that
  820  * p = 1 mod 2304. Generator g is a primitive 2304-th root of 1.
  821  */
  822 static const small_prime PRIMES3[] = {
  823         { 2147450113, 1822830492,      33535 },
  824         { 2147431681,  424626201, 1193134109 },
  825         { 2147415553, 1454863251, 1663647397 },
  826         { 2147390209, 1762035773, 1387046861 },
  827         { 2147385601, 1517680363, 1106952843 },
  828         { 2147362561, 1025814050,  440476355 },
  829         { 2147355649,  865455738,  854658425 },
  830         { 2147346433, 1321425846,  224103891 },
  831         { 2147339521, 1474981691,  599730472 },
  832         { 2147332609, 1864656042, 1545592206 },
  833         { 2147316481, 1890505928,  421789636 },
  834         { 2147309569,  906570069, 1610828544 },
  835         { 2147304961, 1403818598, 1990856782 },
  836         { 2147281921,   71783863,  956869830 },
  837         { 2147258881, 1036649303,  396683436 },
  838         { 2147242753,  467401799, 1773880773 },
  839         { 2147235841, 1453821700,  461674371 },
  840         { 2147233537,  598421725, 1689622480 },
  841         { 2147217409, 1761254347,  688859791 },
  842         { 2147212801, 1327730595, 1887769026 },
  843         { 2147164417, 1258121641,  366578724 },
  844         { 2147141377, 1777466165,  171390629 },
  845         { 2147139073, 1984146916,  328716798 },
  846         { 2147136769, 1765129553,  767922350 },
  847         { 2147129857, 1514145184, 1081034369 },
  848         { 2147104513, 1951719537,  733036907 },
  849         { 2147095297,  682785034,  373776370 },
  850         { 2147079169, 1820310317, 1807535939 },
  851         { 2147051521,  458986762,  590428790 },
  852         { 2147049217, 1038363684,  877203562 },
  853         { 2147026177, 1326504697,  643505453 },
  854         { 2147003137, 1170185764, 1319639833 },
  855         { 2146966273,  114827812, 1294985554 },
  856         { 2146959361, 1164199326, 1256988252 },
  857         { 2146954753,  486248957,  710359258 },
  858         { 2146947841, 2000149550,  918021771 },
  859         { 2146936321,  942091103,  108081081 },
  860         { 2146906369,  271696604,  608809917 },
  861         { 2146887937, 1511165281,  369139633 },
  862         { 2146885633,  441164610,  926260308 },
  863         { 2146883329, 1085986991,  240773649 },
  864         { 2146864897,  345552901, 1817459006 },
  865         { 2146814209,  793155214,  640578693 },
  866         { 2146775041, 1675390468,  820220886 },
  867         { 2146772737,  730144429,  765352711 },
  868         { 2146756609, 1835554388, 1004443916 },
  869         { 2146738177,  433272441, 2105099367 },
  870         { 2146733569, 1762957593, 1602986233 },
  871         { 2146694401,  399612535,  169309537 },
  872         { 2146687489,  114105700,  780597858 },
  873         { 2146646017, 1434027294,  564442758 },
  874         { 2146622977, 1341692395, 2091021085 },
  875         { 2146613761,  360157916, 1064050988 },
  876         { 2146606849, 1313179470, 1969566727 },
  877         { 2146599937, 1706429999,  652421166 },
  878         { 2146572289, 1732243288, 1454972738 },
  879         { 2146544641,  272853975,  590131110 },
  880         { 2146542337,   36418634, 1056975891 },
  881         { 2146530817, 2007241330, 1580599978 },
  882         { 2146507777, 1995701118, 1014402730 },
  883         { 2146493953, 1114934790, 1032062368 },
  884         { 2146487041, 1862337352, 1231735150 },
  885         { 2146450177,  846834198,  681753242 },
  886         { 2146447873, 1227566685,   12770655 },
  887         { 2146417921, 1671891715,  794746270 },
  888         { 2146406401,  609289185,  630588397 },
  889         { 2146381057, 2109266947,  655092508 },
  890         { 2146378753,  343938960,  966167795 },
  891         { 2146367233,   41811369, 1440184031 },
  892         { 2146353409, 1450579491,  183813399 },
  893         { 2146348801,    9565898,  526104353 },
  894         { 2146330369, 1407087756,  647129879 },
  895         { 2146325761,  957276356,  283440898 },
  896         { 2146291201,  925507446,  985278760 },
  897         { 2146288897,   48605924,  828823151 },
  898         { 2146256641,  974604801, 1426305449 },
  899         { 2146203649,  977374447, 1007788238 },
  900         { 2146192129, 2083411670, 1165914744 },
  901         { 2146187521, 1993961053,  795729927 },
  902         { 2146180609, 1629167227,  730660993 },
  903         { 2146169089,   44701833, 1427754293 },
  904         { 2146106881,   16997996,  530746042 },
  905         { 2146095361,  264082098,  306963294 },
  906         { 2146093057,  662783894, 1341690726 },
  907         { 2146090753, 1656443739,   45600745 },
  908         { 2146072321, 1430706864, 1210417794 },
  909         { 2146067713, 1657990272, 1024407936 },
  910         { 2146030849, 1649603809, 2098081629 },
  911         { 2146019329,  978947895,  491565736 },
  912         { 2146014721,   48551225,   46133079 },
  913         { 2145996289, 1832516941,  325405827 },
  914         { 2145984769, 1046055041, 1409723796 },
  915         { 2145964033,   66156561,  464497063 },
  916         { 2145954817,  859915264, 2092994912 },
  917         { 2145952513, 1758184581,  956837854 },
  918         { 2145943297, 1901395808, 1848403013 },
  919         { 2145915649,   30807819, 1059439378 },
  920         { 2145904129,  647739402, 1973108753 },
  921         { 2145874177, 1869553544,  577659032 },
  922         { 2145871873,  337408396, 1872862727 },
  923         { 2145864961, 1685455768,  667710631 },
  924         { 2145862657, 1532795538, 2070068256 },
  925         { 2145851137,  212501100, 1224511404 },
  926         { 2145837313, 1358121493, 1848966089 },
  927         { 2145825793,  386307300,  449999704 },
  928         { 2145823489, 1129365680, 1001085691 },
  929         { 2145816577,  551493242, 2119183454 },
  930         { 2145807361,  271270756, 1989968022 },
  931         { 2145793537,  852239048, 1309749017 },
  932         { 2145782017, 1284378810,  311082915 },
  933         { 2145777409, 2004177093, 1290706457 },
  934         { 2145742849, 2121468587, 1133755266 },
  935         { 2145733633, 1260159354, 1951998626 },
  936         { 2145722113,  163237044,  594573040 },
  937         { 2145689857,  474757169,  834321078 },
  938         { 2145687553,   67947935,  828266594 },
  939         { 2145664513, 1598442324,  630205829 },
  940         { 2145646081,  250505282,  776273401 },
  941         { 2145623041, 1703821214,  888409131 },
  942         { 2145611521, 1039247338,  228177613 },
  943         { 2145609217, 1668731457, 1185982491 },
  944         { 2145600001,  857636216,  139252527 },
  945         { 2145595393,  400214229,  729082632 },
  946         { 2145593089, 1796005933,  550289398 },
  947         { 2145588481, 2094475478, 1217287696 },
  948         { 2145565441,  426324144, 1312430145 },
  949         { 2145563137,   78773460,  169188458 },
  950         { 2145549313,  936100306,  757760717 },
  951         { 2145542401, 1097930762,  824118991 },
  952         { 2145535489, 1983878618, 1042705853 },
  953         { 2145528577, 1036190039, 1022323836 },
  954         { 2145523969, 1868801802,  416370830 },
  955         { 2145491713,  356121488,  543381348 },
  956         { 2145480193,  199326412, 1731701467 },
  957         { 2145466369, 1830188719,  240239794 },
  958         { 2145445633,  351737879, 1506144677 },
  959         { 2145443329,  379875076,  475878516 },
  960         { 2145434113, 1003996799,  683903694 },
  961         { 2145415681, 1035366258, 1531251342 },
  962         { 2145401857,  126895727, 1548302219 },
  963         { 2145390337,  203720005,  425208191 },
  964         { 2145388033, 1347512002,  117858071 },
  965         { 2145362689, 1169852366, 1431567820 },
  966         { 2145355777, 2110316730, 2086117269 },
  967         { 2145353473, 1774773548,  876852201 },
  968         { 2145339649,  501723795, 1021106097 },
  969         { 2145318913,  418847235, 1160609910 },
  970         { 2145300481, 1023331752,  191383930 },
  971         { 2145286657, 1280971615,  216845844 },
  972         { 2145282049, 1196793073, 1187575821 },
  973         { 2145235969,  923496757, 1307358539 },
  974         { 2145219841,  519785435, 1626462263 },
  975         { 2145185281, 1860630272,  474564844 },
  976         { 2145180673, 1834049770,  586172945 },
  977         { 2145178369, 1501247472, 1096722774 },
  978         { 2145136897,  550810282,   40160384 },
  979         { 2145113857, 1296058651,  977708791 },
  980         { 2145079297,  955545860,  265600380 },
  981         { 2145076993,   79051519, 1039925789 },
  982         { 2145067777,  253166375, 1127232024 },
  983         { 2145065473, 1026024296,  756613594 },
  984         { 2145063169, 1426910002, 1608845610 },
  985         { 2145033217,  970758503, 1555852593 },
  986         { 2145017089,  347430931,  775368992 },
  987         { 2145007873,  914675316, 1297416956 },
  988         { 2144975617, 2015869942, 1431118733 },
  989         { 2144938753, 1762298335, 1193660775 },
  990         { 2144929537, 1449381583, 1771237555 },
  991         { 2144920321,  875265402, 2099494532 },
  992         { 2144915713,  582439339,  817587747 },
  993         { 2144894977,  506160393, 1688289421 },
  994         { 2144809729,  734926996, 1704354178 },
  995         { 2144793601,  155059687,  171692410 },
  996         { 2144777473,  317485600,  874802365 },
  997         { 2144775169,  651843170,  473816143 },
  998         { 2144759041,  681485475,  464197116 },
  999         { 2144733697, 2125180198,   48767965 },
 1000         { 2144729089, 1478112475, 1355118431 },
 1001         { 2144712961, 1279790481,  591304708 },
 1002         { 2144648449, 1335778078, 1098669758 },
 1003         { 2144630017, 1603567267,  160302246 },
 1004         { 2144604673, 1175259039,   78203500 },
 1005         { 2144597761,  620616998, 1515263848 },
 1006         { 2144579329,  193500483, 1559132057 },
 1007         { 2144567809, 1946028673, 1581197529 },
 1008         { 2144558593,  231387691, 1485256545 },
 1009         { 2144551681,  844374396,   30673778 },
 1010         { 2144535553,  166176063,  168263365 },
 1011         { 2144528641,  781790035,  556577063 },
 1012         { 2144500993,  461561124, 1399755614 },
 1013         { 2144459521,    2822303,  752681683 },
 1014         { 2144454913,  274692318,  257668067 },
 1015         { 2144448001,  716371349,  584223158 },
 1016         { 2144418049, 1936863174,  540598834 },
 1017         { 2144388097, 1855586009,  690032843 },
 1018         { 2144383489,  408100025, 1859550157 },
 1019         { 2144348929,  122163029, 2060148799 },
 1020         { 2144337409, 1525008836,  819305252 },
 1021         { 2144330497, 2024998620, 1611547511 },
 1022         { 2144275201, 1947760064, 1759305590 },
 1023         { 2144261377,  468455779, 2046852340 },
 1024         { 2144256769,  243366493,  207179424 },
 1025         { 2144236033,  562111014,  536612673 },
 1026         { 2144229121,  562082296, 1933892350 },
 1027         { 2144210689, 1057111161,  162239457 },
 1028         { 2144192257, 1440567493,  907872966 },
 1029         { 2144160001,  464029916,  319626240 },
 1030         { 2144136961,  853524564,  297716302 },
 1031         { 2144120833,  587037095, 1136871325 },
 1032         { 2144065537,  231183627,  323060764 },
 1033         { 2144028673,  340762973, 1758401946 },
 1034         { 2144010241,  593721336,  481717690 },
 1035         { 2143996417,  743415799,   71802978 },
 1036         { 2143975681,  403083216,  363908257 },
 1037         { 2143950337,  854680705, 1615520152 },
 1038         { 2143948033,  294824006, 1680513832 },
 1039         { 2143922689,  648528897,  390399209 },
 1040         { 2143918081, 1755729599,  293848938 },
 1041         { 2143899649, 1560525715,  670501530 },
 1042         { 2143867393,  733595942, 1533336387 },
 1043         { 2143821313, 1550195321,  726145119 },
 1044         { 2143786753,  856875165,   36866658 },
 1045         { 2143784449, 1222822055, 1205640485 },
 1046         { 2143754497,  991548026,  190649322 },
 1047         { 2143742977, 1756977003,  292977861 },
 1048         { 2143740673, 1342868950,  176825607 },
 1049         { 2143708417, 1236271287,  157693291 },
 1050         { 2143692289, 1746765009, 1168989539 },
 1051         { 2143685377, 1342856351, 1371059722 },
 1052         { 2143680769,  273481162, 1775434577 },
 1053         { 2143641601, 1998453617,  606651047 },
 1054         { 2143567873,  702418292,  536490409 },
 1055         { 2143556353, 1662552442, 1276091018 },
 1056         { 2143535617,  842561199, 1808542507 },
 1057         { 2143533313,  826061287, 1605713154 },
 1058         { 2143531009, 1930038526, 2102909123 },
 1059         { 2143510273,  172073132, 1023337452 },
 1060         { 2143491841, 1212168134, 1114919776 },
 1061         { 2143487233, 1779380185, 1831962347 },
 1062         { 2143480321, 1156632912, 1555759032 },
 1063         { 2143478017,  622095943,  417269653 },
 1064         { 2143461889, 1280303071,  153354823 },
 1065         { 2143431937,  153658480, 1588613191 },
 1066         { 2143383553,  277427922,   38234351 },
 1067         { 2143339777,  589784914, 1573224776 },
 1068         { 2143335169,  922783059,  200661355 },
 1069         { 2143323649,  302291280, 1138833370 },
 1070         { 2143277569,  820797102,  854962129 },
 1071         { 2143213057, 1893415535,  181451477 },
 1072         { 2143203841, 1088582389,  170356944 },
 1073         { 2143196929,  291107059,  728824031 },
 1074         { 2143192321, 1068927089,  373852844 },
 1075         { 2143187713,  985974306,  668372891 },
 1076         { 2143173889,  167922091,   50439572 },
 1077         { 2143130113,  211225918, 1845430847 },
 1078         { 2143127809,   27582853, 1679634064 },
 1079         { 2143097857, 1766916709, 1819896188 },
 1080         { 2143063297,  780824783, 1952357891 },
 1081         { 2143058689, 1378949832,  756913072 },
 1082         { 2143051777,  408317329,  443287483 },
 1083         { 2143047169, 1273910740, 1515393890 },
 1084         { 2143028737,  996305079,  776734402 },
 1085         { 2143003393,  993006570, 1390097560 },
 1086         { 2142996481, 2136513072, 1012671159 },
 1087         { 2142961921,  891029287,  107949307 },
 1088         { 2142948097,  844702524,  757623023 },
 1089         { 2142943489,  783825741,  900340177 },
 1090         { 2142936577,  674435958,  527289896 },
 1091         { 2142934273,  823653457,  327447549 },
 1092         { 2142920449, 1856390383,  273195225 },
 1093         { 2142911233, 1065735230, 1111511492 },
 1094         { 2142899713,  852649554, 1319180783 },
 1095         { 2142897409, 1582470696, 1728311821 },
 1096         { 2142885889,  875959709, 1969188734 },
 1097         { 2142851329, 1668580464, 1442501897 },
 1098         { 2142830593, 1991029339,  713631715 },
 1099         { 2142807553,   46451453,  209687937 },
 1100         { 2142786817, 1066849029, 1022563324 },
 1101         { 2142754561, 1182968060, 1098740072 },
 1102         { 2142749953,  170870492, 1883579850 },
 1103         { 2142736129,  885298220,  134381218 },
 1104         { 2142690049,  257234559,  713466052 },
 1105         { 2142671617,   16153127,  932308898 },
 1106         { 2142639361, 1904439789,  232384311 },
 1107         { 2142623233,  710460071, 1914284724 },
 1108         { 2142616321, 1384373375, 1948960778 },
 1109         { 2142614017,  967648016,  687398138 },
 1110         { 2142579457, 1095246428, 2090495107 },
 1111         { 2142574849, 1641210381,  638943403 },
 1112         { 2142556417,  958201775, 1972069612 },
 1113         { 2142547201,  951709836,  676516709 },
 1114         { 2142540289, 1374042909,  215502433 },
 1115         { 2142533377, 1882816694, 2074030459 },
 1116         { 2142498817, 1504404304, 1972349416 },
 1117         { 2142471169, 1750000645,  813201047 },
 1118         { 2142450433,  295533779, 1565446756 },
 1119         { 2142438913,   10280011,   91646711 },
 1120         { 2142436609, 1274524844,  162069470 },
 1121         { 2142429697, 1092732318, 1096144802 },
 1122         { 2142420481, 1535070609, 1079403268 },
 1123         { 2142402049, 2005480704,  629779334 },
 1124         { 2142381313, 2114923039,  648509010 },
 1125         { 2142379009,  718105805,  536973776 },
 1126         { 2142374401, 2095587425, 1044147117 },
 1127         { 2142351361,  439135167, 1502821682 },
 1128         { 2142349057,  446437191,  530473080 },
 1129         { 2142323713,  100125463,  992682060 },
 1130         { 2142316801,  696384453, 1947571491 },
 1131         { 2142314497, 1453856322, 2001001690 },
 1132         { 2142277633, 1952977383,  221148682 },
 1133         { 2142245377, 1644990598, 1408241725 },
 1134         { 2142229249, 1528562358, 1061142574 },
 1135         { 2142217729,  510200942, 1943431139 },
 1136         { 2142208513,  572850506,  958721727 },
 1137         { 2142178561, 1699015145,  655247881 },
 1138         { 2142144001,  295258268, 1190693208 },
 1139         { 2142109441,   13501310, 1444074262 },
 1140         { 2142097921, 1555745306,  965245540 },
 1141         { 2142095617,  639823805,  124040022 },
 1142         { 2142091009,   78431256, 2072711469 },
 1143         { 2142074881, 1583322983, 1971506113 },
 1144         { 2142067969, 1985460292,  493290576 },
 1145         { 2142044929,  964118073, 1818270494 },
 1146         { 2142040321, 2074619527,   89552074 },
 1147         { 2142024193,  114755492,  442193256 },
 1148         { 2142010369, 1379074915, 1941110794 },
 1149         { 2141932033, 2026935677,  190080015 },
 1150         { 2141920513,   74333441,  805281961 },
 1151         { 2141918209, 1775077654,  236453650 },
 1152         { 2141913601, 1802464945, 1566679036 },
 1153         { 2141911297,  387494635,  563810545 },
 1154         { 2141895169, 1172229060, 1528048235 },
 1155         { 2141890561, 1361767304, 2041867386 },
 1156         { 2141867521,  533045780,  914774350 },
 1157         { 2141842177,  437131558,  183042158 },
 1158         { 2141821441,   39421972, 1331298027 },
 1159         { 2141814529, 1537989736, 1530178830 },
 1160         { 2141803009, 1676671105,  515971140 },
 1161         { 2141791489, 2010187792, 1541172871 },
 1162         { 2141784577, 1386646410, 1982618979 },
 1163         { 2141738497,  712677114,   31915251 },
 1164         { 2141736193, 1274969467,  916301278 },
 1165         { 2141733889,  116533872, 1715534326 },
 1166         { 2141694721, 1454976825, 1509954498 },
 1167         { 2141692417, 1516965569,  642117163 },
 1168         { 2141671681,  750263061,  316436557 },
 1169         { 2141669377, 2072414840,  808860641 },
 1170         { 2141660161,  161141902,  282052692 },
 1171         { 2141653249, 1463156395,  255786615 },
 1172         { 2141620993, 1362041953,  229951180 },
 1173         { 2141591041, 1936223409,  710815623 },
 1174         { 2141563393, 1302310432, 2126420394 },
 1175         { 2141542657, 1197316148,    2471210 },
 1176         { 2141533441, 2086849163, 1253750414 },
 1177         { 2141526529,  392684861,  291575124 },
 1178         { 2141475841, 1392668921, 1135731293 },
 1179         { 2141464321, 1574939096,  682732876 },
 1180         { 2141450497,  714585706, 2017942072 },
 1181         { 2141418241, 1697181785, 1091684538 },
 1182         { 2141406721,  712755841,  661693530 },
 1183         { 2141404417, 2001939869,   46726158 },
 1184         { 2141342209,  400626204, 1436047974 },
 1185         { 2141335297, 1502178162,  413989624 },
 1186         { 2141332993, 1475685643,  178253589 },
 1187         { 2141291521,  304189746, 1048860171 },
 1188         { 2141275393, 2084823014, 1555287725 },
 1189         { 2141273089, 2119184022,  411567434 },
 1190         { 2141261569,  178224725,  298057668 },
 1191         { 2141254657,  677034170, 1139735334 },
 1192         { 2141238529,  616800485, 1401647061 },
 1193         { 2141220097,  779544570,  504667947 },
 1194         { 2141197057,  978744056,  193160486 },
 1195         { 2141148673, 1163962695, 1287367370 },
 1196         { 2141137153,  224770889,  980061580 },
 1197         { 2141123329,  298410621, 1056581872 },
 1198         { 2141111809,  219751128, 1647153596 },
 1199         { 2141100289, 1311425207,  724567430 },
 1200         { 2141088769,  541960942, 1683309546 },
 1201         { 2141077249, 1907117344, 1956090946 },
 1202         { 2141044993, 1762133402, 1997717427 },
 1203         { 2141026561,  929429695, 1693498396 },
 1204         { 2140980481,  202324091, 1791782539 },
 1205         { 2140962049,  847752429,  771999167 },
 1206         { 2140955137,  171581753,  897638410 },
 1207         { 2140934401, 1561197544, 1785721193 },
 1208         { 2140927489, 1304799727,  610189109 },
 1209         { 2140911361,  584407800,  573572391 },
 1210         { 2140872193,  579572504, 1460985498 },
 1211         { 2140865281, 2037311399, 1085114600 },
 1212         { 2140842241, 1464914593,  585074325 },
 1213         { 2140823809, 1476350636,    6041181 },
 1214         { 2140814593, 1547025818,  862668427 },
 1215         { 2140791553,  600414192, 1406967556 },
 1216         { 2140766209, 1778609535, 1403120148 },
 1217         { 2140733953, 1320658482, 1879929384 },
 1218         { 2140713217, 1129185878, 1667839994 },
 1219         { 2140701697,  296171658, 1309665658 },
 1220         { 2140664833,  399603307, 1473037276 },
 1221         { 2140627969, 2072761417,  860933793 },
 1222         { 2140561153,  859021277, 1227879163 },
 1223         { 2140547329, 1716541026, 1290146230 },
 1224         { 2140542721, 1194859500, 1797753332 },
 1225         { 2140535809,  167714346, 1162981484 },
 1226         { 2140503553,  370984759,   46309386 },
 1227         { 2140494337, 1328986356, 1606498728 },
 1228         { 2140480513,   38230879, 2034344207 },
 1229         { 2140466689, 1806259424,  362621291 },
 1230         { 2140427521,  814658758,  825681996 },
 1231         { 2140404481, 1123614881,  456597404 },
 1232         { 2140399873, 1573999347, 2109157497 },
 1233         { 2140365313,  972962485, 1511091337 },
 1234         { 2140351489,  862138970, 1385361132 },
 1235         { 2140323841,  925446313,  315386822 },
 1236         { 2140316929,  327600537, 1432209372 },
 1237         { 2140300801,  206223333, 1824969389 },
 1238         { 2140284673,  154825255,  625926281 },
 1239         { 2140275457, 1268521276, 1039783122 },
 1240         { 2140270849, 1930935020, 2093846199 },
 1241         { 2140250113,  293408394,  446043725 },
 1242         { 2140236289, 1210426371,  927336002 },
 1243         { 2140224769,  132901891,  584957812 },
 1244         { 2140220161,  940671602, 1748241255 },
 1245         { 2140208641,  337534060, 1014808439 },
 1246         { 2140206337,  707821818,  736128861 },
 1247         { 2140197121, 1276000052, 1233695606 },
 1248         { 2140192513, 1586736300, 1734440244 },
 1249         { 2140185601,  823807555,  861050097 },
 1250         { 2140171777,  706329492, 1082375865 },
 1251         { 2140116481, 1812454795, 1369401687 },
 1252         { 2140104961,  264127389, 1819984006 },
 1253         { 2140098049, 1965766156,  383515743 },
 1254         { 2140047361, 1960123302, 1042763142 },
 1255         { 2140042753, 1687337635,  420287792 },
 1256         { 2140031233, 1366749250, 1532940500 },
 1257         { 2139987457, 1912737167, 1404131659 },
 1258         { 2139964417,  693333303,  177486658 },
 1259         { 2139952897,  495455314, 1274422832 },
 1260         { 2139948289,   40363916,  943432746 },
 1261         { 2139927553,  444424273, 1207981519 },
 1262         { 2139902209, 1465807360,   92415256 },
 1263         { 2139872257, 1837154584,  909402842 },
 1264         { 2139867649,   75059817,  804803785 },
 1265         { 2139826177, 2015221154,  881164323 },
 1266         { 2139814657, 1148432315, 1578970409 },
 1267         { 2139800833, 2088034106,  933005359 },
 1268         { 2139789313, 1861311437,  177956065 },
 1269         { 2139770881,  363514308,   25637872 },
 1270         { 2139766273, 2070991768, 1557275280 },
 1271         { 2139713281, 1604789895, 1788183740 },
 1272         { 2139708673, 1845339933,  609946729 },
 1273         { 2139701761, 1430546144, 1989106525 },
 1274         { 2139676417,  605392937,  970641834 },
 1275         { 2139641857, 1165076318,  835985127 },
 1276         { 2139616513,  283116094,  667711089 },
 1277         { 2139614209,  646890502, 1209456877 },
 1278         { 2139598081,  449743376,  718517127 },
 1279         { 2139593473,  497145855,  394220435 },
 1280         { 2139572737,  955015696,  294304456 },
 1281         { 2139568129,  401133955, 1091472607 },
 1282         { 2139563521, 1735308269, 1691190275 },
 1283         { 2139556609, 1826473043, 1043649467 },
 1284         { 2139549697, 1908832672,  377280535 },
 1285         { 2139535873, 1241741506,  651921754 },
 1286         { 2139524353, 1923937400, 1353981614 },
 1287         { 2139492097,  285331406,  802398974 },
 1288         { 2139475969,  576196150,  429222684 },
 1289         { 2139466753, 1780761618,  296308885 },
 1290         { 2139459841,  611500689, 1213639654 },
 1291         { 2139434497,  148656389, 1051625014 },
 1292         { 2139413761,  391073297, 2081064242 },
 1293         { 2139399937, 1065174985,  787712624 },
 1294         { 2139383809,  788093248, 1111802639 },
 1295         { 2139356161,  336361251,  319889191 },
 1296         { 2139333121,  884166060,  509405995 },
 1297         { 2139321601, 1513473632, 1940111538 },
 1298         { 2139310081,  900672962, 2071072080 },
 1299         { 2139303169, 1991620918,  543628217 },
 1300         { 2139287041,  370425359, 1047050467 },
 1301         { 2139275521,  713658400, 1797885684 },
 1302         { 2139268609, 1783527316,  874491612 },
 1303         { 2139261697,  631837513, 1395044429 },
 1304         { 2139259393, 1742874935,  508544171 },
 1305         { 2139252481,  463463038,  737489950 },
 1306         { 2139204097,  538698195,  604942560 },
 1307         { 2139201793, 1435754599,   54362978 },
 1308         { 2139169537, 2012388705,  392083310 },
 1309         { 2139144193, 1155362435, 1319368247 },
 1310         { 2139137281,  921952177,   92928882 },
 1311         { 2139123457, 1506351550,  602798069 },
 1312         { 2139109633, 1527038848,   48844022 },
 1313         { 2139095809, 2071318942, 1524340004 },
 1314         { 2139075073, 1165123429, 1717466457 },
 1315         { 2139056641, 1776112510,  286710183 },
 1316         { 2139033601,  980780564, 2116611843 },
 1317         { 2139028993, 1474450200, 1742934604 },
 1318         { 2138976001,  428486407, 1580417049 },
 1319         { 2138969089, 1396721654, 1511256767 },
 1320         { 2138950657, 1516054630, 2071630098 },
 1321         { 2138939137, 1078360116,  691835055 },
 1322         { 2138927617,  425837073,  756872582 },
 1323         { 2138918401, 1641039620, 1734207081 },
 1324         { 2138916097, 1017601848, 1142652819 },
 1325         { 2138913793,  306828523,  191212034 },
 1326         { 2138904577, 1745075919, 1122876435 },
 1327         { 2138888449, 1503947447,  951342509 },
 1328         { 2138837761, 1210620672,  957215442 },
 1329         { 2138833153,  787109305,  412518979 },
 1330         { 2138823937,   44637017,  408073095 },
 1331         { 2138775553,  199446608,  907722635 },
 1332         { 2138757121, 2081488043, 1042423315 },
 1333         { 2138738689, 1497686426, 2084550241 },
 1334         { 2138729473,  608998372, 1206782464 },
 1335         { 2138717953,  539269185, 2044657934 },
 1336         { 2138681089,  268630505, 1615466641 },
 1337         { 2138660353,  582047226,  449049076 },
 1338         { 2138648833,  926857321, 1056946200 },
 1339         { 2138639617,  195695324, 2012033442 },
 1340         { 2138616577, 2068544160, 1896897903 },
 1341         { 2138605057, 1751671600,  614364881 },
 1342         { 2138600449,  478123986,  288140368 },
 1343         { 2138591233,  783785615,  851589578 },
 1344         { 0, 0, 0 }
 1345 };
 1346 
 1347 /*
 1348  * Reduce a small signed integer modulo a small prime. The source
 1349  * value x MUST be such that -p < x < p.
 1350  */
 1351 static inline uint32_t
 1352 modp_set(int32_t x, uint32_t p)
 1353 {
 1354         uint32_t w;
 1355 
 1356         w = (uint32_t)x;
 1357         w += p & -(w >> 31);
 1358         return w;
 1359 }
 1360 
 1361 /*
 1362  * Normalize a modular integer around 0.
 1363  */
 1364 static inline int32_t
 1365 modp_norm(uint32_t x, uint32_t p)
 1366 {
 1367         return x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1));
 1368 }
 1369 
 1370 /*
 1371  * Compute -1/p mod 2^31. This works for all odd integers p that fit
 1372  * on 32 bits.
 1373  */
 1374 static uint32_t
 1375 modp_ninv31(uint32_t p)
 1376 {
 1377         uint32_t y;
 1378 
 1379         y = 2 - p;
 1380         y *= 2 - p * y;
 1381         y *= 2 - p * y;
 1382         y *= 2 - p * y;
 1383         y *= 2 - p * y;
 1384         return (uint32_t)0x7FFFFFFF & -y;
 1385 }
 1386 
 1387 /*
 1388  * Compute R = 2^31 mod p.
 1389  */
 1390 static inline uint32_t
 1391 modp_R(uint32_t p)
 1392 {
 1393         /*
 1394          * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
 1395          * 2^31 - p.
 1396          */
 1397         return ((uint32_t)1 << 31) - p;
 1398 }
 1399 
 1400 /*
 1401  * Addition modulo p.
 1402  */
 1403 static inline uint32_t
 1404 modp_add(uint32_t a, uint32_t b, uint32_t p)
 1405 {
 1406         uint32_t d;
 1407 
 1408         d = a + b - p;
 1409         d += p & -(d >> 31);
 1410         return d;
 1411 }
 1412 
 1413 /*
 1414  * Subtraction modulo p.
 1415  */
 1416 static inline uint32_t
 1417 modp_sub(uint32_t a, uint32_t b, uint32_t p)
 1418 {
 1419         uint32_t d;
 1420 
 1421         d = a - b;
 1422         d += p & -(d >> 31);
 1423         return d;
 1424 }
 1425 
 1426 /*
 1427  * Halving modulo p.
 1428  */
 1429 static inline uint32_t
 1430 modp_half(uint32_t a, uint32_t p)
 1431 {
 1432         a += p & -(a & 1);
 1433         return a >> 1;
 1434 }
 1435 
 1436 /*
 1437  * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
 1438  * It is required that p is an odd integer.
 1439  */
 1440 static inline uint32_t
 1441 modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i)
 1442 {
 1443         uint64_t z, w;
 1444         uint32_t d;
 1445 
 1446         z = (uint64_t)a * (uint64_t)b;
 1447         w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
 1448         d = (uint32_t)((z + w) >> 31) - p;
 1449         d += p & -(d >> 31);
 1450         return d;
 1451 }
 1452 
 1453 /*
 1454  * Compute R2 = 2^62 mod p.
 1455  */
 1456 static uint32_t
 1457 modp_R2(uint32_t p, uint32_t p0i)
 1458 {
 1459         uint32_t z;
 1460 
 1461         /*
 1462          * Compute z = 2^31 mod p (this is the value 1 in Montgomery
 1463          * representation), then double it with an addition.
 1464          */
 1465         z = modp_R(p);
 1466         z = modp_add(z, z, p);
 1467 
 1468         /*
 1469          * Square it five times to obtain 2^32 in Montgomery representation
 1470          * (i.e. 2^63 mod p).
 1471          */
 1472         z = modp_montymul(z, z, p, p0i);
 1473         z = modp_montymul(z, z, p, p0i);
 1474         z = modp_montymul(z, z, p, p0i);
 1475         z = modp_montymul(z, z, p, p0i);
 1476         z = modp_montymul(z, z, p, p0i);
 1477 
 1478         /*
 1479          * Halve the value mod p to get 2^62.
 1480          */
 1481         z = (z + (p & -(z & 1))) >> 1;
 1482         return z;
 1483 }
 1484 
 1485 /*
 1486  * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
 1487  * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
 1488  * -1/p mod 2^31.
 1489  */
 1490 static inline uint32_t
 1491 modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2)
 1492 {
 1493         int i;
 1494         uint32_t r, z;
 1495 
 1496         /*
 1497          * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
 1498          * representation of (2^31)^e mod p, where e = x-1.
 1499          * R2 is 2^31 in Montgomery representation.
 1500          */
 1501         x --;
 1502         r = R2;
 1503         z = modp_R(p);
 1504         for (i = 0; (1U << i) <= x; i ++) {
 1505                 if ((x & (1U << i)) != 0) {
 1506                         z = modp_montymul(z, r, p, p0i);
 1507                 }
 1508                 r = modp_montymul(r, r, p, p0i);
 1509         }
 1510         return z;
 1511 }
 1512 
 1513 /*
 1514  * Division modulo p. If the divisor (b) is 0, then 0 is returned.
 1515  * This function computes proper results only when p is prime.
 1516  * Parameters:
 1517  *   a     dividend
 1518  *   b     divisor
 1519  *   p     odd prime modulus
 1520  *   p0i   -1/p mod 2^31
 1521  *   R     2^31 mod R
 1522  */
 1523 static uint32_t
 1524 modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R)
 1525 {
 1526         uint32_t z, e;
 1527         int i;
 1528 
 1529         e = p - 2;
 1530         z = R;
 1531         for (i = 30; i >= 0; i --) {
 1532                 uint32_t z2;
 1533 
 1534                 z = modp_montymul(z, z, p, p0i);
 1535                 z2 = modp_montymul(z, b, p, p0i);
 1536                 z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
 1537         }
 1538 
 1539         /*
 1540          * The loop above just assumed that b was in Montgomery
 1541          * representation, i.e. really contained b*R; under that
 1542          * assumption, it returns 1/b in Montgomery representation,
 1543          * which is R/b. But we gave it b in normal representation,
 1544          * so the loop really returned R/(b/R) = R^2/b.
 1545          *
 1546          * We want a/b, so we need one Montgomery multiplication with a,
 1547          * which also remove one of the R factors, and another such
 1548          * multiplication to remove the second R factor.
 1549          */
 1550         z = modp_montymul(z, 1, p, p0i);
 1551         return modp_montymul(a, z, p, p0i);
 1552 }
 1553 
 1554 /*
 1555  * Bit-reversal index table.
 1556  */
 1557 static const uint16_t REV10[] = {
 1558            0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
 1559          192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
 1560           96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
 1561          144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
 1562           48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
 1563          240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
 1564           72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
 1565          168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
 1566           24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
 1567          216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
 1568          120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
 1569          132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
 1570           36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
 1571          228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
 1572           84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
 1573          180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
 1574           12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
 1575          204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
 1576          108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
 1577          156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
 1578           60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
 1579          252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
 1580           66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
 1581          162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
 1582           18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
 1583          210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
 1584          114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
 1585          138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
 1586           42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
 1587          234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
 1588           90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
 1589          186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
 1590            6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
 1591          198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
 1592          102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
 1593          150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
 1594           54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
 1595          246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
 1596           78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
 1597          174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
 1598           30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
 1599          222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
 1600          126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
 1601          129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
 1602           33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
 1603          225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
 1604           81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
 1605          177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
 1606            9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
 1607          201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
 1608          105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
 1609          153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
 1610           57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
 1611          249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
 1612           69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
 1613          165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
 1614           21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
 1615          213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
 1616          117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
 1617          141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
 1618           45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
 1619          237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
 1620           93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
 1621          189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
 1622            3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
 1623          195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
 1624           99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
 1625          147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
 1626           51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
 1627          243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
 1628           75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
 1629          171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
 1630           27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
 1631          219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
 1632          123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
 1633          135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
 1634           39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
 1635          231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
 1636           87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
 1637          183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
 1638           15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
 1639          207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
 1640          111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
 1641          159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
 1642           63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
 1643          255,  767,  511, 1023
 1644 };
 1645 
 1646 /*
 1647  * Compute the roots for NTT and inverse NTT (binary case). Input
 1648  * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
 1649  * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
 1650  *   gm[rev(i)] = g^i mod p
 1651  *   igm[rev(i)] = (1/g)^i mod p
 1652  * where rev() is the "bit reversal" function over 10 bits. It fills
 1653  * the arrays only up to N = 2^logn values.
 1654  *
 1655  * The values stored in gm[] and igm[] are in Montgomery representation.
 1656  *
 1657  * p must be a prime such that p = 1 mod 2048.
 1658  */
 1659 static void
 1660 modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
 1661         uint32_t g, uint32_t p, uint32_t p0i)
 1662 {
 1663         size_t u, n;
 1664         unsigned k;
 1665         uint32_t ig, x1, x2, R2;
 1666 
 1667         n = (size_t)1 << logn;
 1668 
 1669         /*
 1670          * We want g such that g^(2N) = 1 mod p, but the provided
 1671          * generator has order 2048. We must square it a few times.
 1672          */
 1673         R2 = modp_R2(p, p0i);
 1674         g = modp_montymul(g, R2, p, p0i);
 1675         for (k = logn; k < 10; k ++) {
 1676                 g = modp_montymul(g, g, p, p0i);
 1677         }
 1678 
 1679         ig = modp_div(R2, g, p, p0i, modp_R(p));
 1680         k = 10 - logn;
 1681         x1 = x2 = modp_R(p);
 1682         for (u = 0; u < n; u ++) {
 1683                 size_t v;
 1684 
 1685                 v = REV10[u << k];
 1686                 gm[v] = x1;
 1687                 igm[v] = x2;
 1688                 x1 = modp_montymul(x1, g, p, p0i);
 1689                 x2 = modp_montymul(x2, ig, p, p0i);
 1690         }
 1691 }
 1692 
 1693 /*
 1694  * Compute the NTT over a polynomial (binary case). Polynomial elements
 1695  * are a[0], a[stride], a[2 * stride]...
 1696  */
 1697 static void
 1698 modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
 1699         uint32_t p, uint32_t p0i)
 1700 {
 1701         size_t t, m, n;
 1702 
 1703         if (logn == 0) {
 1704                 return;
 1705         }
 1706         n = (size_t)1 << logn;
 1707         t = n;
 1708         for (m = 1; m < n; m <<= 1) {
 1709                 size_t ht, u, v1;
 1710 
 1711                 ht = t >> 1;
 1712                 for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
 1713                         uint32_t s;
 1714                         size_t v;
 1715                         uint32_t *r1, *r2;
 1716 
 1717                         s = gm[m + u];
 1718                         r1 = a + v1 * stride;
 1719                         r2 = r1 + ht * stride;
 1720                         for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
 1721                                 uint32_t x, y;
 1722 
 1723                                 x = *r1;
 1724                                 y = modp_montymul(*r2, s, p, p0i);
 1725                                 *r1 = modp_add(x, y, p);
 1726                                 *r2 = modp_sub(x, y, p);
 1727                         }
 1728                 }
 1729                 t = ht;
 1730         }
 1731 }
 1732 
 1733 /*
 1734  * Compute the inverse NTT over a polynomial (binary case).
 1735  */
 1736 static void
 1737 modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
 1738         uint32_t p, uint32_t p0i)
 1739 {
 1740         size_t t, m, n, k;
 1741         uint32_t ni;
 1742         uint32_t *r;
 1743 
 1744         if (logn == 0) {
 1745                 return;
 1746         }
 1747         n = (size_t)1 << logn;
 1748         t = 1;
 1749         for (m = n; m > 1; m >>= 1) {
 1750                 size_t hm, dt, u, v1;
 1751 
 1752                 hm = m >> 1;
 1753                 dt = t << 1;
 1754                 for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
 1755                         uint32_t s;
 1756                         size_t v;
 1757                         uint32_t *r1, *r2;
 1758 
 1759                         s = igm[hm + u];
 1760                         r1 = a + v1 * stride;
 1761                         r2 = r1 + t * stride;
 1762                         for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
 1763                                 uint32_t x, y;
 1764 
 1765                                 x = *r1;
 1766                                 y = *r2;
 1767                                 *r1 = modp_add(x, y, p);
 1768                                 *r2 = modp_montymul(
 1769                                         modp_sub(x, y, p), s, p, p0i);;
 1770                         }
 1771                 }
 1772                 t = dt;
 1773         }
 1774 
 1775         /*
 1776          * We need 1/n in Montgomery representation, i.e. R/n. Since
 1777          * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
 1778          * thus a simple shift will do.
 1779          * a modular reduction.
 1780          */
 1781         ni = (uint32_t)1 << (31 - logn);
 1782         for (k = 0, r = a; k < n; k ++, r += stride) {
 1783                 *r = modp_montymul(*r, ni, p, p0i);
 1784         }
 1785 }
 1786 
 1787 /*
 1788  * Simplified macros for NTT and iNTT (binary case) when the elements
 1789  * are consecutive in RAM.
 1790  */
 1791 #define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
 1792 #define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
 1793 
 1794 /*
 1795  * Compute the roots for NTT and inverse NTT (ternary case).
 1796  *
 1797  * Degree is 1.5*2^logn if 'full' is 1; otherwise, 'full' is 0 and the
 1798  * degree is 2^logn.
 1799  *
 1800  * Input parameter g is a primitive 2304-th root of 1 modulo p (i.e.
 1801  * g^1152 = -1 mod p).
 1802  *
 1803  * In the full case, for n = 768:
 1804  *
 1805  *  - Tables gm[] and igm[] have size 512 entries each.
 1806  *
 1807  *  - gm[256..511] contains the values g^k for k = 1 or 5 mod 6 and
 1808  *    k < 768.
 1809  *
 1810  *  - For 1 <= j < 256, gm[j] contains the square of gm[2*j].
 1811  *
 1812  *  - gm[0] is a copy of gm[1].
 1813  *
 1814  *  - For j >= 1, igm[j] is the inverse of gm[j].
 1815  *
 1816  *  - igm[0] is the inverse of 2*gm[1]-1.
 1817  *
 1818  * For smaller degrees, everything is scaled down accordingly, and g
 1819  * is repeatedly squared.
 1820  *
 1821  * In the partial case, tables gm[] and igm[] have size 2^logn, and
 1822  * are the prefixes of the table for the full case for logn+1.
 1823  */
 1824 static void
 1825 modp_mkgm3(uint32_t *restrict gm, uint32_t *restrict igm,
 1826         unsigned logn, unsigned full,
 1827         uint32_t g, uint32_t p, uint32_t p0i)
 1828 {
 1829         size_t u;
 1830         unsigned k;
 1831         uint32_t R, R2, w, ig;
 1832 
 1833         R = modp_R(p);
 1834         R2 = modp_R2(p, p0i);
 1835 
 1836         /*
 1837          * Square g as needed for the requested degree, and convert it
 1838          * to Montgomery representation. Also set ig = 1/g, also in
 1839          * Montgomery representation.
 1840          */
 1841         g = modp_montymul(g, R2, p, p0i);
 1842         k = logn;
 1843         if (!full) {
 1844                 g = modp_montymul(g, modp_montymul(g, g, p, p0i), p, p0i);
 1845                 k ++;
 1846         }
 1847         while (k ++ < 9) {
 1848                 g = modp_montymul(g, g, p, p0i);
 1849         }
 1850         ig = modp_div(R2, g, p, p0i, modp_R(p));
 1851 
 1852         /*
 1853          * Fill the last row, using bit-reversal order.
 1854          */
 1855         if (logn == 1) {
 1856                 gm[1] = g;
 1857                 igm[1] = ig;
 1858         } else {
 1859                 uint32_t x, ix, g2, g4, ig2, ig4;
 1860                 size_t b;
 1861 
 1862                 x = g;
 1863                 ix = ig;
 1864                 g2 = modp_montymul(g, g, p, p0i);
 1865                 g4 = modp_montymul(g2, g2, p, p0i);
 1866                 ig2 = modp_montymul(ig, ig, p, p0i);
 1867                 ig4 = modp_montymul(ig2, ig2, p, p0i);
 1868                 k = 11 - logn;
 1869                 b = (size_t)1 << (logn - 1);
 1870                 for (u = 0; u < b; u += 2) {
 1871                         gm[b + REV10[u << k]] = x;
 1872                         igm[b + REV10[u << k]] = ix;
 1873                         x = modp_montymul(x, g4, p, p0i);
 1874                         ix = modp_montymul(ix, ig4, p, p0i);
 1875                         gm[b + REV10[(u + 1) << k]] = x;
 1876                         igm[b + REV10[(u + 1) << k]] = ix;
 1877                         x = modp_montymul(x, g2, p, p0i);
 1878                         ix = modp_montymul(ix, ig2, p, p0i);
 1879                 }
 1880         }
 1881 
 1882         /*
 1883          * If the table is full, then the next-to-last row contains the
 1884          * cubes of the last row.
 1885          */
 1886         k = logn - 1;
 1887         if (full) {
 1888                 k --;
 1889                 for (u = (size_t)1 << k; u < ((size_t)1 << (k + 1)); u ++) {
 1890                         uint32_t y, z;
 1891 
 1892                         y = gm[u << 1];
 1893                         z = igm[u << 1];
 1894                         y = modp_montymul(y,
 1895                                 modp_montymul(y, y, p, p0i), p, p0i);
 1896                         z = modp_montymul(z,
 1897                                 modp_montymul(z, z, p, p0i), p, p0i);
 1898                         gm[u] = y;
 1899                         igm[u] = z;
 1900                 }
 1901         }
 1902 
 1903         /*
 1904          * Fill other rows, with squares of the next one.
 1905          */
 1906         for (u = ((size_t)1 << k) - 1; u > 0; u --) {
 1907                 size_t v;
 1908 
 1909                 v = u << 1;
 1910                 gm[u] = modp_montymul(gm[v], gm[v], p, p0i);
 1911                 igm[u] = modp_montymul(igm[v], igm[v], p, p0i);
 1912         }
 1913 
 1914         /*
 1915          * Comute top elements.
 1916          */
 1917         gm[0] = gm[1];
 1918         w = gm[1];
 1919         igm[0] = modp_div(R2, modp_sub(modp_add(w, w, p), R, p), p, p0i, R);
 1920 }
 1921 
 1922 /*
 1923  * Compute the NTT over a polynomial (ternary case). If full is 1, then
 1924  * the degree 1.5*2^logn; otherwise, the degree is 2^logn.
 1925  */
 1926 static void
 1927 modp_NTT3_ext(uint32_t *a, size_t stride, const uint32_t *gm,
 1928         unsigned logn, unsigned full, uint32_t p, uint32_t p0i)
 1929 {
 1930         size_t n, hn, u, r, m, t;
 1931         uint32_t w;
 1932         uint32_t *r1, *r2;
 1933 
 1934         if (logn == 0) {
 1935                 return;
 1936         }
 1937         n = MKN(logn, full);
 1938         hn = n >> 1;
 1939 
 1940         /*
 1941          * First pass: NTT over individual degree-1 polynomials modulo
 1942          * X^2-X+1. Roots are w (such that w != -1, and w^3 = -1) and w^5.
 1943          * Note that w^2 - w + 1 = 0, thus:
 1944          *
 1945          *  a(w) = a0 + a1*w
 1946          *  a(w^5) = a(-w^2)
 1947          *         = a0 + a1*(-w + 1)
 1948          *         = a0 + a1 - a1*w
 1949          */
 1950         w = gm[1];
 1951         for (u = 0, r1 = a, r2 = a + hn * stride;
 1952                 u < hn; u ++, r1 += stride, r2 += stride)
 1953         {
 1954                 uint32_t a0, a1, b;
 1955 
 1956                 a0 = *r1;
 1957                 a1 = *r2;
 1958                 b = modp_montymul(a1, w, p, p0i);
 1959                 *r1 = modp_add(a0, b, p);
 1960                 *r2 = modp_sub(modp_add(a0, a1, p), b, p);
 1961         }
 1962 
 1963         /*
 1964          * Intermediate passes, for doubling the degree repeatedly.
 1965          */
 1966         t = hn;
 1967         for (m = 2; t > (1 + (full << 1)); m <<= 1) {
 1968                 size_t ht, u1, v1;
 1969 
 1970                 ht = t >> 1;
 1971                 for (u1 = 0, v1 = 0; u1 < m; u1 ++, v1 += t) {
 1972                         size_t v;
 1973                         uint32_t s;
 1974 
 1975                         s = gm[m + u1];
 1976                         r1 = a + v1 * stride;
 1977                         r2 = r1 + ht * stride;
 1978                         for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
 1979                                 uint32_t x, y;
 1980 
 1981                                 x = *r1;
 1982                                 y = *r2;
 1983                                 y = modp_montymul(y, s, p, p0i);
 1984                                 *r1 = modp_add(x, y, p);
 1985                                 *r2 = modp_sub(x, y, p);
 1986                         }
 1987                 }
 1988                 t = ht;
 1989         }
 1990 
 1991         /*
 1992          * Final pass, to triple the degree.
 1993          */
 1994         if (full) {
 1995                 w = modp_montymul(gm[1], gm[1], p, p0i);
 1996                 for (u = 0, r = (size_t)1 << (logn - 1), r1 = a;
 1997                         u < n; u += 3, r ++, r1 += 3 * stride)
 1998                 {
 1999                         uint32_t fA, fB, fC, fB0, fB1, fB2, fC0, fC1, fC2;
 2000                         uint32_t x, x2;
 2001 
 2002                         x = gm[r];
 2003                         x2 = modp_montymul(x, x, p, p0i);
 2004                         fA = *r1;
 2005                         fB = *(r1 + stride);
 2006                         fC = *(r1 + 2 * stride);
 2007                         fB0 = modp_montymul(fB, x, p, p0i);
 2008                         fB1 = modp_montymul(fB0, w, p, p0i);
 2009                         fB2 = modp_montymul(fB1, w, p, p0i);
 2010                         fC0 = modp_montymul(fC, x2, p, p0i);
 2011                         fC1 = modp_montymul(fC0, w, p, p0i);
 2012                         fC2 = modp_montymul(fC1, w, p, p0i);
 2013                         *r1 = modp_add(fA,
 2014                                 modp_add(fB0, fC0, p), p);
 2015                         *(r1 + stride) = modp_add(fA,
 2016                                 modp_add(fB1, fC2, p), p);
 2017                         *(r1 + 2 * stride) = modp_add(fA,
 2018                                 modp_add(fB2, fC1, p), p);
 2019                 }
 2020         }
 2021 }
 2022 
 2023 /*
 2024  * Compute the inverse NTT over a polynomial (ternary case). If full is
 2025  * 1, then the degree 1.5*2^logn; otherwise, the degree is 2^logn.
 2026  */
 2027 static void
 2028 modp_iNTT3_ext(uint32_t *a, size_t stride, const uint32_t *igm,
 2029         unsigned logn, unsigned full, uint32_t p, uint32_t p0i)
 2030 {
 2031         size_t n, hn, u, r, m, t;
 2032         uint32_t w, ni, R, *r1, *r2;
 2033 
 2034         if (logn == 0) {
 2035                 return;
 2036         }
 2037         n = MKN(logn, full);
 2038         hn = n >> 1;
 2039 
 2040         /*
 2041          * Steps here correspond to the steps in modp_NTT3_ext(), in
 2042          * reverse order. However, computations leave a cumulative
 2043          * multiplicative factor, that must be cancelled out at the end.
 2044          */
 2045 
 2046         /*
 2047          * Divide degree by 3.
 2048          */
 2049         if (full) {
 2050                 w = modp_montymul(igm[1], igm[1], p, p0i);
 2051                 for (u = 0, r = (size_t)1 << (logn - 1), r1 = a;
 2052                         u < n; u += 3, r ++, r1 += 3 * stride)
 2053                 {
 2054                         uint32_t f0, f1, f2, f11, f12, f21, f22;
 2055                         uint32_t x, x2;
 2056 
 2057                         x = igm[r];
 2058                         x2 = modp_montymul(x, x, p, p0i);
 2059                         f0 = *r1;
 2060                         f1 = *(r1 + stride);
 2061                         f2 = *(r1 + 2 * stride);
 2062                         f11 = modp_montymul(f1, w, p, p0i);
 2063                         f12 = modp_montymul(f11, w, p, p0i);
 2064                         f21 = modp_montymul(f2, w, p, p0i);
 2065                         f22 = modp_montymul(f21, w, p, p0i);
 2066                         *r1 = modp_add(f0, modp_add(f1, f2, p), p);
 2067                         *(r1 + stride) = modp_montymul(x,
 2068                                 modp_add(f0, modp_add(f11, f22, p), p), p, p0i);
 2069                         *(r1 + 2 * stride) = modp_montymul(x2,
 2070                                 modp_add(f0, modp_add(f12, f21, p), p), p, p0i);
 2071                 }
 2072         }
 2073 
 2074         /*
 2075          * Intermediate steps. The 't' and 'm' values have the same
 2076          * semantics as in NTT3, except that they are processed in
 2077          * reverse order. Note the invariant: t*m = n.
 2078          */
 2079         t = 2 + (full << 2);
 2080         for (m = (size_t)1 << (logn - 1 - full); t < n; m >>= 1) {
 2081                 size_t ht, u1, v1;
 2082 
 2083                 ht = t >> 1;
 2084                 for (u1 = 0, v1 = 0; u1 < m; u1 ++, v1 += t) {
 2085                         size_t v;
 2086                         uint32_t s;
 2087 
 2088                         s = igm[m + u1];
 2089                         r1 = a + v1 * stride;
 2090                         r2 = r1 + ht * stride;
 2091                         for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
 2092                                 uint32_t x, y;
 2093 
 2094                                 x = *r1;
 2095                                 y = *r2;
 2096                                 *r1 = modp_add(x, y, p);
 2097                                 *r2 = modp_montymul(
 2098                                         modp_sub(x, y, p), s, p, p0i);
 2099                         }
 2100                 }
 2101                 t <<= 1;
 2102         }
 2103 
 2104         /*
 2105          * Final step: inverse NTT for polynomials modulo X^2-X+1.
 2106          */
 2107         w = igm[0];
 2108         for (u = 0, r1 = a, r2 = a + hn * stride;
 2109                 u < hn; u ++, r1 += stride, r2 += stride)
 2110         {
 2111                 uint32_t a0, a1, b, c;
 2112 
 2113                 a0 = *r1;
 2114                 a1 = *r2;
 2115                 b = modp_add(a0, a1, p);
 2116                 c = modp_montymul(w, modp_sub(a0, a1, p), p, p0i);
 2117                 *r1 = modp_sub(b, c, p);
 2118                 *r2 = modp_add(c, c, p);
 2119         }
 2120 
 2121         /*
 2122          * Corrective factor: all values must be divided by n.
 2123          */
 2124         R = modp_R(p);
 2125         ni = modp_div(R, (uint32_t)n, p, p0i, R);
 2126         for (u = 0, r1 = a; u < n; u ++, r1 += stride) {
 2127                 *r1 = modp_montymul(*r1, ni, p, p0i);
 2128         }
 2129 }
 2130 
 2131 /*
 2132  * Simplified macros for NTT and iNTT (ternary case) when the elements
 2133  * are consecutive in RAM.
 2134  */
 2135 #define modp_NTT3(a, gm, logn, full, p, p0i) \
 2136         modp_NTT3_ext(a, 1, gm, logn, full, p, p0i)
 2137 #define modp_iNTT3(a, igm, logn, full, p, p0i) \
 2138         modp_iNTT3_ext(a, 1, igm, logn, full, p, p0i)
 2139 
 2140 /*
 2141  * Given polynomial f in NTT representation modulo p, compute f' of degree
 2142  * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
 2143  * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
 2144  *
 2145  * The new polynomial is written "in place" over the first N/2 elements
 2146  * of f.
 2147  *
 2148  * If applied logn times successively on a given polynomial, the resulting
 2149  * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
 2150  *
 2151  * This function applies only to the binary case; it is invoked from
 2152  * solve_NTRU_binary_depth1().
 2153  */
 2154 static void
 2155 modp_poly_rec_res(uint32_t *f, unsigned logn,
 2156         uint32_t p, uint32_t p0i, uint32_t R2)
 2157 {
 2158         size_t hn, u;
 2159 
 2160         hn = (size_t)1 << (logn - 1);
 2161         for (u = 0; u < hn; u ++) {
 2162                 uint32_t w0, w1;
 2163 
 2164                 w0 = f[(u << 1) + 0];
 2165                 w1 = f[(u << 1) + 1];
 2166                 f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 2167         }
 2168 }
 2169 
 2170 /* ==================================================================== */
 2171 /*
 2172  * Custom bignum implementation.
 2173  *
 2174  * This is a very reduced set of functionalities. We need to do the
 2175  * following operations:
 2176  *
 2177  *  - Rebuild the resultant and the polynomial coefficients from their
 2178  *    values modulo small primes (of length 31 bits each).
 2179  *
 2180  *  - Compute an extended GCD between the two computed resultants.
 2181  *
 2182  *  - Extract top bits and add scaled values during the successive steps
 2183  *    of Babai rounding.
 2184  *
 2185  * When rebuilding values using CRT, we must also recompute the product
 2186  * of the small prime factors. We always do it one small factor at a
 2187  * time, so the "complicated" operations can be done modulo the small
 2188  * prime with the modp_* functions. CRT coefficients (inverses) can be
 2189  * precomputed.
 2190  *
 2191  * All values are positive until the last step: when the polynomial
 2192  * coefficients have been rebuilt, we normalize them around 0. But then,
 2193  * only additions and subtractions on the upper few bits are needed
 2194  * afterwards.
 2195  *
 2196  * We keep big integers as arrays of 31-bit words (in uint32_t values);
 2197  * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
 2198  * makes it easier to track carries. When negative values are used,
 2199  * two's complement is used.
 2200  */
 2201 
 2202 /*
 2203  * Add integer b to integer a. Both integers are supposed to have the
 2204  * same size, and the result should fit in that size as well. The carry
 2205  * is returned. Source arrays a and b MUST be distinct.
 2206  */
 2207 static uint32_t
 2208 zint_add(uint32_t *restrict a, const uint32_t *restrict b, size_t len)
 2209 {
 2210         size_t u;
 2211         uint32_t cc;
 2212 
 2213         cc = 0;
 2214         for (u = 0; u < len; u ++) {
 2215                 uint32_t w;
 2216 
 2217                 w = a[u] + b[u] + cc;
 2218                 a[u] = w & 0x7FFFFFFF;
 2219                 cc = w >> 31;
 2220         }
 2221         return cc;
 2222 }
 2223 
 2224 /*
 2225  * Subtract integer b from integer a. Both integers are supposed to have
 2226  * the same size. The carry (0 or 1) is returned. Source arrays a and b
 2227  * MUST be distinct.
 2228  */
 2229 static uint32_t
 2230 zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len)
 2231 {
 2232         size_t u;
 2233         uint32_t cc;
 2234 
 2235         cc = 0;
 2236         for (u = 0; u < len; u ++) {
 2237                 uint32_t w;
 2238 
 2239                 w = a[u] - b[u] - cc;
 2240                 a[u] = w & 0x7FFFFFFF;
 2241                 cc = w >> 31;
 2242         }
 2243         return cc;
 2244 }
 2245 
 2246 /*
 2247  * Mutiply the provided big integer m with a small value x.
 2248  * This function assumes that x < 2^31. The carry word is returned.
 2249  */
 2250 static uint32_t
 2251 zint_mul_small(uint32_t *m, size_t mlen, uint32_t x)
 2252 {
 2253         size_t u;
 2254         uint32_t cc;
 2255 
 2256         cc = 0;
 2257         for (u = 0; u < mlen; u ++) {
 2258                 uint64_t z;
 2259 
 2260                 z = (uint64_t)m[u] * (uint64_t)x + cc;
 2261                 m[u] = (uint32_t)z & 0x7FFFFFFF;
 2262                 cc = (uint32_t)(z >> 31);
 2263         }
 2264         return cc;
 2265 }
 2266 
 2267 /*
 2268  * Reduce a big integer d modulo a small integer p.
 2269  * Rules:
 2270  *  d is unsigned
 2271  *  p is prime
 2272  *  2^30 < p < 2^31
 2273  *  p0i = -(1/p) mod 2^31
 2274  *  R2 = 2^62 mod p
 2275  */
 2276 static uint32_t
 2277 zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
 2278         uint32_t p, uint32_t p0i, uint32_t R2)
 2279 {
 2280         uint32_t x;
 2281         size_t u;
 2282 
 2283         /*
 2284          * Algorithm: we inject words one by one, starting with the high
 2285          * word. Each step is:
 2286          *  - multiply x by 2^31
 2287          *  - add new word
 2288          */
 2289         x = 0;
 2290         u = dlen;
 2291         while (u -- > 0) {
 2292                 uint32_t w;
 2293 
 2294                 x = modp_montymul(x, R2, p, p0i);
 2295                 w = d[u] - p;
 2296                 w += p & -(w >> 31);
 2297                 x = modp_add(x, w, p);
 2298         }
 2299         return x;
 2300 }
 2301 
 2302 /*
 2303  * Similar to zint_mod_small_unsigned(), except that d may be signed.
 2304  * Extra parameter is Rx = 2^(31*dlen) mod p.
 2305  */
 2306 static uint32_t
 2307 zint_mod_small_signed(const uint32_t *d, size_t dlen,
 2308         uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx)
 2309 {
 2310         uint32_t z;
 2311 
 2312         if (dlen == 0) {
 2313                 return 0;
 2314         }
 2315         z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
 2316         z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
 2317         return z;
 2318 }
 2319 
 2320 /*
 2321  * Add y*s to x. x and y initially have length 'len' words; the new x
 2322  * has length 'len+1' words. 's' must fit on 31 bits.
 2323  */
 2324 static void
 2325 zint_add_mul_small(uint32_t *restrict x,
 2326         const uint32_t *restrict y, size_t len, uint32_t s)
 2327 {
 2328         size_t u;
 2329         uint32_t cc;
 2330 
 2331         cc = 0;
 2332         for (u = 0; u < len; u ++) {
 2333                 uint32_t xw, yw;
 2334                 uint64_t z;
 2335 
 2336                 xw = x[u];
 2337                 yw = y[u];
 2338                 z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
 2339                 x[u] = (uint32_t)z & 0x7FFFFFFF;
 2340                 cc = (uint32_t)(z >> 31);
 2341         }
 2342         x[len] = cc;
 2343 }
 2344 
 2345 /*
 2346  * Right-shift an _unsigned_ integer by one bit. The least significant
 2347  * bit is returned. The new integer length is returned.
 2348  */
 2349 static uint32_t
 2350 zint_rshift1(uint32_t *d, size_t len)
 2351 {
 2352         uint32_t cc;
 2353         size_t k;
 2354 
 2355         cc = 0;
 2356         k = len;
 2357         while (k -- > 0) {
 2358                 uint32_t w;
 2359 
 2360                 w = d[k];
 2361                 d[k] = (w >> 1) | (cc << 30);
 2362                 cc = w & 1;
 2363         }
 2364         return cc;
 2365 }
 2366 
 2367 /*
 2368  * Halve integer x modulo integer p. The modulus p MUST be odd.
 2369  */
 2370 static void
 2371 zint_rshift1_mod(uint32_t *restrict x, const uint32_t *restrict p, size_t len)
 2372 {
 2373         uint32_t hi;
 2374 
 2375         if ((x[0] & 1) != 0) {
 2376                 hi = zint_add(x, p, len);
 2377         } else {
 2378                 hi = 0;
 2379         }
 2380         zint_rshift1(x, len);
 2381         x[len - 1] |= hi << 30;
 2382 }
 2383 
 2384 /*
 2385  * Subtract y from x, modulo p.
 2386  */
 2387 static void
 2388 zint_sub_mod(uint32_t *restrict x, const uint32_t *restrict y,
 2389         const uint32_t *restrict p, size_t len)
 2390 {
 2391         if (zint_sub(x, y, len)) {
 2392                 zint_add(x, p, len);
 2393         }
 2394 }
 2395 
 2396 /*
 2397  * Compare a with b. Both integers are unsigned and have the same encoded
 2398  * length.
 2399  */
 2400 static int
 2401 zint_ucmp(const uint32_t *a, const uint32_t *b, size_t len)
 2402 {
 2403         while (len -- > 0) {
 2404                 uint32_t wa, wb;
 2405 
 2406                 wa = a[len];
 2407                 wb = b[len];
 2408                 if (wa < wb) {
 2409                         return -1;
 2410                 }
 2411                 if (wa > wb) {
 2412                         return 1;
 2413                 }
 2414         }
 2415         return 0;
 2416 }
 2417 
 2418 /*
 2419  * Normalize a modular integer around 0: if x > p/2, then x is replaced
 2420  * with x - p (signed encoding with two's complement); otherwise, x is
 2421  * untouched. The two integers x and p are encoded over the same length.
 2422  */
 2423 static void
 2424 zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len)
 2425 {
 2426         uint32_t cc;
 2427         size_t u;
 2428 
 2429         cc = 0;
 2430         u = len;
 2431         while (u -- > 0) {
 2432                 uint32_t w;
 2433 
 2434                 w = (p[u] >> 1) | (cc << 30);
 2435                 cc = p[u] & 1;
 2436                 if (x[u] < w) {
 2437                         return;
 2438                 }
 2439                 if (x[u] > w) {
 2440                         zint_sub(x, p, len);
 2441                         return;
 2442                 }
 2443         }
 2444 }
 2445 
 2446 /*
 2447  * Rebuild integers from their RNS representation. There are 'num'
 2448  * integers, and each consists in 'xlen' words. 'xx' points at that
 2449  * first word of the first integer; subsequent integers are accessed
 2450  * by adding 'xstride' repeatedly.
 2451  *
 2452  * The words of an integer are the RNS representation of that integer,
 2453  * using the provided 'primes' are moduli. This function replaces
 2454  * each integer with its multi-word value (little-endian order).
 2455  *
 2456  * If "normalize_signed" is non-zero, then the returned value is
 2457  * normalized to the -m/2..m/2 interval (where m is the product of all
 2458  * small prime moduli); two's complement is used for negative values.
 2459  */
 2460 static void
 2461 zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
 2462         size_t num, const small_prime *primes, int normalize_signed,
 2463         uint32_t *restrict tmp)
 2464 {
 2465         size_t u;
 2466         uint32_t *x;
 2467 
 2468         tmp[0] = primes[0].p;
 2469         for (u = 1; u < xlen; u ++) {
 2470                 /*
 2471                  * At the entry of each loop iteration:
 2472                  *  - the first u words of each array have been
 2473                  *    reassembled;
 2474                  *  - the first u words of tmp[] contains the
 2475                  * product of the prime moduli processed so far.
 2476                  *
 2477                  * We call 'q' the product of all previous primes.
 2478                  */
 2479                 uint32_t p, p0i, s, R2;
 2480                 size_t v;
 2481 
 2482                 p = primes[u].p;
 2483                 s = primes[u].s;
 2484                 p0i = modp_ninv31(p);
 2485                 R2 = modp_R2(p, p0i);
 2486 
 2487                 for (v = 0, x = xx; v < num; v ++, x += xstride) {
 2488                         uint32_t xp, xq, xr;
 2489                         /*
 2490                          * xp = the integer x modulo the prime p for this
 2491                          *      iteration
 2492                          * xq = (x mod q) mod p
 2493                          */
 2494                         xp = x[u];
 2495                         xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
 2496 
 2497                         /*
 2498                          * New value is (x mod q) + q * (s * (xp - xq) mod p)
 2499                          */
 2500                         xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
 2501                         zint_add_mul_small(x, tmp, u, xr);
 2502                 }
 2503 
 2504                 /*
 2505                  * Update product of primes in tmp[].
 2506                  */
 2507                 tmp[u] = zint_mul_small(tmp, u, p);
 2508         }
 2509 
 2510         /*
 2511          * Normalize the reconstructed values around 0.
 2512          */
 2513         if (normalize_signed) {
 2514                 for (u = 0, x = xx; u < num; u ++, x += xstride) {
 2515                         zint_norm_zero(x, tmp, xlen);
 2516                 }
 2517         }
 2518 }
 2519 
 2520 /*
 2521  * Compute exact length of an integer (i.e. reduce it to remove high
 2522  * words of value 0).
 2523  */
 2524 static size_t
 2525 zint_exact_length(const uint32_t *x, size_t xlen)
 2526 {
 2527         while (xlen > 0) {
 2528                 if (x[xlen - 1] != 0) {
 2529                         return xlen;
 2530                 }
 2531                 xlen --;
 2532         }
 2533         return xlen;
 2534 }
 2535 
 2536 /*
 2537  * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
 2538  * The low bits are dropped (the caller should compute the coefficients
 2539  * such that these dropped bits are all zeros). If either or both
 2540  * yields a negative value, then the value is negated.
 2541  *
 2542  * Returned value is:
 2543  *  0  both values were positive
 2544  *  1  new a had to be negated
 2545  *  2  new b had to be negated
 2546  *  3  both new a and new b had to be negated
 2547  *
 2548  * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
 2549  */
 2550 static int
 2551 zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
 2552         int32_t xa, int32_t xb, int32_t ya, int32_t yb)
 2553 {
 2554         size_t u;
 2555         int32_t cca, ccb;
 2556         int r;
 2557 
 2558         cca = 0;
 2559         ccb = 0;
 2560         for (u = 0; u < len; u ++) {
 2561                 int32_t wa, wb;
 2562                 int64_t za, zb;
 2563                 uint32_t tta, ttb;
 2564 
 2565                 wa = (int32_t)a[u];
 2566                 wb = (int32_t)b[u];
 2567                 za = (int64_t)wa * xa + (int64_t)wb * xb + cca;
 2568                 zb = (int64_t)wa * ya + (int64_t)wb * yb + ccb;
 2569                 if (u > 0) {
 2570                         a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
 2571                         b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
 2572                 }
 2573                 tta = (uint32_t)((uint64_t)za >> 31);
 2574                 ttb = (uint32_t)((uint64_t)zb >> 31);
 2575                 cca = *(int32_t *)&tta;
 2576                 ccb = *(int32_t *)&ttb;
 2577         }
 2578         a[len - 1] = (uint32_t)cca;
 2579         b[len - 1] = (uint32_t)ccb;
 2580         r = 0;
 2581         if (cca < 0) {
 2582                 uint32_t c;
 2583 
 2584                 c = 1;
 2585                 for (u = 0; u < len; u ++) {
 2586                         uint32_t w;
 2587 
 2588                         w = c + ~a[u];
 2589                         a[u] = w & 0x7FFFFFFF;
 2590                         c = (~w) >> 31;
 2591                 }
 2592                 r |= 1;
 2593         }
 2594         if (ccb < 0) {
 2595                 uint32_t c;
 2596 
 2597                 c = 1;
 2598                 for (u = 0; u < len; u ++) {
 2599                         uint32_t w;
 2600 
 2601                         w = c + ~b[u];
 2602                         b[u] = w & 0x7FFFFFFF;
 2603                         c = (~w) >> 31;
 2604                 }
 2605                 r |= 2;
 2606         }
 2607 
 2608         return r;
 2609 }
 2610 
 2611 /*
 2612  * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
 2613  * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
 2614  */
 2615 static void
 2616 zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
 2617         uint32_t m0i, int32_t xa, int32_t xb, int32_t ya, int32_t yb)
 2618 {
 2619         size_t u;
 2620         uint32_t fx, fy;
 2621         int64_t cca, ccb;
 2622 
 2623         /*
 2624          * These are actually four combined Montgomery multiplications.
 2625          */
 2626         fx = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
 2627         fy = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
 2628         cca = 0;
 2629         ccb = 0;
 2630         for (u = 0; u < len; u ++) {
 2631                 uint32_t wa, wb;
 2632                 int64_t za, zb;
 2633                 uint64_t tta, ttb;
 2634 
 2635                 wa = a[u];
 2636                 wb = b[u];
 2637                 za = (int64_t)wa * (int64_t)xa + (int64_t)wb * (int64_t)xb;
 2638                 zb = (int64_t)wa * (int64_t)ya + (int64_t)wb * (int64_t)yb;
 2639                 za += cca;
 2640                 zb += ccb;
 2641                 za += (uint64_t)m[u] * (uint64_t)fx;
 2642                 zb += (uint64_t)m[u] * (uint64_t)fy;
 2643                 if (u > 0) {
 2644                         a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
 2645                         b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
 2646                 }
 2647 
 2648                 /*
 2649                  * Weird code below is a sign-extension. We want to
 2650                  * perform an arithmetic shift, but the C standard does
 2651                  * not guarantee that right-shifting signed negative
 2652                  * values performs an arithmetic shift (it's
 2653                  * "implementation-defined").
 2654                  */
 2655 #define M   ((uint64_t)1 << 32)
 2656                 tta = (uint64_t)za >> 31;
 2657                 ttb = (uint64_t)zb >> 31;
 2658                 tta = (tta ^ M) - M;
 2659                 ttb = (ttb ^ M) - M;
 2660                 cca = *(int64_t *)&tta;
 2661                 ccb = *(int64_t *)&ttb;
 2662 #undef M
 2663         }
 2664         a[len - 1] = (uint32_t)cca & 0x7FFFFFFF;
 2665         b[len - 1] = (uint32_t)ccb & 0x7FFFFFFF;
 2666 
 2667         /*
 2668          * For each value a and b:
 2669          *  - if negative, add modulus
 2670          *  - if positive and not lower than modulus, subtract modulus
 2671          */
 2672         if (cca < 0) {
 2673                 zint_add(a, m, len);
 2674         } else {
 2675                 if (zint_ucmp(a, m, len) >= 0) {
 2676                         zint_sub(a, m, len);
 2677                 }
 2678         }
 2679         if (ccb < 0) {
 2680                 zint_add(b, m, len);
 2681         } else {
 2682                 if (zint_ucmp(b, m, len) >= 0) {
 2683                         zint_sub(b, m, len);
 2684                 }
 2685         }
 2686 }
 2687 
 2688 /*
 2689  * Replace a with (a+k*b)/(2^31). If the result it negative, then it is
 2690  * negated and 1 is returned; otherwise, 0 is returned.
 2691  */
 2692 static int
 2693 zint_reduce(uint32_t *a, const uint32_t *b, size_t len, int32_t k)
 2694 {
 2695         size_t u;
 2696         int32_t cc;
 2697 
 2698         cc = 0;
 2699         for (u = 0; u < len; u ++) {
 2700                 int32_t wa, wb;
 2701                 int64_t z;
 2702                 uint32_t tt;
 2703 
 2704                 wa = (int32_t)a[u];
 2705                 wb = (int32_t)b[u];
 2706                 z = (int64_t)wb * k + (int64_t)wa + cc;
 2707                 if (u > 0) {
 2708                         a[u - 1] = (uint32_t)z & 0x7FFFFFFF;
 2709                 }
 2710                 tt = (uint32_t)((uint64_t)z >> 31);
 2711                 cc = *(int32_t *)&tt;
 2712         }
 2713         a[len - 1] = (uint32_t)cc;
 2714         if (cc < 0) {
 2715                 uint32_t c;
 2716 
 2717                 c = 1;
 2718                 for (u = 0; u < len; u ++) {
 2719                         uint32_t w;
 2720 
 2721                         w = c + ~a[u];
 2722                         a[u] = w & 0x7FFFFFFF;
 2723                         c = (~w) >> 31;
 2724                 }
 2725                 return 1;
 2726         } else {
 2727                 return 0;
 2728         }
 2729 }
 2730 
 2731 /*
 2732  * Replace a with (a+k*b)/(2^31) mod m.
 2733  * Modulus m must be odd; m0i = -1/m[0] mod 2^31.
 2734  */
 2735 static void
 2736 zint_reduce_mod(uint32_t *a, const uint32_t *b, const uint32_t *m,
 2737         size_t len, uint32_t m0i, int32_t k)
 2738 {
 2739         size_t u;
 2740         uint32_t f;
 2741         int32_t cc;
 2742 
 2743         f = ((a[0] + b[0] * (uint32_t)k) * m0i) & 0x7FFFFFFF;
 2744         cc = 0;
 2745         for (u = 0; u < len; u ++) {
 2746                 uint32_t wa, wb;
 2747                 int64_t z;
 2748                 uint32_t tt;
 2749 
 2750                 wa = a[u];
 2751                 wb = b[u];
 2752                 z = (int64_t)wa + (int64_t)wb * (int64_t)k + cc;
 2753                 z += (uint64_t)m[u] * (uint64_t)f;
 2754                 if (u > 0) {
 2755                         a[u - 1] = (uint32_t)z & 0x7FFFFFFF;
 2756                 }
 2757                 tt = (uint32_t)((uint64_t)z >> 31);
 2758                 cc = *(int32_t *)&tt;
 2759         }
 2760         a[len - 1] = (uint32_t)cc & 0x7FFFFFFF;
 2761 
 2762         /*
 2763          * - if negative, add modulus
 2764          * - if positive and not lower than modulus, subtract modulus
 2765          */
 2766         if (cc < 0) {
 2767                 zint_add(a, m, len);
 2768         } else {
 2769                 if (zint_ucmp(a, m, len) >= 0) {
 2770                         zint_sub(a, m, len);
 2771                 }
 2772         }
 2773 }
 2774 
 2775 /*
 2776  * Compute a GCD between two positive big integers x and y. The two
 2777  * integers must be odd. Returned value is 1 if the GCD is 1, 0
 2778  * otherwise. When 1 is returned, arrays u and v are filled with values
 2779  * such that:
 2780  *   0 <= u <= y
 2781  *   0 <= v <= x
 2782  *   x*u - y*v = 1
 2783  * x[] and y[] are unmodified. Both input values must have the same
 2784  * encoded length. Temporary array must be large enough to accommodate 4
 2785  * extra values of that length. Arrays u, v and tmp may not overlap with
 2786  * each other, or with either x or y.
 2787  */
 2788 static int
 2789 zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
 2790         const uint32_t *restrict x, const uint32_t *restrict y,
 2791         size_t len, uint32_t *restrict tmp)
 2792 {
 2793         uint32_t *u0, *u1, *v0, *v1, *a, *b;
 2794         size_t xlen, ylen, alen, blen, mlen;
 2795         uint32_t x0i, y0i;
 2796 
 2797         /*
 2798          * Algorithm is an extended binary GCD. We maintain 6 values
 2799          * a, b, u0, u1, v0 and v1 with the following invariants:
 2800          *
 2801          *  a = x*u0 - y*v0
 2802          *  b = x*u1 - y*v1
 2803          *  0 <= u0 < y
 2804          *  0 <= v0 < x
 2805          *  0 <= u1 <= y
 2806          *  0 <= v1 <= x
 2807          *
 2808          * Initial values are:
 2809          *
 2810          *  a = x   u0 = 1   v0 = 0
 2811          *  b = y   u1 = y   v1 = x-1
 2812          *
 2813          * Each iteration reduces either a or b, and maintain the
 2814          * invariants. Algorithm stops when a = b, at which point their
 2815          * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
 2816          * the values (u,v) we want to return.
 2817          *
 2818          * We must handle specially the cases of x = 1 or y = 1, which
 2819          * make the solution trivial. If x > 1 and y > 1, and GCD(x,y) = 1,
 2820          * then there will be a solution (u,v) such that 0 < u < y and
 2821          * 0 < v < x (it can be shown that u = 1/x mod y and v = -1/y mod x).
 2822          */
 2823 
 2824         u0 = u;
 2825         v0 = v;
 2826         u1 = tmp;
 2827         v1 = u1 + len;
 2828         a = v1 + len;
 2829         b = a + len;
 2830 
 2831         /*
 2832          * Compute actual lengths of x and y.
 2833          */
 2834         xlen = zint_exact_length(x, len);
 2835         ylen = zint_exact_length(y, len);
 2836 
 2837         /*
 2838          * Filter out bad values:
 2839          *   x and y must not be zero.
 2840          *   x and y must be odd.
 2841          */
 2842         if (xlen == 0 || ylen == 0 || (x[0] & y[0] & 1) == 0) {
 2843                 return 0;
 2844         }
 2845 
 2846         /*
 2847          * Initialize a, b, u0, u1, v0 and v1.
 2848          *  a = x   u0 = 1   v0 = 0
 2849          *  b = y   u1 = y   v1 = x-1
 2850          * Note that x is odd, so computing x-1 is easy.
 2851          */
 2852         memcpy(a, x, xlen * sizeof *x);
 2853         memcpy(b, y, ylen * sizeof *y);
 2854         alen = xlen;
 2855         blen = ylen;
 2856         u0[0] = 1;
 2857         memset(u0 + 1, 0, (ylen - 1) * sizeof *u0);
 2858         memset(v0, 0, xlen * sizeof *v0);
 2859         memcpy(u1, y, ylen * sizeof *u1);
 2860         memcpy(v1, x, xlen * sizeof *v1);
 2861         v1[0] &= ~(uint32_t)1;
 2862 
 2863         /*
 2864          * We also zero out the upper unused words of the returned array
 2865          * u and v (caller expects it).
 2866          */
 2867         memset(u + ylen, 0, (len - ylen) * sizeof *u);
 2868         memset(v + xlen, 0, (len - xlen) * sizeof *v);
 2869 
 2870         /*
 2871          * We zero out the upper unused words of a and b as well, so that
 2872          * we may subtract one from the other with a common length.
 2873          */
 2874         mlen = alen < blen ? blen : alen;
 2875         memset(a + alen, 0, (mlen - alen) * sizeof *a);
 2876         memset(b + blen, 0, (mlen - blen) * sizeof *b);
 2877 
 2878         /*
 2879          * If x = 1 then the current values in u and v are just fine
 2880          * and we can return them (because u0 and u are the same array,
 2881          * and similarly v0 and v).
 2882          * If y = 1, then the values in u1 and v1 must be returned.
 2883          */
 2884         if (xlen == 1 && x[0] == 1) {
 2885                 return 1;
 2886         }
 2887         if (ylen == 1 && y[0] == 1) {
 2888                 memcpy(u, u1, ylen * sizeof *u);
 2889                 memcpy(v, v1, xlen * sizeof *v);
 2890                 return 1;
 2891         }
 2892 
 2893         x0i = modp_ninv31(x[0]);
 2894         y0i = modp_ninv31(y[0]);
 2895 
 2896         /*
 2897          * We are now all set for the main algorithm.
 2898          */
 2899         for (;;) {
 2900                 int r;
 2901 
 2902                 /*
 2903                  * If either word is large enough, we use the
 2904                  * accelerated approximation.
 2905                  */
 2906                 if (alen >= 3 || blen >= 3) {
 2907                         size_t len;
 2908                         uint64_t a_hi, b_hi;
 2909                         uint32_t a_lo, b_lo;
 2910                         uint32_t uxa, uya, uxb, uyb;
 2911                         int i, r;
 2912 
 2913                         len = alen < blen ? blen : alen;
 2914 
 2915                         /*
 2916                          * Get the top and low bits of each value.
 2917                          */
 2918                         a_hi = ((uint64_t)a[len - 1] << 31) | a[len - 2];
 2919                         b_hi = ((uint64_t)b[len - 1] << 31) | b[len - 2];
 2920                         a_lo = a[0];
 2921                         b_lo = b[0];
 2922                         uxa = 1;
 2923                         uxb = 0;
 2924                         uya = 0;
 2925                         uyb = 1;
 2926                         for (i = 0; i < 31; i ++) {
 2927                                 uint32_t m;
 2928 
 2929                                 m = (uint32_t)1 << i;
 2930                                 if ((a_lo & m) == 0) {
 2931                                         a_hi >>= 1;
 2932                                         b_lo <<= 1;
 2933                                         uya <<= 1;
 2934                                         uyb <<= 1;
 2935                                 } else if ((b_lo & m) == 0) {
 2936                                         b_hi >>= 1;
 2937                                         a_lo <<= 1;
 2938                                         uxa <<= 1;
 2939                                         uxb <<= 1;
 2940                                 } else if (a_hi > b_hi) {
 2941                                         a_hi -= b_hi;
 2942                                         a_lo -= b_lo;
 2943                                         uxa -= uya;
 2944                                         uxb -= uyb;
 2945                                         a_hi >>= 1;
 2946                                         b_lo <<= 1;
 2947                                         uya <<= 1;
 2948                                         uyb <<= 1;
 2949                                 } else {
 2950                                         b_hi -= a_hi;
 2951                                         b_lo -= a_lo;
 2952                                         uya -= uxa;
 2953                                         uyb -= uxb;
 2954                                         b_hi >>= 1;
 2955                                         a_lo <<= 1;
 2956                                         uxa <<= 1;
 2957                                         uxb <<= 1;
 2958                                 }
 2959                         }
 2960 
 2961                         /*
 2962                          * It may happen that one of the factors is
 2963                          * equal to 2^31. In that case, we must use a
 2964                          * specialized function, because that value will
 2965                          * not fit in an int32_t.
 2966                          */
 2967                         if (uxa == 0x80000000) {
 2968                                 int32_t ya;
 2969 
 2970                                 if (uxb != 0 || uyb != 1) {
 2971                                         return 0;
 2972                                 }
 2973                                 ya = *(int32_t *)&uya;
 2974                                 if (zint_reduce(b, a, len, ya)) {
 2975                                         ya = -ya;
 2976                                 }
 2977                                 zint_reduce_mod(u1, u0, y, ylen, y0i, ya);
 2978                                 zint_reduce_mod(v1, v0, x, xlen, x0i, ya);
 2979                         } else if (uyb == 0x80000000) {
 2980                                 int32_t xb;
 2981 
 2982                                 if (uya != 0 || uxa != 1) {
 2983                                         return 0;
 2984                                 }
 2985                                 xb = *(int32_t *)&uxb;
 2986                                 if (zint_reduce(a, b, len, xb)) {
 2987                                         xb = -xb;
 2988                                 }
 2989                                 zint_reduce_mod(u0, u1, y, ylen, y0i, xb);
 2990                                 zint_reduce_mod(v0, v1, x, xlen, x0i, xb);
 2991                         } else {
 2992                                 int32_t xa, xb, ya, yb;
 2993 
 2994                                 xa = *(int32_t *)&uxa;
 2995                                 xb = *(int32_t *)&uxb;
 2996                                 ya = *(int32_t *)&uya;
 2997                                 yb = *(int32_t *)&uyb;
 2998 
 2999                                 r = zint_co_reduce(a, b, len, xa, xb, ya, yb);
 3000                                 if ((r & 1) != 0) {
 3001                                         xa = -xa;
 3002                                         xb = -xb;
 3003                                 }
 3004                                 if ((r & 2) != 0) {
 3005                                         ya = -ya;
 3006                                         yb = -yb;
 3007                                 }
 3008                                 zint_co_reduce_mod(u0, u1, y, ylen, y0i,
 3009                                         xa, xb, ya, yb);
 3010                                 zint_co_reduce_mod(v0, v1, x, xlen, x0i,
 3011                                         xa, xb, ya, yb);
 3012                         }
 3013                         alen = zint_exact_length(a, alen);
 3014                         blen = zint_exact_length(b, blen);
 3015 
 3016                         continue;
 3017                 }
 3018 
 3019                 /*
 3020                  * If a is even, divide it by 2 and adjust u0 and v0.
 3021                  */
 3022                 if ((a[0] & 1) == 0) {
 3023                         zint_rshift1(a, alen);
 3024                         alen = zint_exact_length(a, alen);
 3025                         zint_rshift1_mod(u0, y, ylen);
 3026                         zint_rshift1_mod(v0, x, xlen);
 3027                         continue;
 3028                 }
 3029 
 3030                 /*
 3031                  * If b is even, divide it by 2 and adjust u1 and v1.
 3032                  */
 3033                 if ((b[0] & 1) == 0) {
 3034                         zint_rshift1(b, blen);
 3035                         blen = zint_exact_length(b, blen);
 3036                         zint_rshift1_mod(u1, y, ylen);
 3037                         zint_rshift1_mod(v1, x, xlen);
 3038                         continue;
 3039                 }
 3040 
 3041                 /*
 3042                  * Compare a to b. If equal, then the algorithm
 3043                  * terminates.
 3044                  */
 3045                 if (alen < blen) {
 3046                         r = -1;
 3047                 } else if (alen > blen) {
 3048                         r = 1;
 3049                 } else {
 3050                         r = zint_ucmp(a, b, alen);
 3051                         if (r == 0) {
 3052                                 /*
 3053                                  * If a == b, then the algorithm terminate;
 3054                                  * they both contain the GCD of x and y.
 3055                                  * This is a success only if that GCD is 1.
 3056                                  * Arrays u and v are already filled with
 3057                                  * the proper results.
 3058                                  */
 3059                                 return alen == 1 && a[0] == 1;
 3060                         }
 3061                 }
 3062 
 3063                 /*
 3064                  * If a > b, then set a <- a-b, and adjust u0 and v0
 3065                  * accordingly. Analysis shows that we will be able to
 3066                  * maintain 0 < u0 < y and 0 < v0 < x.
 3067                  *
 3068                  * If a < b, then set b <- b-a, and adjust u1 and v1
 3069                  * accordingly. Analysis shows that we will be able to
 3070                  * maintain 0 < u1 < y and 0 < v1 < x.
 3071                  */
 3072                 if (r > 0) {
 3073                         zint_sub(a, b, alen);
 3074                         alen = zint_exact_length(a, alen);
 3075                         zint_sub_mod(u0, u1, y, ylen);
 3076                         zint_sub_mod(v0, v1, x, xlen);
 3077                 } else {
 3078                         zint_sub(b, a, blen);
 3079                         blen = zint_exact_length(b, blen);
 3080                         zint_sub_mod(u1, u0, y, ylen);
 3081                         zint_sub_mod(v1, v0, x, xlen);
 3082                 }
 3083         }
 3084 }
 3085 
 3086 /*
 3087  * Compute bit length of a word. Input word x must have length at most 31
 3088  * bits (i.e. top bit is cleared).
 3089  */
 3090 unsigned
 3091 bitlength(uint32_t x)
 3092 {
 3093         /*
 3094          * This algorithm is inspired from an algorithm devised by
 3095          * Robert Harley and explained in Hacker's Delight, 2nd edition
 3096          * (section 5.3).
 3097          *
 3098          * We first propagate the highest non-zero bit to the right, so
 3099          * that the value becomes equal to 2^bl-1; at that point, we thus
 3100          * have 32 possible values for x (0, and powers of 2 from 2^0 to
 3101          * 2^30). Then, we multiply the value with a specific constant
 3102          * that makes it so that the top 5 bits of the 32-bit result will
 3103          * contain 32 different values for the 32 possible values of x
 3104          * at this point. These top 5 bits thus contain a permutation of
 3105          * the 0..31 result we need; a table look-up implements the
 3106          * reverse permutation.
 3107          */
 3108         static const unsigned vv[] = {
 3109                  0, 31,  4,  5,  6, 10,  7, 15,
 3110                 11, 20,  8, 18, 16, 25, 12, 27,
 3111                 21, 30,  3,  9, 14, 19, 17, 24,
 3112                 26, 29,  2, 13, 23, 28,  1, 22
 3113         };
 3114 
 3115         x |= x >> 1;
 3116         x |= x >> 2;
 3117         x |= x >> 4;
 3118         x |= x >> 8;
 3119         x |= x >> 16;
 3120         return vv[(x * 0xF04653AE) >> 27];
 3121 }
 3122 
 3123 /*
 3124  * Get the bit length of a signed big integer: this is the minimum number
 3125  * of bits required to hold the value, _without_ the signed bit (thus, -1
 3126  * has bit length 0).
 3127  */
 3128 static uint32_t
 3129 zint_signed_bit_length(const uint32_t *x, size_t xlen)
 3130 {
 3131         uint32_t sign;
 3132 
 3133         if (xlen == 0) {
 3134                 return 0;
 3135         }
 3136         sign = (-(x[xlen - 1] >> 30)) >> 1;
 3137         while (xlen > 0) {
 3138                 if (x[xlen - 1] != sign) {
 3139                         break;
 3140                 }
 3141                 xlen --;
 3142         }
 3143         if (xlen == 0) {
 3144                 return 0;
 3145         }
 3146         return (uint32_t)(xlen - 1) * 31 + bitlength(x[xlen - 1] ^ sign);
 3147 }
 3148 
 3149 /*
 3150  * Get the top 63 bits of a signed big integer, starting at the provided
 3151  * index (in bits). The integer absolute value MUST fit in sc+63 bits.
 3152  */
 3153 static int64_t
 3154 zint_get_top(const uint32_t *x, size_t xlen, uint32_t sc)
 3155 {
 3156         uint64_t z;
 3157         uint32_t sign, w0, w1, w2;
 3158         uint32_t k, off;
 3159 
 3160         if (xlen == 0) {
 3161                 return 0;
 3162         }
 3163 
 3164         /*
 3165          * The "sign word" is -1 for negative values, 0 for positive values.
 3166          */
 3167         sign = -(x[xlen - 1] >> 30);
 3168 
 3169         k = sc / 31;
 3170         off = sc - (31 * k);
 3171 
 3172         /*
 3173          * To obtain 63 bits, we always need exactly three words.
 3174          */
 3175         if ((k + 2) < xlen) {
 3176                 w0 = x[k + 0];
 3177                 w1 = x[k + 1];
 3178                 w2 = x[k + 2] | (sign << 31);
 3179         } else if ((k + 1) < xlen) {
 3180                 w0 = x[k + 0];
 3181                 w1 = x[k + 1];
 3182                 w2 = sign;
 3183         } else if (k < xlen) {
 3184                 w0 = x[k + 0];
 3185                 w1 = sign;
 3186                 w2 = sign;
 3187         } else {
 3188                 w0 = sign;
 3189                 w1 = sign;
 3190                 w2 = sign;
 3191         }
 3192         z = ((uint64_t)w0 >> off)
 3193                 | ((uint64_t)w1 << (31 - off))
 3194                 | ((uint64_t)w2 << (62 - off));
 3195 
 3196         /*
 3197          * Properties of the exact-width types (no padding, no trap
 3198          * representation, two's complement representation) means that
 3199          * we can use a cast on the in-memory representation to
 3200          * convert from unsigned to signed values, without incurring
 3201          * any undefined behaviour.
 3202          */
 3203         return *(int64_t *)&z;
 3204 }
 3205 
 3206 /*
 3207  * Add k*y*2^sc to x. The result is assumed to fit in the array of
 3208  * size xlen (truncation is applied if necessary).
 3209  * Scale factor 'sc' is provided as sch and scl, such that:
 3210  *   sch = sc / 31
 3211  *   scl = sc % 31
 3212  * xlen MUST NOT be lower than ylen.
 3213  *
 3214  * x[] and y[] are both signed integers, using two's complement for
 3215  * negative values.
 3216  */
 3217 static void
 3218 zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
 3219         const uint32_t *restrict y, size_t ylen, int32_t k,
 3220         uint32_t sch, uint32_t scl)
 3221 {
 3222         size_t u;
 3223         uint32_t ysign, tw;
 3224         int32_t cc;
 3225 
 3226         if (ylen == 0) {
 3227                 return;
 3228         }
 3229 
 3230         ysign = -(y[ylen - 1] >> 30) >> 1;
 3231         tw = 0;
 3232         cc = 0;
 3233         for (u = sch; u < xlen; u ++) {
 3234                 size_t v;
 3235                 uint32_t wy, wys, ccu;
 3236                 uint64_t z;
 3237 
 3238                 /*
 3239                  * Get the next word of y (scaled).
 3240                  */
 3241                 v = u - sch;
 3242                 wy = v < ylen ? y[v] : ysign;
 3243                 wys = ((wy << scl) & 0x7FFFFFFF) | tw;
 3244                 tw = wy >> (31 - scl);
 3245 
 3246                 /*
 3247                  * The expression below does not overflow.
 3248                  */
 3249                 z = (int64_t)wys * (int64_t)k + (int64_t)x[u] + cc;
 3250                 x[u] = (uint32_t)z & 0x7FFFFFFF;
 3251 
 3252                 /*
 3253                  * Right-shifting the signed value z would yield
 3254                  * implementation-defined results (arithmetic shift is
 3255                  * not guaranteed). However, we can cast to unsigned,
 3256                  * and get the next carry as an unsigned word. We can
 3257                  * then convert it back to signed by using the guaranteed
 3258                  * fact that 'int32_t' uses two's complement with no
 3259                  * trap representation or padding bit, and with a layout
 3260                  * compatible with that of 'uint32_t'.
 3261                  */
 3262                 ccu = (uint32_t)((uint64_t)z >> 31);
 3263                 cc = *(int32_t *)&ccu;
 3264         }
 3265 }
 3266 
 3267 /*
 3268  * Subtract y*2^sc from x. The result is assumed to fit in the array of
 3269  * size xlen (truncation is applied if necessary).
 3270  * Scale factor 'sc' is provided as sch and scl, such that:
 3271  *   sch = sc / 31
 3272  *   scl = sc % 31
 3273  * xlen MUST NOT be lower than ylen.
 3274  *
 3275  * x[] and y[] are both signed integers, using two's complement for
 3276  * negative values.
 3277  */
 3278 static void
 3279 zint_sub_scaled(uint32_t *restrict x, size_t xlen,
 3280         const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl)
 3281 {
 3282         size_t u;
 3283         uint32_t ysign, tw;
 3284         uint32_t cc;
 3285 
 3286         if (ylen == 0) {
 3287                 return;
 3288         }
 3289 
 3290         ysign = -(y[ylen - 1] >> 30) >> 1;
 3291         tw = 0;
 3292         cc = 0;
 3293         for (u = sch; u < xlen; u ++) {
 3294                 size_t v;
 3295                 uint32_t w, wy, wys;
 3296 
 3297                 /*
 3298                  * Get the next word of y (scaled).
 3299                  */
 3300                 v = u - sch;
 3301                 wy = v < ylen ? y[v] : ysign;
 3302                 wys = ((wy << scl) & 0x7FFFFFFF) | tw;
 3303                 tw = wy >> (31 - scl);
 3304 
 3305                 w = x[u] - wys - cc;
 3306                 x[u] = w & 0x7FFFFFFF;
 3307                 cc = w >> 31;
 3308         }
 3309 }
 3310 
 3311 /*
 3312  * Convert a one-word signed big integer into a signed value.
 3313  */
 3314 static inline int32_t
 3315 zint_one_to_plain(const uint32_t *x)
 3316 {
 3317         uint32_t w;
 3318 
 3319         w = x[0];
 3320         w |= (w & 0x40000000) << 1;
 3321         return *(int32_t *)&w;
 3322 }
 3323 
 3324 /* ==================================================================== */
 3325 
 3326 /*
 3327  * Get the maximum bitlength of coordinates for a polynomial.
 3328  */
 3329 static uint32_t
 3330 poly_max_bitlength(const uint32_t *f, size_t flen, size_t fstride,
 3331         unsigned logn, unsigned ter)
 3332 {
 3333         size_t n, u;
 3334         uint32_t maxbl;
 3335 
 3336         n = MKN(logn, ter);
 3337         maxbl = 0;
 3338         for (u = 0; u < n; u ++, f += fstride) {
 3339                 uint32_t bl;
 3340 
 3341                 bl = zint_signed_bit_length(f, flen);
 3342                 if (bl > maxbl) {
 3343                         maxbl = bl;
 3344                 }
 3345         }
 3346         return maxbl;
 3347 }
 3348 
 3349 /*
 3350  * Convert a polynomial to floating-point values; the maximum bit length
 3351  * of all coefficients is provided as 'maxbl' parameter. Returned values are
 3352  * scaled down by 'scale' bits: if the integer value is z, this function
 3353  * computes an approximation of z*2^(-scale).
 3354  */
 3355 static void
 3356 poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
 3357         unsigned logn, unsigned ter, uint32_t maxbl, uint32_t scale)
 3358 {
 3359         size_t n, u;
 3360         uint32_t off;
 3361 
 3362         n = MKN(logn, ter);
 3363         off = maxbl < 63 ? 0 : maxbl - 63;
 3364         for (u = 0; u < n; u ++, f += fstride) {
 3365                 d[u] = fpr_scaled(zint_get_top(f, flen, off),
 3366                         (int)(off - scale));
 3367         }
 3368 }
 3369 
 3370 /*
 3371  * Convert a polynomial to small integers. Source values are supposed
 3372  * to be one-word integers, signed over 31 bits. Returned value is 0
 3373  * if any of the coefficients exceeds 2047 (in absolute value), or 1
 3374  * on success.
 3375  */
 3376 static int
 3377 poly_big_to_small(int16_t *d, const uint32_t *s, unsigned logn, unsigned ter)
 3378 {
 3379         size_t n, u;
 3380 
 3381         n = MKN(logn, ter);
 3382         for (u = 0; u < n; u ++) {
 3383                 int32_t z;
 3384 
 3385                 z = zint_one_to_plain(s + u);
 3386                 if (z < -2047 || z > 2047) {
 3387                         return 0;
 3388                 }
 3389                 d[u] = (int16_t)z;
 3390         }
 3391         return 1;
 3392 }
 3393 
 3394 /*
 3395  * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
 3396  * Coefficients of polynomial k are small integers (signed values in the
 3397  * -2^31..2^31 range) scaled by 2^sc.
 3398  *
 3399  * This function implements the basic quadratic multiplication algorithm,
 3400  * which is efficient in space (no extra buffer needed) but slow at
 3401  * high degree.
 3402  */
 3403 static void
 3404 poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
 3405         const uint32_t *restrict f, size_t flen, size_t fstride,
 3406         const int32_t *restrict k, uint32_t sc,
 3407         unsigned logn, unsigned full, unsigned ternary)
 3408 {
 3409         size_t n, hn, u;
 3410         uint32_t sch, scl;
 3411 
 3412         n = MKN(logn, full);
 3413         hn = n >> 1;
 3414         sch = sc / 31;
 3415         scl = sc % 31;
 3416         if (ternary) {
 3417                 size_t off1, off2, off3;
 3418 
 3419                 off1 = hn * Fstride;
 3420                 off2 = n * Fstride;
 3421                 off3 = off1 + off2;
 3422                 for (u = 0; u < n; u ++) {
 3423                         int32_t kf;
 3424                         size_t v, j;
 3425                         const uint32_t *y;
 3426 
 3427                         kf = -k[u];
 3428                         j = u * Fstride;
 3429                         y = f;
 3430                         for (v = 0; v < n; v ++, j += Fstride, y += fstride) {
 3431                                 if (u + v < n) {
 3432                                         zint_add_scaled_mul_small(
 3433                                                 F + j, Flen,
 3434                                                 y, flen, kf, sch, scl);
 3435                                 } else if (u + v < (n + hn)) {
 3436                                         zint_add_scaled_mul_small(
 3437                                                 F + j - off1, Flen,
 3438                                                 y, flen, kf, sch, scl);
 3439                                         zint_add_scaled_mul_small(
 3440                                                 F + j - off2, Flen,
 3441                                                 y, flen, -kf, sch, scl);
 3442                                 } else {
 3443                                         zint_add_scaled_mul_small(
 3444                                                 F + j - off3, Flen,
 3445                                                 y, flen, -kf, sch, scl);
 3446                                 }
 3447                         }
 3448                 }
 3449         } else {
 3450                 for (u = 0; u < n; u ++) {
 3451                         int32_t kf;
 3452                         size_t v;
 3453                         uint32_t *x;
 3454                         const uint32_t *y;
 3455 
 3456                         kf = -k[u];
 3457                         x = F + u * Fstride;
 3458                         y = f;
 3459                         for (v = 0; v < n; v ++) {
 3460                                 zint_add_scaled_mul_small(
 3461                                         x, Flen, y, flen, kf, sch, scl);
 3462                                 if (u + v == n - 1) {
 3463                                         x = F;
 3464                                         kf = -kf;
 3465                                 } else {
 3466                                         x += Fstride;
 3467                                 }
 3468                                 y += fstride;
 3469                         }
 3470                 }
 3471         }
 3472 }
 3473 
 3474 /*
 3475  * Subtract k*f from F. Coefficients of polynomial k are small integers
 3476  * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
 3477  * assumes that the degree is large, and integers relatively small.
 3478  */
 3479 static void
 3480 poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
 3481         const uint32_t *restrict f, size_t flen, size_t fstride,
 3482         const int32_t *restrict k, uint32_t sc,
 3483         unsigned logn, unsigned full, int ternary, uint32_t *restrict tmp)
 3484 {
 3485         uint32_t *gm, *igm, *fk, *t1, *x;
 3486         const uint32_t *y;
 3487         size_t n, u, tlen;
 3488         uint32_t sch, scl;
 3489         const small_prime *primes;
 3490 
 3491         n = MKN(logn, full);
 3492         tlen = flen + 1;
 3493         gm = tmp;
 3494         igm = gm + MKN(logn, 0);
 3495         fk = igm + MKN(logn, 0);
 3496         t1 = fk + n * tlen;
 3497 
 3498         primes = ternary ? PRIMES3 : PRIMES2;
 3499 
 3500         /*
 3501          * Compute k*f in fk[], in RNS notation.
 3502          */
 3503         for (u = 0; u < tlen; u ++) {
 3504                 uint32_t p, p0i, R2, Rx;
 3505                 size_t v;
 3506 
 3507                 p = primes[u].p;
 3508                 p0i = modp_ninv31(p);
 3509                 R2 = modp_R2(p, p0i);
 3510                 Rx = modp_Rx(flen, p, p0i, R2);
 3511                 if (ternary) {
 3512                         modp_mkgm3(gm, igm, logn, full, primes[u].g, p, p0i);
 3513                 } else {
 3514                         modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
 3515                 }
 3516 
 3517                 for (v = 0; v < n; v ++) {
 3518                         t1[v] = modp_set(k[v], p);
 3519                 }
 3520                 if (ternary) {
 3521                         modp_NTT3(t1, gm, logn, full, p, p0i);
 3522                 } else {
 3523                         modp_NTT2(t1, gm, logn, p, p0i);
 3524                 }
 3525                 for (v = 0, y = f, x = fk + u;
 3526                         v < n; v ++, y += fstride, x += tlen)
 3527                 {
 3528                         *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
 3529                 }
 3530                 if (ternary) {
 3531                         modp_NTT3_ext(fk + u, tlen, gm, logn, full, p, p0i);
 3532                 } else {
 3533                         modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
 3534                 }
 3535                 for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
 3536                         *x = modp_montymul(
 3537                                 modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
 3538                 }
 3539                 if (ternary) {
 3540                         modp_iNTT3_ext(fk + u, tlen, igm, logn, full, p, p0i);
 3541                 } else {
 3542                         modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
 3543                 }
 3544         }
 3545 
 3546         /*
 3547          * Rebuild k*f.
 3548          */
 3549         zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
 3550 
 3551         /*
 3552          * Subtract k*f, scaled, from F.
 3553          */
 3554         sch = sc / 31;
 3555         scl = sc % 31;
 3556         for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
 3557                 zint_sub_scaled(x, Flen, y, tlen, sch, scl);
 3558         }
 3559 }
 3560 
 3561 /* ==================================================================== */
 3562 
 3563 struct falcon_keygen_ {
 3564 
 3565         /* Base-2 logarithm of the degree. */
 3566         unsigned logn;
 3567 
 3568         /* 1 for a ternary modulus, 0 for binary. */
 3569         unsigned ternary;
 3570 
 3571         /* RNG:
 3572            seeded    non-zero when a 'replace' seed or system RNG was pushed
 3573            flipped   non-zero when flipped */
 3574         shake_context rng;
 3575         int seeded;
 3576         int flipped;
 3577 
 3578         /* Temporary storage for key generation. 'tmp_len' is expressed
 3579            in 32-bit words. */
 3580         uint32_t *tmp;
 3581         size_t tmp_len;
 3582 };
 3583 
 3584 /*
 3585  * Get a random 8-byte integer from a SHAKE-based RNG. This function
 3586  * ensures consistent interpretation of the SHAKE output so that
 3587  * the same values will be obtained over different platforms, in case
 3588  * a known seed is used.
 3589  */
 3590 static inline uint64_t
 3591 get_rng_u64(shake_context *rng)
 3592 {
 3593         /*
 3594          * On little-endian systems we just interpret the bytes "as is"
 3595          * (this is correct because the exact-width types such as
 3596          * 'uint64_t' are guaranteed to have no padding and no trap
 3597          * representation).
 3598          *
 3599          * On other systems we enforce little-endian representation.
 3600          */
 3601 #if FALCON_LE_U
 3602         uint64_t r;
 3603 
 3604         shake_extract(rng, &r, sizeof r);
 3605         return r;
 3606 #else
 3607         unsigned char tmp[8];
 3608 
 3609         shake_extract(rng, tmp, sizeof tmp);
 3610         return (uint64_t)tmp[0]
 3611                 | ((uint64_t)tmp[1] << 8)
 3612                 | ((uint64_t)tmp[2] << 16)
 3613                 | ((uint64_t)tmp[3] << 24)
 3614                 | ((uint64_t)tmp[4] << 32)
 3615                 | ((uint64_t)tmp[5] << 40)
 3616                 | ((uint64_t)tmp[6] << 48)
 3617                 | ((uint64_t)tmp[7] << 56);
 3618 #endif
 3619 }
 3620 
 3621 /*
 3622  * Table below incarnates a discrete Gaussian distribution:
 3623  *    D(x) = exp(-(x^2)/(2*sigma^2))
 3624  * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
 3625  * Element 0 of the table is P(x = 0).
 3626  * For k > 0, element k is P(x >= k+1 | x > 0).
 3627  * Probabilities are scaled up by 2^63.
 3628  */
 3629 static const uint64_t gauss_1024_12289[] = {
 3630          1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
 3631          2353523259288686585u,  1227179971273316331u,   575931623374121527u,
 3632           242543240509105209u,    91437049221049666u,    30799446349977173u,
 3633             9255276791179340u,     2478152334826140u,      590642893610164u,
 3634              125206034929641u,       23590435911403u,        3948334035941u,
 3635                 586753615614u,          77391054539u,           9056793210u,
 3636                    940121950u,             86539696u,              7062824u,
 3637                       510971u,                32764u,                 1862u,
 3638                           94u,                    4u,                    0u
 3639 };
 3640 
 3641 /*
 3642  * Generate a random value with a Gaussian distribution centered on 0.
 3643  * The RNG must be ready for extraction (already flipped).
 3644  *
 3645  * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
 3646  * precomputed table is for N = 1024. Since the sum of two independent
 3647  * values of standard deviation sigma has standard deviation
 3648  * sigma*sqrt(2), then we can just generate more values and add them
 3649  * together for lower dimensions.
 3650  *
 3651  * This function is only for the binary case.
 3652  */
 3653 static int
 3654 mkgauss(falcon_keygen *fk, unsigned logn)
 3655 {
 3656         unsigned u, g;
 3657         int val;
 3658 
 3659         g = 1U << (10 - logn);
 3660         val = 0;
 3661         for (u = 0; u < g; u ++) {
 3662                 uint64_t r;
 3663                 int k, neg;
 3664 
 3665                 r = get_rng_u64(&fk->rng);
 3666                 neg = (int)(r >> 63);
 3667                 r &= ~((uint64_t)1 << 63);
 3668                 if (r < gauss_1024_12289[0]) {
 3669                         continue;
 3670                 }
 3671                 r = get_rng_u64(&fk->rng);
 3672                 r &= ~((uint64_t)1 << 63);
 3673                 k = 1;
 3674                 while (gauss_1024_12289[k] > r) {
 3675                         k ++;
 3676                 }
 3677                 k *= (int)(1 - (neg << 1));
 3678                 val += k;
 3679         }
 3680         return val;
 3681 }
 3682 
 3683 /*
 3684  * The MAX_BL_SMALL*[] and MAX_BL_LARGE*[] contain the lengths, in 31-bit
 3685  * words, of intermediate values in the computation:
 3686  *
 3687  *   MAX_BL_SMALL*[depth]: length for the input f and g at that depth
 3688  *   MAX_BL_LARGE*[depth]: length for the unreduced F and G at that depth
 3689  *
 3690  * MAX_BL_SMALL2[] and MAX_BL_LARGE2[] are for the binary case, for depth
 3691  * up to 10. MAX_BL_SMALL3[] and MAX_BL_LARGE3[] are for the ternary case.
 3692  *
 3693  * Rules:
 3694  *
 3695  *  - Within an array, values grow.
 3696  *
 3697  *  - The 'SMALL' array must have an entry for maximum depth, corresponding
 3698  *    to the size of values used in the binary GCD. There is no such value
 3699  *    for the 'LARGE' array (the binary GCD yields already reduced
 3700  *    coefficients).
 3701  *
 3702  *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
 3703  *
 3704  *  - Values must be large enough to handle the common cases, with some
 3705  *    margins.
 3706  *
 3707  * The following average lengths, in bits, have been measured on thousands
 3708  * of random keys (fg = max length of the absolute value of coefficients
 3709  * of f and g at that depth; FG = idem for the unreduced F and G; for the
 3710  * maximum depth, F and G are the output of binary GCD, multiplied by q;
 3711  * for each value, the average and standard deviation are provided).
 3712  *
 3713  * Binary case:
 3714  *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
 3715  *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
 3716  *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
 3717  *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
 3718  *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
 3719  *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
 3720  *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
 3721  *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
 3722  *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
 3723  *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
 3724  *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
 3725  *
 3726  * Ternary case:
 3727  *    depth:  9    fg: 4975.81 (22.38)    FG: 4988.54 (22.43)
 3728  *    depth:  8    fg: 2472.64 (11.20)    FG: 7409.66 (25.93)
 3729  *    depth:  7    fg: 1243.34 ( 6.78)    FG: 3705.30 (13.66)
 3730  *    depth:  6    fg:  626.61 ( 4.40)    FG: 1861.86 ( 8.50)
 3731  *    depth:  5    fg:  316.34 ( 2.75)    FG:  937.68 ( 5.51)
 3732  *    depth:  4    fg:  159.68 ( 1.61)    FG:  473.48 ( 3.45)
 3733  *    depth:  3    fg:   80.16 ( 0.86)    FG:  239.82 ( 2.06)
 3734  *    depth:  2    fg:   39.58 ( 0.51)    FG:  121.80 ( 1.14)
 3735  *    depth:  1    fg:   18.98 ( 0.13)    FG:   61.93 ( 0.61)
 3736  *    depth:  0    fg:    4.76 ( 0.42)    FG:   34.97 ( 0.28)
 3737  *
 3738  * Integers are actually represented either in binary notation over
 3739  * 31-bit words (signed, using two's complement), or in RNS, modulo
 3740  * many small primes. These small primes are close to, but slightly
 3741  * lower than, 2^31. Use of RNS loses less than two bits, even for
 3742  * the largest values.
 3743  */
 3744 
 3745 static const size_t MAX_BL_SMALL2[] = {
 3746         1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 212
 3747 };
 3748 
 3749 static const size_t MAX_BL_LARGE2[] = {
 3750         2, 2, 5, 7, 12, 22, 42, 80, 157, 310
 3751 };
 3752 
 3753 static const size_t MAX_BL_SMALL3[] = {
 3754         1, 1, 2, 3, 6, 12, 22, 42, 82, 166
 3755 };
 3756 
 3757 static const size_t MAX_BL_LARGE3[] = {
 3758         2, 3, 5, 9, 16, 32, 62, 123, 245
 3759 };
 3760 
 3761 /*
 3762  * Minimal recursion depth at which we rebuild intermediate values
 3763  * when reconstructing f and g.
 3764  */
 3765 #define DEPTH_INT_FG   4
 3766 
 3767 /*
 3768  * Compute size of temporary array for key generation.
 3769  * Returned size is expressed in bytes.
 3770  */
 3771 static size_t
 3772 temp_size(unsigned logn, int ternary)
 3773 {
 3774 #define ALIGN_FP(tt)   ((((tt) + sizeof(fpr) - 1) / sizeof(fpr)) * sizeof(fpr))
 3775 #define ALIGN_UW(tt)   ((((tt) + sizeof(uint32_t) - 1) \
 3776                        / sizeof(uint32_t)) * sizeof(uint32_t))
 3777 
 3778         size_t gmax;
 3779         unsigned depth;
 3780 
 3781         gmax = 0;
 3782 
 3783         /*
 3784          * Compute memory requirements for make_fg() at each depth.
 3785          */
 3786         for (depth = 0; depth < logn; depth ++) {
 3787                 size_t cur;
 3788 
 3789                 if (depth == 0 && ternary) {
 3790                         size_t n, dn, tn;
 3791 
 3792                         n = (size_t)3 << (logn - 1);
 3793                         dn = (size_t)1 << logn;
 3794                         tn = (size_t)1 << (logn - 1);
 3795                         cur = (2 * tn + 2 * n + 2 * dn) * sizeof(uint32_t);
 3796                         gmax = cur > gmax ? cur : gmax;
 3797                 } else {
 3798                         size_t n, slen, tlen;
 3799 
 3800                         n = (size_t)1 << (logn - depth);
 3801                         slen = ternary
 3802                                 ? MAX_BL_SMALL3[depth]
 3803                                 : MAX_BL_SMALL2[depth];
 3804                         tlen = ternary
 3805                                 ? MAX_BL_SMALL3[depth + 1]
 3806                                 : MAX_BL_SMALL2[depth + 1];
 3807                         cur = (n * tlen + 2 * n * slen + 3 * n)
 3808                                 * sizeof(uint32_t);
 3809                         gmax = cur > gmax ? cur : gmax;
 3810                         cur = (n * tlen + 2 * n * slen + slen)
 3811                                 * sizeof(uint32_t);
 3812                         gmax = cur > gmax ? cur : gmax;
 3813                 }
 3814         }
 3815 
 3816         /*
 3817          * Compute memory requirements for each depth.
 3818          */
 3819         for (depth = 0; depth <= logn; depth ++) {
 3820                 size_t cur, max;
 3821 
 3822                 max = 0;
 3823                 if (depth == logn) {
 3824                         size_t slen;
 3825 
 3826                         slen = ternary
 3827                                 ? MAX_BL_SMALL3[depth]
 3828                                 : MAX_BL_SMALL2[depth];
 3829                         cur = 8 * slen * sizeof(uint32_t);
 3830                         max = cur > max ? cur : max;
 3831                 } else if (ternary && depth == 0 && logn > 2) {
 3832                         size_t n, tn, hn;
 3833 
 3834                         n = (size_t)3 << (logn - 1);
 3835                         tn = (size_t)1 << (logn - 1);
 3836                         hn = n >> 1;
 3837                         cur = ALIGN_FP(2 * tn * sizeof(uint32_t))
 3838                                 + (2 * n + hn) * sizeof(fpr);
 3839                         max = cur > max ? cur : max;
 3840                         cur = (hn + 4 * n) * sizeof(fpr);
 3841                         max = cur > max ? cur : max;
 3842                         cur = ALIGN_FP(2 * n * sizeof(uint32_t))
 3843                                 + 2 * n * sizeof(fpr);
 3844                         max = cur > max ? cur : max;
 3845                 } else if (!ternary && depth == 0 && logn > 2) {
 3846                         size_t n, hn;
 3847 
 3848                         n = (size_t)1 << logn;
 3849                         hn = n >> 1;
 3850                         cur = 7 * n * sizeof(uint32_t);
 3851                         max = cur > max ? cur : max;
 3852                         cur = ALIGN_FP(4 * n * sizeof(uint32_t))
 3853                                 + n * sizeof(fpr);
 3854                         max = cur > max ? cur : max;
 3855                         cur = ALIGN_FP(3 * n * sizeof(uint32_t))
 3856                                 + (n + hn) * sizeof(fpr);
 3857                         max = cur > max ? cur : max;
 3858                 } else if (!ternary && depth == 1 && logn > 2) {
 3859                         size_t n, hn, slen, dlen, llen;
 3860 
 3861                         n = (size_t)1 << (logn - 1);
 3862                         hn = n >> 1;
 3863                         slen = MAX_BL_SMALL2[depth];
 3864                         dlen = MAX_BL_SMALL2[depth + 1];
 3865                         llen = MAX_BL_LARGE2[depth];
 3866                         cur = (2 * hn * dlen + 2 * n * llen) * sizeof(uint32_t);
 3867                         max = cur > max ? cur : max;
 3868                         cur = (2 * n * llen + 2 * n * slen + 7 * n)
 3869                                 * sizeof(uint32_t);
 3870                         max = cur > max ? cur : max;
 3871                         cur = (2 * n * llen + 2 * n * slen + llen)
 3872                                 * sizeof(uint32_t);
 3873                         max = cur > max ? cur : max;
 3874                         cur = ALIGN_FP((2 * n * llen + 2 * n * slen)
 3875                                 * sizeof(uint32_t))
 3876                                 + 2 * n * sizeof(fpr);
 3877                         max = cur > max ? cur : max;
 3878                         cur = ALIGN_FP(2 * n * slen * sizeof(uint32_t))
 3879                                 + 4 * n * sizeof(fpr);
 3880                         max = cur > max ? cur : max;
 3881                         cur = (5 * n + hn) * sizeof(fpr);
 3882                         max = cur > max ? cur : max;
 3883                         cur = ALIGN_FP(2 * n * sizeof(uint32_t))
 3884                                 + 2 * n * sizeof(fpr);
 3885                         max = cur > max ? cur : max;
 3886                 } else {
 3887                         size_t n, hn, slen, llen, tmp1, tmp2;
 3888 
 3889                         if (ternary && depth == 0 && logn == 2) {
 3890                                 n = 6;
 3891                                 hn = 2;
 3892                         } else {
 3893                                 n = (size_t)1 << (logn - depth);
 3894                                 hn = n >> 1;
 3895                         }
 3896                         if (ternary) {
 3897                                 slen = MAX_BL_SMALL3[depth];
 3898                                 llen = MAX_BL_LARGE3[depth];
 3899                         } else {
 3900                                 slen = MAX_BL_SMALL2[depth];
 3901                                 llen = MAX_BL_LARGE2[depth];
 3902                         }
 3903                         cur = (2 * n * llen + 2 * n * slen + 4 * n)
 3904                                 * sizeof(uint32_t);
 3905                         max = cur > max ? cur : max;
 3906                         cur = (2 * n * llen + 2 * n * slen + llen)
 3907                                 * sizeof(uint32_t);
 3908                         max = cur > max ? cur : max;
 3909                         tmp1 = ALIGN_UW(
 3910                                 ALIGN_FP((2 * n * llen + 2 * n * slen)
 3911                                         * sizeof(uint32_t))
 3912                                 + (2 * n + hn) * sizeof(fpr))
 3913                                 + n * sizeof(uint32_t);
 3914                         tmp2 = ALIGN_FP((2 * n * llen + 2 * n * slen)
 3915                                 * sizeof(uint32_t))
 3916                                 + (3 * n + hn) * sizeof(fpr);
 3917                         cur = tmp1 > tmp2 ? tmp1 : tmp2;
 3918                         cur = ALIGN_FP(cur) + n * sizeof(fpr);
 3919                         max = cur > max ? cur : max;
 3920                         cur = ALIGN_UW(
 3921                                 ALIGN_FP((2 * n * llen + 2 * n * slen)
 3922                                         * sizeof(uint32_t))
 3923                                 + (2 * n + hn) * sizeof(fpr))
 3924                                 + (5 * n + n * slen) * sizeof(uint32_t);
 3925                         max = cur > max ? cur : max;
 3926                 }
 3927 
 3928                 gmax = max > gmax ? max : gmax;
 3929         }
 3930 
 3931         return gmax;
 3932 
 3933 #undef ALIGN_FP
 3934 #undef ALIGN_UW
 3935 }
 3936 
 3937 #if MEMCHECK
 3938 static const char MEMCHECK_MARK[] = "memcheck";
 3939 #endif
 3940 
 3941 /* see falcon.h */
 3942 falcon_keygen *
 3943 falcon_keygen_new(unsigned logn, int ternary)
 3944 {
 3945         falcon_keygen *fk;
 3946 
 3947         if (ternary) {
 3948                 if (logn < 3 || logn > 9) {
 3949                         return NULL;
 3950                 }
 3951         } else {
 3952                 if (logn < 1 || logn > 10) {
 3953                         return NULL;
 3954                 }
 3955         }
 3956         fk = malloc(sizeof *fk);
 3957         if (fk == NULL) {
 3958                 return NULL;
 3959         }
 3960         fk->logn = logn;
 3961         fk->ternary = ternary;
 3962         shake_init(&fk->rng, 512);
 3963         fk->seeded = 0;
 3964         fk->flipped = 0;
 3965 
 3966         fk->tmp_len = temp_size(logn, ternary);
 3967 #if MEMCHECK
 3968         fk->tmp = malloc(fk->tmp_len + sizeof MEMCHECK_MARK);
 3969 #else
 3970         fk->tmp = malloc(fk->tmp_len);
 3971 #endif
 3972         if (fk->tmp == NULL) {
 3973                 free(fk);
 3974                 return NULL;
 3975         }
 3976 #if MEMCHECK
 3977         memcpy((unsigned char *)fk->tmp + fk->tmp_len,
 3978                 MEMCHECK_MARK, sizeof MEMCHECK_MARK);
 3979 #endif
 3980 
 3981         return fk;
 3982 }
 3983 
 3984 /* see falcon.h */
 3985 void
 3986 falcon_keygen_free(falcon_keygen *fk)
 3987 {
 3988         if (fk != NULL) {
 3989 #if CLEANSE
 3990                 cleanse(fk->tmp, fk->tmp_len);
 3991 #endif
 3992 #if MEMCHECK
 3993                 if (memcmp((unsigned char *)fk->tmp + fk->tmp_len,
 3994                         MEMCHECK_MARK, sizeof MEMCHECK_MARK) != 0)
 3995                 {
 3996                         fprintf(stderr,
 3997                                 "buffer overflow! temp_size() is wrong\n");
 3998                         abort();
 3999                 }
 4000 #endif
 4001                 free(fk->tmp);
 4002                 free(fk);
 4003         }
 4004 }
 4005 
 4006 /* see falcon.h */
 4007 size_t
 4008 falcon_keygen_max_privkey_size(falcon_keygen *fk)
 4009 {
 4010         /*
 4011          * Uncompressed private key is a one-byte header, followed by
 4012          * f, g, F and G, each represented as an array of 16-bit values.
 4013          */
 4014         unsigned logn;
 4015         size_t n;
 4016 
 4017         logn = fk->logn;
 4018         n = fk->ternary ? ((size_t)3 << (logn - 1)) : ((size_t)1 << logn);
 4019         return 1 + (n << 3);
 4020 }
 4021 
 4022 /* see falcon.h */
 4023 size_t
 4024 falcon_keygen_max_pubkey_size(falcon_keygen *fk)
 4025 {
 4026         /*
 4027          * Public key is a one-byte header, followed by the public
 4028          * vector h. Each element of h uses 14 bits in the binary case,
 4029          * 15 bits in the ternary case.
 4030          */
 4031         unsigned logn;
 4032         unsigned z;
 4033 
 4034         logn = fk->logn;
 4035         z = fk->ternary ? (45U << (logn - 1)) : (14U << logn);
 4036         return 1 + ((z + 7) >> 3);
 4037 }
 4038 
 4039 /* see falcon.h */
 4040 void
 4041 falcon_keygen_set_seed(falcon_keygen *fk,
 4042         const void *seed, size_t len, int replace)
 4043 {
 4044         if (replace) {
 4045                 shake_init(&fk->rng, 512);
 4046                 shake_inject(&fk->rng, seed, len);
 4047                 fk->seeded = 1;
 4048                 fk->flipped = 0;
 4049                 return;
 4050         }
 4051         if (fk->flipped) {
 4052                 unsigned char tmp[32];
 4053 
 4054                 shake_extract(&fk->rng, tmp, sizeof tmp);
 4055                 shake_init(&fk->rng, 512);
 4056                 shake_inject(&fk->rng, tmp, sizeof tmp);
 4057                 fk->flipped = 0;
 4058         }
 4059         shake_inject(&fk->rng, seed, len);
 4060 }
 4061 
 4062 static int
 4063 rng_ready(falcon_keygen *fk)
 4064 {
 4065         if (!fk->seeded) {
 4066                 unsigned char tmp[32];
 4067 
 4068                 if (!falcon_get_seed(tmp, sizeof tmp)) {
 4069                         return 0;
 4070                 }
 4071                 falcon_keygen_set_seed(fk, tmp, sizeof tmp, 0);
 4072                 fk->seeded = 1;
 4073         }
 4074         if (!fk->flipped) {
 4075                 shake_flip(&fk->rng);
 4076                 fk->flipped = 1;
 4077         }
 4078         return 1;
 4079 }
 4080 
 4081 /*
 4082  * Compute squared norm of a short vector. Returned value is saturated to
 4083  * 2^32-1 if it is not lower than 2^31.
 4084  */
 4085 static uint32_t
 4086 poly_small_sqnorm(const int16_t *f, unsigned logn, unsigned ter)
 4087 {
 4088         size_t n, u;
 4089         uint32_t s, ng;
 4090 
 4091         n = MKN(logn, ter);
 4092         s = 0;
 4093         ng = 0;
 4094         for (u = 0; u < n; u ++) {
 4095                 int32_t z;
 4096 
 4097                 z = f[u];
 4098                 s += (uint32_t)(z * z);
 4099                 ng |= s;
 4100         }
 4101         return s | -(ng >> 31);
 4102 }
 4103 
 4104 /*
 4105  * Align (upwards) the provided 'data' pointer with regards to 'base'
 4106  * so that the offset is a multiple of the size of 'fpr'.
 4107  */
 4108 static fpr *
 4109 align_fpr(void *base, void *data)
 4110 {
 4111         unsigned char *cb, *cd;
 4112         size_t k, km;
 4113 
 4114         cb = base;
 4115         cd = data;
 4116         k = (size_t)(cd - cb);
 4117         km = k % sizeof(fpr);
 4118         if (km) {
 4119                 k += (sizeof(fpr)) - km;
 4120         }
 4121         return (fpr *)(cb + k);
 4122 }
 4123 
 4124 /*
 4125  * Align (upwards) the provided 'data' pointer with regards to 'base'
 4126  * so that the offset is a multiple of the size of 'uint32_t'.
 4127  */
 4128 static uint32_t *
 4129 align_u32(void *base, void *data)
 4130 {
 4131         unsigned char *cb, *cd;
 4132         size_t k, km;
 4133 
 4134         cb = base;
 4135         cd = data;
 4136         k = (size_t)(cd - cb);
 4137         km = k % sizeof(uint32_t);
 4138         if (km) {
 4139                 k += (sizeof(uint32_t)) - km;
 4140         }
 4141         return (uint32_t *)(cb + k);
 4142 }
 4143 
 4144 /*
 4145  * Convert a small vector to floating point.
 4146  */
 4147 static void
 4148 poly_small_to_fp(fpr *x, const int16_t *f, unsigned logn, unsigned ter)
 4149 {
 4150         size_t n, u;
 4151 
 4152         n = MKN(logn, ter);
 4153         for (u = 0; u < n; u ++) {
 4154                 x[u] = fpr_of(f[u]);
 4155         }
 4156 }
 4157 
 4158 /*
 4159  * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
 4160  * individual length. If 'ter' is 1, then this is for the ternary case.
 4161  * This function is never invoked for the top-level of the ternary case,
 4162  * though.
 4163  *
 4164  * Output: f',g' of degree N/2, with the length for 'depth+1'.
 4165  *
 4166  * Values are in RNS; input and/or output may also be in NTT.
 4167  */
 4168 static void
 4169 make_fg_step(uint32_t *data, unsigned logn, unsigned depth, unsigned ter,
 4170         int in_ntt, int out_ntt)
 4171 {
 4172         size_t n, hn, u;
 4173         size_t slen, tlen;
 4174         uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
 4175         const small_prime *primes;
 4176 
 4177         n = (size_t)1 << logn;
 4178         hn = n >> 1;
 4179         if (ter) {
 4180                 slen = MAX_BL_SMALL3[depth];
 4181                 tlen = MAX_BL_SMALL3[depth + 1];
 4182                 primes = PRIMES3;
 4183         } else {
 4184                 slen = MAX_BL_SMALL2[depth];
 4185                 tlen = MAX_BL_SMALL2[depth + 1];
 4186                 primes = PRIMES2;
 4187         }
 4188 
 4189         /*
 4190          * Prepare room for the result.
 4191          */
 4192         fd = data;
 4193         gd = fd + hn * tlen;
 4194         fs = gd + hn * tlen;
 4195         gs = fs + n * slen;
 4196         gm = gs + n * slen;
 4197         igm = gm + n;
 4198         t1 = igm + n;
 4199         memmove(fs, data, 2 * n * slen * sizeof *data);
 4200 
 4201         /*
 4202          * First slen words: we use the input values directly, and apply
 4203          * inverse NTT as we go.
 4204          */
 4205         for (u = 0; u < slen; u ++) {
 4206                 uint32_t p, p0i, R2;
 4207                 size_t v;
 4208                 uint32_t *x;
 4209 
 4210                 p = primes[u].p;
 4211                 p0i = modp_ninv31(p);
 4212                 R2 = modp_R2(p, p0i);
 4213                 if (ter) {
 4214                         modp_mkgm3(gm, igm, logn, 0, primes[u].g, p, p0i);
 4215                 } else {
 4216                         modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
 4217                 }
 4218 
 4219                 for (v = 0, x = fs + u; v < n; v ++, x += slen) {
 4220                         t1[v] = *x;
 4221                 }
 4222                 if (!in_ntt) {
 4223                         if (ter) {
 4224                                 modp_NTT3(t1, gm, logn, 0, p, p0i);
 4225                         } else {
 4226                                 modp_NTT2(t1, gm, logn, p, p0i);
 4227                         }
 4228                 }
 4229                 for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
 4230                         uint32_t w0, w1;
 4231 
 4232                         w0 = t1[(v << 1) + 0];
 4233                         w1 = t1[(v << 1) + 1];
 4234                         *x = modp_montymul(
 4235                                 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 4236                 }
 4237                 if (in_ntt) {
 4238                         if (ter) {
 4239                                 modp_iNTT3_ext(fs + u, slen, igm,
 4240                                         logn, 0, p, p0i);
 4241                         } else {
 4242                                 modp_iNTT2_ext(fs + u, slen, igm,
 4243                                         logn, p, p0i);
 4244                         }
 4245                 }
 4246 
 4247                 for (v = 0, x = gs + u; v < n; v ++, x += slen) {
 4248                         t1[v] = *x;
 4249                 }
 4250                 if (!in_ntt) {
 4251                         if (ter) {
 4252                                 modp_NTT3(t1, gm, logn, 0, p, p0i);
 4253                         } else {
 4254                                 modp_NTT2(t1, gm, logn, p, p0i);
 4255                         }
 4256                 }
 4257                 for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
 4258                         uint32_t w0, w1;
 4259 
 4260                         w0 = t1[(v << 1) + 0];
 4261                         w1 = t1[(v << 1) + 1];
 4262                         *x = modp_montymul(
 4263                                 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 4264                 }
 4265                 if (in_ntt) {
 4266                         if (ter) {
 4267                                 modp_iNTT3_ext(gs + u, slen, igm,
 4268                                         logn, 0, p, p0i);
 4269                         } else {
 4270                                 modp_iNTT2_ext(gs + u, slen, igm,
 4271                                         logn, p, p0i);
 4272                         }
 4273                 }
 4274 
 4275                 if (!out_ntt) {
 4276                         if (ter) {
 4277                                 modp_iNTT3_ext(fd + u, tlen, igm,
 4278                                         logn - 1, 0, p, p0i);
 4279                                 modp_iNTT3_ext(gd + u, tlen, igm,
 4280                                         logn - 1, 0, p, p0i);
 4281                         } else {
 4282                                 modp_iNTT2_ext(fd + u, tlen, igm,
 4283                                         logn - 1, p, p0i);
 4284                                 modp_iNTT2_ext(gd + u, tlen, igm,
 4285                                         logn - 1, p, p0i);
 4286                         }
 4287                 }
 4288         }
 4289 
 4290         /*
 4291          * Since the fs and gs words have been de-NTTized, we can use the
 4292          * CRT to rebuild the values.
 4293          */
 4294         zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
 4295         zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
 4296 
 4297         /*
 4298          * Remaining words: use modular reductions to extract the values.
 4299          */
 4300         for (u = slen; u < tlen; u ++) {
 4301                 uint32_t p, p0i, R2, Rx;
 4302                 size_t v;
 4303                 uint32_t *x;
 4304 
 4305                 p = primes[u].p;
 4306                 p0i = modp_ninv31(p);
 4307                 R2 = modp_R2(p, p0i);
 4308                 Rx = modp_Rx(slen, p, p0i, R2);
 4309                 if (ter) {
 4310                         modp_mkgm3(gm, igm, logn, 0, primes[u].g, p, p0i);
 4311                 } else {
 4312                         modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
 4313                 }
 4314                 for (v = 0, x = fs; v < n; v ++, x += slen) {
 4315                         t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
 4316                 }
 4317                 if (ter) {
 4318                         modp_NTT3(t1, gm, logn, 0, p, p0i);
 4319                 } else {
 4320                         modp_NTT2(t1, gm, logn, p, p0i);
 4321                 }
 4322                 for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
 4323                         uint32_t w0, w1;
 4324 
 4325                         w0 = t1[(v << 1) + 0];
 4326                         w1 = t1[(v << 1) + 1];
 4327                         *x = modp_montymul(
 4328                                 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 4329                 }
 4330                 for (v = 0, x = gs; v < n; v ++, x += slen) {
 4331                         t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
 4332                 }
 4333                 if (ter) {
 4334                         modp_NTT3(t1, gm, logn, 0, p, p0i);
 4335                 } else {
 4336                         modp_NTT2(t1, gm, logn, p, p0i);
 4337                 }
 4338                 for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
 4339                         uint32_t w0, w1;
 4340 
 4341                         w0 = t1[(v << 1) + 0];
 4342                         w1 = t1[(v << 1) + 1];
 4343                         *x = modp_montymul(
 4344                                 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 4345                 }
 4346 
 4347                 if (!out_ntt) {
 4348                         if (ter) {
 4349                                 modp_iNTT3_ext(fd + u, tlen, igm,
 4350                                         logn - 1, 0, p, p0i);
 4351                                 modp_iNTT3_ext(gd + u, tlen, igm,
 4352                                         logn - 1, 0, p, p0i);
 4353                         } else {
 4354                                 modp_iNTT2_ext(fd + u, tlen, igm,
 4355                                         logn - 1, p, p0i);
 4356                                 modp_iNTT2_ext(gd + u, tlen, igm,
 4357                                         logn - 1, p, p0i);
 4358                         }
 4359                 }
 4360         }
 4361 }
 4362 
 4363 /*
 4364  * Input: f,g of degree N = 1.5*2^logn (normal representation).
 4365  *
 4366  * Output: f',g' of degree N/3, with the length for depth 1. Output
 4367  * may be in NTT.
 4368  *
 4369  * Values are in RNS.
 4370  */
 4371 static void
 4372 make_fg_ternary_top(uint32_t *data, unsigned logn, int out_ntt)
 4373 {
 4374         /*
 4375          * Let f = f0(x^3) + x*f1(x^3) + x^2*f2(x^3).
 4376          * We define the field norm N(f) as the derminant of the
 4377          * map y |-> y*f (from the sub-field of degree N/3 to the
 4378          * field of degree N); this yields:
 4379          *
 4380          *   N(f) = f0^3 + f1^3*x + f2^3*x^2 - 3*f0*f1*f2*x
 4381          *
 4382          * For w such that w^3 = 1, but w != 1:
 4383          *
 4384          *   c1(f) = f(x*w) = f0(x^3) + w*f1(x^3)*x + w^2*f2(x^3)*x^2
 4385          *   c2(f) = f(x*w^2) = f0(x^3) + w^2*f1(x^3)*x + w*f2(x^3)*x^2
 4386          *
 4387          * It can be verified that:
 4388          *
 4389          *   N(f)(x^3) = f*c1(f)*c2(f)
 4390          *
 4391          * If c(f) = c1(f)*c2(f), then:
 4392          *
 4393          *   c(f) = (f0^2)(x^3)
 4394          *        - (f0*f1)(x^3) * x
 4395          *        + (f1^2 - f0*f2)(x^3) * x^2
 4396          *        - (f1*f2)(x^3) * x^3
 4397          *        + (f2^2)(x^3) * x^4
 4398          *
 4399          * Therefore, if the coefficients of f are integers, then so
 4400          * are the coefficients of c(f).
 4401          *
 4402          * Thus, the NTRU equation:
 4403          *   f*G - g*F = q
 4404          * can be solved recursively: we get F' and G' such that:
 4405          *   N(f)*G' - N(g)*F' = q
 4406          * and we then set:
 4407          *   F = F'(x^3) * c(g)
 4408          *   G = G'(x^3) * c(f)
 4409          *
 4410          * In this function, we simply compute N(f) and N(g). Since the
 4411          * coefficients of f and g are very small, all computations can
 4412          * be done modulo a single small prime.
 4413          *
 4414          * Moreover, in NTT representation, we have:
 4415          *
 4416          *   N(f)(x^3) = f(x)*c1(f)(x)*c2(f)(x)
 4417          *             = f(x)*f(x*w)*f(x*w^2)
 4418          *
 4419          * which allows us to compute the coefficients of N(f) by simply
 4420          * multiplying together the coefficients of f.
 4421          */
 4422 
 4423         size_t n, dn, tn, u, v;
 4424         uint32_t *gm, *igm, *fd, *gd, *fs, *gs;
 4425         uint32_t p, p0i, R2, R3;
 4426 
 4427         n = MKN(logn, 1);
 4428         dn = MKN(logn, 0);
 4429         tn = MKN(logn - 1, 0);
 4430         fd = data;
 4431         gd = fd + tn;
 4432         fs = gd + tn;
 4433         gs = fs + n;
 4434         gm = gs + n;
 4435         igm = gm + dn;
 4436         memmove(fs, data, n * 2 * sizeof *data);
 4437 
 4438         p = PRIMES3[0].p;
 4439         p0i = modp_ninv31(p);
 4440         R2 = modp_R2(p, p0i);
 4441 
 4442         modp_mkgm3(gm, igm, logn, 1, PRIMES3[0].g, p, p0i);
 4443         modp_NTT3(fs, gm, logn, 1, p, p0i);
 4444         modp_NTT3(gs, gm, logn, 1, p, p0i);
 4445 
 4446         R3 = modp_montymul(R2, R2, p, p0i);
 4447 
 4448         for (u = 0, v = 0; u < n; u += 3, v ++) {
 4449                 fd[v] = modp_montymul(R3, modp_montymul(fs[u],
 4450                         modp_montymul(fs[u + 1], fs[u + 2], p, p0i),
 4451                         p, p0i), p, p0i);
 4452                 gd[v] = modp_montymul(R3, modp_montymul(gs[u],
 4453                         modp_montymul(gs[u + 1], gs[u + 2], p, p0i),
 4454                         p, p0i), p, p0i);
 4455         }
 4456         if (!out_ntt) {
 4457                 modp_iNTT3(fd, igm, logn - 1, 0, p, p0i);
 4458                 modp_iNTT3(gd, igm, logn - 1, 0, p, p0i);
 4459         }
 4460 }
 4461 
 4462 /*
 4463  * Compute f and g at a specific depth, in RNS notation.
 4464  *
 4465  * Returned values are stored in the data[] array, at slen words per integer.
 4466  *
 4467  * Conditions:
 4468  *   0 <= depth <= logn
 4469  *
 4470  * Space use in data[]: enough room for any two successive values (f', g',
 4471  * f and g).
 4472  */
 4473 static void
 4474 make_fg(uint32_t *data, const int16_t *f, const int16_t *g,
 4475         unsigned logn, unsigned ter, unsigned depth, int out_ntt)
 4476 {
 4477         size_t n, u;
 4478         uint32_t *ft, *gt, p0;
 4479         unsigned d;
 4480         const small_prime *primes;
 4481 
 4482         n = MKN(logn, ter);
 4483         ft = data;
 4484         gt = ft + n;
 4485         primes = ter ? PRIMES3 : PRIMES2;
 4486         p0 = primes[0].p;
 4487         for (u = 0; u < n; u ++) {
 4488                 ft[u] = modp_set(f[u], p0);
 4489                 gt[u] = modp_set(g[u], p0);
 4490         }
 4491 
 4492         if (depth == 0 && out_ntt) {
 4493                 uint32_t *gm, *igm;
 4494                 uint32_t p, p0i;
 4495 
 4496                 p = primes[0].p;
 4497                 p0i = modp_ninv31(p);
 4498                 gm = gt + n;
 4499                 igm = gm + MKN(logn, 0);
 4500                 if (ter) {
 4501                         modp_mkgm3(gm, igm, logn, 1, primes[0].g, p, p0i);
 4502                         modp_NTT3(ft, gm, logn, 1, p, p0i);
 4503                         modp_NTT3(gt, gm, logn, 1, p, p0i);
 4504                 } else {
 4505                         modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
 4506                         modp_NTT2(ft, gm, logn, p, p0i);
 4507                         modp_NTT2(gt, gm, logn, p, p0i);
 4508                 }
 4509                 return;
 4510         }
 4511 
 4512         if (ter) {
 4513                 make_fg_ternary_top(data, logn, depth > 1 || out_ntt);
 4514                 for (d = 1; d < depth; d ++) {
 4515                         make_fg_step(data, logn - d, d, 1,
 4516                                 1, (d + 1) < depth || out_ntt);
 4517                 }
 4518         } else {
 4519                 for (d = 0; d < depth; d ++) {
 4520                         make_fg_step(data, logn - d, d, 0,
 4521                                 d != 0, (d + 1) < depth || out_ntt);
 4522                 }
 4523         }
 4524 }
 4525 
 4526 /*
 4527  * Solving the NTRU equation, deepest level: compute the resultants of
 4528  * f and g with X^N+1, and use binary GCD. The F and G values are
 4529  * returned in fk->tmp.
 4530  *
 4531  * Returned value: 1 on success, 0 on error.
 4532  */
 4533 static int
 4534 solve_NTRU_deepest(falcon_keygen *fk, const int16_t *f, const int16_t *g)
 4535 {
 4536         unsigned logn;
 4537         size_t len;
 4538         uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
 4539         const small_prime *primes;
 4540 
 4541         logn = fk->logn;
 4542         if (fk->ternary) {
 4543                 len = MAX_BL_SMALL3[logn];
 4544                 primes = PRIMES3;
 4545         } else {
 4546                 len = MAX_BL_SMALL2[logn];
 4547                 primes = PRIMES2;
 4548         }
 4549 
 4550         Fp = fk->tmp;
 4551         Gp = Fp + len;
 4552         fp = Gp + len;
 4553         gp = fp + len;
 4554         t1 = gp + len;
 4555 
 4556         make_fg(fp, f, g, logn, fk->ternary, logn, 0);
 4557 
 4558         /*
 4559          * We use the CRT to rebuild the resultants as big integers.
 4560          * There are two such big integers.
 4561          */
 4562         zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
 4563 
 4564         /*
 4565          * Apply the binary GCD. The zint_bezout() function works only
 4566          * if both inputs are odd.
 4567          */
 4568         if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
 4569                 return 0;
 4570         }
 4571 
 4572         /*
 4573          * Multiply the two values by the target value q. Values must
 4574          * fit in the destination arrays.
 4575          */
 4576         q = fk->ternary ? 18433 : 12289;
 4577         if (zint_mul_small(Fp, len, q) != 0
 4578                 || zint_mul_small(Gp, len, q) != 0)
 4579         {
 4580                 return 0;
 4581         }
 4582 
 4583         return 1;
 4584 }
 4585 
 4586 /*
 4587  * Solving the NTRU equation, intermediate level. Upon entry, the F and G
 4588  * from the previous level should be in the fk->tmp array.
 4589  * This function MAY be invoked for the top-level (in which case depth = 0).
 4590  *
 4591  * Returned value: 1 on success, 0 on error.
 4592  */
 4593 static int
 4594 solve_NTRU_intermediate(falcon_keygen *fk,
 4595         const int16_t *f, const int16_t *g, unsigned depth)
 4596 {
 4597         /*
 4598          * In this function, 'logn' is the log2 of the degree for
 4599          * this step. If N = 2^logn, then:
 4600          *  - the F and G values already in fk->tmp (from the deeper
 4601          *    levels) have degree N/2;
 4602          *  - this function should return F and G of degree N.
 4603          */
 4604         unsigned logn_top, logn, full;
 4605         size_t n, hn, slen, dlen, llen, FGlen, u;
 4606         uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
 4607         fpr *rt1, *rt2, *rt3, *rt4, *rt5;
 4608         uint32_t maxbl_f, maxbl_g, maxbl_fg, maxbl_FG, prev_maxbl_FG;
 4609         uint32_t *x, *y;
 4610         int32_t *k;
 4611         const small_prime *primes;
 4612 
 4613         logn_top = fk->logn;
 4614         logn = logn_top - depth;
 4615 
 4616         /*
 4617          * In the ternary case _and_ top-level, n is a multiple of 3,
 4618          * and hn = n/3. Otherwise, n is a power of 2, and hn = n/2.
 4619          */
 4620         if (fk->ternary && depth == 0) {
 4621                 full = 1;
 4622                 n = (size_t)3 << (logn - 1);
 4623                 hn = (size_t)1 << (logn - 1);
 4624         } else {
 4625                 full = 0;
 4626                 n = (size_t)1 << logn;
 4627                 hn = n >> 1;
 4628         }
 4629 
 4630         /*
 4631          * slen = size for our input f and g; also size of the reduced
 4632          *        F and G we return (degree N)
 4633          *
 4634          * dlen = size of the F and G obtained from the deeper level
 4635          *        (degree N/2 or N/3)
 4636          *
 4637          * llen = size for intermediary F and G before reduction (degree N)
 4638          *
 4639          * We build our non-reduced F and G as two independent halves each,
 4640          * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
 4641          */
 4642         if (fk->ternary) {
 4643                 slen = MAX_BL_SMALL3[depth];
 4644                 dlen = MAX_BL_SMALL3[depth + 1];
 4645                 llen = MAX_BL_LARGE3[depth];
 4646                 primes = PRIMES3;
 4647         } else {
 4648                 slen = MAX_BL_SMALL2[depth];
 4649                 dlen = MAX_BL_SMALL2[depth + 1];
 4650                 llen = MAX_BL_LARGE2[depth];
 4651                 primes = PRIMES2;
 4652         }
 4653 
 4654         /*
 4655          * Fd and Gd are the F and G from the deeper level.
 4656          */
 4657         Fd = fk->tmp;
 4658         Gd = Fd + dlen * hn;
 4659 
 4660         /*
 4661          * Compute the input f and g for this level. Note that we get f
 4662          * and g in RNS + NTT representation.
 4663          */
 4664         ft = Gd + dlen * hn;
 4665         make_fg(ft, f, g, logn_top, fk->ternary, depth, 1);
 4666 
 4667         /*
 4668          * Move the newly computed f and g to make room for our candidate
 4669          * F and G (unreduced).
 4670          */
 4671         Ft = fk->tmp;
 4672         Gt = Ft + n * llen;
 4673         t1 = Gt + n * llen;
 4674         memmove(t1, ft, 2 * n * slen * sizeof *ft);
 4675         ft = t1;
 4676         gt = ft + slen * n;
 4677         t1 = gt + slen * n;
 4678 
 4679         /*
 4680          * Move Fd and Gd _after_ f and g.
 4681          */
 4682         memmove(t1, Fd, 2 * hn * dlen * sizeof *Fd);
 4683         Fd = t1;
 4684         Gd = Fd + hn * dlen;
 4685 
 4686         /*
 4687          * We reduce Fd and Gd modulo all the small primes we will need,
 4688          * and store the values in Ft and Gt (only n/2 values in each).
 4689          */
 4690         for (u = 0; u < llen; u ++) {
 4691                 uint32_t p, p0i, R2, Rx;
 4692                 size_t v;
 4693                 uint32_t *xs, *ys, *xd, *yd;
 4694 
 4695                 p = primes[u].p;
 4696                 p0i = modp_ninv31(p);
 4697                 R2 = modp_R2(p, p0i);
 4698                 Rx = modp_Rx(dlen, p, p0i, R2);
 4699                 for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
 4700                         v < hn;
 4701                         v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
 4702                 {
 4703                         *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
 4704                         *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
 4705                 }
 4706         }
 4707 
 4708         /*
 4709          * We do not need Fd and Gd after that point.
 4710          */
 4711 
 4712         /*
 4713          * Compute our F and G modulo sufficiently many small primes.
 4714          */
 4715         for (u = 0; u < llen; u ++) {
 4716                 uint32_t p, p0i, R2;
 4717                 uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
 4718                 size_t v;
 4719 
 4720                 /*
 4721                  * All computations are done modulo p.
 4722                  */
 4723                 p = primes[u].p;
 4724                 p0i = modp_ninv31(p);
 4725                 R2 = modp_R2(p, p0i);
 4726 
 4727                 /*
 4728                  * If we processed slen words, then f and g have been
 4729                  * de-NTTized, and are in RNS; we can rebuild them.
 4730                  */
 4731                 if (u == slen) {
 4732                         zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
 4733                         zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
 4734                 }
 4735 
 4736                 gm = t1;
 4737                 if (full) {
 4738                         igm = gm + ((size_t)1 << logn);
 4739                         fx = igm + ((size_t)1 << logn);
 4740                 } else {
 4741                         gm = t1;
 4742                         igm = gm + n;
 4743                         fx = igm + n;
 4744                 }
 4745                 gx = fx + n;
 4746 
 4747                 if (fk->ternary) {
 4748                         modp_mkgm3(gm, igm, logn, full, primes[u].g, p, p0i);
 4749                 } else {
 4750                         modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
 4751                 }
 4752 
 4753                 if (u < slen) {
 4754                         size_t v;
 4755 
 4756                         for (v = 0, x = ft + u, y = gt + u;
 4757                                 v < n; v ++, x += slen, y += slen)
 4758                         {
 4759                                 fx[v] = *x;
 4760                                 gx[v] = *y;
 4761                         }
 4762                         if (fk->ternary) {
 4763                                 modp_iNTT3_ext(ft + u, slen, igm,
 4764                                         logn, full, p, p0i);
 4765                                 modp_iNTT3_ext(gt + u, slen, igm,
 4766                                         logn, full, p, p0i);
 4767                         } else {
 4768                                 modp_iNTT2_ext(ft + u, slen, igm,
 4769                                         logn, p, p0i);
 4770                                 modp_iNTT2_ext(gt + u, slen, igm,
 4771                                         logn, p, p0i);
 4772                         }
 4773                 } else {
 4774                         uint32_t Rx;
 4775                         size_t v;
 4776 
 4777                         Rx = modp_Rx(slen, p, p0i, R2);
 4778                         for (v = 0, x = ft, y = gt;
 4779                                 v < n; v ++, x += slen, y += slen)
 4780                         {
 4781                                 fx[v] = zint_mod_small_signed(x, slen,
 4782                                         p, p0i, R2, Rx);
 4783                                 gx[v] = zint_mod_small_signed(y, slen,
 4784                                         p, p0i, R2, Rx);
 4785                         }
 4786                         if (fk->ternary) {
 4787                                 modp_NTT3(fx, gm, logn, full, p, p0i);
 4788                                 modp_NTT3(gx, gm, logn, full, p, p0i);
 4789                         } else {
 4790                                 modp_NTT2(fx, gm, logn, p, p0i);
 4791                                 modp_NTT2(gx, gm, logn, p, p0i);
 4792                         }
 4793                 }
 4794 
 4795                 /*
 4796                  * Get F' and G' modulo p and in NTT representation
 4797                  * (they have degree n/2 or n/3). These values were
 4798                  * computed in a previous step, and stored in Ft and Gt.
 4799                  */
 4800                 Fp = gx + n;
 4801                 Gp = Fp + hn;
 4802                 for (v = 0, x = Ft + u, y = Gt + u;
 4803                         v < hn; v ++, x += llen, y += llen)
 4804                 {
 4805                         Fp[v] = *x;
 4806                         Gp[v] = *y;
 4807                 }
 4808                 if (fk->ternary) {
 4809                         modp_NTT3(Fp, gm, logn - 1, 0, p, p0i);
 4810                         modp_NTT3(Gp, gm, logn - 1, 0, p, p0i);
 4811                 } else {
 4812                         modp_NTT2(Fp, gm, logn - 1, p, p0i);
 4813                         modp_NTT2(Gp, gm, logn - 1, p, p0i);
 4814                 }
 4815 
 4816                 /*
 4817                  * Compute our F and G modulo p.
 4818                  *
 4819                  * General case:
 4820                  *
 4821                  *   we divide degree by d = 2 or 3
 4822                  *   f'(x^d) = N(f)(x^d) = f * adj(f)
 4823                  *   g'(x^d) = N(g)(x^d) = g * adj(g)
 4824                  *   f'*G' - g'*F' = q
 4825                  *   F = F'(x^d) * adj(g)
 4826                  *   G = G'(x^d) * adj(f)
 4827                  *
 4828                  * We compute things in the NTT. We group roots of phi
 4829                  * such that all roots x in a group share the same x^d.
 4830                  * If the roots in a group are x_1, x_2... x_d, then:
 4831                  *
 4832                  *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
 4833                  *
 4834                  * Thus, we have:
 4835                  *
 4836                  *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
 4837                  *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
 4838                  *   ...
 4839                  *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
 4840                  *
 4841                  * In all cases, we can thus compute F and G in NTT
 4842                  * representation by a few simple multiplications.
 4843                  * Moreover, in our chosen NTT representation, roots
 4844                  * from the same group are consecutive in RAM.
 4845                  */
 4846                 if (full) {
 4847                         uint32_t R3;
 4848                         size_t v2;
 4849 
 4850                         R3 = modp_montymul(R2, R2, p, p0i);
 4851                         for (v = 0, v2 = 0, x = Ft + u, y = Gt + u; v < n;
 4852                                 v += 3, v2 ++, x += 3 * llen, y += 3 * llen)
 4853                         {
 4854                                 uint32_t ftA, ftB, ftC, gtA, gtB, gtC;
 4855                                 uint32_t mFp, mGp;
 4856 
 4857                                 ftA = fx[v + 0];
 4858                                 ftB = fx[v + 1];
 4859                                 ftC = fx[v + 2];
 4860                                 gtA = gx[v + 0];
 4861                                 gtB = gx[v + 1];
 4862                                 gtC = gx[v + 2];
 4863                                 mFp = modp_montymul(Fp[v2], R3, p, p0i);
 4864                                 mGp = modp_montymul(Gp[v2], R3, p, p0i);
 4865                                 x[0] = modp_montymul(mFp,
 4866                                         modp_montymul(gtB, gtC, p, p0i),
 4867                                         p, p0i);
 4868                                 x[llen] = modp_montymul(mFp,
 4869                                         modp_montymul(gtC, gtA, p, p0i),
 4870                                         p, p0i);
 4871                                 x[2 * llen] = modp_montymul(mFp,
 4872                                         modp_montymul(gtA, gtB, p, p0i),
 4873                                         p, p0i);
 4874                                 y[0] = modp_montymul(mGp,
 4875                                         modp_montymul(ftB, ftC, p, p0i),
 4876                                         p, p0i);
 4877                                 y[llen] = modp_montymul(mGp,
 4878                                         modp_montymul(ftC, ftA, p, p0i),
 4879                                         p, p0i);
 4880                                 y[2 * llen] = modp_montymul(mGp,
 4881                                         modp_montymul(ftA, ftB, p, p0i),
 4882                                         p, p0i);
 4883                         }
 4884                 } else {
 4885                         for (v = 0, x = Ft + u, y = Gt + u; v < hn;
 4886                                 v ++, x += (llen << 1), y += (llen << 1))
 4887                         {
 4888                                 uint32_t ftA, ftB, gtA, gtB;
 4889                                 uint32_t mFp, mGp;
 4890 
 4891                                 ftA = fx[(v << 1) + 0];
 4892                                 ftB = fx[(v << 1) + 1];
 4893                                 gtA = gx[(v << 1) + 0];
 4894                                 gtB = gx[(v << 1) + 1];
 4895                                 mFp = modp_montymul(Fp[v], R2, p, p0i);
 4896                                 mGp = modp_montymul(Gp[v], R2, p, p0i);
 4897                                 x[0] = modp_montymul(gtB, mFp, p, p0i);
 4898                                 x[llen] = modp_montymul(gtA, mFp, p, p0i);
 4899                                 y[0] = modp_montymul(ftB, mGp, p, p0i);
 4900                                 y[llen] = modp_montymul(ftA, mGp, p, p0i);
 4901                         }
 4902                 }
 4903                 if (fk->ternary) {
 4904                         modp_iNTT3_ext(Ft + u, llen, igm, logn, full, p, p0i);
 4905                         modp_iNTT3_ext(Gt + u, llen, igm, logn, full, p, p0i);
 4906                 } else {
 4907                         modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
 4908                         modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
 4909                 }
 4910         }
 4911 
 4912         /*
 4913          * Rebuild F and G with the CRT.
 4914          */
 4915         zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
 4916         zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
 4917 
 4918         /*
 4919          * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
 4920          * order).
 4921          */
 4922 
 4923         /*
 4924          * Apply Babai reduction to bring back F and G to size slen.
 4925          *
 4926          * We use the FFT to compute successive approximations of the
 4927          * reduction coefficient. We first isolate the top bits of
 4928          * the coefficients of f and g, and convert them to floating
 4929          * point; with the FFT, we compute adj(f), adj(g), and
 4930          * 1/(f*adj(f)+g*adj(g)).
 4931          *
 4932          * Then, we repeatedly apply the following:
 4933          *
 4934          *   - Get the top bits of the coefficients of F and G into
 4935          *     floating point, and use the FFT to compute:
 4936          *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
 4937          *
 4938          *   - Convert back that value into normal representation, and
 4939          *     round it to the nearest integers, yielding a polynomial k.
 4940          *     Proper scaling is applied to f, g, F and G so that the
 4941          *     coefficients fit on 32 bits (signed).
 4942          *
 4943          *   - Subtract k*f from F and k*g from G.
 4944          *
 4945          * The process is repeated as long as it works, i.e. it decreases
 4946          * the maximum bit length of coefficients of F and G. This will
 4947          * normally bring down F and G down from llen to slen words at
 4948          * most.
 4949          */
 4950 
 4951         /*
 4952          * Memory layout:
 4953          *  - We need to compute and keep adj(f), adj(g), and
 4954          *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
 4955          *    respectively).
 4956          *  - At each iteration we need two extra fp buffer (N fp values),
 4957          *    and produce a k (N 32-bit words). k will be shared with one
 4958          *    of the fp buffers.
 4959          *  - To compute k*f and k*g efficiently (with the NTT), we need
 4960          *    some extra room; we reuse the space of the temporary buffers.
 4961          *
 4962          * Arrays of 'fpr' are obtained from the temporary array itself.
 4963          * We ensure that the base is at a properly aligned offset (the
 4964          * source array fk->tmp was obtained with malloc(), and is thus
 4965          * already aligned).
 4966          */
 4967 
 4968         rt3 = align_fpr(fk->tmp, t1);
 4969         rt4 = rt3 + n;
 4970         rt5 = rt4 + n;
 4971         rt1 = rt5 + (n >> 1);
 4972         k = (int32_t *)align_u32(fk->tmp, rt1);
 4973         rt2 = align_fpr(fk->tmp, k + n);
 4974         if (rt2 < (rt1 + n)) {
 4975                 rt2 = rt1 + n;
 4976         }
 4977         t1 = (uint32_t *)k + n;
 4978 
 4979         /*
 4980          * Get the maximum bit lengths of f and g. f and g are scaled
 4981          * down by maxbl_fg bits, so that values will be below 1.
 4982          */
 4983         maxbl_f = poly_max_bitlength(ft, slen, slen, logn, full);
 4984         maxbl_g = poly_max_bitlength(gt, slen, slen, logn, full);
 4985         maxbl_fg = maxbl_f &