Falcon source files (reference implementation)


keygen.c

    1 /*
    2  * Falcon key pair generation.
    3  *
    4  * ==========================(LICENSE BEGIN)============================
    5  *
    6  * Copyright (c) 2017-2019  Falcon Project
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining
    9  * a copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be
   17  * included in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * ===========================(LICENSE END)=============================
   28  *
   29  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
   30  */
   31 
   32 #include "inner.h"
   33 
   34 #define MKN(logn)   ((size_t)1 << (logn))
   35 
   36 /* ==================================================================== */
   37 /*
   38  * Modular arithmetics.
   39  *
   40  * We implement a few functions for computing modulo a small integer p.
   41  *
   42  * All functions require that 2^30 < p < 2^31. Moreover, operands must
   43  * be in the 0..p-1 range.
   44  *
   45  * Modular addition and subtraction work for all such p.
   46  *
   47  * Montgomery multiplication requires that p is odd, and must be provided
   48  * with an additional value p0i = -1/p mod 2^31. See below for some basics
   49  * on Montgomery multiplication.
   50  *
   51  * Division computes an inverse modulo p by an exponentiation (with
   52  * exponent p-2): this works only if p is prime. Multiplication
   53  * requirements also apply, i.e. p must be odd and p0i must be provided.
   54  *
   55  * The NTT and inverse NTT need all of the above, and also that
   56  * p = 1 mod 2048.
   57  *
   58  * -----------------------------------------------------------------------
   59  *
   60  * We use Montgomery representation with 31-bit values:
   61  *
   62  *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
   63  *   Montgomery representation of an integer x modulo p is x*R mod p.
   64  *
   65  *   Montgomery multiplication computes (x*y)/R mod p for
   66  *   operands x and y. Therefore:
   67  *
   68  *    - if operands are x*R and y*R (Montgomery representations of x and
   69  *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
   70  *      mod p, which is the Montgomery representation of the product x*y;
   71  *
   72  *    - if operands are x*R and y (or x and y*R), then Montgomery
   73  *      multiplication returns x*y mod p: mixed-representation
   74  *      multiplications yield results in normal representation.
   75  *
   76  * To convert to Montgomery representation, we multiply by R, which is done
   77  * by Montgomery-multiplying by R^2. Stand-alone conversion back from
   78  * Montgomery representation is Montgomery-multiplication by 1.
   79  */
   80 
   81 /*
   82  * Precomputed small primes. Each element contains the following:
   83  *
   84  *  p   The prime itself.
   85  *
   86  *  g   A primitive root of phi = X^N+1 (in field Z_p).
   87  *
   88  *  s   The inverse of the product of all previous primes in the array,
   89  *      computed modulo p and in Montgomery representation.
   90  *
   91  * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
   92  * are listed in decreasing order.
   93  */
   94 
   95 typedef struct {
   96         uint32_t p;
   97         uint32_t g;
   98         uint32_t s;
   99 } small_prime;
  100 
  101 static const small_prime PRIMES[] = {
  102         { 2147473409,  383167813,      10239 },
  103         { 2147389441,  211808905,  471403745 },
  104         { 2147387393,   37672282, 1329335065 },
  105         { 2147377153, 1977035326,  968223422 },
  106         { 2147358721, 1067163706,  132460015 },
  107         { 2147352577, 1606082042,  598693809 },
  108         { 2147346433, 2033915641, 1056257184 },
  109         { 2147338241, 1653770625,  421286710 },
  110         { 2147309569,  631200819, 1111201074 },
  111         { 2147297281, 2038364663, 1042003613 },
  112         { 2147295233, 1962540515,   19440033 },
  113         { 2147239937, 2100082663,  353296760 },
  114         { 2147235841, 1991153006, 1703918027 },
  115         { 2147217409,  516405114, 1258919613 },
  116         { 2147205121,  409347988, 1089726929 },
  117         { 2147196929,  927788991, 1946238668 },
  118         { 2147178497, 1136922411, 1347028164 },
  119         { 2147100673,  868626236,  701164723 },
  120         { 2147082241, 1897279176,  617820870 },
  121         { 2147074049, 1888819123,  158382189 },
  122         { 2147051521,   25006327,  522758543 },
  123         { 2147043329,  327546255,   37227845 },
  124         { 2147039233,  766324424, 1133356428 },
  125         { 2146988033, 1862817362,   73861329 },
  126         { 2146963457,  404622040,  653019435 },
  127         { 2146959361, 1936581214,  995143093 },
  128         { 2146938881, 1559770096,  634921513 },
  129         { 2146908161,  422623708, 1985060172 },
  130         { 2146885633, 1751189170,  298238186 },
  131         { 2146871297,  578919515,  291810829 },
  132         { 2146846721, 1114060353,  915902322 },
  133         { 2146834433, 2069565474,   47859524 },
  134         { 2146818049, 1552824584,  646281055 },
  135         { 2146775041, 1906267847, 1597832891 },
  136         { 2146756609, 1847414714, 1228090888 },
  137         { 2146744321, 1818792070, 1176377637 },
  138         { 2146738177, 1118066398, 1054971214 },
  139         { 2146736129,   52057278,  933422153 },
  140         { 2146713601,  592259376, 1406621510 },
  141         { 2146695169,  263161877, 1514178701 },
  142         { 2146656257,  685363115,  384505091 },
  143         { 2146650113,  927727032,  537575289 },
  144         { 2146646017,   52575506, 1799464037 },
  145         { 2146643969, 1276803876, 1348954416 },
  146         { 2146603009,  814028633, 1521547704 },
  147         { 2146572289, 1846678872, 1310832121 },
  148         { 2146547713,  919368090, 1019041349 },
  149         { 2146508801,  671847612,   38582496 },
  150         { 2146492417,  283911680,  532424562 },
  151         { 2146490369, 1780044827,  896447978 },
  152         { 2146459649,  327980850, 1327906900 },
  153         { 2146447361, 1310561493,  958645253 },
  154         { 2146441217,  412148926,  287271128 },
  155         { 2146437121,  293186449, 2009822534 },
  156         { 2146430977,  179034356, 1359155584 },
  157         { 2146418689, 1517345488, 1790248672 },
  158         { 2146406401, 1615820390, 1584833571 },
  159         { 2146404353,  826651445,  607120498 },
  160         { 2146379777,    3816988, 1897049071 },
  161         { 2146363393, 1221409784, 1986921567 },
  162         { 2146355201, 1388081168,  849968120 },
  163         { 2146336769, 1803473237, 1655544036 },
  164         { 2146312193, 1023484977,  273671831 },
  165         { 2146293761, 1074591448,  467406983 },
  166         { 2146283521,  831604668, 1523950494 },
  167         { 2146203649,  712865423, 1170834574 },
  168         { 2146154497, 1764991362, 1064856763 },
  169         { 2146142209,  627386213, 1406840151 },
  170         { 2146127873, 1638674429, 2088393537 },
  171         { 2146099201, 1516001018,  690673370 },
  172         { 2146093057, 1294931393,  315136610 },
  173         { 2146091009, 1942399533,  973539425 },
  174         { 2146078721, 1843461814, 2132275436 },
  175         { 2146060289, 1098740778,  360423481 },
  176         { 2146048001, 1617213232, 1951981294 },
  177         { 2146041857, 1805783169, 2075683489 },
  178         { 2146019329,  272027909, 1753219918 },
  179         { 2145986561, 1206530344, 2034028118 },
  180         { 2145976321, 1243769360, 1173377644 },
  181         { 2145964033,  887200839, 1281344586 },
  182         { 2145906689, 1651026455,  906178216 },
  183         { 2145875969, 1673238256, 1043521212 },
  184         { 2145871873, 1226591210, 1399796492 },
  185         { 2145841153, 1465353397, 1324527802 },
  186         { 2145832961, 1150638905,  554084759 },
  187         { 2145816577,  221601706,  427340863 },
  188         { 2145785857,  608896761,  316590738 },
  189         { 2145755137, 1712054942, 1684294304 },
  190         { 2145742849, 1302302867,  724873116 },
  191         { 2145728513,  516717693,  431671476 },
  192         { 2145699841,  524575579, 1619722537 },
  193         { 2145691649, 1925625239,  982974435 },
  194         { 2145687553,  463795662, 1293154300 },
  195         { 2145673217,  771716636,  881778029 },
  196         { 2145630209, 1509556977,  837364988 },
  197         { 2145595393,  229091856,  851648427 },
  198         { 2145587201, 1796903241,  635342424 },
  199         { 2145525761,  715310882, 1677228081 },
  200         { 2145495041, 1040930522,  200685896 },
  201         { 2145466369,  949804237, 1809146322 },
  202         { 2145445889, 1673903706,   95316881 },
  203         { 2145390593,  806941852, 1428671135 },
  204         { 2145372161, 1402525292,  159350694 },
  205         { 2145361921, 2124760298, 1589134749 },
  206         { 2145359873, 1217503067, 1561543010 },
  207         { 2145355777,  338341402,   83865711 },
  208         { 2145343489, 1381532164,  641430002 },
  209         { 2145325057, 1883895478, 1528469895 },
  210         { 2145318913, 1335370424,   65809740 },
  211         { 2145312769, 2000008042, 1919775760 },
  212         { 2145300481,  961450962, 1229540578 },
  213         { 2145282049,  910466767, 1964062701 },
  214         { 2145232897,  816527501,  450152063 },
  215         { 2145218561, 1435128058, 1794509700 },
  216         { 2145187841,   33505311, 1272467582 },
  217         { 2145181697,  269767433, 1380363849 },
  218         { 2145175553,   56386299, 1316870546 },
  219         { 2145079297, 2106880293, 1391797340 },
  220         { 2145021953, 1347906152,  720510798 },
  221         { 2145015809,  206769262, 1651459955 },
  222         { 2145003521, 1885513236, 1393381284 },
  223         { 2144960513, 1810381315,   31937275 },
  224         { 2144944129, 1306487838, 2019419520 },
  225         { 2144935937,   37304730, 1841489054 },
  226         { 2144894977, 1601434616,  157985831 },
  227         { 2144888833,   98749330, 2128592228 },
  228         { 2144880641, 1772327002, 2076128344 },
  229         { 2144864257, 1404514762, 2029969964 },
  230         { 2144827393,  801236594,  406627220 },
  231         { 2144806913,  349217443, 1501080290 },
  232         { 2144796673, 1542656776, 2084736519 },
  233         { 2144778241, 1210734884, 1746416203 },
  234         { 2144759809, 1146598851,  716464489 },
  235         { 2144757761,  286328400, 1823728177 },
  236         { 2144729089, 1347555695, 1836644881 },
  237         { 2144727041, 1795703790,  520296412 },
  238         { 2144696321, 1302475157,  852964281 },
  239         { 2144667649, 1075877614,  504992927 },
  240         { 2144573441,  198765808, 1617144982 },
  241         { 2144555009,  321528767,  155821259 },
  242         { 2144550913,  814139516, 1819937644 },
  243         { 2144536577,  571143206,  962942255 },
  244         { 2144524289, 1746733766,    2471321 },
  245         { 2144512001, 1821415077,  124190939 },
  246         { 2144468993,  917871546, 1260072806 },
  247         { 2144458753,  378417981, 1569240563 },
  248         { 2144421889,  175229668, 1825620763 },
  249         { 2144409601, 1699216963,  351648117 },
  250         { 2144370689, 1071885991,  958186029 },
  251         { 2144348161, 1763151227,  540353574 },
  252         { 2144335873, 1060214804,  919598847 },
  253         { 2144329729,  663515846, 1448552668 },
  254         { 2144327681, 1057776305,  590222840 },
  255         { 2144309249, 1705149168, 1459294624 },
  256         { 2144296961,  325823721, 1649016934 },
  257         { 2144290817,  738775789,  447427206 },
  258         { 2144243713,  962347618,  893050215 },
  259         { 2144237569, 1655257077,  900860862 },
  260         { 2144161793,  242206694, 1567868672 },
  261         { 2144155649,  769415308, 1247993134 },
  262         { 2144137217,  320492023,  515841070 },
  263         { 2144120833, 1639388522,  770877302 },
  264         { 2144071681, 1761785233,  964296120 },
  265         { 2144065537,  419817825,  204564472 },
  266         { 2144028673,  666050597, 2091019760 },
  267         { 2144010241, 1413657615, 1518702610 },
  268         { 2143952897, 1238327946,  475672271 },
  269         { 2143940609,  307063413, 1176750846 },
  270         { 2143918081, 2062905559,  786785803 },
  271         { 2143899649, 1338112849, 1562292083 },
  272         { 2143891457,   68149545,   87166451 },
  273         { 2143885313,  921750778,  394460854 },
  274         { 2143854593,  719766593,  133877196 },
  275         { 2143836161, 1149399850, 1861591875 },
  276         { 2143762433, 1848739366, 1335934145 },
  277         { 2143756289, 1326674710,  102999236 },
  278         { 2143713281,  808061791, 1156900308 },
  279         { 2143690753,  388399459, 1926468019 },
  280         { 2143670273, 1427891374, 1756689401 },
  281         { 2143666177, 1912173949,  986629565 },
  282         { 2143645697, 2041160111,  371842865 },
  283         { 2143641601, 1279906897, 2023974350 },
  284         { 2143635457,  720473174, 1389027526 },
  285         { 2143621121, 1298309455, 1732632006 },
  286         { 2143598593, 1548762216, 1825417506 },
  287         { 2143567873,  620475784, 1073787233 },
  288         { 2143561729, 1932954575,  949167309 },
  289         { 2143553537,  354315656, 1652037534 },
  290         { 2143541249,  577424288, 1097027618 },
  291         { 2143531009,  357862822,  478640055 },
  292         { 2143522817, 2017706025, 1550531668 },
  293         { 2143506433, 2078127419, 1824320165 },
  294         { 2143488001,  613475285, 1604011510 },
  295         { 2143469569, 1466594987,  502095196 },
  296         { 2143426561, 1115430331, 1044637111 },
  297         { 2143383553,    9778045, 1902463734 },
  298         { 2143377409, 1557401276, 2056861771 },
  299         { 2143363073,  652036455, 1965915971 },
  300         { 2143260673, 1464581171, 1523257541 },
  301         { 2143246337, 1876119649,  764541916 },
  302         { 2143209473, 1614992673, 1920672844 },
  303         { 2143203329,  981052047, 2049774209 },
  304         { 2143160321, 1847355533,  728535665 },
  305         { 2143129601,  965558457,  603052992 },
  306         { 2143123457, 2140817191,    8348679 },
  307         { 2143100929, 1547263683,  694209023 },
  308         { 2143092737,  643459066, 1979934533 },
  309         { 2143082497,  188603778, 2026175670 },
  310         { 2143062017, 1657329695,  377451099 },
  311         { 2143051777,  114967950,  979255473 },
  312         { 2143025153, 1698431342, 1449196896 },
  313         { 2143006721, 1862741675, 1739650365 },
  314         { 2142996481,  756660457,  996160050 },
  315         { 2142976001,  927864010, 1166847574 },
  316         { 2142965761,  905070557,  661974566 },
  317         { 2142916609,   40932754, 1787161127 },
  318         { 2142892033, 1987985648,  675335382 },
  319         { 2142885889,  797497211, 1323096997 },
  320         { 2142871553, 2068025830, 1411877159 },
  321         { 2142861313, 1217177090, 1438410687 },
  322         { 2142830593,  409906375, 1767860634 },
  323         { 2142803969, 1197788993,  359782919 },
  324         { 2142785537,  643817365,  513932862 },
  325         { 2142779393, 1717046338,  218943121 },
  326         { 2142724097,   89336830,  416687049 },
  327         { 2142707713,    5944581, 1356813523 },
  328         { 2142658561,  887942135, 2074011722 },
  329         { 2142638081,  151851972, 1647339939 },
  330         { 2142564353, 1691505537, 1483107336 },
  331         { 2142533633, 1989920200, 1135938817 },
  332         { 2142529537,  959263126, 1531961857 },
  333         { 2142527489,  453251129, 1725566162 },
  334         { 2142502913, 1536028102,  182053257 },
  335         { 2142498817,  570138730,  701443447 },
  336         { 2142416897,  326965800,  411931819 },
  337         { 2142363649, 1675665410, 1517191733 },
  338         { 2142351361,  968529566, 1575712703 },
  339         { 2142330881, 1384953238, 1769087884 },
  340         { 2142314497, 1977173242, 1833745524 },
  341         { 2142289921,   95082313, 1714775493 },
  342         { 2142283777,  109377615, 1070584533 },
  343         { 2142277633,   16960510,  702157145 },
  344         { 2142263297,  553850819,  431364395 },
  345         { 2142208001,  241466367, 2053967982 },
  346         { 2142164993, 1795661326, 1031836848 },
  347         { 2142097409, 1212530046,  712772031 },
  348         { 2142087169, 1763869720,  822276067 },
  349         { 2142078977,  644065713, 1765268066 },
  350         { 2142074881,  112671944,  643204925 },
  351         { 2142044161, 1387785471, 1297890174 },
  352         { 2142025729,  783885537, 1000425730 },
  353         { 2142011393,  905662232, 1679401033 },
  354         { 2141974529,  799788433,  468119557 },
  355         { 2141943809, 1932544124,  449305555 },
  356         { 2141933569, 1527403256,  841867925 },
  357         { 2141931521, 1247076451,  743823916 },
  358         { 2141902849, 1199660531,  401687910 },
  359         { 2141890561,  150132350, 1720336972 },
  360         { 2141857793, 1287438162,  663880489 },
  361         { 2141833217,  618017731, 1819208266 },
  362         { 2141820929,  999578638, 1403090096 },
  363         { 2141786113,   81834325, 1523542501 },
  364         { 2141771777,  120001928,  463556492 },
  365         { 2141759489,  122455485, 2124928282 },
  366         { 2141749249,  141986041,  940339153 },
  367         { 2141685761,  889088734,  477141499 },
  368         { 2141673473,  324212681, 1122558298 },
  369         { 2141669377, 1175806187, 1373818177 },
  370         { 2141655041, 1113654822,  296887082 },
  371         { 2141587457,  991103258, 1585913875 },
  372         { 2141583361, 1401451409, 1802457360 },
  373         { 2141575169, 1571977166,  712760980 },
  374         { 2141546497, 1107849376, 1250270109 },
  375         { 2141515777,  196544219,  356001130 },
  376         { 2141495297, 1733571506, 1060744866 },
  377         { 2141483009,  321552363, 1168297026 },
  378         { 2141458433,  505818251,  733225819 },
  379         { 2141360129, 1026840098,  948342276 },
  380         { 2141325313,  945133744, 2129965998 },
  381         { 2141317121, 1871100260, 1843844634 },
  382         { 2141286401, 1790639498, 1750465696 },
  383         { 2141267969, 1376858592,  186160720 },
  384         { 2141255681, 2129698296, 1876677959 },
  385         { 2141243393, 2138900688, 1340009628 },
  386         { 2141214721, 1933049835, 1087819477 },
  387         { 2141212673, 1898664939, 1786328049 },
  388         { 2141202433,  990234828,  940682169 },
  389         { 2141175809, 1406392421,  993089586 },
  390         { 2141165569, 1263518371,  289019479 },
  391         { 2141073409, 1485624211,  507864514 },
  392         { 2141052929, 1885134788,  311252465 },
  393         { 2141040641, 1285021247,  280941862 },
  394         { 2141028353, 1527610374,  375035110 },
  395         { 2141011969, 1400626168,  164696620 },
  396         { 2140999681,  632959608,  966175067 },
  397         { 2140997633, 2045628978, 1290889438 },
  398         { 2140993537, 1412755491,  375366253 },
  399         { 2140942337,  719477232,  785367828 },
  400         { 2140925953,   45224252,  836552317 },
  401         { 2140917761, 1157376588, 1001839569 },
  402         { 2140887041,  278480752, 2098732796 },
  403         { 2140837889, 1663139953,  924094810 },
  404         { 2140788737,  802501511, 2045368990 },
  405         { 2140766209, 1820083885, 1800295504 },
  406         { 2140764161, 1169561905, 2106792035 },
  407         { 2140696577,  127781498, 1885987531 },
  408         { 2140684289,   16014477, 1098116827 },
  409         { 2140653569,  665960598, 1796728247 },
  410         { 2140594177, 1043085491,  377310938 },
  411         { 2140579841, 1732838211, 1504505945 },
  412         { 2140569601,  302071939,  358291016 },
  413         { 2140567553,  192393733, 1909137143 },
  414         { 2140557313,  406595731, 1175330270 },
  415         { 2140549121, 1748850918,  525007007 },
  416         { 2140477441,  499436566, 1031159814 },
  417         { 2140469249, 1886004401, 1029951320 },
  418         { 2140426241, 1483168100, 1676273461 },
  419         { 2140420097, 1779917297,  846024476 },
  420         { 2140413953,  522948893, 1816354149 },
  421         { 2140383233, 1931364473, 1296921241 },
  422         { 2140366849, 1917356555,  147196204 },
  423         { 2140354561,   16466177, 1349052107 },
  424         { 2140348417, 1875366972, 1860485634 },
  425         { 2140323841,  456498717, 1790256483 },
  426         { 2140321793, 1629493973,  150031888 },
  427         { 2140315649, 1904063898,  395510935 },
  428         { 2140280833, 1784104328,  831417909 },
  429         { 2140250113,  256087139,  697349101 },
  430         { 2140229633,  388553070,  243875754 },
  431         { 2140223489,  747459608, 1396270850 },
  432         { 2140200961,  507423743, 1895572209 },
  433         { 2140162049,  580106016, 2045297469 },
  434         { 2140149761,  712426444,  785217995 },
  435         { 2140137473, 1441607584,  536866543 },
  436         { 2140119041,  346538902, 1740434653 },
  437         { 2140090369,  282642885,   21051094 },
  438         { 2140076033, 1407456228,  319910029 },
  439         { 2140047361, 1619330500, 1488632070 },
  440         { 2140041217, 2089408064, 2012026134 },
  441         { 2140008449, 1705524800, 1613440760 },
  442         { 2139924481, 1846208233, 1280649481 },
  443         { 2139906049,  989438755, 1185646076 },
  444         { 2139867137, 1522314850,  372783595 },
  445         { 2139842561, 1681587377,  216848235 },
  446         { 2139826177, 2066284988, 1784999464 },
  447         { 2139824129,  480888214, 1513323027 },
  448         { 2139789313,  847937200,  858192859 },
  449         { 2139783169, 1642000434, 1583261448 },
  450         { 2139770881,  940699589,  179702100 },
  451         { 2139768833,  315623242,  964612676 },
  452         { 2139666433,  331649203,  764666914 },
  453         { 2139641857, 2118730799, 1313764644 },
  454         { 2139635713,  519149027,  519212449 },
  455         { 2139598849, 1526413634, 1769667104 },
  456         { 2139574273,  551148610,  820739925 },
  457         { 2139568129, 1386800242,  472447405 },
  458         { 2139549697,  813760130, 1412328531 },
  459         { 2139537409, 1615286260, 1609362979 },
  460         { 2139475969, 1352559299, 1696720421 },
  461         { 2139455489, 1048691649, 1584935400 },
  462         { 2139432961,  836025845,  950121150 },
  463         { 2139424769, 1558281165, 1635486858 },
  464         { 2139406337, 1728402143, 1674423301 },
  465         { 2139396097, 1727715782, 1483470544 },
  466         { 2139383809, 1092853491, 1741699084 },
  467         { 2139369473,  690776899, 1242798709 },
  468         { 2139351041, 1768782380, 2120712049 },
  469         { 2139334657, 1739968247, 1427249225 },
  470         { 2139332609, 1547189119,  623011170 },
  471         { 2139310081, 1346827917, 1605466350 },
  472         { 2139303937,  369317948,  828392831 },
  473         { 2139301889, 1560417239, 1788073219 },
  474         { 2139283457, 1303121623,  595079358 },
  475         { 2139248641, 1354555286,  573424177 },
  476         { 2139240449,   60974056,  885781403 },
  477         { 2139222017,  355573421, 1221054839 },
  478         { 2139215873,  566477826, 1724006500 },
  479         { 2139150337,  871437673, 1609133294 },
  480         { 2139144193, 1478130914, 1137491905 },
  481         { 2139117569, 1854880922,  964728507 },
  482         { 2139076609,  202405335,  756508944 },
  483         { 2139062273, 1399715741,  884826059 },
  484         { 2139045889, 1051045798, 1202295476 },
  485         { 2139033601, 1707715206,  632234634 },
  486         { 2139006977, 2035853139,  231626690 },
  487         { 2138951681,  183867876,  838350879 },
  488         { 2138945537, 1403254661,  404460202 },
  489         { 2138920961,  310865011, 1282911681 },
  490         { 2138910721, 1328496553,  103472415 },
  491         { 2138904577,   78831681,  993513549 },
  492         { 2138902529, 1319697451, 1055904361 },
  493         { 2138816513,  384338872, 1706202469 },
  494         { 2138810369, 1084868275,  405677177 },
  495         { 2138787841,  401181788, 1964773901 },
  496         { 2138775553, 1850532988, 1247087473 },
  497         { 2138767361,  874261901, 1576073565 },
  498         { 2138757121, 1187474742,  993541415 },
  499         { 2138748929, 1782458888, 1043206483 },
  500         { 2138744833, 1221500487,  800141243 },
  501         { 2138738689,  413465368, 1450660558 },
  502         { 2138695681,  739045140,  342611472 },
  503         { 2138658817, 1355845756,  672674190 },
  504         { 2138644481,  608379162, 1538874380 },
  505         { 2138632193, 1444914034,  686911254 },
  506         { 2138607617,  484707818, 1435142134 },
  507         { 2138591233,  539460669, 1290458549 },
  508         { 2138572801, 2093538990, 2011138646 },
  509         { 2138552321, 1149786988, 1076414907 },
  510         { 2138546177,  840688206, 2108985273 },
  511         { 2138533889,  209669619,  198172413 },
  512         { 2138523649, 1975879426, 1277003968 },
  513         { 2138490881, 1351891144, 1976858109 },
  514         { 2138460161, 1817321013, 1979278293 },
  515         { 2138429441, 1950077177,  203441928 },
  516         { 2138400769,  908970113,  628395069 },
  517         { 2138398721,  219890864,  758486760 },
  518         { 2138376193, 1306654379,  977554090 },
  519         { 2138351617,  298822498, 2004708503 },
  520         { 2138337281,  441457816, 1049002108 },
  521         { 2138320897, 1517731724, 1442269609 },
  522         { 2138290177, 1355911197, 1647139103 },
  523         { 2138234881,  531313247, 1746591962 },
  524         { 2138214401, 1899410930,  781416444 },
  525         { 2138202113, 1813477173, 1622508515 },
  526         { 2138191873, 1086458299, 1025408615 },
  527         { 2138183681, 1998800427,  827063290 },
  528         { 2138173441, 1921308898,  749670117 },
  529         { 2138103809, 1620902804, 2126787647 },
  530         { 2138099713,  828647069, 1892961817 },
  531         { 2138085377,  179405355, 1525506535 },
  532         { 2138060801,  615683235, 1259580138 },
  533         { 2138044417, 2030277840, 1731266562 },
  534         { 2138042369, 2087222316, 1627902259 },
  535         { 2138032129,  126388712, 1108640984 },
  536         { 2138011649,  715026550, 1017980050 },
  537         { 2137993217, 1693714349, 1351778704 },
  538         { 2137888769, 1289762259, 1053090405 },
  539         { 2137853953,  199991890, 1254192789 },
  540         { 2137833473,  941421685,  896995556 },
  541         { 2137817089,  750416446, 1251031181 },
  542         { 2137792513,  798075119,  368077456 },
  543         { 2137786369,  878543495, 1035375025 },
  544         { 2137767937,    9351178, 1156563902 },
  545         { 2137755649, 1382297614, 1686559583 },
  546         { 2137724929, 1345472850, 1681096331 },
  547         { 2137704449,  834666929,  630551727 },
  548         { 2137673729, 1646165729, 1892091571 },
  549         { 2137620481,  778943821,   48456461 },
  550         { 2137618433, 1730837875, 1713336725 },
  551         { 2137581569,  805610339, 1378891359 },
  552         { 2137538561,  204342388, 1950165220 },
  553         { 2137526273, 1947629754, 1500789441 },
  554         { 2137516033,  719902645, 1499525372 },
  555         { 2137491457,  230451261,  556382829 },
  556         { 2137440257,  979573541,  412760291 },
  557         { 2137374721,  927841248, 1954137185 },
  558         { 2137362433, 1243778559,  861024672 },
  559         { 2137313281, 1341338501,  980638386 },
  560         { 2137311233,  937415182, 1793212117 },
  561         { 2137255937,  795331324, 1410253405 },
  562         { 2137243649,  150756339, 1966999887 },
  563         { 2137182209,  163346914, 1939301431 },
  564         { 2137171969, 1952552395,  758913141 },
  565         { 2137159681,  570788721,  218668666 },
  566         { 2137147393, 1896656810, 2045670345 },
  567         { 2137141249,  358493842,  518199643 },
  568         { 2137139201, 1505023029,  674695848 },
  569         { 2137133057,   27911103,  830956306 },
  570         { 2137122817,  439771337, 1555268614 },
  571         { 2137116673,  790988579, 1871449599 },
  572         { 2137110529,  432109234,  811805080 },
  573         { 2137102337, 1357900653, 1184997641 },
  574         { 2137098241,  515119035, 1715693095 },
  575         { 2137090049,  408575203, 2085660657 },
  576         { 2137085953, 2097793407, 1349626963 },
  577         { 2137055233, 1556739954, 1449960883 },
  578         { 2137030657, 1545758650, 1369303716 },
  579         { 2136987649,  332602570,  103875114 },
  580         { 2136969217, 1499989506, 1662964115 },
  581         { 2136924161,  857040753,    4738842 },
  582         { 2136895489, 1948872712,  570436091 },
  583         { 2136893441,   58969960, 1568349634 },
  584         { 2136887297, 2127193379,  273612548 },
  585         { 2136850433,  111208983, 1181257116 },
  586         { 2136809473, 1627275942, 1680317971 },
  587         { 2136764417, 1574888217,   14011331 },
  588         { 2136741889,   14011055, 1129154251 },
  589         { 2136727553,   35862563, 1838555253 },
  590         { 2136721409,  310235666, 1363928244 },
  591         { 2136698881, 1612429202, 1560383828 },
  592         { 2136649729, 1138540131,  800014364 },
  593         { 2136606721,  602323503, 1433096652 },
  594         { 2136563713,  182209265, 1919611038 },
  595         { 2136555521,  324156477,  165591039 },
  596         { 2136549377,  195513113,  217165345 },
  597         { 2136526849, 1050768046,  939647887 },
  598         { 2136508417, 1886286237, 1619926572 },
  599         { 2136477697,  609647664,   35065157 },
  600         { 2136471553,  679352216, 1452259468 },
  601         { 2136457217,  128630031,  824816521 },
  602         { 2136422401,   19787464, 1526049830 },
  603         { 2136420353,  698316836, 1530623527 },
  604         { 2136371201, 1651862373, 1804812805 },
  605         { 2136334337,  326596005,  336977082 },
  606         { 2136322049,   63253370, 1904972151 },
  607         { 2136297473,  312176076,  172182411 },
  608         { 2136248321,  381261841,  369032670 },
  609         { 2136242177,  358688773, 1640007994 },
  610         { 2136229889,  512677188,   75585225 },
  611         { 2136219649, 2095003250, 1970086149 },
  612         { 2136207361, 1909650722,  537760675 },
  613         { 2136176641, 1334616195, 1533487619 },
  614         { 2136158209, 2096285632, 1793285210 },
  615         { 2136143873, 1897347517,  293843959 },
  616         { 2136133633,  923586222, 1022655978 },
  617         { 2136096769, 1464868191, 1515074410 },
  618         { 2136094721, 2020679520, 2061636104 },
  619         { 2136076289,  290798503, 1814726809 },
  620         { 2136041473,  156415894, 1250757633 },
  621         { 2135996417,  297459940, 1132158924 },
  622         { 2135955457,  538755304, 1688831340 },
  623         { 0, 0, 0 }
  624 };
  625 
  626 /*
  627  * Reduce a small signed integer modulo a small prime. The source
  628  * value x MUST be such that -p < x < p.
  629  */
  630 static inline uint32_t
  631 modp_set(int32_t x, uint32_t p)
  632 {
  633         uint32_t w;
  634 
  635         w = (uint32_t)x;
  636         w += p & -(w >> 31);
  637         return w;
  638 }
  639 
  640 /*
  641  * Normalize a modular integer around 0.
  642  */
  643 static inline int32_t
  644 modp_norm(uint32_t x, uint32_t p)
  645 {
  646         return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
  647 }
  648 
  649 /*
  650  * Compute -1/p mod 2^31. This works for all odd integers p that fit
  651  * on 31 bits.
  652  */
  653 static uint32_t
  654 modp_ninv31(uint32_t p)
  655 {
  656         uint32_t y;
  657 
  658         y = 2 - p;
  659         y *= 2 - p * y;
  660         y *= 2 - p * y;
  661         y *= 2 - p * y;
  662         y *= 2 - p * y;
  663         return (uint32_t)0x7FFFFFFF & -y;
  664 }
  665 
  666 /*
  667  * Compute R = 2^31 mod p.
  668  */
  669 static inline uint32_t
  670 modp_R(uint32_t p)
  671 {
  672         /*
  673          * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
  674          * 2^31 - p.
  675          */
  676         return ((uint32_t)1 << 31) - p;
  677 }
  678 
  679 /*
  680  * Addition modulo p.
  681  */
  682 static inline uint32_t
  683 modp_add(uint32_t a, uint32_t b, uint32_t p)
  684 {
  685         uint32_t d;
  686 
  687         d = a + b - p;
  688         d += p & -(d >> 31);
  689         return d;
  690 }
  691 
  692 /*
  693  * Subtraction modulo p.
  694  */
  695 static inline uint32_t
  696 modp_sub(uint32_t a, uint32_t b, uint32_t p)
  697 {
  698         uint32_t d;
  699 
  700         d = a - b;
  701         d += p & -(d >> 31);
  702         return d;
  703 }
  704 
  705 /*
  706  * Halving modulo p.
  707  */
  708 /* unused
  709 static inline uint32_t
  710 modp_half(uint32_t a, uint32_t p)
  711 {
  712         a += p & -(a & 1);
  713         return a >> 1;
  714 }
  715 */
  716 
  717 /*
  718  * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
  719  * It is required that p is an odd integer.
  720  */
  721 static inline uint32_t
  722 modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i)
  723 {
  724         uint64_t z, w;
  725         uint32_t d;
  726 
  727         z = (uint64_t)a * (uint64_t)b;
  728         w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
  729         d = (uint32_t)((z + w) >> 31) - p;
  730         d += p & -(d >> 31);
  731         return d;
  732 }
  733 
  734 /*
  735  * Compute R2 = 2^62 mod p.
  736  */
  737 static uint32_t
  738 modp_R2(uint32_t p, uint32_t p0i)
  739 {
  740         uint32_t z;
  741 
  742         /*
  743          * Compute z = 2^31 mod p (this is the value 1 in Montgomery
  744          * representation), then double it with an addition.
  745          */
  746         z = modp_R(p);
  747         z = modp_add(z, z, p);
  748 
  749         /*
  750          * Square it five times to obtain 2^32 in Montgomery representation
  751          * (i.e. 2^63 mod p).
  752          */
  753         z = modp_montymul(z, z, p, p0i);
  754         z = modp_montymul(z, z, p, p0i);
  755         z = modp_montymul(z, z, p, p0i);
  756         z = modp_montymul(z, z, p, p0i);
  757         z = modp_montymul(z, z, p, p0i);
  758 
  759         /*
  760          * Halve the value mod p to get 2^62.
  761          */
  762         z = (z + (p & -(z & 1))) >> 1;
  763         return z;
  764 }
  765 
  766 /*
  767  * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
  768  * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
  769  * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
  770  */
  771 static inline uint32_t
  772 modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2)
  773 {
  774         int i;
  775         uint32_t r, z;
  776 
  777         /*
  778          * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
  779          * representation of (2^31)^e mod p, where e = x-1.
  780          * R2 is 2^31 in Montgomery representation.
  781          */
  782         x --;
  783         r = R2;
  784         z = modp_R(p);
  785         for (i = 0; (1U << i) <= x; i ++) {
  786                 if ((x & (1U << i)) != 0) {
  787                         z = modp_montymul(z, r, p, p0i);
  788                 }
  789                 r = modp_montymul(r, r, p, p0i);
  790         }
  791         return z;
  792 }
  793 
  794 /*
  795  * Division modulo p. If the divisor (b) is 0, then 0 is returned.
  796  * This function computes proper results only when p is prime.
  797  * Parameters:
  798  *   a     dividend
  799  *   b     divisor
  800  *   p     odd prime modulus
  801  *   p0i   -1/p mod 2^31
  802  *   R     2^31 mod R
  803  */
  804 static uint32_t
  805 modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R)
  806 {
  807         uint32_t z, e;
  808         int i;
  809 
  810         e = p - 2;
  811         z = R;
  812         for (i = 30; i >= 0; i --) {
  813                 uint32_t z2;
  814 
  815                 z = modp_montymul(z, z, p, p0i);
  816                 z2 = modp_montymul(z, b, p, p0i);
  817                 z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
  818         }
  819 
  820         /*
  821          * The loop above just assumed that b was in Montgomery
  822          * representation, i.e. really contained b*R; under that
  823          * assumption, it returns 1/b in Montgomery representation,
  824          * which is R/b. But we gave it b in normal representation,
  825          * so the loop really returned R/(b/R) = R^2/b.
  826          *
  827          * We want a/b, so we need one Montgomery multiplication with a,
  828          * which also remove one of the R factors, and another such
  829          * multiplication to remove the second R factor.
  830          */
  831         z = modp_montymul(z, 1, p, p0i);
  832         return modp_montymul(a, z, p, p0i);
  833 }
  834 
  835 /*
  836  * Bit-reversal index table.
  837  */
  838 static const uint16_t REV10[] = {
  839            0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
  840          192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
  841           96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
  842          144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
  843           48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
  844          240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
  845           72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
  846          168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
  847           24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
  848          216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
  849          120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
  850          132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
  851           36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
  852          228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
  853           84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
  854          180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
  855           12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
  856          204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
  857          108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
  858          156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
  859           60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
  860          252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
  861           66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
  862          162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
  863           18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
  864          210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
  865          114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
  866          138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
  867           42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
  868          234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
  869           90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
  870          186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
  871            6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
  872          198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
  873          102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
  874          150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
  875           54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
  876          246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
  877           78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
  878          174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
  879           30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
  880          222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
  881          126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
  882          129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
  883           33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
  884          225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
  885           81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
  886          177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
  887            9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
  888          201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
  889          105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
  890          153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
  891           57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
  892          249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
  893           69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
  894          165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
  895           21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
  896          213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
  897          117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
  898          141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
  899           45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
  900          237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
  901           93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
  902          189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
  903            3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
  904          195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
  905           99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
  906          147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
  907           51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
  908          243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
  909           75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
  910          171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
  911           27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
  912          219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
  913          123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
  914          135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
  915           39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
  916          231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
  917           87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
  918          183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
  919           15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
  920          207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
  921          111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
  922          159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
  923           63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
  924          255,  767,  511, 1023
  925 };
  926 
  927 /*
  928  * Compute the roots for NTT and inverse NTT (binary case). Input
  929  * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
  930  * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
  931  *   gm[rev(i)] = g^i mod p
  932  *   igm[rev(i)] = (1/g)^i mod p
  933  * where rev() is the "bit reversal" function over 10 bits. It fills
  934  * the arrays only up to N = 2^logn values.
  935  *
  936  * The values stored in gm[] and igm[] are in Montgomery representation.
  937  *
  938  * p must be a prime such that p = 1 mod 2048.
  939  */
  940 static void
  941 modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
  942         uint32_t g, uint32_t p, uint32_t p0i)
  943 {
  944         size_t u, n;
  945         unsigned k;
  946         uint32_t ig, x1, x2, R2;
  947 
  948         n = (size_t)1 << logn;
  949 
  950         /*
  951          * We want g such that g^(2N) = 1 mod p, but the provided
  952          * generator has order 2048. We must square it a few times.
  953          */
  954         R2 = modp_R2(p, p0i);
  955         g = modp_montymul(g, R2, p, p0i);
  956         for (k = logn; k < 10; k ++) {
  957                 g = modp_montymul(g, g, p, p0i);
  958         }
  959 
  960         ig = modp_div(R2, g, p, p0i, modp_R(p));
  961         k = 10 - logn;
  962         x1 = x2 = modp_R(p);
  963         for (u = 0; u < n; u ++) {
  964                 size_t v;
  965 
  966                 v = REV10[u << k];
  967                 gm[v] = x1;
  968                 igm[v] = x2;
  969                 x1 = modp_montymul(x1, g, p, p0i);
  970                 x2 = modp_montymul(x2, ig, p, p0i);
  971         }
  972 }
  973 
  974 /*
  975  * Compute the NTT over a polynomial (binary case). Polynomial elements
  976  * are a[0], a[stride], a[2 * stride]...
  977  */
  978 static void
  979 modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
  980         uint32_t p, uint32_t p0i)
  981 {
  982         size_t t, m, n;
  983 
  984         if (logn == 0) {
  985                 return;
  986         }
  987         n = (size_t)1 << logn;
  988         t = n;
  989         for (m = 1; m < n; m <<= 1) {
  990                 size_t ht, u, v1;
  991 
  992                 ht = t >> 1;
  993                 for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
  994                         uint32_t s;
  995                         size_t v;
  996                         uint32_t *r1, *r2;
  997 
  998                         s = gm[m + u];
  999                         r1 = a + v1 * stride;
 1000                         r2 = r1 + ht * stride;
 1001                         for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
 1002                                 uint32_t x, y;
 1003 
 1004                                 x = *r1;
 1005                                 y = modp_montymul(*r2, s, p, p0i);
 1006                                 *r1 = modp_add(x, y, p);
 1007                                 *r2 = modp_sub(x, y, p);
 1008                         }
 1009                 }
 1010                 t = ht;
 1011         }
 1012 }
 1013 
 1014 /*
 1015  * Compute the inverse NTT over a polynomial (binary case).
 1016  */
 1017 static void
 1018 modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
 1019         uint32_t p, uint32_t p0i)
 1020 {
 1021         size_t t, m, n, k;
 1022         uint32_t ni;
 1023         uint32_t *r;
 1024 
 1025         if (logn == 0) {
 1026                 return;
 1027         }
 1028         n = (size_t)1 << logn;
 1029         t = 1;
 1030         for (m = n; m > 1; m >>= 1) {
 1031                 size_t hm, dt, u, v1;
 1032 
 1033                 hm = m >> 1;
 1034                 dt = t << 1;
 1035                 for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
 1036                         uint32_t s;
 1037                         size_t v;
 1038                         uint32_t *r1, *r2;
 1039 
 1040                         s = igm[hm + u];
 1041                         r1 = a + v1 * stride;
 1042                         r2 = r1 + t * stride;
 1043                         for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
 1044                                 uint32_t x, y;
 1045 
 1046                                 x = *r1;
 1047                                 y = *r2;
 1048                                 *r1 = modp_add(x, y, p);
 1049                                 *r2 = modp_montymul(
 1050                                         modp_sub(x, y, p), s, p, p0i);;
 1051                         }
 1052                 }
 1053                 t = dt;
 1054         }
 1055 
 1056         /*
 1057          * We need 1/n in Montgomery representation, i.e. R/n. Since
 1058          * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
 1059          * thus a simple shift will do.
 1060          */
 1061         ni = (uint32_t)1 << (31 - logn);
 1062         for (k = 0, r = a; k < n; k ++, r += stride) {
 1063                 *r = modp_montymul(*r, ni, p, p0i);
 1064         }
 1065 }
 1066 
 1067 /*
 1068  * Simplified macros for NTT and iNTT (binary case) when the elements
 1069  * are consecutive in RAM.
 1070  */
 1071 #define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
 1072 #define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
 1073 
 1074 /*
 1075  * Given polynomial f in NTT representation modulo p, compute f' of degree
 1076  * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
 1077  * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
 1078  *
 1079  * The new polynomial is written "in place" over the first N/2 elements
 1080  * of f.
 1081  *
 1082  * If applied logn times successively on a given polynomial, the resulting
 1083  * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
 1084  *
 1085  * This function applies only to the binary case; it is invoked from
 1086  * solve_NTRU_binary_depth1().
 1087  */
 1088 static void
 1089 modp_poly_rec_res(uint32_t *f, unsigned logn,
 1090         uint32_t p, uint32_t p0i, uint32_t R2)
 1091 {
 1092         size_t hn, u;
 1093 
 1094         hn = (size_t)1 << (logn - 1);
 1095         for (u = 0; u < hn; u ++) {
 1096                 uint32_t w0, w1;
 1097 
 1098                 w0 = f[(u << 1) + 0];
 1099                 w1 = f[(u << 1) + 1];
 1100                 f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 1101         }
 1102 }
 1103 
 1104 /* ==================================================================== */
 1105 /*
 1106  * Custom bignum implementation.
 1107  *
 1108  * This is a very reduced set of functionalities. We need to do the
 1109  * following operations:
 1110  *
 1111  *  - Rebuild the resultant and the polynomial coefficients from their
 1112  *    values modulo small primes (of length 31 bits each).
 1113  *
 1114  *  - Compute an extended GCD between the two computed resultants.
 1115  *
 1116  *  - Extract top bits and add scaled values during the successive steps
 1117  *    of Babai rounding.
 1118  *
 1119  * When rebuilding values using CRT, we must also recompute the product
 1120  * of the small prime factors. We always do it one small factor at a
 1121  * time, so the "complicated" operations can be done modulo the small
 1122  * prime with the modp_* functions. CRT coefficients (inverses) are
 1123  * precomputed.
 1124  *
 1125  * All values are positive until the last step: when the polynomial
 1126  * coefficients have been rebuilt, we normalize them around 0. But then,
 1127  * only additions and subtractions on the upper few bits are needed
 1128  * afterwards.
 1129  *
 1130  * We keep big integers as arrays of 31-bit words (in uint32_t values);
 1131  * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
 1132  * makes it easier to keep track of carries. When negative values are
 1133  * used, two's complement is used.
 1134  */
 1135 
 1136 /*
 1137  * Subtract integer b from integer a. Both integers are supposed to have
 1138  * the same size. The carry (0 or 1) is returned. Source arrays a and b
 1139  * MUST be distinct.
 1140  *
 1141  * The operation is performed as described above if ctr = 1. If
 1142  * ctl = 0, the value a[] is unmodified, but all memory accesses are
 1143  * still performed, and the carry is computed and returned.
 1144  */
 1145 static uint32_t
 1146 zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
 1147         uint32_t ctl)
 1148 {
 1149         size_t u;
 1150         uint32_t cc, m;
 1151 
 1152         cc = 0;
 1153         m = -ctl;
 1154         for (u = 0; u < len; u ++) {
 1155                 uint32_t aw, w;
 1156 
 1157                 aw = a[u];
 1158                 w = aw - b[u] - cc;
 1159                 cc = w >> 31;
 1160                 aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
 1161                 a[u] = aw;
 1162         }
 1163         return cc;
 1164 }
 1165 
 1166 /*
 1167  * Mutiply the provided big integer m with a small value x.
 1168  * This function assumes that x < 2^31. The carry word is returned.
 1169  */
 1170 static uint32_t
 1171 zint_mul_small(uint32_t *m, size_t mlen, uint32_t x)
 1172 {
 1173         size_t u;
 1174         uint32_t cc;
 1175 
 1176         cc = 0;
 1177         for (u = 0; u < mlen; u ++) {
 1178                 uint64_t z;
 1179 
 1180                 z = (uint64_t)m[u] * (uint64_t)x + cc;
 1181                 m[u] = (uint32_t)z & 0x7FFFFFFF;
 1182                 cc = (uint32_t)(z >> 31);
 1183         }
 1184         return cc;
 1185 }
 1186 
 1187 /*
 1188  * Reduce a big integer d modulo a small integer p.
 1189  * Rules:
 1190  *  d is unsigned
 1191  *  p is prime
 1192  *  2^30 < p < 2^31
 1193  *  p0i = -(1/p) mod 2^31
 1194  *  R2 = 2^62 mod p
 1195  */
 1196 static uint32_t
 1197 zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
 1198         uint32_t p, uint32_t p0i, uint32_t R2)
 1199 {
 1200         uint32_t x;
 1201         size_t u;
 1202 
 1203         /*
 1204          * Algorithm: we inject words one by one, starting with the high
 1205          * word. Each step is:
 1206          *  - multiply x by 2^31
 1207          *  - add new word
 1208          */
 1209         x = 0;
 1210         u = dlen;
 1211         while (u -- > 0) {
 1212                 uint32_t w;
 1213 
 1214                 x = modp_montymul(x, R2, p, p0i);
 1215                 w = d[u] - p;
 1216                 w += p & -(w >> 31);
 1217                 x = modp_add(x, w, p);
 1218         }
 1219         return x;
 1220 }
 1221 
 1222 /*
 1223  * Similar to zint_mod_small_unsigned(), except that d may be signed.
 1224  * Extra parameter is Rx = 2^(31*dlen) mod p.
 1225  */
 1226 static uint32_t
 1227 zint_mod_small_signed(const uint32_t *d, size_t dlen,
 1228         uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx)
 1229 {
 1230         uint32_t z;
 1231 
 1232         if (dlen == 0) {
 1233                 return 0;
 1234         }
 1235         z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
 1236         z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
 1237         return z;
 1238 }
 1239 
 1240 /*
 1241  * Add y*s to x. x and y initially have length 'len' words; the new x
 1242  * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
 1243  * not overlap.
 1244  */
 1245 static void
 1246 zint_add_mul_small(uint32_t *restrict x,
 1247         const uint32_t *restrict y, size_t len, uint32_t s)
 1248 {
 1249         size_t u;
 1250         uint32_t cc;
 1251 
 1252         cc = 0;
 1253         for (u = 0; u < len; u ++) {
 1254                 uint32_t xw, yw;
 1255                 uint64_t z;
 1256 
 1257                 xw = x[u];
 1258                 yw = y[u];
 1259                 z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
 1260                 x[u] = (uint32_t)z & 0x7FFFFFFF;
 1261                 cc = (uint32_t)(z >> 31);
 1262         }
 1263         x[len] = cc;
 1264 }
 1265 
 1266 /*
 1267  * Normalize a modular integer around 0: if x > p/2, then x is replaced
 1268  * with x - p (signed encoding with two's complement); otherwise, x is
 1269  * untouched. The two integers x and p are encoded over the same length.
 1270  */
 1271 static void
 1272 zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len)
 1273 {
 1274         size_t u;
 1275         uint32_t r, bb;
 1276 
 1277         /*
 1278          * Compare x with p/2. We use the shifted version of p, and p
 1279          * is odd, so we really compare with (p-1)/2; we want to perform
 1280          * the subtraction if and only if x > (p-1)/2.
 1281          */
 1282         r = 0;
 1283         bb = 0;
 1284         u = len;
 1285         while (u -- > 0) {
 1286                 uint32_t wx, wp, cc;
 1287 
 1288                 /*
 1289                  * Get the two words to compare in wx and wp (both over
 1290                  * 31 bits exactly).
 1291                  */
 1292                 wx = x[u];
 1293                 wp = (p[u] >> 1) | (bb << 30);
 1294                 bb = p[u] & 1;
 1295 
 1296                 /*
 1297                  * We set cc to -1, 0 or 1, depending on whether wp is
 1298                  * lower than, equal to, or greater than wx.
 1299                  */
 1300                 cc = wp - wx;
 1301                 cc = ((-cc) >> 31) | -(cc >> 31);
 1302 
 1303                 /*
 1304                  * If r != 0 then it is either 1 or -1, and we keep its
 1305                  * value. Otherwise, if r = 0, then we replace it with cc.
 1306                  */
 1307                 r |= cc & ((r & 1) - 1);
 1308         }
 1309 
 1310         /*
 1311          * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
 1312          * is lower than, equal to, or greater than x. We thus want to
 1313          * do the subtraction only if r = -1.
 1314          */
 1315         zint_sub(x, p, len, r >> 31);
 1316 }
 1317 
 1318 /*
 1319  * Rebuild integers from their RNS representation. There are 'num'
 1320  * integers, and each consists in 'xlen' words. 'xx' points at that
 1321  * first word of the first integer; subsequent integers are accessed
 1322  * by adding 'xstride' repeatedly.
 1323  *
 1324  * The words of an integer are the RNS representation of that integer,
 1325  * using the provided 'primes' are moduli. This function replaces
 1326  * each integer with its multi-word value (little-endian order).
 1327  *
 1328  * If "normalize_signed" is non-zero, then the returned value is
 1329  * normalized to the -m/2..m/2 interval (where m is the product of all
 1330  * small prime moduli); two's complement is used for negative values.
 1331  */
 1332 static void
 1333 zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
 1334         size_t num, const small_prime *primes, int normalize_signed,
 1335         uint32_t *restrict tmp)
 1336 {
 1337         size_t u;
 1338         uint32_t *x;
 1339 
 1340         tmp[0] = primes[0].p;
 1341         for (u = 1; u < xlen; u ++) {
 1342                 /*
 1343                  * At the entry of each loop iteration:
 1344                  *  - the first u words of each array have been
 1345                  *    reassembled;
 1346                  *  - the first u words of tmp[] contains the
 1347                  * product of the prime moduli processed so far.
 1348                  *
 1349                  * We call 'q' the product of all previous primes.
 1350                  */
 1351                 uint32_t p, p0i, s, R2;
 1352                 size_t v;
 1353 
 1354                 p = primes[u].p;
 1355                 s = primes[u].s;
 1356                 p0i = modp_ninv31(p);
 1357                 R2 = modp_R2(p, p0i);
 1358 
 1359                 for (v = 0, x = xx; v < num; v ++, x += xstride) {
 1360                         uint32_t xp, xq, xr;
 1361                         /*
 1362                          * xp = the integer x modulo the prime p for this
 1363                          *      iteration
 1364                          * xq = (x mod q) mod p
 1365                          */
 1366                         xp = x[u];
 1367                         xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
 1368 
 1369                         /*
 1370                          * New value is (x mod q) + q * (s * (xp - xq) mod p)
 1371                          */
 1372                         xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
 1373                         zint_add_mul_small(x, tmp, u, xr);
 1374                 }
 1375 
 1376                 /*
 1377                  * Update product of primes in tmp[].
 1378                  */
 1379                 tmp[u] = zint_mul_small(tmp, u, p);
 1380         }
 1381 
 1382         /*
 1383          * Normalize the reconstructed values around 0.
 1384          */
 1385         if (normalize_signed) {
 1386                 for (u = 0, x = xx; u < num; u ++, x += xstride) {
 1387                         zint_norm_zero(x, tmp, xlen);
 1388                 }
 1389         }
 1390 }
 1391 
 1392 /*
 1393  * Negate a big integer conditionally: value a is replaced with -a if
 1394  * and only if ctl = 1. Control value ctl must be 0 or 1.
 1395  */
 1396 static void
 1397 zint_negate(uint32_t *a, size_t len, uint32_t ctl)
 1398 {
 1399         size_t u;
 1400         uint32_t cc, m;
 1401 
 1402         /*
 1403          * If ctl = 1 then we flip the bits of a by XORing with
 1404          * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
 1405          * with 0 and add 0, which leaves the value unchanged.
 1406          */
 1407         cc = ctl;
 1408         m = -ctl >> 1;
 1409         for (u = 0; u < len; u ++) {
 1410                 uint32_t aw;
 1411 
 1412                 aw = a[u];
 1413                 aw = (aw ^ m) + cc;
 1414                 a[u] = aw & 0x7FFFFFFF;
 1415                 cc = aw >> 31;
 1416         }
 1417 }
 1418 
 1419 /*
 1420  * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
 1421  * The low bits are dropped (the caller should compute the coefficients
 1422  * such that these dropped bits are all zeros). If either or both
 1423  * yields a negative value, then the value is negated.
 1424  *
 1425  * Returned value is:
 1426  *  0  both values were positive
 1427  *  1  new a had to be negated
 1428  *  2  new b had to be negated
 1429  *  3  both new a and new b had to be negated
 1430  *
 1431  * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
 1432  */
 1433 static uint32_t
 1434 zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
 1435         int64_t xa, int64_t xb, int64_t ya, int64_t yb)
 1436 {
 1437         size_t u;
 1438         int64_t cca, ccb;
 1439         uint32_t nega, negb;
 1440 
 1441         cca = 0;
 1442         ccb = 0;
 1443         for (u = 0; u < len; u ++) {
 1444                 uint32_t wa, wb;
 1445                 uint64_t za, zb;
 1446 
 1447                 wa = a[u];
 1448                 wb = b[u];
 1449                 za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
 1450                 zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
 1451                 if (u > 0) {
 1452                         a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
 1453                         b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
 1454                 }
 1455                 cca = *(int64_t *)&za >> 31;
 1456                 ccb = *(int64_t *)&zb >> 31;
 1457         }
 1458         a[len - 1] = (uint32_t)cca;
 1459         b[len - 1] = (uint32_t)ccb;
 1460 
 1461         nega = (uint32_t)((uint64_t)cca >> 63);
 1462         negb = (uint32_t)((uint64_t)ccb >> 63);
 1463         zint_negate(a, len, nega);
 1464         zint_negate(b, len, negb);
 1465         return nega | (negb << 1);
 1466 }
 1467 
 1468 /*
 1469  * Finish modular reduction. Rules on input parameters:
 1470  *
 1471  *   if neg = 1, then -m <= a < 0
 1472  *   if neg = 0, then 0 <= a < 2*m
 1473  *
 1474  * If neg = 0, then the top word of a[] is allowed to use 32 bits.
 1475  *
 1476  * Modulus m must be odd.
 1477  */
 1478 static void
 1479 zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg)
 1480 {
 1481         size_t u;
 1482         uint32_t cc, xm, ym;
 1483 
 1484         /*
 1485          * First pass: compare a (assumed nonnegative) with m. Note that
 1486          * if the top word uses 32 bits, subtracting m must yield a
 1487          * value less than 2^31 since a < 2*m.
 1488          */
 1489         cc = 0;
 1490         for (u = 0; u < len; u ++) {
 1491                 cc = (a[u] - m[u] - cc) >> 31;
 1492         }
 1493 
 1494         /*
 1495          * If neg = 1 then we must add m (regardless of cc)
 1496          * If neg = 0 and cc = 0 then we must subtract m
 1497          * If neg = 0 and cc = 1 then we must do nothing
 1498          *
 1499          * In the loop below, we conditionally subtract either m or -m
 1500          * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
 1501          * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
 1502          */
 1503         xm = -neg >> 1;
 1504         ym = -(neg | (1 - cc));
 1505         cc = neg;
 1506         for (u = 0; u < len; u ++) {
 1507                 uint32_t aw, mw;
 1508 
 1509                 aw = a[u];
 1510                 mw = (m[u] ^ xm) & ym;
 1511                 aw = aw - mw - cc;
 1512                 a[u] = aw & 0x7FFFFFFF;
 1513                 cc = aw >> 31;
 1514         }
 1515 }
 1516 
 1517 /*
 1518  * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
 1519  * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
 1520  */
 1521 static void
 1522 zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
 1523         uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb)
 1524 {
 1525         size_t u;
 1526         int64_t cca, ccb;
 1527         uint32_t fa, fb;
 1528 
 1529         /*
 1530          * These are actually four combined Montgomery multiplications.
 1531          */
 1532         cca = 0;
 1533         ccb = 0;
 1534         fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
 1535         fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
 1536         for (u = 0; u < len; u ++) {
 1537                 uint32_t wa, wb;
 1538                 uint64_t za, zb;
 1539 
 1540                 wa = a[u];
 1541                 wb = b[u];
 1542                 za = wa * (uint64_t)xa + wb * (uint64_t)xb
 1543                         + m[u] * (uint64_t)fa + (uint64_t)cca;
 1544                 zb = wa * (uint64_t)ya + wb * (uint64_t)yb
 1545                         + m[u] * (uint64_t)fb + (uint64_t)ccb;
 1546                 if (u > 0) {
 1547                         a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
 1548                         b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
 1549                 }
 1550                 cca = *(int64_t *)&za >> 31;
 1551                 ccb = *(int64_t *)&zb >> 31;
 1552         }
 1553         a[len - 1] = (uint32_t)cca;
 1554         b[len - 1] = (uint32_t)ccb;
 1555 
 1556         /*
 1557          * At this point:
 1558          *   -m <= a < 2*m
 1559          *   -m <= b < 2*m
 1560          * (this is a case of Montgomery reduction)
 1561          * The top words of 'a' and 'b' may have a 32-th bit set.
 1562          * We want to add or subtract the modulus, as required.
 1563          */
 1564         zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
 1565         zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
 1566 }
 1567 
 1568 /*
 1569  * Compute a GCD between two positive big integers x and y. The two
 1570  * integers must be odd. Returned value is 1 if the GCD is 1, 0
 1571  * otherwise. When 1 is returned, arrays u and v are filled with values
 1572  * such that:
 1573  *   0 <= u <= y
 1574  *   0 <= v <= x
 1575  *   x*u - y*v = 1
 1576  * x[] and y[] are unmodified. Both input values must have the same
 1577  * encoded length. Temporary array must be large enough to accommodate 4
 1578  * extra values of that length. Arrays u, v and tmp may not overlap with
 1579  * each other, or with either x or y.
 1580  */
 1581 static int
 1582 zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
 1583         const uint32_t *restrict x, const uint32_t *restrict y,
 1584         size_t len, uint32_t *restrict tmp)
 1585 {
 1586         /*
 1587          * Algorithm is an extended binary GCD. We maintain 6 values
 1588          * a, b, u0, u1, v0 and v1 with the following invariants:
 1589          *
 1590          *  a = x*u0 - y*v0
 1591          *  b = x*u1 - y*v1
 1592          *  0 <= a <= x
 1593          *  0 <= b <= y
 1594          *  0 <= u0 < y
 1595          *  0 <= v0 < x
 1596          *  0 <= u1 <= y
 1597          *  0 <= v1 < x
 1598          *
 1599          * Initial values are:
 1600          *
 1601          *  a = x   u0 = 1   v0 = 0
 1602          *  b = y   u1 = y   v1 = x-1
 1603          *
 1604          * Each iteration reduces either a or b, and maintains the
 1605          * invariants. Algorithm stops when a = b, at which point their
 1606          * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
 1607          * the values (u,v) we want to return.
 1608          *
 1609          * The formal definition of the algorithm is a sequence of steps:
 1610          *
 1611          *  - If a is even, then:
 1612          *        a <- a/2
 1613          *        u0 <- u0/2 mod y
 1614          *        v0 <- v0/2 mod x
 1615          *
 1616          *  - Otherwise, if b is even, then:
 1617          *        b <- b/2
 1618          *        u1 <- u1/2 mod y
 1619          *        v1 <- v1/2 mod x
 1620          *
 1621          *  - Otherwise, if a > b, then:
 1622          *        a <- (a-b)/2
 1623          *        u0 <- (u0-u1)/2 mod y
 1624          *        v0 <- (v0-v1)/2 mod x
 1625          *
 1626          *  - Otherwise:
 1627          *        b <- (b-a)/2
 1628          *        u1 <- (u1-u0)/2 mod y
 1629          *        v1 <- (v1-v0)/2 mod y
 1630          *
 1631          * We can show that the operations above preserve the invariants:
 1632          *
 1633          *  - If a is even, then u0 and v0 are either both even or both
 1634          *    odd (since a = x*u0 - y*v0, and x and y are both odd).
 1635          *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
 1636          *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
 1637          *    the a = x*u0 - y*v0 invariant is preserved.
 1638          *
 1639          *  - The same holds for the case where b is even.
 1640          *
 1641          *  - If a and b are odd, and a > b, then:
 1642          *
 1643          *      a-b = x*(u0-u1) - y*(v0-v1)
 1644          *
 1645          *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
 1646          *    a-b > 0; therefore, it must be that v0 < v1, and the
 1647          *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
 1648          *    which preserves the invariants. Otherwise, if u0 > u1,
 1649          *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
 1650          *    b >= 0, hence a-b <= x. It follows that, in that case,
 1651          *    v0-v1 >= 0. The first part of the update is then:
 1652          *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
 1653          *    invariants.
 1654          *
 1655          *    Either way, once the subtraction is done, the new value of
 1656          *    a, which is the difference of two odd values, is even,
 1657          *    and the remaining of this step is a subcase of the
 1658          *    first algorithm case (i.e. when a is even).
 1659          *
 1660          *  - If a and b are odd, and b > a, then the a similar
 1661          *    argument holds.
 1662          *
 1663          * The values a and b start at x and y, respectively. Since x
 1664          * and y are odd, their GCD is odd, and it is easily seen that
 1665          * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
 1666          * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
 1667          * or b is reduced by at least one bit at each iteration, so
 1668          * the algorithm necessarily converges on the case a = b, at
 1669          * which point the common value is the GCD.
 1670          *
 1671          * In the algorithm expressed above, when a = b, the fourth case
 1672          * applies, and sets b = 0. Since a contains the GCD of x and y,
 1673          * which are both odd, a must be odd, and subsequent iterations
 1674          * (if any) will simply divide b by 2 repeatedly, which has no
 1675          * consequence. Thus, the algorithm can run for more iterations
 1676          * than necessary; the final GCD will be in a, and the (u,v)
 1677          * coefficients will be (u0,v0).
 1678          *
 1679          *
 1680          * The presentation above is bit-by-bit. It can be sped up by
 1681          * noticing that all decisions are taken based on the low bits
 1682          * and high bits of a and b. We can extract the two top words
 1683          * and low word of each of a and b, and compute reduction
 1684          * parameters pa, pb, qa and qb such that the new values for
 1685          * a and b are:
 1686          *    a' = (a*pa + b*pb) / (2^31)
 1687          *    b' = (a*qa + b*qb) / (2^31)
 1688          * the two divisions being exact. The coefficients are obtained
 1689          * just from the extracted words, and may be slightly off, requiring
 1690          * an optional correction: if a' < 0, then we replace pa with -pa
 1691          * and pb with -pb. Each such step will reduce the total length
 1692          * (sum of lengths of a and b) by at least 30 bits at each
 1693          * iteration.
 1694          */
 1695         uint32_t *u0, *u1, *v0, *v1, *a, *b;
 1696         uint32_t x0i, y0i;
 1697         uint32_t num, rc;
 1698         size_t j;
 1699 
 1700         if (len == 0) {
 1701                 return 0;
 1702         }
 1703 
 1704         /*
 1705          * u0 and v0 are the u and v result buffers; the four other
 1706          * values (u1, v1, a and b) are taken from tmp[].
 1707          */
 1708         u0 = u;
 1709         v0 = v;
 1710         u1 = tmp;
 1711         v1 = u1 + len;
 1712         a = v1 + len;
 1713         b = a + len;
 1714 
 1715         /*
 1716          * We'll need the Montgomery reduction coefficients.
 1717          */
 1718         x0i = modp_ninv31(x[0]);
 1719         y0i = modp_ninv31(y[0]);
 1720 
 1721         /*
 1722          * Initialize a, b, u0, u1, v0 and v1.
 1723          *  a = x   u0 = 1   v0 = 0
 1724          *  b = y   u1 = y   v1 = x-1
 1725          * Note that x is odd, so computing x-1 is easy.
 1726          */
 1727         memcpy(a, x, len * sizeof *x);
 1728         memcpy(b, y, len * sizeof *y);
 1729         u0[0] = 1;
 1730         memset(u0 + 1, 0, (len - 1) * sizeof *u0);
 1731         memset(v0, 0, len * sizeof *v0);
 1732         memcpy(u1, y, len * sizeof *u1);
 1733         memcpy(v1, x, len * sizeof *v1);
 1734         v1[0] --;
 1735 
 1736         /*
 1737          * Each input operand may be as large as 31*len bits, and we
 1738          * reduce the total length by at least 30 bits at each iteration.
 1739          */
 1740         for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
 1741                 uint32_t c0, c1;
 1742                 uint32_t a0, a1, b0, b1;
 1743                 uint64_t a_hi, b_hi;
 1744                 uint32_t a_lo, b_lo;
 1745                 int64_t pa, pb, qa, qb;
 1746                 int i;
 1747                 uint32_t r;
 1748 
 1749                 /*
 1750                  * Extract the top words of a and b. If j is the highest
 1751                  * index >= 1 such that a[j] != 0 or b[j] != 0, then we
 1752                  * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
 1753                  * If a and b are down to one word each, then we use
 1754                  * a[0] and b[0].
 1755                  */
 1756                 c0 = (uint32_t)-1;
 1757                 c1 = (uint32_t)-1;
 1758                 a0 = 0;
 1759                 a1 = 0;
 1760                 b0 = 0;
 1761                 b1 = 0;
 1762                 j = len;
 1763                 while (j -- > 0) {
 1764                         uint32_t aw, bw;
 1765 
 1766                         aw = a[j];
 1767                         bw = b[j];
 1768                         a0 ^= (a0 ^ aw) & c0;
 1769                         a1 ^= (a1 ^ aw) & c1;
 1770                         b0 ^= (b0 ^ bw) & c0;
 1771                         b1 ^= (b1 ^ bw) & c1;
 1772                         c1 = c0;
 1773                         c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
 1774                 }
 1775 
 1776                 /*
 1777                  * If c1 = 0, then we grabbed two words for a and b.
 1778                  * If c1 != 0 but c0 = 0, then we grabbed one word. It
 1779                  * is not possible that c1 != 0 and c0 != 0, because that
 1780                  * would mean that both integers are zero.
 1781                  */
 1782                 a1 |= a0 & c1;
 1783                 a0 &= ~c1;
 1784                 b1 |= b0 & c1;
 1785                 b0 &= ~c1;
 1786                 a_hi = ((uint64_t)a0 << 31) + a1;
 1787                 b_hi = ((uint64_t)b0 << 31) + b1;
 1788                 a_lo = a[0];
 1789                 b_lo = b[0];
 1790 
 1791                 /*
 1792                  * Compute reduction factors:
 1793                  *
 1794                  *   a' = a*pa + b*pb
 1795                  *   b' = a*qa + b*qb
 1796                  *
 1797                  * such that a' and b' are both multiple of 2^31, but are
 1798                  * only marginally larger than a and b.
 1799                  */
 1800                 pa = 1;
 1801                 pb = 0;
 1802                 qa = 0;
 1803                 qb = 1;
 1804                 for (i = 0; i < 31; i ++) {
 1805                         /*
 1806                          * At each iteration:
 1807                          *
 1808                          *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
 1809                          *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
 1810                          *   a <- a/2 if: a is even
 1811                          *   b <- b/2 if: a is odd, b is even
 1812                          *
 1813                          * We multiply a_lo and b_lo by 2 at each
 1814                          * iteration, thus a division by 2 really is a
 1815                          * non-multiplication by 2.
 1816                          */
 1817                         uint32_t rt, oa, ob, cAB, cBA, cA;
 1818                         uint64_t rz;
 1819 
 1820                         /*
 1821                          * rt = 1 if a_hi > b_hi, 0 otherwise.
 1822                          */
 1823                         rz = b_hi - a_hi;
 1824                         rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
 1825                                 & (a_hi ^ rz))) >> 63);
 1826 
 1827                         /*
 1828                          * cAB = 1 if b must be subtracted from a
 1829                          * cBA = 1 if a must be subtracted from b
 1830                          * cA = 1 if a must be divided by 2
 1831                          *
 1832                          * Rules:
 1833                          *
 1834                          *   cAB and cBA cannot both be 1.
 1835                          *   If a is not divided by 2, b is.
 1836                          */
 1837                         oa = (a_lo >> i) & 1;
 1838                         ob = (b_lo >> i) & 1;
 1839                         cAB = oa & ob & rt;
 1840                         cBA = oa & ob & ~rt;
 1841                         cA = cAB | (oa ^ 1);
 1842 
 1843                         /*
 1844                          * Conditional subtractions.
 1845                          */
 1846                         a_lo -= b_lo & -cAB;
 1847                         a_hi -= b_hi & -(uint64_t)cAB;
 1848                         pa -= qa & -(int64_t)cAB;
 1849                         pb -= qb & -(int64_t)cAB;
 1850                         b_lo -= a_lo & -cBA;
 1851                         b_hi -= a_hi & -(uint64_t)cBA;
 1852                         qa -= pa & -(int64_t)cBA;
 1853                         qb -= pb & -(int64_t)cBA;
 1854 
 1855                         /*
 1856                          * Shifting.
 1857                          */
 1858                         a_lo += a_lo & (cA - 1);
 1859                         pa += pa & ((int64_t)cA - 1);
 1860                         pb += pb & ((int64_t)cA - 1);
 1861                         a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
 1862                         b_lo += b_lo & -cA;
 1863                         qa += qa & -(int64_t)cA;
 1864                         qb += qb & -(int64_t)cA;
 1865                         b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
 1866                 }
 1867 
 1868                 /*
 1869                  * Apply the computed parameters to our values. We
 1870                  * may have to correct pa and pb depending on the
 1871                  * returned value of zint_co_reduce() (when a and/or b
 1872                  * had to be negated).
 1873                  */
 1874                 r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
 1875                 pa -= (pa + pa) & -(int64_t)(r & 1);
 1876                 pb -= (pb + pb) & -(int64_t)(r & 1);
 1877                 qa -= (qa + qa) & -(int64_t)(r >> 1);
 1878                 qb -= (qb + qb) & -(int64_t)(r >> 1);
 1879                 zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
 1880                 zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
 1881         }
 1882 
 1883         /*
 1884          * At that point, array a[] should contain the GCD, and the
 1885          * results (u,v) should already be set. We check that the GCD
 1886          * is indeed 1. We also check that the two operands x and y
 1887          * are odd.
 1888          */
 1889         rc = a[0] ^ 1;
 1890         for (j = 1; j < len; j ++) {
 1891                 rc |= a[j];
 1892         }
 1893         return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
 1894 }
 1895 
 1896 /*
 1897  * Add k*y*2^sc to x. The result is assumed to fit in the array of
 1898  * size xlen (truncation is applied if necessary).
 1899  * Scale factor 'sc' is provided as sch and scl, such that:
 1900  *   sch = sc / 31
 1901  *   scl = sc % 31
 1902  * xlen MUST NOT be lower than ylen.
 1903  *
 1904  * x[] and y[] are both signed integers, using two's complement for
 1905  * negative values.
 1906  */
 1907 static void
 1908 zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
 1909         const uint32_t *restrict y, size_t ylen, int32_t k,
 1910         uint32_t sch, uint32_t scl)
 1911 {
 1912         size_t u;
 1913         uint32_t ysign, tw;
 1914         int32_t cc;
 1915 
 1916         if (ylen == 0) {
 1917                 return;
 1918         }
 1919 
 1920         ysign = -(y[ylen - 1] >> 30) >> 1;
 1921         tw = 0;
 1922         cc = 0;
 1923         for (u = sch; u < xlen; u ++) {
 1924                 size_t v;
 1925                 uint32_t wy, wys, ccu;
 1926                 uint64_t z;
 1927 
 1928                 /*
 1929                  * Get the next word of y (scaled).
 1930                  */
 1931                 v = u - sch;
 1932                 wy = v < ylen ? y[v] : ysign;
 1933                 wys = ((wy << scl) & 0x7FFFFFFF) | tw;
 1934                 tw = wy >> (31 - scl);
 1935 
 1936                 /*
 1937                  * The expression below does not overflow.
 1938                  */
 1939                 z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
 1940                 x[u] = (uint32_t)z & 0x7FFFFFFF;
 1941 
 1942                 /*
 1943                  * Right-shifting the signed value z would yield
 1944                  * implementation-defined results (arithmetic shift is
 1945                  * not guaranteed). However, we can cast to unsigned,
 1946                  * and get the next carry as an unsigned word. We can
 1947                  * then convert it back to signed by using the guaranteed
 1948                  * fact that 'int32_t' uses two's complement with no
 1949                  * trap representation or padding bit, and with a layout
 1950                  * compatible with that of 'uint32_t'.
 1951                  */
 1952                 ccu = (uint32_t)(z >> 31);
 1953                 cc = *(int32_t *)&ccu;
 1954         }
 1955 }
 1956 
 1957 /*
 1958  * Subtract y*2^sc from x. The result is assumed to fit in the array of
 1959  * size xlen (truncation is applied if necessary).
 1960  * Scale factor 'sc' is provided as sch and scl, such that:
 1961  *   sch = sc / 31
 1962  *   scl = sc % 31
 1963  * xlen MUST NOT be lower than ylen.
 1964  *
 1965  * x[] and y[] are both signed integers, using two's complement for
 1966  * negative values.
 1967  */
 1968 static void
 1969 zint_sub_scaled(uint32_t *restrict x, size_t xlen,
 1970         const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl)
 1971 {
 1972         size_t u;
 1973         uint32_t ysign, tw;
 1974         uint32_t cc;
 1975 
 1976         if (ylen == 0) {
 1977                 return;
 1978         }
 1979 
 1980         ysign = -(y[ylen - 1] >> 30) >> 1;
 1981         tw = 0;
 1982         cc = 0;
 1983         for (u = sch; u < xlen; u ++) {
 1984                 size_t v;
 1985                 uint32_t w, wy, wys;
 1986 
 1987                 /*
 1988                  * Get the next word of y (scaled).
 1989                  */
 1990                 v = u - sch;
 1991                 wy = v < ylen ? y[v] : ysign;
 1992                 wys = ((wy << scl) & 0x7FFFFFFF) | tw;
 1993                 tw = wy >> (31 - scl);
 1994 
 1995                 w = x[u] - wys - cc;
 1996                 x[u] = w & 0x7FFFFFFF;
 1997                 cc = w >> 31;
 1998         }
 1999 }
 2000 
 2001 /*
 2002  * Convert a one-word signed big integer into a signed value.
 2003  */
 2004 static inline int32_t
 2005 zint_one_to_plain(const uint32_t *x)
 2006 {
 2007         uint32_t w;
 2008 
 2009         w = x[0];
 2010         w |= (w & 0x40000000) << 1;
 2011         return *(int32_t *)&w;
 2012 }
 2013 
 2014 /* ==================================================================== */
 2015 
 2016 /*
 2017  * Convert a polynomial to floating-point values.
 2018  *
 2019  * Each coefficient has length flen words, and starts fstride words after
 2020  * the previous.
 2021  *
 2022  * IEEE-754 binary64 values can represent values in a finite range,
 2023  * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
 2024  * they should be "trimmed" by pointing not to the lowest word of each,
 2025  * but upper.
 2026  */
 2027 static void
 2028 poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
 2029         unsigned logn)
 2030 {
 2031         size_t n, u;
 2032 
 2033         n = MKN(logn);
 2034         if (flen == 0) {
 2035                 for (u = 0; u < n; u ++) {
 2036                         d[u] = fpr_zero;
 2037                 }
 2038                 return;
 2039         }
 2040         for (u = 0; u < n; u ++, f += fstride) {
 2041                 size_t v;
 2042                 uint32_t neg, cc, xm;
 2043                 fpr x, fsc;
 2044 
 2045                 /*
 2046                  * Get sign of the integer; if it is negative, then we
 2047                  * will load its absolute value instead, and negate the
 2048                  * result.
 2049                  */
 2050                 neg = -(f[flen - 1] >> 30);
 2051                 xm = neg >> 1;
 2052                 cc = neg & 1;
 2053                 x = fpr_zero;
 2054                 fsc = fpr_one;
 2055                 for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
 2056                         uint32_t w;
 2057 
 2058                         w = (f[v] ^ xm) + cc;
 2059                         cc = w >> 31;
 2060                         w &= 0x7FFFFFFF;
 2061                         w -= (w << 1) & neg;
 2062                         x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
 2063                 }
 2064                 d[u] = x;
 2065         }
 2066 }
 2067 
 2068 /*
 2069  * Convert a polynomial to small integers. Source values are supposed
 2070  * to be one-word integers, signed over 31 bits. Returned value is 0
 2071  * if any of the coefficients exceeds the provided limit (in absolute
 2072  * value), or 1 on success.
 2073  *
 2074  * This is not constant-time; this is not a problem here, because on
 2075  * any failure, the NTRU-solving process will be deemed to have failed
 2076  * and the (f,g) polynomials will be discarded.
 2077  */
 2078 static int
 2079 poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn)
 2080 {
 2081         size_t n, u;
 2082 
 2083         n = MKN(logn);
 2084         for (u = 0; u < n; u ++) {
 2085                 int32_t z;
 2086 
 2087                 z = zint_one_to_plain(s + u);
 2088                 if (z < -lim || z > lim) {
 2089                         return 0;
 2090                 }
 2091                 d[u] = (int8_t)z;
 2092         }
 2093         return 1;
 2094 }
 2095 
 2096 /*
 2097  * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
 2098  * Coefficients of polynomial k are small integers (signed values in the
 2099  * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
 2100  * and scl = sc % 31.
 2101  *
 2102  * This function implements the basic quadratic multiplication algorithm,
 2103  * which is efficient in space (no extra buffer needed) but slow at
 2104  * high degree.
 2105  */
 2106 static void
 2107 poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
 2108         const uint32_t *restrict f, size_t flen, size_t fstride,
 2109         const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn)
 2110 {
 2111         size_t n, u;
 2112 
 2113         n = MKN(logn);
 2114         for (u = 0; u < n; u ++) {
 2115                 int32_t kf;
 2116                 size_t v;
 2117                 uint32_t *x;
 2118                 const uint32_t *y;
 2119 
 2120                 kf = -k[u];
 2121                 x = F + u * Fstride;
 2122                 y = f;
 2123                 for (v = 0; v < n; v ++) {
 2124                         zint_add_scaled_mul_small(
 2125                                 x, Flen, y, flen, kf, sch, scl);
 2126                         if (u + v == n - 1) {
 2127                                 x = F;
 2128                                 kf = -kf;
 2129                         } else {
 2130                                 x += Fstride;
 2131                         }
 2132                         y += fstride;
 2133                 }
 2134         }
 2135 }
 2136 
 2137 /*
 2138  * Subtract k*f from F. Coefficients of polynomial k are small integers
 2139  * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
 2140  * assumes that the degree is large, and integers relatively small.
 2141  * The value sc is provided as sch = sc / 31 and scl = sc % 31.
 2142  */
 2143 static void
 2144 poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
 2145         const uint32_t *restrict f, size_t flen, size_t fstride,
 2146         const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
 2147         uint32_t *restrict tmp)
 2148 {
 2149         uint32_t *gm, *igm, *fk, *t1, *x;
 2150         const uint32_t *y;
 2151         size_t n, u, tlen;
 2152         const small_prime *primes;
 2153 
 2154         n = MKN(logn);
 2155         tlen = flen + 1;
 2156         gm = tmp;
 2157         igm = gm + MKN(logn);
 2158         fk = igm + MKN(logn);
 2159         t1 = fk + n * tlen;
 2160 
 2161         primes = PRIMES;
 2162 
 2163         /*
 2164          * Compute k*f in fk[], in RNS notation.
 2165          */
 2166         for (u = 0; u < tlen; u ++) {
 2167                 uint32_t p, p0i, R2, Rx;
 2168                 size_t v;
 2169 
 2170                 p = primes[u].p;
 2171                 p0i = modp_ninv31(p);
 2172                 R2 = modp_R2(p, p0i);
 2173                 Rx = modp_Rx((unsigned)flen, p, p0i, R2);
 2174                 modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
 2175 
 2176                 for (v = 0; v < n; v ++) {
 2177                         t1[v] = modp_set(k[v], p);
 2178                 }
 2179                 modp_NTT2(t1, gm, logn, p, p0i);
 2180                 for (v = 0, y = f, x = fk + u;
 2181                         v < n; v ++, y += fstride, x += tlen)
 2182                 {
 2183                         *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
 2184                 }
 2185                 modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
 2186                 for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
 2187                         *x = modp_montymul(
 2188                                 modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
 2189                 }
 2190                 modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
 2191         }
 2192 
 2193         /*
 2194          * Rebuild k*f.
 2195          */
 2196         zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
 2197 
 2198         /*
 2199          * Subtract k*f, scaled, from F.
 2200          */
 2201         for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
 2202                 zint_sub_scaled(x, Flen, y, tlen, sch, scl);
 2203         }
 2204 }
 2205 
 2206 /* ==================================================================== */
 2207 
 2208 #if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
 2209 
 2210 #define RNG_CONTEXT   prng
 2211 #define get_rng_u64   prng_get_u64
 2212 
 2213 #else  // yyyKG_CHACHA20+0
 2214 
 2215 #define RNG_CONTEXT   inner_shake256_context
 2216 
 2217 /*
 2218  * Get a random 8-byte integer from a SHAKE-based RNG. This function
 2219  * ensures consistent interpretation of the SHAKE output so that
 2220  * the same values will be obtained over different platforms, in case
 2221  * a known seed is used.
 2222  */
 2223 static inline uint64_t
 2224 get_rng_u64(inner_shake256_context *rng)
 2225 {
 2226         /*
 2227          * We enforce little-endian representation.
 2228          */
 2229 
 2230 #if FALCON_LE  // yyyLE+1
 2231         /*
 2232          * On little-endian systems we just interpret the bytes "as is"
 2233          * (this is correct because the exact-width types such as
 2234          * 'uint64_t' are guaranteed to have no padding and no trap
 2235          * representation).
 2236          */
 2237         uint64_t r;
 2238 
 2239         inner_shake256_extract(rng, (uint8_t *)&r, sizeof r);
 2240         return r;
 2241 #else  // yyyLE+0
 2242         uint8_t tmp[8];
 2243 
 2244         inner_shake256_extract(rng, tmp, sizeof tmp);
 2245         return (uint64_t)tmp[0]
 2246                 | ((uint64_t)tmp[1] << 8)
 2247                 | ((uint64_t)tmp[2] << 16)
 2248                 | ((uint64_t)tmp[3] << 24)
 2249                 | ((uint64_t)tmp[4] << 32)
 2250                 | ((uint64_t)tmp[5] << 40)
 2251                 | ((uint64_t)tmp[6] << 48)
 2252                 | ((uint64_t)tmp[7] << 56);
 2253 #endif  // yyyLE-
 2254 }
 2255 
 2256 #endif  // yyyKG_CHACHA20-
 2257 
 2258 /*
 2259  * Table below incarnates a discrete Gaussian distribution:
 2260  *    D(x) = exp(-(x^2)/(2*sigma^2))
 2261  * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
 2262  * Element 0 of the table is P(x = 0).
 2263  * For k > 0, element k is P(x >= k+1 | x > 0).
 2264  * Probabilities are scaled up by 2^63.
 2265  */
 2266 static const uint64_t gauss_1024_12289[] = {
 2267          1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
 2268          2353523259288686585u,  1227179971273316331u,   575931623374121527u,
 2269           242543240509105209u,    91437049221049666u,    30799446349977173u,
 2270             9255276791179340u,     2478152334826140u,      590642893610164u,
 2271              125206034929641u,       23590435911403u,        3948334035941u,
 2272                 586753615614u,          77391054539u,           9056793210u,
 2273                    940121950u,             86539696u,              7062824u,
 2274                       510971u,                32764u,                 1862u,
 2275                           94u,                    4u,                    0u
 2276 };
 2277 
 2278 /*
 2279  * Generate a random value with a Gaussian distribution centered on 0.
 2280  * The RNG must be ready for extraction (already flipped).
 2281  *
 2282  * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
 2283  * precomputed table is for N = 1024. Since the sum of two independent
 2284  * values of standard deviation sigma has standard deviation
 2285  * sigma*sqrt(2), then we can just generate more values and add them
 2286  * together for lower dimensions.
 2287  */
 2288 static int
 2289 mkgauss(RNG_CONTEXT *rng, unsigned logn)
 2290 {
 2291         unsigned u, g;
 2292         int val;
 2293 
 2294         g = 1U << (10 - logn);
 2295         val = 0;
 2296         for (u = 0; u < g; u ++) {
 2297                 /*
 2298                  * Each iteration generates one value with the
 2299                  * Gaussian distribution for N = 1024.
 2300                  *
 2301                  * We use two random 64-bit values. First value
 2302                  * decides on whether the generated value is 0, and,
 2303                  * if not, the sign of the value. Second random 64-bit
 2304                  * word is used to generate the non-zero value.
 2305                  *
 2306                  * For constant-time code we have to read the complete
 2307                  * table. This has negligible cost, compared with the
 2308                  * remainder of the keygen process (solving the NTRU
 2309                  * equation).
 2310                  */
 2311                 uint64_t r;
 2312                 uint32_t f, v, k, neg;
 2313 
 2314                 /*
 2315                  * First value:
 2316                  *  - flag 'neg' is randomly selected to be 0 or 1.
 2317                  *  - flag 'f' is set to 1 if the generated value is zero,
 2318                  *    or set to 0 otherwise.
 2319                  */
 2320                 r = get_rng_u64(rng);
 2321                 neg = (uint32_t)(r >> 63);
 2322                 r &= ~((uint64_t)1 << 63);
 2323                 f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
 2324 
 2325                 /*
 2326                  * We produce a new random 63-bit integer r, and go over
 2327                  * the array, starting at index 1. We store in v the
 2328                  * index of the first array element which is not greater
 2329                  * than r, unless the flag f was already 1.
 2330                  */
 2331                 v = 0;
 2332                 r = get_rng_u64(rng);
 2333                 r &= ~((uint64_t)1 << 63);
 2334                 for (k = 1; k < (sizeof gauss_1024_12289)
 2335                         / (sizeof gauss_1024_12289[0]); k ++)
 2336                 {
 2337                         uint32_t t;
 2338 
 2339                         t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
 2340                         v |= k & -(t & (f ^ 1));
 2341                         f |= t;
 2342                 }
 2343 
 2344                 /*
 2345                  * We apply the sign ('neg' flag). If the value is zero,
 2346                  * the sign has no effect.
 2347                  */
 2348                 v = (v ^ -neg) + neg;
 2349 
 2350                 /*
 2351                  * Generated value is added to val.
 2352                  */
 2353                 val += *(int32_t *)&v;
 2354         }
 2355         return val;
 2356 }
 2357 
 2358 /*
 2359  * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
 2360  * words, of intermediate values in the computation:
 2361  *
 2362  *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
 2363  *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
 2364  *
 2365  * Rules:
 2366  *
 2367  *  - Within an array, values grow.
 2368  *
 2369  *  - The 'SMALL' array must have an entry for maximum depth, corresponding
 2370  *    to the size of values used in the binary GCD. There is no such value
 2371  *    for the 'LARGE' array (the binary GCD yields already reduced
 2372  *    coefficients).
 2373  *
 2374  *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
 2375  *
 2376  *  - Values must be large enough to handle the common cases, with some
 2377  *    margins.
 2378  *
 2379  *  - Values must not be "too large" either because we will convert some
 2380  *    integers into floating-point values by considering the top 10 words,
 2381  *    i.e. 310 bits; hence, for values of length more than 10 words, we
 2382  *    should take care to have the length centered on the expected size.
 2383  *
 2384  * The following average lengths, in bits, have been measured on thousands
 2385  * of random keys (fg = max length of the absolute value of coefficients
 2386  * of f and g at that depth; FG = idem for the unreduced F and G; for the
 2387  * maximum depth, F and G are the output of binary GCD, multiplied by q;
 2388  * for each value, the average and standard deviation are provided).
 2389  *
 2390  * Binary case:
 2391  *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
 2392  *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
 2393  *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
 2394  *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
 2395  *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
 2396  *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
 2397  *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
 2398  *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
 2399  *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
 2400  *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
 2401  *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
 2402  *
 2403  * Integers are actually represented either in binary notation over
 2404  * 31-bit words (signed, using two's complement), or in RNS, modulo
 2405  * many small primes. These small primes are close to, but slightly
 2406  * lower than, 2^31. Use of RNS loses less than two bits, even for
 2407  * the largest values.
 2408  *
 2409  * IMPORTANT: if these values are modified, then the temporary buffer
 2410  * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
 2411  * accordingly.
 2412  */
 2413 
 2414 static const size_t MAX_BL_SMALL[] = {
 2415         1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
 2416 };
 2417 
 2418 static const size_t MAX_BL_LARGE[] = {
 2419         2, 2, 5, 7, 12, 21, 40, 78, 157, 308
 2420 };
 2421 
 2422 /*
 2423  * Average and standard deviation for the maximum size (in bits) of
 2424  * coefficients of (f,g), depending on depth. These values are used
 2425  * to compute bounds for Babai's reduction.
 2426  */
 2427 static const struct {
 2428         int avg;
 2429         int std;
 2430 } BITLENGTH[] = {
 2431         {    4,  0 },
 2432         {   11,  1 },
 2433         {   24,  1 },
 2434         {   50,  1 },
 2435         {  102,  1 },
 2436         {  202,  2 },
 2437         {  401,  4 },
 2438         {  794,  5 },
 2439         { 1577,  8 },
 2440         { 3138, 13 },
 2441         { 6308, 25 }
 2442 };
 2443 
 2444 /*
 2445  * Minimal recursion depth at which we rebuild intermediate values
 2446  * when reconstructing f and g.
 2447  */
 2448 #define DEPTH_INT_FG   4
 2449 
 2450 /*
 2451  * Compute squared norm of a short vector. Returned value is saturated to
 2452  * 2^32-1 if it is not lower than 2^31.
 2453  */
 2454 static uint32_t
 2455 poly_small_sqnorm(const int8_t *f, unsigned logn)
 2456 {
 2457         size_t n, u;
 2458         uint32_t s, ng;
 2459 
 2460         n = MKN(logn);
 2461         s = 0;
 2462         ng = 0;
 2463         for (u = 0; u < n; u ++) {
 2464                 int32_t z;
 2465 
 2466                 z = f[u];
 2467                 s += (uint32_t)(z * z);
 2468                 ng |= s;
 2469         }
 2470         return s | -(ng >> 31);
 2471 }
 2472 
 2473 /*
 2474  * Align (upwards) the provided 'data' pointer with regards to 'base'
 2475  * so that the offset is a multiple of the size of 'fpr'.
 2476  */
 2477 static fpr *
 2478 align_fpr(void *base, void *data)
 2479 {
 2480         uint8_t *cb, *cd;
 2481         size_t k, km;
 2482 
 2483         cb = base;
 2484         cd = data;
 2485         k = (size_t)(cd - cb);
 2486         km = k % sizeof(fpr);
 2487         if (km) {
 2488                 k += (sizeof(fpr)) - km;
 2489         }
 2490         return (fpr *)(cb + k);
 2491 }
 2492 
 2493 /*
 2494  * Align (upwards) the provided 'data' pointer with regards to 'base'
 2495  * so that the offset is a multiple of the size of 'uint32_t'.
 2496  */
 2497 static uint32_t *
 2498 align_u32(void *base, void *data)
 2499 {
 2500         uint8_t *cb, *cd;
 2501         size_t k, km;
 2502 
 2503         cb = base;
 2504         cd = data;
 2505         k = (size_t)(cd - cb);
 2506         km = k % sizeof(uint32_t);
 2507         if (km) {
 2508                 k += (sizeof(uint32_t)) - km;
 2509         }
 2510         return (uint32_t *)(cb + k);
 2511 }
 2512 
 2513 /*
 2514  * Convert a small vector to floating point.
 2515  */
 2516 static void
 2517 poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn)
 2518 {
 2519         size_t n, u;
 2520 
 2521         n = MKN(logn);
 2522         for (u = 0; u < n; u ++) {
 2523                 x[u] = fpr_of(f[u]);
 2524         }
 2525 }
 2526 
 2527 /*
 2528  * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
 2529  * individual length.
 2530  *
 2531  * Output: f',g' of degree N/2, with the length for 'depth+1'.
 2532  *
 2533  * Values are in RNS; input and/or output may also be in NTT.
 2534  */
 2535 static void
 2536 make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
 2537         int in_ntt, int out_ntt)
 2538 {
 2539         size_t n, hn, u;
 2540         size_t slen, tlen;
 2541         uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
 2542         const small_prime *primes;
 2543 
 2544         n = (size_t)1 << logn;
 2545         hn = n >> 1;
 2546         slen = MAX_BL_SMALL[depth];
 2547         tlen = MAX_BL_SMALL[depth + 1];
 2548         primes = PRIMES;
 2549 
 2550         /*
 2551          * Prepare room for the result.
 2552          */
 2553         fd = data;
 2554         gd = fd + hn * tlen;
 2555         fs = gd + hn * tlen;
 2556         gs = fs + n * slen;
 2557         gm = gs + n * slen;
 2558         igm = gm + n;
 2559         t1 = igm + n;
 2560         memmove(fs, data, 2 * n * slen * sizeof *data);
 2561 
 2562         /*
 2563          * First slen words: we use the input values directly, and apply
 2564          * inverse NTT as we go.
 2565          */
 2566         for (u = 0; u < slen; u ++) {
 2567                 uint32_t p, p0i, R2;
 2568                 size_t v;
 2569                 uint32_t *x;
 2570 
 2571                 p = primes[u].p;
 2572                 p0i = modp_ninv31(p);
 2573                 R2 = modp_R2(p, p0i);
 2574                 modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
 2575 
 2576                 for (v = 0, x = fs + u; v < n; v ++, x += slen) {
 2577                         t1[v] = *x;
 2578                 }
 2579                 if (!in_ntt) {
 2580                         modp_NTT2(t1, gm, logn, p, p0i);
 2581                 }
 2582                 for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
 2583                         uint32_t w0, w1;
 2584 
 2585                         w0 = t1[(v << 1) + 0];
 2586                         w1 = t1[(v << 1) + 1];
 2587                         *x = modp_montymul(
 2588                                 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 2589                 }
 2590                 if (in_ntt) {
 2591                         modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
 2592                 }
 2593 
 2594                 for (v = 0, x = gs + u; v < n; v ++, x += slen) {
 2595                         t1[v] = *x;
 2596                 }
 2597                 if (!in_ntt) {
 2598                         modp_NTT2(t1, gm, logn, p, p0i);
 2599                 }
 2600                 for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
 2601                         uint32_t w0, w1;
 2602 
 2603                         w0 = t1[(v << 1) + 0];
 2604                         w1 = t1[(v << 1) + 1];
 2605                         *x = modp_montymul(
 2606                                 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 2607                 }
 2608                 if (in_ntt) {
 2609                         modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
 2610                 }
 2611 
 2612                 if (!out_ntt) {
 2613                         modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
 2614                         modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
 2615                 }
 2616         }
 2617 
 2618         /*
 2619          * Since the fs and gs words have been de-NTTized, we can use the
 2620          * CRT to rebuild the values.
 2621          */
 2622         zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
 2623         zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
 2624 
 2625         /*
 2626          * Remaining words: use modular reductions to extract the values.
 2627          */
 2628         for (u = slen; u < tlen; u ++) {
 2629                 uint32_t p, p0i, R2, Rx;
 2630                 size_t v;
 2631                 uint32_t *x;
 2632 
 2633                 p = primes[u].p;
 2634                 p0i = modp_ninv31(p);
 2635                 R2 = modp_R2(p, p0i);
 2636                 Rx = modp_Rx((unsigned)slen, p, p0i, R2);
 2637                 modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
 2638                 for (v = 0, x = fs; v < n; v ++, x += slen) {
 2639                         t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
 2640                 }
 2641                 modp_NTT2(t1, gm, logn, p, p0i);
 2642                 for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
 2643                         uint32_t w0, w1;
 2644 
 2645                         w0 = t1[(v << 1) + 0];
 2646                         w1 = t1[(v << 1) + 1];
 2647                         *x = modp_montymul(
 2648                                 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 2649                 }
 2650                 for (v = 0, x = gs; v < n; v ++, x += slen) {
 2651                         t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
 2652                 }
 2653                 modp_NTT2(t1, gm, logn, p, p0i);
 2654                 for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
 2655                         uint32_t w0, w1;
 2656 
 2657                         w0 = t1[(v << 1) + 0];
 2658                         w1 = t1[(v << 1) + 1];
 2659                         *x = modp_montymul(
 2660                                 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
 2661                 }
 2662 
 2663                 if (!out_ntt) {
 2664                         modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
 2665                         modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
 2666                 }
 2667         }
 2668 }
 2669 
 2670 /*
 2671  * Compute f and g at a specific depth, in RNS notation.
 2672  *
 2673  * Returned values are stored in the data[] array, at slen words per integer.
 2674  *
 2675  * Conditions:
 2676  *   0 <= depth <= logn
 2677  *
 2678  * Space use in data[]: enough room for any two successive values (f', g',
 2679  * f and g).
 2680  */
 2681 static void
 2682 make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
 2683         unsigned logn, unsigned depth, int out_ntt)
 2684 {
 2685         size_t n, u;
 2686         uint32_t *ft, *gt, p0;
 2687         unsigned d;
 2688         const small_prime *primes;
 2689 
 2690         n = MKN(logn);
 2691         ft = data;
 2692         gt = ft + n;
 2693         primes = PRIMES;
 2694         p0 = primes[0].p;
 2695         for (u = 0; u < n; u ++) {
 2696                 ft[u] = modp_set(f[u], p0);
 2697                 gt[u] = modp_set(g[u], p0);
 2698         }
 2699 
 2700         if (depth == 0 && out_ntt) {
 2701                 uint32_t *gm, *igm;
 2702                 uint32_t p, p0i;
 2703 
 2704                 p = primes[0].p;
 2705                 p0i = modp_ninv31(p);
 2706                 gm = gt + n;
 2707                 igm = gm + MKN(logn);
 2708                 modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
 2709                 modp_NTT2(ft, gm, logn, p, p0i);
 2710                 modp_NTT2(gt, gm, logn, p, p0i);
 2711                 return;
 2712         }
 2713 
 2714         for (d = 0; d < depth; d ++) {
 2715                 make_fg_step(data, logn - d, d,
 2716                         d != 0, (d + 1) < depth || out_ntt);
 2717         }
 2718 }
 2719 
 2720 /*
 2721  * Solving the NTRU equation, deepest level: compute the resultants of
 2722  * f and g with X^N+1, and use binary GCD. The F and G values are
 2723  * returned in tmp[].
 2724  *
 2725  * Returned value: 1 on success, 0 on error.
 2726  */
 2727 static int
 2728 solve_NTRU_deepest(unsigned logn_top,
 2729         const int8_t *f, const int8_t *g, uint32_t *tmp)
 2730 {
 2731         size_t len;
 2732         uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
 2733         const small_prime *primes;
 2734 
 2735         len = MAX_BL_SMALL[logn_top];
 2736         primes = PRIMES;
 2737 
 2738         Fp = tmp;
 2739         Gp = Fp + len;
 2740         fp = Gp + len;
 2741         gp = fp + len;
 2742         t1 = gp + len;
 2743 
 2744         make_fg(fp, f, g, logn_top, logn_top, 0);
 2745 
 2746         /*
 2747          * We use the CRT to rebuild the resultants as big integers.
 2748          * There are two such big integers. The resultants are always
 2749          * nonnegative.
 2750          */
 2751         zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
 2752 
 2753         /*
 2754          * Apply the binary GCD. The zint_bezout() function works only
 2755          * if both inputs are odd.
 2756          *
 2757          * We can test on the result and return 0 because that would
 2758          * imply failure of the NTRU solving equation, and the (f,g)
 2759          * values will be abandoned in that case.
 2760          */
 2761         if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
 2762                 return 0;
 2763         }
 2764 
 2765         /*
 2766          * Multiply the two values by the target value q. Values must
 2767          * fit in the destination arrays.
 2768          * We can again test on the returned words: a non-zero output
 2769          * of zint_mul_small() means that we exceeded our array
 2770          * capacity, and that implies failure and rejection of (f,g).
 2771          */
 2772         q = 12289;
 2773         if (zint_mul_small(Fp, len, q) != 0
 2774                 || zint_mul_small(Gp, len, q) != 0)
 2775         {
 2776                 return 0;
 2777         }
 2778 
 2779         return 1;
 2780 }
 2781 
 2782 /*
 2783  * Solving the NTRU equation, intermediate level. Upon entry, the F and G
 2784  * from the previous level should be in the tmp[] array.
 2785  * This function MAY be invoked for the top-level (in which case depth = 0).
 2786  *
 2787  * Returned value: 1 on success, 0 on error.
 2788  */
 2789 static int
 2790 solve_NTRU_intermediate(unsigned logn_top,
 2791         const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp)
 2792 {
 2793         /*
 2794          * In this function, 'logn' is the log2 of the degree for
 2795          * this step. If N = 2^logn, then:
 2796          *  - the F and G values already in fk->tmp (from the deeper
 2797          *    levels) have degree N/2;
 2798          *  - this function should return F and G of degree N.
 2799          */
 2800         unsigned logn;
 2801         size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
 2802         uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
 2803         fpr *rt1, *rt2, *rt3, *rt4, *rt5;
 2804         int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
 2805         uint32_t *x, *y;
 2806         int32_t *k;
 2807         const small_prime *primes;
 2808 
 2809         logn = logn_top - depth;
 2810         n = (size_t)1 << logn;
 2811         hn = n >> 1;
 2812 
 2813         /*
 2814          * slen = size for our input f and g; also size of the reduced
 2815          *        F and G we return (degree N)
 2816          *
 2817          * dlen = size of the F and G obtained from the deeper level
 2818          *        (degree N/2 or N/3)
 2819          *
 2820          * llen = size for intermediary F and G before reduction (degree N)
 2821          *
 2822          * We build our non-reduced F and G as two independent halves each,
 2823          * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
 2824          */
 2825         slen = MAX_BL_SMALL[depth];
 2826         dlen = MAX_BL_SMALL[depth + 1];
 2827         llen = MAX_BL_LARGE[depth];
 2828         primes = PRIMES;
 2829 
 2830         /*
 2831          * Fd and Gd are the F and G from the deeper level.
 2832          */
 2833         Fd = tmp;
 2834         Gd = Fd + dlen * hn;
 2835 
 2836         /*
 2837          * Compute the input f and g for this level. Note that we get f
 2838          * and g in RNS + NTT representation.
 2839          */
 2840         ft = Gd + dlen * hn;
 2841         make_fg(ft, f, g, logn_top, depth, 1);
 2842 
 2843         /*
 2844          * Move the newly computed f and g to make room for our candidate
 2845          * F and G (unreduced).
 2846          */
 2847         Ft = tmp;
 2848         Gt = Ft + n * llen;
 2849         t1 = Gt + n * llen;
 2850         memmove(t1, ft, 2 * n * slen * sizeof *ft);
 2851         ft = t1;
 2852         gt = ft + slen * n;
 2853         t1 = gt + slen * n;
 2854 
 2855         /*
 2856          * Move Fd and Gd _after_ f and g.
 2857          */
 2858         memmove(t1, Fd, 2 * hn * dlen * sizeof *Fd);
 2859         Fd = t1;
 2860         Gd = Fd + hn * dlen;
 2861 
 2862         /*
 2863          * We reduce Fd and Gd modulo all the small primes we will need,
 2864          * and store the values in Ft and Gt (only n/2 values in each).
 2865          */
 2866         for (u = 0; u < llen; u ++) {
 2867                 uint32_t p, p0i, R2, Rx;
 2868                 size_t v;
 2869                 uint32_t *xs, *ys, *xd, *yd;
 2870 
 2871                 p = primes[u].p;
 2872                 p0i = modp_ninv31(p);
 2873                 R2 = modp_R2(p, p0i);
 2874                 Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
 2875                 for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
 2876                         v < hn;
 2877                         v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
 2878                 {
 2879                         *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
 2880                         *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
 2881                 }
 2882         }
 2883 
 2884         /*
 2885          * We do not need Fd and Gd after that point.
 2886          */
 2887 
 2888         /*
 2889          * Compute our F and G modulo sufficiently many small primes.
 2890          */
 2891         for (u = 0; u < llen; u ++) {
 2892                 uint32_t p, p0i, R2;
 2893                 uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
 2894                 size_t v;
 2895 
 2896                 /*
 2897                  * All computations are done modulo p.
 2898                  */
 2899                 p = primes[u].p;
 2900                 p0i = modp_ninv31(p);
 2901                 R2 = modp_R2(p, p0i);
 2902 
 2903                 /*
 2904                  * If we processed slen words, then f and g have been
 2905                  * de-NTTized, and are in RNS; we can rebuild them.
 2906                  */
 2907                 if (u == slen) {
 2908                         zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
 2909                         zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
 2910                 }
 2911 
 2912                 gm = t1;
 2913                 igm = gm + n;
 2914                 fx = igm + n;
 2915                 gx = fx + n;
 2916 
 2917                 modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
 2918 
 2919                 if (u < slen) {
 2920                         for (v = 0, x = ft + u, y = gt + u;
 2921                                 v < n; v ++, x += slen, y += slen)
 2922                         {
 2923                                 fx[v] = *x;
 2924                                 gx[v] = *y;
 2925                         }
 2926                         modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
 2927                         modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
 2928                 } else {
 2929                         uint32_t Rx;
 2930 
 2931                         Rx = modp_Rx((unsigned)slen, p, p0i, R2);
 2932                         for (v = 0, x = ft, y = gt;
 2933                                 v < n; v ++, x += slen, y += slen)
 2934                         {
 2935                                 fx[v] = zint_mod_small_signed(x, slen,
 2936                                         p, p0i, R2, Rx);
 2937                                 gx[v] = zint_mod_small_signed(y, slen,
 2938                                         p, p0i, R2, Rx);
 2939                         }
 2940                         modp_NTT2(fx, gm, logn, p, p0i);
 2941                         modp_NTT2(gx, gm, logn, p, p0i);
 2942                 }
 2943 
 2944                 /*
 2945                  * Get F' and G' modulo p and in NTT representation
 2946                  * (they have degree n/2). These values were computed in
 2947                  * a previous step, and stored in Ft and Gt.
 2948                  */
 2949                 Fp = gx + n;
 2950                 Gp = Fp + hn;
 2951                 for (v = 0, x = Ft + u, y = Gt + u;
 2952                         v < hn; v ++, x += llen, y += llen)
 2953                 {
 2954                         Fp[v] = *x;
 2955                         Gp[v] = *y;
 2956                 }
 2957                 modp_NTT2(Fp, gm, logn - 1, p, p0i);
 2958                 modp_NTT2(Gp, gm, logn - 1, p, p0i);
 2959 
 2960                 /*
 2961                  * Compute our F and G modulo p.
 2962                  *
 2963                  * General case:
 2964                  *
 2965                  *   we divide degree by d = 2 or 3
 2966                  *   f'(x^d) = N(f)(x^d) = f * adj(f)
 2967                  *   g'(x^d) = N(g)(x^d) = g * adj(g)
 2968                  *   f'*G' - g'*F' = q
 2969                  *   F = F'(x^d) * adj(g)
 2970                  *   G = G'(x^d) * adj(f)
 2971                  *
 2972                  * We compute things in the NTT. We group roots of phi
 2973                  * such that all roots x in a group share the same x^d.
 2974                  * If the roots in a group are x_1, x_2... x_d, then:
 2975                  *
 2976                  *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
 2977                  *
 2978                  * Thus, we have:
 2979                  *
 2980                  *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
 2981                  *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
 2982                  *   ...
 2983                  *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
 2984                  *
 2985                  * In all cases, we can thus compute F and G in NTT
 2986                  * representation by a few simple multiplications.
 2987                  * Moreover, in our chosen NTT representation, roots
 2988                  * from the same group are consecutive in RAM.
 2989                  */
 2990                 for (v = 0, x = Ft + u, y = Gt + u; v < hn;
 2991                         v ++, x += (llen << 1), y += (llen << 1))
 2992                 {
 2993                         uint32_t ftA, ftB, gtA, gtB;
 2994                         uint32_t mFp, mGp;
 2995 
 2996                         ftA = fx[(v << 1) + 0];
 2997                         ftB = fx[(v << 1) + 1];
 2998                         gtA = gx[(v << 1) + 0];
 2999                         gtB = gx[(v << 1) + 1];
 3000                         mFp = modp_montymul(Fp[v], R2, p, p0i);
 3001                         mGp = modp_montymul(Gp[v], R2, p, p0i);
 3002                         x[0] = modp_montymul(gtB, mFp, p, p0i);
 3003                         x[llen] = modp_montymul(gtA, mFp, p, p0i);
 3004                         y[0] = modp_montymul(ftB, mGp, p, p0i);
 3005                         y[llen] = modp_montymul(ftA, mGp, p, p0i);
 3006                 }
 3007                 modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
 3008                 modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
 3009         }
 3010 
 3011         /*
 3012          * Rebuild F and G with the CRT.
 3013          */
 3014         zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
 3015         zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
 3016 
 3017         /*
 3018          * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
 3019          * order).
 3020          */
 3021 
 3022         /*
 3023          * Apply Babai reduction to bring back F and G to size slen.
 3024          *
 3025          * We use the FFT to compute successive approximations of the
 3026          * reduction coefficient. We first isolate the top bits of
 3027          * the coefficients of f and g, and convert them to floating
 3028          * point; with the FFT, we compute adj(f), adj(g), and
 3029          * 1/(f*adj(f)+g*adj(g)).
 3030          *
 3031          * Then, we repeatedly apply the following:
 3032          *
 3033          *   - Get the top bits of the coefficients of F and G into
 3034          *     floating point, and use the FFT to compute:
 3035          *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
 3036          *
 3037          *   - Convert back that value into normal representation, and
 3038          *     round it to the nearest integers, yielding a polynomial k.
 3039          *     Proper scaling is applied to f, g, F and G so that the
 3040          *     coefficients fit on 32 bits (signed).
 3041          *
 3042          *   - Subtract k*f from F and k*g from G.
 3043          *
 3044          * Under normal conditions, this process reduces the size of F
 3045          * and G by some bits at each iteration. For constant-time
 3046          * operation, we do not want to measure the actual length of
 3047          * F and G; instead, we do the following:
 3048          *
 3049          *   - f and g are converted to floating-point, with some scaling
 3050          *     if necessary to keep values in the representable range.
 3051          *
 3052          *   - For each iteration, we _assume_ a maximum size for F and G,
 3053          *     and use the values at that size. If we overreach, then
 3054          *     we get zeros, which is harmless: the resulting coefficients
 3055          *     of k will be 0 and the value won't be reduced.
 3056          *
 3057          *   - We conservatively assume that F and G will be reduced by
 3058          *     at least 25 bits at each iteration.
 3059          *
 3060          * Even when reaching the bottom of the reduction, reduction
 3061          * coefficient will remain low. If it goes out-of-range, then
 3062          * something wrong occurred and the whole NTRU solving fails.
 3063          */
 3064 
 3065         /*
 3066          * Memory layout:
 3067          *  - We need to compute and keep adj(f), adj(g), and
 3068          *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
 3069          *    respectively).
 3070          *  - At each iteration we need two extra fp buffer (N fp values),
 3071          *    and produce a k (N 32-bit words). k will be shared with one
 3072          *    of the fp buffers.
 3073          *  - To compute k*f and k*g efficiently (with the NTT), we need
 3074          *    some extra room; we reuse the space of the temporary buffers.
 3075          *
 3076          * Arrays of 'fpr' are obtained from the temporary array itself.
 3077          * We ensure that the base is at a properly aligned offset (the
 3078          * source array tmp[] is supposed to be already aligned).
 3079          */
 3080 
 3081         rt3 = align_fpr(tmp, t1);
 3082         rt4 = rt3 + n;
 3083         rt5 = rt4 + n;
 3084         rt1 = rt5 + (n >> 1);
 3085         k = (int32_t *)align_u32(tmp, rt1);
 3086         rt2 = align_fpr(tmp, k + n);
 3087         if (rt2 < (rt1 + n)) {
 3088                 rt2 = rt1 + n;
 3089         }
 3090         t1 = (uint32_t *)k + n;
 3091 
 3092         /*
 3093          * Get f and g into rt3 and rt4 as floating-point approximations.
 3094          *
 3095          * We need to "scale down" the floating-point representation of
 3096          * coefficients when they are too big. We want to keep the value
 3097          * below 2^310 or so. Thus, when values are larger than 10 words,
 3098          * we consider only the top 10 words. Array lengths have been
 3099          * computed so that average maximum length will fall in the
 3100          * middle or the upper half of these top 10 words.
 3101          */
 3102         rlen = (slen > 10) ? 10 : slen;
 3103         poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
 3104         poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
 3105 
 3106         /*
 3107          * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
 3108          */
 3109         scale_fg = 31 * (int)(slen - rlen);
 3110 
 3111         /*
 3112          * Estimated boundaries for the maximum size (in bits) of the
 3113          * coefficients of (f,g). We use the measured average, and
 3114          * allow for a deviation of at most six times the standard
 3115          * deviation.
 3116          */
 3117         minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
 3118         maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
 3119 
 3120         /*
 3121          * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
 3122          * and adj(g) in rt3 and rt4, respectively.
 3123          */
 3124         Zf(FFT)(rt3, logn);
 3125         Zf(FFT)(rt4, logn);
 3126         Zf(poly_invnorm2_fft)(rt5, rt3, rt4, logn);
 3127         Zf(poly_adj_fft)(rt3, logn);
 3128         Zf(poly_adj_fft)(rt4, logn);
 3129 
 3130         /*
 3131          * Reduce F and G repeatedly.
 3132          *
 3133          * The expected maximum bit length of coefficients of F and G
 3134          * is kept in maxbl_FG, with the corresponding word length in
 3135          * FGlen.
 3136          */
 3137         FGlen = llen;
 3138         maxbl_FG = 31 * (int)llen;
 3139 
 3140         /*
 3141          * Each reduction operation computes the reduction polynomial
 3142          * "k". We need that polynomial to have coefficients that fit
 3143          * on 32-bit signed integers, with some scaling; thus, we use
 3144          * a descending sequence of scaling values, down to zero.
 3145          *
 3146          * The size of the coefficients of k is (roughly) the difference
 3147          * between the size of the coefficients of (F,G) and the size
 3148          * of the coefficients of (f,g). Thus, the maximum size of the
 3149          * coefficients of k is, at the start, maxbl_FG - minbl_fg;
 3150          * this is our starting scale value for k.
 3151          *
 3152          * We need to estimate the size of (F,G) during the execution of
 3153          * the algorithm; we are allowed some overestimation but not too
 3154          * much (poly_big_to_fp() uses a 310-bit window). Generally
 3155          * speaking, after applying a reduction with k scaled to
 3156          * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
 3157          * where 'dd' is a few bits to account for the fact that the
 3158          * reduction is never perfect (intuitively, dd is on the order
 3159          * of sqrt(N), so at most 5 bits; we here allow for 10 extra
 3160          * bits).
 3161          *
 3162          * The size of (f,g) is not known exactly, but maxbl_fg is an
 3163          * upper bound.
 3164          */
 3165         scale_k = maxbl_FG - minbl_fg;
 3166 
 3167         for (;;) {
 3168                 int scale_FG, dc, new_maxbl_FG;
 3169                 uint32_t scl, sch;
 3170                 fpr pdc, pt;
 3171 
 3172                 /*
 3173                  * Convert current F and G into floating-point. We apply
 3174                  * scaling if the current length is more than 10 words.
 3175                  */
 3176                 rlen = (FGlen > 10) ? 10 : FGlen;
 3177                 scale_FG = 31 * (int)(FGlen - rlen);
 3178                 poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
 3179                 poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
 3180 
 3181                 /*
 3182                  * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
 3183                  */
 3184                 Zf(FFT)(rt1, logn);
 3185                 Zf(FFT)(rt2, logn);
 3186                 Zf(poly_mul_fft)(rt1, rt3, logn);
 3187                 Zf(poly_mul_fft)(rt2, rt4, logn);
 3188                 Zf(poly_add)(rt2, rt1, logn);
 3189                 Zf(poly_mul_autoadj_fft)(rt2, rt5, logn);
 3190                 Zf(iFFT)(rt2, logn);
 3191 
 3192                 /*
 3193                  * (f,g) are scaled by 'scale_fg', meaning that the
 3194                  * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
 3195                  * to have their true mathematical value.
 3196                  *
 3197                  * (F,G) are similarly scaled by 'scale_FG'. Therefore,
 3198                  * the value we computed in rt2 is scaled by
 3199                  * 'scale_FG-scale_fg'.
 3200                  *
 3201                  * We want that value to be scaled by 'scale_k', hence we
 3202                  * apply a corrective scaling. After scaling, the values
 3203                  * should fit in -2^31-1..+2^31-1.
 3204                  */
 3205                 dc = scale_k - scale_FG + scale_fg;
 3206 
 3207                 /*
 3208                  * We will need to multiply values by 2^(-dc). The value
 3209                  * 'dc' is not secret, so we can compute 2^(-dc) with a
 3210                  * non-constant-time process.
 3211                  * (We could use ldexp(), but we prefer to avoid any
 3212                  * dependency on libm. When using FP emulation, we could
 3213                  * use our fpr_ldexp(), which is constant-time.)
 3214                  */
 3215                 if (dc < 0) {
 3216                         dc = -dc;
 3217                         pt = fpr_two;
 3218                 } else {
 3219                         pt = fpr_onehalf;
 3220                 }
 3221                 pdc = fpr_one;
 3222                 while (dc != 0) {
 3223                         if ((dc & 1) != 0) {
 3224                                 pdc = fpr_mul(pdc, pt);
 3225                         }
 3226                         dc >>= 1;
 3227                         pt = fpr_sqr(pt);
 3228                 }
 3229 
 3230                 for (u = 0; u < n; u ++) {
 3231                         fpr xv;
 3232 
 3233                         xv = fpr_mul(rt2[u], pdc);
 3234 
 3235                         /*
 3236                          * Sometimes the values can be out-of-bounds if
 3237                          * the algorithm fails; we must not call
 3238                          * fpr_rint() (and cast to int32_t) if the value
 3239                          * is not in-bounds. Note that the test does not
 3240                          * break constant-time discipline, since any
 3241                          * failure here implies that we discard the current
 3242                          * secret key (f,g).
 3243                          */
 3244                         if (!fpr_lt(fpr_mtwo31m1, xv)
 3245                                 || !fpr_lt(xv, fpr_ptwo31m1))
 3246                         {
 3247                                 return 0;
 3248                         }
 3249                         k[u] = (int32_t)fpr_rint(xv);
 3250                 }
 3251 
 3252                 /*
 3253                  * Values in k[] are integers. They really are scaled
 3254                  * down by maxbl_FG - minbl_fg bits.
 3255                  *
 3256                  * If we are at low depth, then we use the NTT to
 3257                  * compute k*f and k*g.
 3258                  */
 3259                 sch = (uint32_t)(scale_k / 31);
 3260                 scl = (uint32_t)(scale_k % 31);
 3261                 if (depth <= DEPTH_INT_FG) {
 3262                         poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
 3263                                 k, sch, scl, logn, t1);
 3264                         poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
 3265                                 k, sch, scl, logn, t1);
 3266                 } else {
 3267                         poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
 3268                                 k, sch, scl, logn);
 3269                         poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
 3270                                 k, sch, scl, logn);
 3271                 }
 3272 
 3273                 /*
 3274                  * We compute the new maximum size of (F,G), assuming that
 3275                  * (f,g) has _maximal_ length (i.e. that reduction is
 3276                  * "late" instead of "early". We also adjust FGlen
 3277                  * accordingly.
 3278                  */
 3279                 new_maxbl_FG = scale_k + maxbl_fg + 10;
 3280                 if (new_maxbl_FG < maxbl_FG) {
 3281                         maxbl_FG = new_maxbl_FG;
 3282                         if ((int)FGlen * 31 >= maxbl_FG + 31) {
 3283                                 FGlen --;
 3284                         }
 3285                 }
 3286 
 3287                 /*
 3288                  * We suppose that scaling down achieves a reduction by
 3289                  * at least 25 bits per iteration. We stop when we have
 3290                  * done the loop with an unscaled k.
 3291                  */
 3292                 if (scale_k <= 0) {
 3293                         break;
 3294                 }
 3295                 scale_k -= 25;
 3296                 if (scale_k < 0) {
 3297                         scale_k = 0;
 3298                 }
 3299         }
 3300 
 3301         /*
 3302          * If (F,G) length was lowered below 'slen', then we must take
 3303          * care to re-extend the sign.
 3304          */
 3305         if (FGlen < slen) {
 3306                 for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
 3307                         size_t v;
 3308                         uint32_t sw;
 3309 
 3310                         sw = -(Ft[FGlen - 1] >> 30) >> 1;
 3311                         for (v = FGlen; v < slen; v ++) {
 3312                                 Ft[v] = sw;
 3313                         }
 3314                         sw = -(Gt[FGlen - 1] >> 30) >> 1;
 3315                         for (v = FGlen; v < slen; v ++) {
 3316                                 Gt[v] = sw;
 3317                         }
 3318                 }
 3319         }
 3320 
 3321         /*
 3322          * Compress encoding of all values to 'slen' words (this is the
 3323          * expected output format).
 3324          */
 3325         for (u = 0, x = tmp, y = tmp;
 3326                 u < (n << 1); u ++, x += slen, y += llen)
 3327         {
 3328                 memmove(x, y, slen * sizeof *y);
 3329         }
 3330         return 1;
 3331 }
 3332 
 3333 /*
 3334  * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
 3335  * F and G from the previous level should be in the tmp[] array.
 3336  *
 3337  * Returned value: 1 on success, 0 on error.
 3338  */
 3339 static int
 3340 solve_NTRU_binary_depth1(unsigned logn_top,
 3341         const int8_t *f, const int8_t *g, uint32_t *tmp)
 3342 {
 3343         /*
 3344          * The first half of this function is a copy of the corresponding
 3345          * part in solve_NTRU_intermediate(), for the reconstruction of
 3346          * the unreduced F and G. The second half (Babai reduction) is
 3347          * done differently, because the unreduced F and G fit in 53 bits
 3348          * of precision, allowing a much simpler process with lower RAM
 3349          * usage.
 3350          */
 3351         unsigned depth, logn;
 3352         size_t n_top, n, hn, slen, dlen, llen, u;
 3353         uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
 3354         fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
 3355         uint32_t *x, *y;
 3356 
 3357         depth = 1;
 3358         n_top = (size_t)1 << logn_top;
 3359         logn = logn_top - depth;
 3360         n = (size_t)1 << logn;
 3361         hn = n >> 1;
 3362 
 3363         /*
 3364          * Equations are:
 3365          *
 3366          *   f' = f0^2 - X^2*f1^2
 3367          *   g' = g0^2 - X^2*g1^2
 3368          *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
 3369          *   F = F'*(g0 - X*g1)
 3370          *   G = G'*(f0 - X*f1)
 3371          *
 3372          * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
 3373          * degree N/2 (their odd-indexed coefficients are all zero).
 3374          */
 3375 
 3376         /*
 3377          * slen = size for our input f and g; also size of the reduced
 3378          *        F and G we return (degree N)
 3379          *
 3380          * dlen = size of the F and G obtained from the deeper level
 3381          *        (degree N/2)
 3382          *
 3383          * llen = size for intermediary F and G before reduction (degree N)
 3384          *
 3385          * We build our non-reduced F and G as two independent halves each,
 3386          * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
 3387          */
 3388         slen = MAX_BL_SMALL[depth];
 3389         dlen = MAX_BL_SMALL[depth + 1];
 3390         llen = MAX_BL_LARGE[depth];
 3391 
 3392         /*
 3393          * Fd and Gd are the F and G from the deeper level. Ft and Gt
 3394          * are the destination arrays for the unreduced F and G.
 3395          */
 3396         Fd = tmp;
 3397         Gd = Fd + dlen * hn;
 3398         Ft = Gd + dlen * hn;
 3399         Gt = Ft + llen * n;
 3400 
 3401         /*
 3402          * We reduce Fd and Gd modulo all the small primes we will need,
 3403          * and store the values in Ft and Gt.
 3404          */
 3405         for (u = 0; u < llen; u ++) {
 3406                 uint32_t p, p0i, R2, Rx;
 3407                 size_t v;
 3408                 uint32_t *xs, *ys, *xd, *yd;
 3409 
 3410                 p = PRIMES[u].p;
 3411                 p0i = modp_ninv31(p);
 3412                 R2 = modp_R2(p, p0i);
 3413                 Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
 3414                 for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
 3415                         v < hn;
 3416                         v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
 3417                 {
 3418                         *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
 3419                         *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
 3420                 }
 3421         }
 3422 
 3423         /*
 3424          * Now Fd and Gd are not needed anymore; we can squeeze them out.
 3425          */
 3426         memmove(tmp, Ft, llen * n * sizeof(uint32_t));
 3427         Ft = tmp;
 3428         memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
 3429         Gt = Ft + llen * n;
 3430         ft = Gt + llen * n;
 3431         gt = ft + slen * n;
 3432 
 3433         t1 = gt + slen * n;
 3434 
 3435         /*
 3436          * Compute our F and G modulo sufficiently many small primes.
 3437          */
 3438         for (u = 0; u < llen; u ++) {
 3439                 uint32_t p, p0i, R2;
 3440                 uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
 3441                 unsigned e;
 3442                 size_t v;
 3443 
 3444                 /*
 3445                  * All computations are done modulo p.
 3446                  */
 3447                 p = PRIMES[u].p;
 3448                 p0i = modp_ninv31(p);
 3449                 R2 = modp_R2(p, p0i);
 3450 
 3451                 /*
 3452                  * We recompute things from the source f and g, of full
 3453                  * degree. However, we will need only the n first elements
 3454                  * of the inverse NTT table (igm); the call to modp_mkgm()
 3455                  * below will fill n_top elements in igm[] (thus overflowing
 3456                  * into fx[]) but later code will overwrite these extra
 3457                  * elements.
 3458                  */
 3459                 gm = t1;
 3460                 igm = gm + n_top;
 3461                 fx = igm + n;
 3462                 gx = fx + n_top;
 3463                 modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
 3464 
 3465                 /*
 3466                  * Set ft and gt to f and g modulo p, respectively.
 3467                  */
 3468                 for (v = 0; v < n_top; v ++) {
 3469                         fx[v] = modp_set(f[v], p);
 3470                         gx[v] = modp_set(g[v], p);
 3471                 }
 3472 
 3473                 /*
 3474                  * Convert to NTT and compute our f and g.
 3475                  */
 3476                 modp_NTT2(fx, gm, logn_top, p, p0i);
 3477                 modp_NTT2(gx, gm, logn_top, p, p0i);
 3478                 for (e = logn_top; e > logn; e --) {
 3479                         modp_poly_rec_res(fx, e, p, p0i, R2);
 3480                         modp_poly_rec_res(gx, e, p, p0i, R2);
 3481                 }
 3482 
 3483                 /*
 3484                  * From that point onward, we only need tables for
 3485                  * degree n, so we can save some space.
 3486                  */
 3487                 if (depth > 0) { /* always true */
 3488                         memmove(gm + n, igm, n * sizeof *igm);
 3489                         igm = gm + n;
 3490                         memmove(igm + n, fx, n * sizeof *ft);
 3491                         fx = igm + n;
 3492                         memmove(fx + n, gx, n * sizeof *gt);
 3493                         gx = fx + n;
 3494                 }
 3495 
 3496                 /*
 3497                  * Get F' and G' modulo p and in NTT representation
 3498                  * (they have degree n/2). These values were computed
 3499                  * in a previous step, and stored in Ft and Gt.
 3500                  */
 3501                 Fp = gx + n;
 3502                 Gp = Fp + hn;
 3503                 for (v = 0, x = Ft + u, y = Gt + u;
 3504                         v < hn; v ++, x += llen, y += llen)
 3505                 {
 3506                         Fp[v] = *x;
 3507                         Gp[v] = *y;
 3508                 }
 3509                 modp_NTT2(Fp, gm, logn - 1, p, p0i);
 3510                 modp_NTT2(Gp, gm, logn - 1, p, p0i);
 3511 
 3512                 /*
 3513                  * Compute our F and G modulo p.
 3514                  *
 3515                  * Equations are:
 3516                  *
 3517                  *   f'(x^2) = N(f)(x^2) = f * adj(f)
 3518                  *   g'(x^2) = N(g)(x^2) = g * adj(g)
 3519                  *
 3520                  *   f'*G' - g'*F' = q
 3521                  *
 3522                  *   F = F'(x^2) * adj(g)
 3523                  *   G = G'(x^2) * adj(f)
 3524                  *
 3525                  * The NTT representation of f is f(w) for all w which
 3526                  * are roots of phi. In the binary case, as well as in
 3527                  * the ternary case for all depth except the deepest,
 3528                  * these roots can be grouped in pairs (w,-w), and we
 3529                  * then have:
 3530                  *
 3531                  *   f(w) = adj(f)(-w)
 3532                  *   f(-w) = adj(f)(w)
 3533                  *
 3534                  * and w^2 is then a root for phi at the half-degree.
 3535                  *
 3536                  * At the deepest level in the ternary case, this still
 3537                  * holds, in the following sense: the roots of x^2-x+1
 3538                  * are (w,-w^2) (for w^3 = -1, and w != -1), and we
 3539                  * have:
 3540                  *
 3541                  *   f(w) = adj(f)(-w^2)
 3542                  *   f(-w^2) = adj(f)(w)
 3543                  *
 3544                  * In all case, we can thus compute F and G in NTT
 3545                  * representation by a few simple multiplications.
 3546                  * Moreover, the two roots for each pair are consecutive
 3547                  * in our bit-reversal encoding.
 3548                  */
 3549                 for (v = 0, x = Ft + u, y = Gt + u;
 3550                         v < hn; v ++, x += (llen << 1), y += (llen << 1))
 3551                 {
 3552                         uint32_t ftA, ftB, gtA, gtB;
 3553                         uint32_t mFp, mGp;
 3554 
 3555                         ftA = fx[(v << 1) + 0];
 3556                         ftB = fx[(v << 1) + 1];
 3557                         gtA = gx[(v << 1) + 0];
 3558                         gtB = gx[(v << 1) + 1];
 3559                         mFp = modp_montymul(Fp[v], R2, p, p0i);
 3560                         mGp = modp_montymul(Gp[v], R2, p, p0i);
 3561                         x[0] = modp_montymul(gtB, mFp, p, p0i);
 3562                         x[llen] = modp_montymul(gtA, mFp, p, p0i);
 3563                         y[0] = modp_montymul(ftB, mGp, p, p0i);
 3564                         y[llen] = modp_montymul(ftA, mGp, p, p0i);
 3565                 }
 3566                 modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
 3567                 modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
 3568 
 3569                 /*
 3570                  * Also save ft and gt (only up to size slen).
 3571                  */
 3572                 if (u < slen) {
 3573                         modp_iNTT2(fx, igm, logn, p, p0i);
 3574                         modp_iNTT2(gx, igm, logn, p, p0i);
 3575                         for (v = 0, x = ft + u, y = gt + u;
 3576                                 v < n; v ++, x += slen, y += slen)
 3577                         {
 3578                                 *x = fx[v];
 3579                                 *y = gx[v];
 3580                         }
 3581                 }
 3582         }
 3583 
 3584         /*
 3585          * Rebuild f, g, F and G with the CRT. Note that the elements of F
 3586          * and G are consecutive, and thus can be rebuilt in a single
 3587          * loop; similarly, the elements of f and g are consecutive.
 3588          */
 3589         zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
 3590         zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
 3591 
 3592         /*
 3593          * Here starts the Babai reduction, specialized for depth = 1.
 3594          *
 3595          * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
 3596          * are converted to floating point. There is no scaling, and a
 3597          * single pass is sufficient.
 3598          */
 3599 
 3600         /*
 3601          * Convert F and G into floating point (rt1 and rt2).
 3602          */
 3603         rt1 = align_fpr(tmp, gt + slen * n);
 3604         rt2 = rt1 + n;
 3605         poly_big_to_fp(rt1, Ft, llen, llen, logn);
 3606         poly_big_to_fp(rt2, Gt, llen, llen, logn);
 3607 
 3608         /*
 3609          * Integer representation of F and G is no longer needed, we
 3610          * can remove it.
 3611          */
 3612         memmove(tmp, ft, 2 * slen * n * sizeof *ft);
 3613         ft = tmp;
 3614         gt = ft + slen * n;
 3615         rt3 = align_fpr(tmp, gt + slen * n);
 3616         memmove(rt3, rt1, 2 * n * sizeof *rt1);
 3617         rt1 = rt3;
 3618         rt2 = rt1 + n;
 3619         rt3 = rt2 + n;
 3620         rt4 = rt3 + n;
 3621 
 3622         /*
 3623          * Convert f and g into floating point (rt3 and rt4).
 3624          */
 3625         poly_big_to_fp(rt3, ft, slen, slen, logn);
 3626         poly_big_to_fp(rt4, gt, slen, slen, logn);
 3627 
 3628         /*
 3629          * Remove unneeded ft and gt.
 3630          */
 3631         memmove(tmp, rt1, 4 * n * sizeof *rt1);
 3632         rt1 = (fpr *)tmp;
 3633         rt2 = rt1 + n;
 3634         rt3 = rt2 + n;
 3635         rt4 = rt3 + n;
 3636 
 3637         /*
 3638          * We now have:
 3639          *   rt1 = F
 3640          *   rt2 = G
 3641          *   rt3 = f
 3642          *   rt4 = g
 3643          * in that order in RAM. We convert all of them to FFT.
 3644          */
 3645         Zf(FFT)(rt1, logn);
 3646         Zf(FFT)(rt2, logn);
 3647         Zf(FFT)(rt3, logn);
 3648         Zf(FFT)(rt4, logn);
 3649 
 3650         /*
 3651          * Compute:
 3652          *   rt5 = F*adj(f) + G*adj(g)
 3653          *   rt6 = 1 / (f*adj(f) + g*adj(g))
 3654          * (Note that rt6 is half-length.)
 3655          */
 3656         rt5 = rt4 + n;
 3657         rt6 = rt5 + n;
 3658         Zf(poly_add_muladj_fft)(rt5, rt1, rt2, rt3, rt4, logn);
 3659         Zf(poly_invnorm2_fft)(rt6, rt3, rt4, logn);
 3660 
 3661         /*
 3662          * Compute:
 3663          *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
 3664          */
 3665         Zf(poly_mul_autoadj_fft)(rt5, rt6, logn);
 3666 
 3667         /*
 3668          * Compute k as the rounded version of rt5. Check that none of
 3669          * the values is larger than 2^63-1 (in absolute value)
 3670          * because that would make the fpr_rint() do something undefined;
 3671          * note that any out-of-bounds value here implies a failure and
 3672          * (f,g) will be discarded, so we can make a simple test.
 3673          */
 3674         Zf(iFFT)(rt5, logn);
 3675         for (u = 0; u < n; u ++) {
 3676                 fpr z;
 3677 
 3678                 z = rt5[u];
 3679                 if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
 3680                         return 0;
 3681                 }
 3682                 rt5[u] = fpr_of(fpr_rint(z));
 3683         }
 3684         Zf(FFT)(rt5, logn);
 3685 
 3686         /*
 3687          * Subtract k*f from F, and k*g from G.
 3688          */
 3689         Zf(poly_mul_fft)(rt3, rt5, logn);
 3690         Zf(poly_mul_fft)(rt4, rt5, logn);
 3691         Zf(poly_sub)(rt1, rt3, logn);
 3692         Zf(poly_sub)(rt2, rt4, logn);
 3693         Zf(iFFT)(rt1, logn);
 3694         Zf(iFFT)(rt2, logn);
 3695 
 3696         /*
 3697          * Convert back F and G to integers, and return.
 3698          */
 3699         Ft = tmp;
 3700         Gt = Ft + n;
 3701         rt3 = align_fpr(tmp, Gt + n);
 3702         memmove(rt3, rt1, 2 * n * sizeof *rt1);
 3703         rt1 = rt3;
 3704         rt2 = rt1 + n;
 3705         for (u = 0; u < n; u ++) {
 3706                 Ft[u] = (uint32_t)fpr_rint(rt1[u]);
 3707                 Gt[u] = (uint32_t)fpr_rint(rt2[u]);
 3708         }
 3709 
 3710         return 1;
 3711 }
 3712 
 3713 /*
 3714  * Solving the NTRU equation, top level. Upon entry, the F and G
 3715  * from the previous level should be in the tmp[] array.
 3716  *
 3717  * Returned value: 1 on success, 0 on error.
 3718  */
 3719 static int
 3720 solve_NTRU_binary_depth0(unsigned logn,
 3721         const int8_t *f, const int8_t *g, uint32_t *tmp)
 3722 {
 3723         size_t n, hn, u;
 3724         uint32_t p, p0i, R2;
 3725         uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
 3726         uint32_t *gm, *igm, *ft, *gt;
 3727         fpr *rt2, *rt3;
 3728 
 3729         n = (size_t)1 << logn;
 3730         hn = n >> 1;
 3731 
 3732         /*
 3733          * Equations are:
 3734          *
 3735          *   f' = f0^2 - X^2*f1^2
 3736          *   g' = g0^2 - X^2*g1^2
 3737          *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
 3738          *   F = F'*(g0 - X*g1)
 3739          *   G = G'*(f0 - X*f1)
 3740          *
 3741          * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
 3742          * degree N/2 (their odd-indexed coefficients are all zero).
 3743          *
 3744          * Everything should fit in 31-bit integers, hence we can just use
 3745          * the first small prime p = 2147473409.
 3746          */
 3747         p = PRIMES[0].p;
 3748         p0i = modp_ninv31(p);
 3749         R2 = modp_R2(p, p0i);
 3750 
 3751         Fp = tmp;
 3752         Gp = Fp + hn;
 3753         ft = Gp + hn;
 3754         gt = ft + n;
 3755         gm = gt + n;
 3756         igm = gm + n;
 3757 
 3758         modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
 3759 
 3760         /*
 3761          * Convert F' anf G' in NTT representation.
 3762          */
 3763         for (u = 0; u < hn; u ++) {
 3764                 Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
 3765                 Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
 3766         }
 3767         modp_NTT2(Fp, gm, logn - 1, p, p0i);
 3768         modp_NTT2(Gp, gm, logn - 1, p, p0i);
 3769 
 3770         /*
 3771          * Load f and g and convert them to NTT representation.
 3772          */
 3773         for (u = 0; u < n; u ++) {
 3774                 ft[u] = modp_set(f[u], p);
 3775                 gt[u] = modp_set(g[u], p);
 3776         }
 3777         modp_NTT2(ft, gm, logn, p, p0i);
 3778         modp_NTT2(gt, gm, logn, p, p0i);
 3779 
 3780         /*
 3781          * Build the unreduced F,G in ft and gt.
 3782          */
 3783         for (u = 0; u < n; u += 2) {
 3784                 uint32_t ftA, ftB, gtA, gtB;
 3785                 uint32_t mFp, mGp;
 3786 
 3787                 ftA = ft[u + 0];
 3788                 ftB = ft[u + 1];
 3789                 gtA = gt[u + 0];
 3790                 gtB = gt[u + 1];
 3791                 mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
 3792                 mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
 3793                 ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
 3794                 ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
 3795                 gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
 3796                 gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
 3797         }
 3798         modp_iNTT2(ft, igm, logn, p, p0i);
 3799         modp_iNTT2(gt, igm, logn, p, p0i);
 3800 
 3801         Gp = Fp + n;
 3802         t1 = Gp + n;
 3803         memmove(Fp, ft, 2 * n * sizeof *ft);
 3804 
 3805         /*
 3806          * We now need to apply the Babai reduction. At that point,
 3807          * we have F and G in two n-word arrays.
 3808          *
 3809          * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
 3810          * modulo p, using the NTT. We still move memory around in
 3811          * order to save RAM.
 3812          */
 3813         t2 = t1 + n;
 3814         t3 = t2 + n;
 3815         t4 = t3 + n;
 3816         t5 = t4 + n;
 3817 
 3818         /*
 3819          * Compute the NTT tables in t1 and t2. We do not keep t2
 3820          * (we'll recompute it later on).
 3821          */
 3822         modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
 3823 
 3824         /*
 3825          * Convert F and G to NTT.
 3826          */
 3827         modp_NTT2(Fp, t1, logn, p, p0i);
 3828         modp_NTT2(Gp, t1, logn, p, p0i);
 3829 
 3830         /*
 3831          * Load f and adj(f) in t4 and t5, and convert them to NTT
 3832          * representation.
 3833          */
 3834         t4[0] = t5[0] = modp_set(f[0], p);
 3835         for (u = 1; u < n; u ++) {
 3836                 t4[u] = modp_set(f[u], p);
 3837                 t5[n - u] = modp_set(-f[u], p);
 3838         }
 3839         modp_NTT2(t4, t1, logn, p, p0i);
 3840         modp_NTT2(t5, t1, logn, p, p0i);
 3841 
 3842         /*
 3843          * Compute F*adj(f) in t2, and f*adj(f) in t3.
 3844          */
 3845         for (u = 0; u < n; u ++) {
 3846                 uint32_t w;
 3847 
 3848                 w = modp_montymul(t5[u], R2, p, p0i);
 3849                 t2[u] = modp_montymul(w, Fp[u], p, p0i);
 3850                 t3[u] = modp_montymul(w, t4[u], p, p0i);
 3851         }
 3852 
 3853         /*
 3854          * Load g and adj(g) in t4 and t5, and convert them to NTT
 3855          * representation.
 3856          */
 3857         t4[0] = t5[0] = modp_set(g[0], p);
 3858         for (u = 1; u < n; u ++) {
 3859                 t4[u] = modp_set(g[u], p);
 3860                 t5[n - u] = modp_set(-g[u], p);
 3861         }
 3862         modp_NTT2(t4, t1, logn, p, p0i);
 3863         modp_NTT2(t5, t1, logn, p, p0i);
 3864 
 3865         /*
 3866          * Add G*adj(g) to t2, and g*adj(g) to t3.
 3867          */
 3868         for (u = 0; u < n; u ++) {
 3869                 uint32_t w;
 3870 
 3871                 w = modp_montymul(t5[u], R2, p, p0i);
 3872                 t2[u] = modp_add(t2[u],
 3873                         modp_montymul(w, Gp[u], p, p0i), p);
 3874                 t3[u] = modp_add(t3[u],
 3875                         modp_montymul(w, t4[u], p, p0i), p);
 3876         }
 3877 
 3878         /*
 3879          * Convert back t2 and t3 to normal representation (normalized
 3880          * around 0), and then
 3881          * move them to t1 and t2. We first need to recompute the
 3882          * inverse table for NTT.
 3883          */
 3884         modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
 3885         modp_iNTT2(t2, t4, logn, p, p0i);
 3886         modp_iNTT2(t3, t4, logn, p, p0i);
 3887         for (u = 0; u < n; u ++) {
 3888                 t1[u] = (uint32_t)modp_norm(t2[u], p);
 3889                 t2[u] = (uint32_t)modp_norm(t3[u], p);
 3890         }
 3891 
 3892         /*
 3893          * At that point, array contents are:
 3894          *
 3895          *   F (NTT representation) (Fp)
 3896          *   G (NTT representation) (Gp)
 3897          *   F*adj(f)+G*adj(g) (t1)
 3898          *   f*adj(f)+g*adj(g) (t2)
 3899          *
 3900          * We want to divide t1 by t2. The result is not integral; it
 3901          * must be rounded. We thus need to use the FFT.
 3902          */
 3903 
 3904         /*
 3905          * Get f*adj(f)+g*adj(g) in FFT representation. Since this
 3906          * polynomial is auto-adjoint, all its coordinates in FFT
 3907          * representation are actually real, so we can truncate off
 3908          * the imaginary parts.
 3909          */
 3910         rt3 = align_fpr(tmp, t3);
 3911         for (u = 0; u < n; u ++) {
 3912                 rt3[u] = fpr_of(((int32_t *)t2)[u]);
 3913         }
 3914         Zf(FFT)(rt3, logn);
 3915         rt2 = align_fpr(tmp, t2);
 3916         memmove(rt2, rt3, hn * sizeof *rt3);
 3917 
 3918         /*
 3919          * Convert F*adj(f)+G*adj(g) in FFT representation.
 3920          */
 3921         rt3 = rt2 + hn;
 3922         for (u = 0; u < n; u ++) {
 3923                 rt3[u] = fpr_of(((int32_t *)t1)[u]);
 3924         }
 3925         Zf(FFT)(rt3, logn);
 3926 
 3927         /*
 3928          * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
 3929          * its rounded normal representation in t1.
 3930          */
 3931         Zf(poly_div_autoadj_fft)(rt3, rt2, logn);
 3932         Zf(iFFT)(rt3, logn);
 3933         for (u = 0; u < n; u ++) {
 3934                 t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
 3935         }
 3936 
 3937         /*
 3938          * RAM contents are now:
 3939          *
 3940          *   F (NTT representation) (Fp)
 3941          *   G (NTT representation) (Gp)
 3942          *   k (t1)
 3943          *
 3944          * We want to compute F-k*f, and G-k*g.
 3945          */
 3946         t2 = t1 + n;
 3947         t3 = t2 + n;
 3948         t4 = t3 + n;
 3949         t5 = t4 + n;
 3950         modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
 3951         for (u = 0; u < n; u ++) {
 3952                 t4[u] = modp_set(f[u], p);
 3953                 t5[u] = modp_set(g[u], p);
 3954         }
 3955         modp_NTT2(t1, t2, logn, p, p0i);
 3956         modp_NTT2(t4, t2, logn, p, p0i);
 3957         modp_NTT2(t5, t2, logn, p, p0i);
 3958         for (u = 0; u < n; u ++) {
 3959                 uint32_t kw;
 3960 
 3961                 kw = modp_montymul(t1[u], R2, p, p0i);
 3962                 Fp[u] = modp_sub(Fp[u],
 3963                         modp_montymul(kw, t4[u], p, p0i), p);
 3964                 Gp[u] = modp_sub(Gp[u],
 3965                         modp_montymul(kw, t5[u], p, p0i), p);
 3966         }
 3967         modp_iNTT2(Fp, t3, logn, p, p0i);
 3968         modp_iNTT2(Gp, t3, logn, p, p0i);
 3969         for (u = 0; u < n; u ++) {
 3970                 Fp[u] = (uint32_t)modp_norm(Fp[u], p);
 3971                 Gp[u] = (uint32_t)modp_norm(Gp[u], p);
 3972         }
 3973 
 3974         return 1;
 3975 }
 3976 
 3977 /*
 3978  * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
 3979  * G can be NULL, in which case that value is computed but not returned.
 3980  * If any of the coefficients of F and G exceeds lim (in absolute value),
 3981  * then 0 is returned.
 3982  */
 3983 static int
 3984 solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
 3985         const int8_t *f, const int8_t *g, int lim, uint32_t *tmp)
 3986 {
 3987         size_t n, u;
 3988         uint32_t *ft, *gt, *Ft, *Gt, *gm;
 3989         uint32_t p, p0i, r;
 3990         const small_prime *primes;
 3991 
 3992         n = MKN(logn);
 3993 
 3994         if (!solve_NTRU_deepest(logn, f, g, tmp)) {
 3995                 return 0;
 3996         }
 3997 
 3998         /*
 3999          * For logn <= 2, we need to use solve_NTRU_intermediate()
 4000          * directly, because coefficients are a bit too large and
 4001          * do not fit the hypotheses in solve_NTRU_binary_depth0().
 4002          */
 4003         if (logn <= 2) {
 4004                 unsigned depth;
 4005 
 4006                 depth = logn;
 4007                 while (depth -- > 0) {
 4008                         if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
 4009                                 return 0;
 4010                         }
 4011                 }
 4012         } else {
 4013                 unsigned depth;
 4014 
 4015                 depth = logn;
 4016                 while (depth -- > 2) {
 4017                         if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
 4018                                 return 0;
 4019                         }
 4020                 }
 4021                 if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
 4022                         return 0;
 4023                 }
 4024                 if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
 4025                         return 0;
 4026                 }
 4027         }
 4028 
 4029         /*
 4030          * If no buffer has been provided for G, use a temporary one.
 4031          */
 4032         if (G == NULL) {
 4033                 G = (int8_t *)(tmp + 2 * n);
 4034         }
 4035 
 4036         /*
 4037          * Final F and G are in fk->tmp, one word per coefficient
 4038          * (signed value over 31 bits).
 4039          */
 4040         if (!poly_big_to_small(F, tmp, lim, logn)
 4041                 || !poly_big_to_small(G, tmp + n, lim, logn))
 4042         {
 4043                 return 0;
 4044         }
 4045 
 4046         /*
 4047          * Verify that the NTRU equation is fulfilled. Since all elements
 4048          * have short lengths, verifying modulo a small prime p works, and
 4049          * allows using the NTT.
 4050          *
 4051          * We put Gt[] first in tmp[], and process it first, so that it does
 4052          * not overlap with G[] in case we allocated it ourselves.
 4053          */
 4054         Gt = tmp;
 4055         ft = Gt + n;
 4056         gt = ft + n;
 4057         Ft = gt + n;
 4058         gm = Ft + n;
 4059 
 4060         primes = PRIMES;
 4061         p = primes[0].p;
 4062         p0i = modp_ninv31(p);
 4063         modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
 4064         for (u = 0; u < n; u ++) {
 4065                 Gt[u] = modp_set(G[u], p);
 4066         }
 4067         for (u = 0; u < n; u ++) {
 4068                 ft[u] = modp_set(f[u], p);
 4069                 gt[u] = modp_set(g[u], p);
 4070                 Ft[u] = modp_set(F[u], p);
 4071         }
 4072         modp_NTT2(ft, gm, logn, p, p0i);
 4073         modp_NTT2(gt, gm, logn, p, p0i);
 4074         modp_NTT2(Ft, gm, logn, p, p0i);
 4075         modp_NTT2(Gt, gm, logn, p, p0i);
 4076         r = modp_montymul(12289, 1, p, p0i);
 4077         for (u = 0; u < n; u ++) {
 4078                 uint32_t z;
 4079 
 4080                 z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
 4081                         modp_montymul(gt[u], Ft[u], p, p0i), p);
 4082                 if (z != r) {
 4083                         return 0;
 4084                 }
 4085         }
 4086 
 4087         return 1;
 4088 }
 4089 
 4090 /*
 4091  * Generate a random polynomial with a Gaussian distribution. This function
 4092  * also makes sure that the resultant of the polynomial with phi is odd.
 4093  */
 4094 static void
 4095 poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn)
 4096 {
 4097         size_t n, u;
 4098         unsigned mod2;
 4099 
 4100         n = MKN(logn);
 4101         mod2 = 0;
 4102         for (u = 0; u < n; u ++) {
 4103                 int s;
 4104 
 4105         restart:
 4106                 s = mkgauss(rng, logn);
 4107 
 4108                 /*
 4109                  * We need the coefficient to fit within -127..+127;
 4110                  * realistically, this is always the case except for
 4111                  * the very low degrees (N = 2 or 4), for which there
 4112                  * is no real security anyway.
 4113                  */
 4114                 if (s < -127 || s > 127) {
 4115                         goto restart;
 4116                 }
 4117 
 4118                 /*
 4119                  * We need the sum of all coefficients to be 1; otherwise,
 4120                  * the resultant of the polynomial with X^N+1 will be even,
 4121                  * and the binary GCD will fail.
 4122                  */
 4123                 if (u == n - 1) {
 4124                         if ((mod2 ^ (unsigned)(s & 1)) == 0) {
 4125                                 goto restart;
 4126                         }
 4127                 } else {
 4128                         mod2 ^= (unsigned)(s & 1);
 4129                 }
 4130                 f[u] = (int8_t)s;
 4131         }
 4132 }
 4133 
 4134 /* see falcon.h */
 4135 void
 4136 Zf(keygen)(inner_shake256_context *rng,
 4137         int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
 4138         unsigned logn, uint8_t *tmp)
 4139 {
 4140         /*
 4141          * Algorithm is the following:
 4142          *
 4143          *  - Generate f and g with the Gaussian distribution.
 4144          *
 4145          *  - If either Res(f,phi) or Res(g,phi) is even, try again.
 4146          *
 4147          *  - If ||(f,g)|| is too large, try again.
 4148          *
 4149          *  - If ||B~_{f,g}|| is too large, try again.
 4150          *
 4151          *  - If f is not invertible mod phi mod q, try again.
 4152          *
 4153          *  - Compute h = g/f mod phi mod q.
 4154          *
 4155          *  - Solve the NTRU equation fG - gF = q; if the solving fails,
 4156          *    try again. Usual failure condition is when Res(f,phi)
 4157          *    and Res(g,phi) are not prime to each other.
 4158          */
 4159         size_t n, u;
 4160         uint16_t *h2, *tmp2;
 4161         RNG_CONTEXT *rc;
 4162 #if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
 4163         prng p;
 4164 #endif  // yyyKG_CHACHA20-
 4165 
 4166         n = MKN(logn);
 4167 #if FALCON_KG_CHACHA20  // yyyKG_CHACHA20+1
 4168         Zf(prng_init)(&p, rng);
 4169         rc = &p;
 4170 #else // yyyKG_CHACHA20+0
 4171         rc = rng;
 4172 #endif  // yyyKG_CHACHA20-
 4173 
 4174         /*
 4175          * We need to generate f and g randomly, until we find values
 4176          * such that the norm of (g,-f), and of the orthogonalized
 4177          * vector, are satisfying. The orthogonalized vector is:
 4178          *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
 4179          * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
 4180          *
 4181          * In the binary case, coefficients of f and g are generated
 4182          * independently of each other, with a discrete Gaussian
 4183          * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
 4184          * the two vectors have expected norm 1.17*sqrt(q), which is
 4185          * also our acceptance bound: we require both vectors to be no
 4186          * larger than that (this will be satisfied about 1/4th of the
 4187          * time, thus we expect sampling new (f,g) about 4 times for that
 4188          * step).
 4189          *
 4190          * We require that Res(f,phi) and Res(g,phi) are both odd (the
 4191          * NTRU equation solver requires it).
 4192          */
 4193         for (;;) {
 4194                 fpr *rt1, *rt2, *rt3;
 4195                 fpr bnorm;
 4196                 uint32_t normf, normg, norm;
 4197                 int lim;
 4198 
 4199                 /*
 4200                  * The poly_small_mkgauss() function makes sure
 4201                  * that the sum of coefficients is 1 modulo 2
 4202                  * (i.e. the resultant of the polynomial with phi
 4203                  * will be odd).
 4204                  */
 4205                 poly_small_mkgauss(rc, f, logn);
 4206                 poly_small_mkgauss(rc, g, logn);
 4207 
 4208                 /*
 4209                  * Verify that all coefficients are within the bounds
 4210                  * defined in max_fg_bits. This is the case with
 4211                  * overwhelming probability; this guarantees that the
 4212                  * key will be encodable with FALCON_COMP_TRIM.
 4213                  */
 4214                 lim = 1 << (Zf(max_fg_bits)[logn] - 1);
 4215                 for (u = 0; u < n; u ++) {
 4216                         /*
 4217                          * We can use non-CT tests since on any failure
 4218                          * we will discard f and g.
 4219                          */
 4220                         if (f[u] >= lim || f[u] <= -lim
 4221                                 || g[u] >= lim || g[u] <= -lim)
 4222                         {
 4223                                 lim = -1;
 4224                                 break;
 4225                         }
 4226                 }
 4227                 if (lim < 0) {
 4228                         continue;
 4229                 }
 4230 
 4231                 /*
 4232                  * Bound is 1.17*sqrt(q). We compute the squared
 4233                  * norms. With q = 12289, the squared bound is:
 4234                  *   (1.17^2)* 12289 = 16822.4121
 4235                  * Since f and g are integral, the squared norm
 4236                  * of (g,-f) is an integer.
 4237                  */
 4238                 normf = poly_small_sqnorm(f, logn);
 4239                 normg = poly_small_sqnorm(g, logn);
 4240                 norm = (normf + normg) | -((normf | normg) >> 31);
 4241                 if (norm >= 16823) {
 4242                         continue;
 4243                 }
 4244 
 4245                 /*
 4246                  * We compute the orthogonalized vector norm.
 4247                  */
 4248                 rt1 = (fpr *)tmp;
 4249                 rt2 = rt1 + n;
 4250                 rt3 = rt2 + n;
 4251                 poly_small_to_fp(rt1, f, logn);
 4252                 poly_small_to_fp(rt2, g, logn);
 4253                 Zf(FFT)(rt1, logn);
 4254                 Zf(FFT)(rt2, logn);
 4255                 Zf(poly_invnorm2_fft)(rt3, rt1, rt2, logn);
 4256                 Zf(poly_adj_fft)(rt1, logn);
 4257                 Zf(poly_adj_fft)(rt2, logn);
 4258                 Zf(poly_mulconst)(rt1, fpr_q, logn);
 4259                 Zf(poly_mulconst)(rt2, fpr_q, logn);
 4260                 Zf(poly_mul_autoadj_fft)(rt1, rt3, logn);
 4261                 Zf(poly_mul_autoadj_fft)(rt2, rt3, logn);
 4262                 Zf(iFFT)(rt1, logn);
 4263                 Zf(iFFT)(rt2, logn);
 4264                 bnorm = fpr_zero;
 4265                 for (u = 0; u < n; u ++) {
 4266                         bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
 4267                         bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
 4268                 }
 4269                 if (!fpr_lt(bnorm, fpr_bnorm_max)) {
 4270                         continue;
 4271                 }
 4272 
 4273                 /*
 4274                  * Compute public key h = g/f mod X^N+1 mod q. If this
 4275                  * fails, we must restart.
 4276                  */
 4277                 if (h == NULL) {
 4278                         h2 = (uint16_t *)tmp;
 4279                         tmp2 = h2 + n;
 4280                 } else {
 4281                         h2 = h;
 4282                         tmp2 = (uint16_t *)tmp;
 4283                 }
 4284                 if (!Zf(compute_public)(h2, f, g, logn, (uint8_t *)tmp2)) {
 4285                         continue;
 4286                 }
 4287 
 4288                 /*
 4289                  * Solve the NTRU equation to get F and G.
 4290                  */
 4291                 lim = (1 << (Zf(max_FG_bits)[logn] - 1)) - 1;
 4292                 if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
 4293                         continue;
 4294                 }
 4295 
 4296                 /*
 4297                  * Key pair is generated.
 4298                  */
 4299                 break;
 4300         }
 4301 }