keygen.c
1 /*
2 * Falcon key pair generation.
3 *
4 * ==========================(LICENSE BEGIN)============================
5 *
6 * Copyright (c) 2017-2019 Falcon Project
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining
9 * a copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be
17 * included in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * ===========================(LICENSE END)=============================
28 *
29 * @author Thomas Pornin <thomas.pornin@nccgroup.com>
30 */
31
32 #include "inner.h"
33
34 #define MKN(logn) ((size_t)1 << (logn))
35
36 /* ==================================================================== */
37 /*
38 * Modular arithmetics.
39 *
40 * We implement a few functions for computing modulo a small integer p.
41 *
42 * All functions require that 2^30 < p < 2^31. Moreover, operands must
43 * be in the 0..p-1 range.
44 *
45 * Modular addition and subtraction work for all such p.
46 *
47 * Montgomery multiplication requires that p is odd, and must be provided
48 * with an additional value p0i = -1/p mod 2^31. See below for some basics
49 * on Montgomery multiplication.
50 *
51 * Division computes an inverse modulo p by an exponentiation (with
52 * exponent p-2): this works only if p is prime. Multiplication
53 * requirements also apply, i.e. p must be odd and p0i must be provided.
54 *
55 * The NTT and inverse NTT need all of the above, and also that
56 * p = 1 mod 2048.
57 *
58 * -----------------------------------------------------------------------
59 *
60 * We use Montgomery representation with 31-bit values:
61 *
62 * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
63 * Montgomery representation of an integer x modulo p is x*R mod p.
64 *
65 * Montgomery multiplication computes (x*y)/R mod p for
66 * operands x and y. Therefore:
67 *
68 * - if operands are x*R and y*R (Montgomery representations of x and
69 * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
70 * mod p, which is the Montgomery representation of the product x*y;
71 *
72 * - if operands are x*R and y (or x and y*R), then Montgomery
73 * multiplication returns x*y mod p: mixed-representation
74 * multiplications yield results in normal representation.
75 *
76 * To convert to Montgomery representation, we multiply by R, which is done
77 * by Montgomery-multiplying by R^2. Stand-alone conversion back from
78 * Montgomery representation is Montgomery-multiplication by 1.
79 */
80
81 /*
82 * Precomputed small primes. Each element contains the following:
83 *
84 * p The prime itself.
85 *
86 * g A primitive root of phi = X^N+1 (in field Z_p).
87 *
88 * s The inverse of the product of all previous primes in the array,
89 * computed modulo p and in Montgomery representation.
90 *
91 * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
92 * are listed in decreasing order.
93 */
94
95 typedef struct {
96 uint32_t p;
97 uint32_t g;
98 uint32_t s;
99 } small_prime;
100
101 static const small_prime PRIMES[] = {
102 { 2147473409, 383167813, 10239 },
103 { 2147389441, 211808905, 471403745 },
104 { 2147387393, 37672282, 1329335065 },
105 { 2147377153, 1977035326, 968223422 },
106 { 2147358721, 1067163706, 132460015 },
107 { 2147352577, 1606082042, 598693809 },
108 { 2147346433, 2033915641, 1056257184 },
109 { 2147338241, 1653770625, 421286710 },
110 { 2147309569, 631200819, 1111201074 },
111 { 2147297281, 2038364663, 1042003613 },
112 { 2147295233, 1962540515, 19440033 },
113 { 2147239937, 2100082663, 353296760 },
114 { 2147235841, 1991153006, 1703918027 },
115 { 2147217409, 516405114, 1258919613 },
116 { 2147205121, 409347988, 1089726929 },
117 { 2147196929, 927788991, 1946238668 },
118 { 2147178497, 1136922411, 1347028164 },
119 { 2147100673, 868626236, 701164723 },
120 { 2147082241, 1897279176, 617820870 },
121 { 2147074049, 1888819123, 158382189 },
122 { 2147051521, 25006327, 522758543 },
123 { 2147043329, 327546255, 37227845 },
124 { 2147039233, 766324424, 1133356428 },
125 { 2146988033, 1862817362, 73861329 },
126 { 2146963457, 404622040, 653019435 },
127 { 2146959361, 1936581214, 995143093 },
128 { 2146938881, 1559770096, 634921513 },
129 { 2146908161, 422623708, 1985060172 },
130 { 2146885633, 1751189170, 298238186 },
131 { 2146871297, 578919515, 291810829 },
132 { 2146846721, 1114060353, 915902322 },
133 { 2146834433, 2069565474, 47859524 },
134 { 2146818049, 1552824584, 646281055 },
135 { 2146775041, 1906267847, 1597832891 },
136 { 2146756609, 1847414714, 1228090888 },
137 { 2146744321, 1818792070, 1176377637 },
138 { 2146738177, 1118066398, 1054971214 },
139 { 2146736129, 52057278, 933422153 },
140 { 2146713601, 592259376, 1406621510 },
141 { 2146695169, 263161877, 1514178701 },
142 { 2146656257, 685363115, 384505091 },
143 { 2146650113, 927727032, 537575289 },
144 { 2146646017, 52575506, 1799464037 },
145 { 2146643969, 1276803876, 1348954416 },
146 { 2146603009, 814028633, 1521547704 },
147 { 2146572289, 1846678872, 1310832121 },
148 { 2146547713, 919368090, 1019041349 },
149 { 2146508801, 671847612, 38582496 },
150 { 2146492417, 283911680, 532424562 },
151 { 2146490369, 1780044827, 896447978 },
152 { 2146459649, 327980850, 1327906900 },
153 { 2146447361, 1310561493, 958645253 },
154 { 2146441217, 412148926, 287271128 },
155 { 2146437121, 293186449, 2009822534 },
156 { 2146430977, 179034356, 1359155584 },
157 { 2146418689, 1517345488, 1790248672 },
158 { 2146406401, 1615820390, 1584833571 },
159 { 2146404353, 826651445, 607120498 },
160 { 2146379777, 3816988, 1897049071 },
161 { 2146363393, 1221409784, 1986921567 },
162 { 2146355201, 1388081168, 849968120 },
163 { 2146336769, 1803473237, 1655544036 },
164 { 2146312193, 1023484977, 273671831 },
165 { 2146293761, 1074591448, 467406983 },
166 { 2146283521, 831604668, 1523950494 },
167 { 2146203649, 712865423, 1170834574 },
168 { 2146154497, 1764991362, 1064856763 },
169 { 2146142209, 627386213, 1406840151 },
170 { 2146127873, 1638674429, 2088393537 },
171 { 2146099201, 1516001018, 690673370 },
172 { 2146093057, 1294931393, 315136610 },
173 { 2146091009, 1942399533, 973539425 },
174 { 2146078721, 1843461814, 2132275436 },
175 { 2146060289, 1098740778, 360423481 },
176 { 2146048001, 1617213232, 1951981294 },
177 { 2146041857, 1805783169, 2075683489 },
178 { 2146019329, 272027909, 1753219918 },
179 { 2145986561, 1206530344, 2034028118 },
180 { 2145976321, 1243769360, 1173377644 },
181 { 2145964033, 887200839, 1281344586 },
182 { 2145906689, 1651026455, 906178216 },
183 { 2145875969, 1673238256, 1043521212 },
184 { 2145871873, 1226591210, 1399796492 },
185 { 2145841153, 1465353397, 1324527802 },
186 { 2145832961, 1150638905, 554084759 },
187 { 2145816577, 221601706, 427340863 },
188 { 2145785857, 608896761, 316590738 },
189 { 2145755137, 1712054942, 1684294304 },
190 { 2145742849, 1302302867, 724873116 },
191 { 2145728513, 516717693, 431671476 },
192 { 2145699841, 524575579, 1619722537 },
193 { 2145691649, 1925625239, 982974435 },
194 { 2145687553, 463795662, 1293154300 },
195 { 2145673217, 771716636, 881778029 },
196 { 2145630209, 1509556977, 837364988 },
197 { 2145595393, 229091856, 851648427 },
198 { 2145587201, 1796903241, 635342424 },
199 { 2145525761, 715310882, 1677228081 },
200 { 2145495041, 1040930522, 200685896 },
201 { 2145466369, 949804237, 1809146322 },
202 { 2145445889, 1673903706, 95316881 },
203 { 2145390593, 806941852, 1428671135 },
204 { 2145372161, 1402525292, 159350694 },
205 { 2145361921, 2124760298, 1589134749 },
206 { 2145359873, 1217503067, 1561543010 },
207 { 2145355777, 338341402, 83865711 },
208 { 2145343489, 1381532164, 641430002 },
209 { 2145325057, 1883895478, 1528469895 },
210 { 2145318913, 1335370424, 65809740 },
211 { 2145312769, 2000008042, 1919775760 },
212 { 2145300481, 961450962, 1229540578 },
213 { 2145282049, 910466767, 1964062701 },
214 { 2145232897, 816527501, 450152063 },
215 { 2145218561, 1435128058, 1794509700 },
216 { 2145187841, 33505311, 1272467582 },
217 { 2145181697, 269767433, 1380363849 },
218 { 2145175553, 56386299, 1316870546 },
219 { 2145079297, 2106880293, 1391797340 },
220 { 2145021953, 1347906152, 720510798 },
221 { 2145015809, 206769262, 1651459955 },
222 { 2145003521, 1885513236, 1393381284 },
223 { 2144960513, 1810381315, 31937275 },
224 { 2144944129, 1306487838, 2019419520 },
225 { 2144935937, 37304730, 1841489054 },
226 { 2144894977, 1601434616, 157985831 },
227 { 2144888833, 98749330, 2128592228 },
228 { 2144880641, 1772327002, 2076128344 },
229 { 2144864257, 1404514762, 2029969964 },
230 { 2144827393, 801236594, 406627220 },
231 { 2144806913, 349217443, 1501080290 },
232 { 2144796673, 1542656776, 2084736519 },
233 { 2144778241, 1210734884, 1746416203 },
234 { 2144759809, 1146598851, 716464489 },
235 { 2144757761, 286328400, 1823728177 },
236 { 2144729089, 1347555695, 1836644881 },
237 { 2144727041, 1795703790, 520296412 },
238 { 2144696321, 1302475157, 852964281 },
239 { 2144667649, 1075877614, 504992927 },
240 { 2144573441, 198765808, 1617144982 },
241 { 2144555009, 321528767, 155821259 },
242 { 2144550913, 814139516, 1819937644 },
243 { 2144536577, 571143206, 962942255 },
244 { 2144524289, 1746733766, 2471321 },
245 { 2144512001, 1821415077, 124190939 },
246 { 2144468993, 917871546, 1260072806 },
247 { 2144458753, 378417981, 1569240563 },
248 { 2144421889, 175229668, 1825620763 },
249 { 2144409601, 1699216963, 351648117 },
250 { 2144370689, 1071885991, 958186029 },
251 { 2144348161, 1763151227, 540353574 },
252 { 2144335873, 1060214804, 919598847 },
253 { 2144329729, 663515846, 1448552668 },
254 { 2144327681, 1057776305, 590222840 },
255 { 2144309249, 1705149168, 1459294624 },
256 { 2144296961, 325823721, 1649016934 },
257 { 2144290817, 738775789, 447427206 },
258 { 2144243713, 962347618, 893050215 },
259 { 2144237569, 1655257077, 900860862 },
260 { 2144161793, 242206694, 1567868672 },
261 { 2144155649, 769415308, 1247993134 },
262 { 2144137217, 320492023, 515841070 },
263 { 2144120833, 1639388522, 770877302 },
264 { 2144071681, 1761785233, 964296120 },
265 { 2144065537, 419817825, 204564472 },
266 { 2144028673, 666050597, 2091019760 },
267 { 2144010241, 1413657615, 1518702610 },
268 { 2143952897, 1238327946, 475672271 },
269 { 2143940609, 307063413, 1176750846 },
270 { 2143918081, 2062905559, 786785803 },
271 { 2143899649, 1338112849, 1562292083 },
272 { 2143891457, 68149545, 87166451 },
273 { 2143885313, 921750778, 394460854 },
274 { 2143854593, 719766593, 133877196 },
275 { 2143836161, 1149399850, 1861591875 },
276 { 2143762433, 1848739366, 1335934145 },
277 { 2143756289, 1326674710, 102999236 },
278 { 2143713281, 808061791, 1156900308 },
279 { 2143690753, 388399459, 1926468019 },
280 { 2143670273, 1427891374, 1756689401 },
281 { 2143666177, 1912173949, 986629565 },
282 { 2143645697, 2041160111, 371842865 },
283 { 2143641601, 1279906897, 2023974350 },
284 { 2143635457, 720473174, 1389027526 },
285 { 2143621121, 1298309455, 1732632006 },
286 { 2143598593, 1548762216, 1825417506 },
287 { 2143567873, 620475784, 1073787233 },
288 { 2143561729, 1932954575, 949167309 },
289 { 2143553537, 354315656, 1652037534 },
290 { 2143541249, 577424288, 1097027618 },
291 { 2143531009, 357862822, 478640055 },
292 { 2143522817, 2017706025, 1550531668 },
293 { 2143506433, 2078127419, 1824320165 },
294 { 2143488001, 613475285, 1604011510 },
295 { 2143469569, 1466594987, 502095196 },
296 { 2143426561, 1115430331, 1044637111 },
297 { 2143383553, 9778045, 1902463734 },
298 { 2143377409, 1557401276, 2056861771 },
299 { 2143363073, 652036455, 1965915971 },
300 { 2143260673, 1464581171, 1523257541 },
301 { 2143246337, 1876119649, 764541916 },
302 { 2143209473, 1614992673, 1920672844 },
303 { 2143203329, 981052047, 2049774209 },
304 { 2143160321, 1847355533, 728535665 },
305 { 2143129601, 965558457, 603052992 },
306 { 2143123457, 2140817191, 8348679 },
307 { 2143100929, 1547263683, 694209023 },
308 { 2143092737, 643459066, 1979934533 },
309 { 2143082497, 188603778, 2026175670 },
310 { 2143062017, 1657329695, 377451099 },
311 { 2143051777, 114967950, 979255473 },
312 { 2143025153, 1698431342, 1449196896 },
313 { 2143006721, 1862741675, 1739650365 },
314 { 2142996481, 756660457, 996160050 },
315 { 2142976001, 927864010, 1166847574 },
316 { 2142965761, 905070557, 661974566 },
317 { 2142916609, 40932754, 1787161127 },
318 { 2142892033, 1987985648, 675335382 },
319 { 2142885889, 797497211, 1323096997 },
320 { 2142871553, 2068025830, 1411877159 },
321 { 2142861313, 1217177090, 1438410687 },
322 { 2142830593, 409906375, 1767860634 },
323 { 2142803969, 1197788993, 359782919 },
324 { 2142785537, 643817365, 513932862 },
325 { 2142779393, 1717046338, 218943121 },
326 { 2142724097, 89336830, 416687049 },
327 { 2142707713, 5944581, 1356813523 },
328 { 2142658561, 887942135, 2074011722 },
329 { 2142638081, 151851972, 1647339939 },
330 { 2142564353, 1691505537, 1483107336 },
331 { 2142533633, 1989920200, 1135938817 },
332 { 2142529537, 959263126, 1531961857 },
333 { 2142527489, 453251129, 1725566162 },
334 { 2142502913, 1536028102, 182053257 },
335 { 2142498817, 570138730, 701443447 },
336 { 2142416897, 326965800, 411931819 },
337 { 2142363649, 1675665410, 1517191733 },
338 { 2142351361, 968529566, 1575712703 },
339 { 2142330881, 1384953238, 1769087884 },
340 { 2142314497, 1977173242, 1833745524 },
341 { 2142289921, 95082313, 1714775493 },
342 { 2142283777, 109377615, 1070584533 },
343 { 2142277633, 16960510, 702157145 },
344 { 2142263297, 553850819, 431364395 },
345 { 2142208001, 241466367, 2053967982 },
346 { 2142164993, 1795661326, 1031836848 },
347 { 2142097409, 1212530046, 712772031 },
348 { 2142087169, 1763869720, 822276067 },
349 { 2142078977, 644065713, 1765268066 },
350 { 2142074881, 112671944, 643204925 },
351 { 2142044161, 1387785471, 1297890174 },
352 { 2142025729, 783885537, 1000425730 },
353 { 2142011393, 905662232, 1679401033 },
354 { 2141974529, 799788433, 468119557 },
355 { 2141943809, 1932544124, 449305555 },
356 { 2141933569, 1527403256, 841867925 },
357 { 2141931521, 1247076451, 743823916 },
358 { 2141902849, 1199660531, 401687910 },
359 { 2141890561, 150132350, 1720336972 },
360 { 2141857793, 1287438162, 663880489 },
361 { 2141833217, 618017731, 1819208266 },
362 { 2141820929, 999578638, 1403090096 },
363 { 2141786113, 81834325, 1523542501 },
364 { 2141771777, 120001928, 463556492 },
365 { 2141759489, 122455485, 2124928282 },
366 { 2141749249, 141986041, 940339153 },
367 { 2141685761, 889088734, 477141499 },
368 { 2141673473, 324212681, 1122558298 },
369 { 2141669377, 1175806187, 1373818177 },
370 { 2141655041, 1113654822, 296887082 },
371 { 2141587457, 991103258, 1585913875 },
372 { 2141583361, 1401451409, 1802457360 },
373 { 2141575169, 1571977166, 712760980 },
374 { 2141546497, 1107849376, 1250270109 },
375 { 2141515777, 196544219, 356001130 },
376 { 2141495297, 1733571506, 1060744866 },
377 { 2141483009, 321552363, 1168297026 },
378 { 2141458433, 505818251, 733225819 },
379 { 2141360129, 1026840098, 948342276 },
380 { 2141325313, 945133744, 2129965998 },
381 { 2141317121, 1871100260, 1843844634 },
382 { 2141286401, 1790639498, 1750465696 },
383 { 2141267969, 1376858592, 186160720 },
384 { 2141255681, 2129698296, 1876677959 },
385 { 2141243393, 2138900688, 1340009628 },
386 { 2141214721, 1933049835, 1087819477 },
387 { 2141212673, 1898664939, 1786328049 },
388 { 2141202433, 990234828, 940682169 },
389 { 2141175809, 1406392421, 993089586 },
390 { 2141165569, 1263518371, 289019479 },
391 { 2141073409, 1485624211, 507864514 },
392 { 2141052929, 1885134788, 311252465 },
393 { 2141040641, 1285021247, 280941862 },
394 { 2141028353, 1527610374, 375035110 },
395 { 2141011969, 1400626168, 164696620 },
396 { 2140999681, 632959608, 966175067 },
397 { 2140997633, 2045628978, 1290889438 },
398 { 2140993537, 1412755491, 375366253 },
399 { 2140942337, 719477232, 785367828 },
400 { 2140925953, 45224252, 836552317 },
401 { 2140917761, 1157376588, 1001839569 },
402 { 2140887041, 278480752, 2098732796 },
403 { 2140837889, 1663139953, 924094810 },
404 { 2140788737, 802501511, 2045368990 },
405 { 2140766209, 1820083885, 1800295504 },
406 { 2140764161, 1169561905, 2106792035 },
407 { 2140696577, 127781498, 1885987531 },
408 { 2140684289, 16014477, 1098116827 },
409 { 2140653569, 665960598, 1796728247 },
410 { 2140594177, 1043085491, 377310938 },
411 { 2140579841, 1732838211, 1504505945 },
412 { 2140569601, 302071939, 358291016 },
413 { 2140567553, 192393733, 1909137143 },
414 { 2140557313, 406595731, 1175330270 },
415 { 2140549121, 1748850918, 525007007 },
416 { 2140477441, 499436566, 1031159814 },
417 { 2140469249, 1886004401, 1029951320 },
418 { 2140426241, 1483168100, 1676273461 },
419 { 2140420097, 1779917297, 846024476 },
420 { 2140413953, 522948893, 1816354149 },
421 { 2140383233, 1931364473, 1296921241 },
422 { 2140366849, 1917356555, 147196204 },
423 { 2140354561, 16466177, 1349052107 },
424 { 2140348417, 1875366972, 1860485634 },
425 { 2140323841, 456498717, 1790256483 },
426 { 2140321793, 1629493973, 150031888 },
427 { 2140315649, 1904063898, 395510935 },
428 { 2140280833, 1784104328, 831417909 },
429 { 2140250113, 256087139, 697349101 },
430 { 2140229633, 388553070, 243875754 },
431 { 2140223489, 747459608, 1396270850 },
432 { 2140200961, 507423743, 1895572209 },
433 { 2140162049, 580106016, 2045297469 },
434 { 2140149761, 712426444, 785217995 },
435 { 2140137473, 1441607584, 536866543 },
436 { 2140119041, 346538902, 1740434653 },
437 { 2140090369, 282642885, 21051094 },
438 { 2140076033, 1407456228, 319910029 },
439 { 2140047361, 1619330500, 1488632070 },
440 { 2140041217, 2089408064, 2012026134 },
441 { 2140008449, 1705524800, 1613440760 },
442 { 2139924481, 1846208233, 1280649481 },
443 { 2139906049, 989438755, 1185646076 },
444 { 2139867137, 1522314850, 372783595 },
445 { 2139842561, 1681587377, 216848235 },
446 { 2139826177, 2066284988, 1784999464 },
447 { 2139824129, 480888214, 1513323027 },
448 { 2139789313, 847937200, 858192859 },
449 { 2139783169, 1642000434, 1583261448 },
450 { 2139770881, 940699589, 179702100 },
451 { 2139768833, 315623242, 964612676 },
452 { 2139666433, 331649203, 764666914 },
453 { 2139641857, 2118730799, 1313764644 },
454 { 2139635713, 519149027, 519212449 },
455 { 2139598849, 1526413634, 1769667104 },
456 { 2139574273, 551148610, 820739925 },
457 { 2139568129, 1386800242, 472447405 },
458 { 2139549697, 813760130, 1412328531 },
459 { 2139537409, 1615286260, 1609362979 },
460 { 2139475969, 1352559299, 1696720421 },
461 { 2139455489, 1048691649, 1584935400 },
462 { 2139432961, 836025845, 950121150 },
463 { 2139424769, 1558281165, 1635486858 },
464 { 2139406337, 1728402143, 1674423301 },
465 { 2139396097, 1727715782, 1483470544 },
466 { 2139383809, 1092853491, 1741699084 },
467 { 2139369473, 690776899, 1242798709 },
468 { 2139351041, 1768782380, 2120712049 },
469 { 2139334657, 1739968247, 1427249225 },
470 { 2139332609, 1547189119, 623011170 },
471 { 2139310081, 1346827917, 1605466350 },
472 { 2139303937, 369317948, 828392831 },
473 { 2139301889, 1560417239, 1788073219 },
474 { 2139283457, 1303121623, 595079358 },
475 { 2139248641, 1354555286, 573424177 },
476 { 2139240449, 60974056, 885781403 },
477 { 2139222017, 355573421, 1221054839 },
478 { 2139215873, 566477826, 1724006500 },
479 { 2139150337, 871437673, 1609133294 },
480 { 2139144193, 1478130914, 1137491905 },
481 { 2139117569, 1854880922, 964728507 },
482 { 2139076609, 202405335, 756508944 },
483 { 2139062273, 1399715741, 884826059 },
484 { 2139045889, 1051045798, 1202295476 },
485 { 2139033601, 1707715206, 632234634 },
486 { 2139006977, 2035853139, 231626690 },
487 { 2138951681, 183867876, 838350879 },
488 { 2138945537, 1403254661, 404460202 },
489 { 2138920961, 310865011, 1282911681 },
490 { 2138910721, 1328496553, 103472415 },
491 { 2138904577, 78831681, 993513549 },
492 { 2138902529, 1319697451, 1055904361 },
493 { 2138816513, 384338872, 1706202469 },
494 { 2138810369, 1084868275, 405677177 },
495 { 2138787841, 401181788, 1964773901 },
496 { 2138775553, 1850532988, 1247087473 },
497 { 2138767361, 874261901, 1576073565 },
498 { 2138757121, 1187474742, 993541415 },
499 { 2138748929, 1782458888, 1043206483 },
500 { 2138744833, 1221500487, 800141243 },
501 { 2138738689, 413465368, 1450660558 },
502 { 2138695681, 739045140, 342611472 },
503 { 2138658817, 1355845756, 672674190 },
504 { 2138644481, 608379162, 1538874380 },
505 { 2138632193, 1444914034, 686911254 },
506 { 2138607617, 484707818, 1435142134 },
507 { 2138591233, 539460669, 1290458549 },
508 { 2138572801, 2093538990, 2011138646 },
509 { 2138552321, 1149786988, 1076414907 },
510 { 2138546177, 840688206, 2108985273 },
511 { 2138533889, 209669619, 198172413 },
512 { 2138523649, 1975879426, 1277003968 },
513 { 2138490881, 1351891144, 1976858109 },
514 { 2138460161, 1817321013, 1979278293 },
515 { 2138429441, 1950077177, 203441928 },
516 { 2138400769, 908970113, 628395069 },
517 { 2138398721, 219890864, 758486760 },
518 { 2138376193, 1306654379, 977554090 },
519 { 2138351617, 298822498, 2004708503 },
520 { 2138337281, 441457816, 1049002108 },
521 { 2138320897, 1517731724, 1442269609 },
522 { 2138290177, 1355911197, 1647139103 },
523 { 2138234881, 531313247, 1746591962 },
524 { 2138214401, 1899410930, 781416444 },
525 { 2138202113, 1813477173, 1622508515 },
526 { 2138191873, 1086458299, 1025408615 },
527 { 2138183681, 1998800427, 827063290 },
528 { 2138173441, 1921308898, 749670117 },
529 { 2138103809, 1620902804, 2126787647 },
530 { 2138099713, 828647069, 1892961817 },
531 { 2138085377, 179405355, 1525506535 },
532 { 2138060801, 615683235, 1259580138 },
533 { 2138044417, 2030277840, 1731266562 },
534 { 2138042369, 2087222316, 1627902259 },
535 { 2138032129, 126388712, 1108640984 },
536 { 2138011649, 715026550, 1017980050 },
537 { 2137993217, 1693714349, 1351778704 },
538 { 2137888769, 1289762259, 1053090405 },
539 { 2137853953, 199991890, 1254192789 },
540 { 2137833473, 941421685, 896995556 },
541 { 2137817089, 750416446, 1251031181 },
542 { 2137792513, 798075119, 368077456 },
543 { 2137786369, 878543495, 1035375025 },
544 { 2137767937, 9351178, 1156563902 },
545 { 2137755649, 1382297614, 1686559583 },
546 { 2137724929, 1345472850, 1681096331 },
547 { 2137704449, 834666929, 630551727 },
548 { 2137673729, 1646165729, 1892091571 },
549 { 2137620481, 778943821, 48456461 },
550 { 2137618433, 1730837875, 1713336725 },
551 { 2137581569, 805610339, 1378891359 },
552 { 2137538561, 204342388, 1950165220 },
553 { 2137526273, 1947629754, 1500789441 },
554 { 2137516033, 719902645, 1499525372 },
555 { 2137491457, 230451261, 556382829 },
556 { 2137440257, 979573541, 412760291 },
557 { 2137374721, 927841248, 1954137185 },
558 { 2137362433, 1243778559, 861024672 },
559 { 2137313281, 1341338501, 980638386 },
560 { 2137311233, 937415182, 1793212117 },
561 { 2137255937, 795331324, 1410253405 },
562 { 2137243649, 150756339, 1966999887 },
563 { 2137182209, 163346914, 1939301431 },
564 { 2137171969, 1952552395, 758913141 },
565 { 2137159681, 570788721, 218668666 },
566 { 2137147393, 1896656810, 2045670345 },
567 { 2137141249, 358493842, 518199643 },
568 { 2137139201, 1505023029, 674695848 },
569 { 2137133057, 27911103, 830956306 },
570 { 2137122817, 439771337, 1555268614 },
571 { 2137116673, 790988579, 1871449599 },
572 { 2137110529, 432109234, 811805080 },
573 { 2137102337, 1357900653, 1184997641 },
574 { 2137098241, 515119035, 1715693095 },
575 { 2137090049, 408575203, 2085660657 },
576 { 2137085953, 2097793407, 1349626963 },
577 { 2137055233, 1556739954, 1449960883 },
578 { 2137030657, 1545758650, 1369303716 },
579 { 2136987649, 332602570, 103875114 },
580 { 2136969217, 1499989506, 1662964115 },
581 { 2136924161, 857040753, 4738842 },
582 { 2136895489, 1948872712, 570436091 },
583 { 2136893441, 58969960, 1568349634 },
584 { 2136887297, 2127193379, 273612548 },
585 { 2136850433, 111208983, 1181257116 },
586 { 2136809473, 1627275942, 1680317971 },
587 { 2136764417, 1574888217, 14011331 },
588 { 2136741889, 14011055, 1129154251 },
589 { 2136727553, 35862563, 1838555253 },
590 { 2136721409, 310235666, 1363928244 },
591 { 2136698881, 1612429202, 1560383828 },
592 { 2136649729, 1138540131, 800014364 },
593 { 2136606721, 602323503, 1433096652 },
594 { 2136563713, 182209265, 1919611038 },
595 { 2136555521, 324156477, 165591039 },
596 { 2136549377, 195513113, 217165345 },
597 { 2136526849, 1050768046, 939647887 },
598 { 2136508417, 1886286237, 1619926572 },
599 { 2136477697, 609647664, 35065157 },
600 { 2136471553, 679352216, 1452259468 },
601 { 2136457217, 128630031, 824816521 },
602 { 2136422401, 19787464, 1526049830 },
603 { 2136420353, 698316836, 1530623527 },
604 { 2136371201, 1651862373, 1804812805 },
605 { 2136334337, 326596005, 336977082 },
606 { 2136322049, 63253370, 1904972151 },
607 { 2136297473, 312176076, 172182411 },
608 { 2136248321, 381261841, 369032670 },
609 { 2136242177, 358688773, 1640007994 },
610 { 2136229889, 512677188, 75585225 },
611 { 2136219649, 2095003250, 1970086149 },
612 { 2136207361, 1909650722, 537760675 },
613 { 2136176641, 1334616195, 1533487619 },
614 { 2136158209, 2096285632, 1793285210 },
615 { 2136143873, 1897347517, 293843959 },
616 { 2136133633, 923586222, 1022655978 },
617 { 2136096769, 1464868191, 1515074410 },
618 { 2136094721, 2020679520, 2061636104 },
619 { 2136076289, 290798503, 1814726809 },
620 { 2136041473, 156415894, 1250757633 },
621 { 2135996417, 297459940, 1132158924 },
622 { 2135955457, 538755304, 1688831340 },
623 { 0, 0, 0 }
624 };
625
626 /*
627 * Reduce a small signed integer modulo a small prime. The source
628 * value x MUST be such that -p < x < p.
629 */
630 static inline uint32_t
631 modp_set(int32_t x, uint32_t p)
632 {
633 uint32_t w;
634
635 w = (uint32_t)x;
636 w += p & -(w >> 31);
637 return w;
638 }
639
640 /*
641 * Normalize a modular integer around 0.
642 */
643 static inline int32_t
644 modp_norm(uint32_t x, uint32_t p)
645 {
646 return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
647 }
648
649 /*
650 * Compute -1/p mod 2^31. This works for all odd integers p that fit
651 * on 31 bits.
652 */
653 static uint32_t
654 modp_ninv31(uint32_t p)
655 {
656 uint32_t y;
657
658 y = 2 - p;
659 y *= 2 - p * y;
660 y *= 2 - p * y;
661 y *= 2 - p * y;
662 y *= 2 - p * y;
663 return (uint32_t)0x7FFFFFFF & -y;
664 }
665
666 /*
667 * Compute R = 2^31 mod p.
668 */
669 static inline uint32_t
670 modp_R(uint32_t p)
671 {
672 /*
673 * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
674 * 2^31 - p.
675 */
676 return ((uint32_t)1 << 31) - p;
677 }
678
679 /*
680 * Addition modulo p.
681 */
682 static inline uint32_t
683 modp_add(uint32_t a, uint32_t b, uint32_t p)
684 {
685 uint32_t d;
686
687 d = a + b - p;
688 d += p & -(d >> 31);
689 return d;
690 }
691
692 /*
693 * Subtraction modulo p.
694 */
695 static inline uint32_t
696 modp_sub(uint32_t a, uint32_t b, uint32_t p)
697 {
698 uint32_t d;
699
700 d = a - b;
701 d += p & -(d >> 31);
702 return d;
703 }
704
705 /*
706 * Halving modulo p.
707 */
708 /* unused
709 static inline uint32_t
710 modp_half(uint32_t a, uint32_t p)
711 {
712 a += p & -(a & 1);
713 return a >> 1;
714 }
715 */
716
717 /*
718 * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
719 * It is required that p is an odd integer.
720 */
721 static inline uint32_t
722 modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i)
723 {
724 uint64_t z, w;
725 uint32_t d;
726
727 z = (uint64_t)a * (uint64_t)b;
728 w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
729 d = (uint32_t)((z + w) >> 31) - p;
730 d += p & -(d >> 31);
731 return d;
732 }
733
734 /*
735 * Compute R2 = 2^62 mod p.
736 */
737 static uint32_t
738 modp_R2(uint32_t p, uint32_t p0i)
739 {
740 uint32_t z;
741
742 /*
743 * Compute z = 2^31 mod p (this is the value 1 in Montgomery
744 * representation), then double it with an addition.
745 */
746 z = modp_R(p);
747 z = modp_add(z, z, p);
748
749 /*
750 * Square it five times to obtain 2^32 in Montgomery representation
751 * (i.e. 2^63 mod p).
752 */
753 z = modp_montymul(z, z, p, p0i);
754 z = modp_montymul(z, z, p, p0i);
755 z = modp_montymul(z, z, p, p0i);
756 z = modp_montymul(z, z, p, p0i);
757 z = modp_montymul(z, z, p, p0i);
758
759 /*
760 * Halve the value mod p to get 2^62.
761 */
762 z = (z + (p & -(z & 1))) >> 1;
763 return z;
764 }
765
766 /*
767 * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
768 * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
769 * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
770 */
771 static inline uint32_t
772 modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2)
773 {
774 int i;
775 uint32_t r, z;
776
777 /*
778 * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
779 * representation of (2^31)^e mod p, where e = x-1.
780 * R2 is 2^31 in Montgomery representation.
781 */
782 x --;
783 r = R2;
784 z = modp_R(p);
785 for (i = 0; (1U << i) <= x; i ++) {
786 if ((x & (1U << i)) != 0) {
787 z = modp_montymul(z, r, p, p0i);
788 }
789 r = modp_montymul(r, r, p, p0i);
790 }
791 return z;
792 }
793
794 /*
795 * Division modulo p. If the divisor (b) is 0, then 0 is returned.
796 * This function computes proper results only when p is prime.
797 * Parameters:
798 * a dividend
799 * b divisor
800 * p odd prime modulus
801 * p0i -1/p mod 2^31
802 * R 2^31 mod R
803 */
804 static uint32_t
805 modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R)
806 {
807 uint32_t z, e;
808 int i;
809
810 e = p - 2;
811 z = R;
812 for (i = 30; i >= 0; i --) {
813 uint32_t z2;
814
815 z = modp_montymul(z, z, p, p0i);
816 z2 = modp_montymul(z, b, p, p0i);
817 z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
818 }
819
820 /*
821 * The loop above just assumed that b was in Montgomery
822 * representation, i.e. really contained b*R; under that
823 * assumption, it returns 1/b in Montgomery representation,
824 * which is R/b. But we gave it b in normal representation,
825 * so the loop really returned R/(b/R) = R^2/b.
826 *
827 * We want a/b, so we need one Montgomery multiplication with a,
828 * which also remove one of the R factors, and another such
829 * multiplication to remove the second R factor.
830 */
831 z = modp_montymul(z, 1, p, p0i);
832 return modp_montymul(a, z, p, p0i);
833 }
834
835 /*
836 * Bit-reversal index table.
837 */
838 static const uint16_t REV10[] = {
839 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832,
840 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928,
841 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784,
842 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976,
843 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880,
844 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904,
845 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808,
846 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000,
847 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856,
848 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952,
849 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772,
850 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964,
851 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868,
852 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916,
853 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820,
854 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012,
855 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844,
856 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940,
857 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796,
858 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988,
859 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892,
860 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898,
861 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802,
862 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994,
863 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850,
864 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946,
865 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778,
866 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970,
867 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874,
868 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922,
869 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826,
870 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018,
871 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838,
872 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934,
873 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790,
874 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982,
875 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886,
876 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910,
877 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814,
878 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006,
879 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862,
880 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958,
881 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769,
882 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961,
883 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865,
884 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913,
885 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817,
886 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009,
887 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841,
888 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937,
889 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793,
890 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985,
891 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889,
892 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901,
893 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805,
894 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997,
895 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853,
896 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949,
897 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781,
898 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973,
899 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877,
900 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925,
901 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829,
902 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021,
903 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835,
904 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931,
905 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787,
906 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979,
907 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883,
908 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907,
909 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811,
910 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003,
911 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859,
912 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955,
913 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775,
914 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967,
915 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871,
916 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919,
917 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823,
918 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015,
919 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847,
920 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943,
921 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799,
922 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991,
923 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895,
924 255, 767, 511, 1023
925 };
926
927 /*
928 * Compute the roots for NTT and inverse NTT (binary case). Input
929 * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
930 * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
931 * gm[rev(i)] = g^i mod p
932 * igm[rev(i)] = (1/g)^i mod p
933 * where rev() is the "bit reversal" function over 10 bits. It fills
934 * the arrays only up to N = 2^logn values.
935 *
936 * The values stored in gm[] and igm[] are in Montgomery representation.
937 *
938 * p must be a prime such that p = 1 mod 2048.
939 */
940 static void
941 modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
942 uint32_t g, uint32_t p, uint32_t p0i)
943 {
944 size_t u, n;
945 unsigned k;
946 uint32_t ig, x1, x2, R2;
947
948 n = (size_t)1 << logn;
949
950 /*
951 * We want g such that g^(2N) = 1 mod p, but the provided
952 * generator has order 2048. We must square it a few times.
953 */
954 R2 = modp_R2(p, p0i);
955 g = modp_montymul(g, R2, p, p0i);
956 for (k = logn; k < 10; k ++) {
957 g = modp_montymul(g, g, p, p0i);
958 }
959
960 ig = modp_div(R2, g, p, p0i, modp_R(p));
961 k = 10 - logn;
962 x1 = x2 = modp_R(p);
963 for (u = 0; u < n; u ++) {
964 size_t v;
965
966 v = REV10[u << k];
967 gm[v] = x1;
968 igm[v] = x2;
969 x1 = modp_montymul(x1, g, p, p0i);
970 x2 = modp_montymul(x2, ig, p, p0i);
971 }
972 }
973
974 /*
975 * Compute the NTT over a polynomial (binary case). Polynomial elements
976 * are a[0], a[stride], a[2 * stride]...
977 */
978 static void
979 modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
980 uint32_t p, uint32_t p0i)
981 {
982 size_t t, m, n;
983
984 if (logn == 0) {
985 return;
986 }
987 n = (size_t)1 << logn;
988 t = n;
989 for (m = 1; m < n; m <<= 1) {
990 size_t ht, u, v1;
991
992 ht = t >> 1;
993 for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
994 uint32_t s;
995 size_t v;
996 uint32_t *r1, *r2;
997
998 s = gm[m + u];
999 r1 = a + v1 * stride;
1000 r2 = r1 + ht * stride;
1001 for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
1002 uint32_t x, y;
1003
1004 x = *r1;
1005 y = modp_montymul(*r2, s, p, p0i);
1006 *r1 = modp_add(x, y, p);
1007 *r2 = modp_sub(x, y, p);
1008 }
1009 }
1010 t = ht;
1011 }
1012 }
1013
1014 /*
1015 * Compute the inverse NTT over a polynomial (binary case).
1016 */
1017 static void
1018 modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
1019 uint32_t p, uint32_t p0i)
1020 {
1021 size_t t, m, n, k;
1022 uint32_t ni;
1023 uint32_t *r;
1024
1025 if (logn == 0) {
1026 return;
1027 }
1028 n = (size_t)1 << logn;
1029 t = 1;
1030 for (m = n; m > 1; m >>= 1) {
1031 size_t hm, dt, u, v1;
1032
1033 hm = m >> 1;
1034 dt = t << 1;
1035 for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
1036 uint32_t s;
1037 size_t v;
1038 uint32_t *r1, *r2;
1039
1040 s = igm[hm + u];
1041 r1 = a + v1 * stride;
1042 r2 = r1 + t * stride;
1043 for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
1044 uint32_t x, y;
1045
1046 x = *r1;
1047 y = *r2;
1048 *r1 = modp_add(x, y, p);
1049 *r2 = modp_montymul(
1050 modp_sub(x, y, p), s, p, p0i);;
1051 }
1052 }
1053 t = dt;
1054 }
1055
1056 /*
1057 * We need 1/n in Montgomery representation, i.e. R/n. Since
1058 * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
1059 * thus a simple shift will do.
1060 */
1061 ni = (uint32_t)1 << (31 - logn);
1062 for (k = 0, r = a; k < n; k ++, r += stride) {
1063 *r = modp_montymul(*r, ni, p, p0i);
1064 }
1065 }
1066
1067 /*
1068 * Simplified macros for NTT and iNTT (binary case) when the elements
1069 * are consecutive in RAM.
1070 */
1071 #define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i)
1072 #define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
1073
1074 /*
1075 * Given polynomial f in NTT representation modulo p, compute f' of degree
1076 * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
1077 * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
1078 *
1079 * The new polynomial is written "in place" over the first N/2 elements
1080 * of f.
1081 *
1082 * If applied logn times successively on a given polynomial, the resulting
1083 * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
1084 *
1085 * This function applies only to the binary case; it is invoked from
1086 * solve_NTRU_binary_depth1().
1087 */
1088 static void
1089 modp_poly_rec_res(uint32_t *f, unsigned logn,
1090 uint32_t p, uint32_t p0i, uint32_t R2)
1091 {
1092 size_t hn, u;
1093
1094 hn = (size_t)1 << (logn - 1);
1095 for (u = 0; u < hn; u ++) {
1096 uint32_t w0, w1;
1097
1098 w0 = f[(u << 1) + 0];
1099 w1 = f[(u << 1) + 1];
1100 f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
1101 }
1102 }
1103
1104 /* ==================================================================== */
1105 /*
1106 * Custom bignum implementation.
1107 *
1108 * This is a very reduced set of functionalities. We need to do the
1109 * following operations:
1110 *
1111 * - Rebuild the resultant and the polynomial coefficients from their
1112 * values modulo small primes (of length 31 bits each).
1113 *
1114 * - Compute an extended GCD between the two computed resultants.
1115 *
1116 * - Extract top bits and add scaled values during the successive steps
1117 * of Babai rounding.
1118 *
1119 * When rebuilding values using CRT, we must also recompute the product
1120 * of the small prime factors. We always do it one small factor at a
1121 * time, so the "complicated" operations can be done modulo the small
1122 * prime with the modp_* functions. CRT coefficients (inverses) are
1123 * precomputed.
1124 *
1125 * All values are positive until the last step: when the polynomial
1126 * coefficients have been rebuilt, we normalize them around 0. But then,
1127 * only additions and subtractions on the upper few bits are needed
1128 * afterwards.
1129 *
1130 * We keep big integers as arrays of 31-bit words (in uint32_t values);
1131 * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
1132 * makes it easier to keep track of carries. When negative values are
1133 * used, two's complement is used.
1134 */
1135
1136 /*
1137 * Subtract integer b from integer a. Both integers are supposed to have
1138 * the same size. The carry (0 or 1) is returned. Source arrays a and b
1139 * MUST be distinct.
1140 *
1141 * The operation is performed as described above if ctr = 1. If
1142 * ctl = 0, the value a[] is unmodified, but all memory accesses are
1143 * still performed, and the carry is computed and returned.
1144 */
1145 static uint32_t
1146 zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
1147 uint32_t ctl)
1148 {
1149 size_t u;
1150 uint32_t cc, m;
1151
1152 cc = 0;
1153 m = -ctl;
1154 for (u = 0; u < len; u ++) {
1155 uint32_t aw, w;
1156
1157 aw = a[u];
1158 w = aw - b[u] - cc;
1159 cc = w >> 31;
1160 aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
1161 a[u] = aw;
1162 }
1163 return cc;
1164 }
1165
1166 /*
1167 * Mutiply the provided big integer m with a small value x.
1168 * This function assumes that x < 2^31. The carry word is returned.
1169 */
1170 static uint32_t
1171 zint_mul_small(uint32_t *m, size_t mlen, uint32_t x)
1172 {
1173 size_t u;
1174 uint32_t cc;
1175
1176 cc = 0;
1177 for (u = 0; u < mlen; u ++) {
1178 uint64_t z;
1179
1180 z = (uint64_t)m[u] * (uint64_t)x + cc;
1181 m[u] = (uint32_t)z & 0x7FFFFFFF;
1182 cc = (uint32_t)(z >> 31);
1183 }
1184 return cc;
1185 }
1186
1187 /*
1188 * Reduce a big integer d modulo a small integer p.
1189 * Rules:
1190 * d is unsigned
1191 * p is prime
1192 * 2^30 < p < 2^31
1193 * p0i = -(1/p) mod 2^31
1194 * R2 = 2^62 mod p
1195 */
1196 static uint32_t
1197 zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
1198 uint32_t p, uint32_t p0i, uint32_t R2)
1199 {
1200 uint32_t x;
1201 size_t u;
1202
1203 /*
1204 * Algorithm: we inject words one by one, starting with the high
1205 * word. Each step is:
1206 * - multiply x by 2^31
1207 * - add new word
1208 */
1209 x = 0;
1210 u = dlen;
1211 while (u -- > 0) {
1212 uint32_t w;
1213
1214 x = modp_montymul(x, R2, p, p0i);
1215 w = d[u] - p;
1216 w += p & -(w >> 31);
1217 x = modp_add(x, w, p);
1218 }
1219 return x;
1220 }
1221
1222 /*
1223 * Similar to zint_mod_small_unsigned(), except that d may be signed.
1224 * Extra parameter is Rx = 2^(31*dlen) mod p.
1225 */
1226 static uint32_t
1227 zint_mod_small_signed(const uint32_t *d, size_t dlen,
1228 uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx)
1229 {
1230 uint32_t z;
1231
1232 if (dlen == 0) {
1233 return 0;
1234 }
1235 z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
1236 z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
1237 return z;
1238 }
1239
1240 /*
1241 * Add y*s to x. x and y initially have length 'len' words; the new x
1242 * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
1243 * not overlap.
1244 */
1245 static void
1246 zint_add_mul_small(uint32_t *restrict x,
1247 const uint32_t *restrict y, size_t len, uint32_t s)
1248 {
1249 size_t u;
1250 uint32_t cc;
1251
1252 cc = 0;
1253 for (u = 0; u < len; u ++) {
1254 uint32_t xw, yw;
1255 uint64_t z;
1256
1257 xw = x[u];
1258 yw = y[u];
1259 z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
1260 x[u] = (uint32_t)z & 0x7FFFFFFF;
1261 cc = (uint32_t)(z >> 31);
1262 }
1263 x[len] = cc;
1264 }
1265
1266 /*
1267 * Normalize a modular integer around 0: if x > p/2, then x is replaced
1268 * with x - p (signed encoding with two's complement); otherwise, x is
1269 * untouched. The two integers x and p are encoded over the same length.
1270 */
1271 static void
1272 zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len)
1273 {
1274 size_t u;
1275 uint32_t r, bb;
1276
1277 /*
1278 * Compare x with p/2. We use the shifted version of p, and p
1279 * is odd, so we really compare with (p-1)/2; we want to perform
1280 * the subtraction if and only if x > (p-1)/2.
1281 */
1282 r = 0;
1283 bb = 0;
1284 u = len;
1285 while (u -- > 0) {
1286 uint32_t wx, wp, cc;
1287
1288 /*
1289 * Get the two words to compare in wx and wp (both over
1290 * 31 bits exactly).
1291 */
1292 wx = x[u];
1293 wp = (p[u] >> 1) | (bb << 30);
1294 bb = p[u] & 1;
1295
1296 /*
1297 * We set cc to -1, 0 or 1, depending on whether wp is
1298 * lower than, equal to, or greater than wx.
1299 */
1300 cc = wp - wx;
1301 cc = ((-cc) >> 31) | -(cc >> 31);
1302
1303 /*
1304 * If r != 0 then it is either 1 or -1, and we keep its
1305 * value. Otherwise, if r = 0, then we replace it with cc.
1306 */
1307 r |= cc & ((r & 1) - 1);
1308 }
1309
1310 /*
1311 * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
1312 * is lower than, equal to, or greater than x. We thus want to
1313 * do the subtraction only if r = -1.
1314 */
1315 zint_sub(x, p, len, r >> 31);
1316 }
1317
1318 /*
1319 * Rebuild integers from their RNS representation. There are 'num'
1320 * integers, and each consists in 'xlen' words. 'xx' points at that
1321 * first word of the first integer; subsequent integers are accessed
1322 * by adding 'xstride' repeatedly.
1323 *
1324 * The words of an integer are the RNS representation of that integer,
1325 * using the provided 'primes' are moduli. This function replaces
1326 * each integer with its multi-word value (little-endian order).
1327 *
1328 * If "normalize_signed" is non-zero, then the returned value is
1329 * normalized to the -m/2..m/2 interval (where m is the product of all
1330 * small prime moduli); two's complement is used for negative values.
1331 */
1332 static void
1333 zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
1334 size_t num, const small_prime *primes, int normalize_signed,
1335 uint32_t *restrict tmp)
1336 {
1337 size_t u;
1338 uint32_t *x;
1339
1340 tmp[0] = primes[0].p;
1341 for (u = 1; u < xlen; u ++) {
1342 /*
1343 * At the entry of each loop iteration:
1344 * - the first u words of each array have been
1345 * reassembled;
1346 * - the first u words of tmp[] contains the
1347 * product of the prime moduli processed so far.
1348 *
1349 * We call 'q' the product of all previous primes.
1350 */
1351 uint32_t p, p0i, s, R2;
1352 size_t v;
1353
1354 p = primes[u].p;
1355 s = primes[u].s;
1356 p0i = modp_ninv31(p);
1357 R2 = modp_R2(p, p0i);
1358
1359 for (v = 0, x = xx; v < num; v ++, x += xstride) {
1360 uint32_t xp, xq, xr;
1361 /*
1362 * xp = the integer x modulo the prime p for this
1363 * iteration
1364 * xq = (x mod q) mod p
1365 */
1366 xp = x[u];
1367 xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
1368
1369 /*
1370 * New value is (x mod q) + q * (s * (xp - xq) mod p)
1371 */
1372 xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
1373 zint_add_mul_small(x, tmp, u, xr);
1374 }
1375
1376 /*
1377 * Update product of primes in tmp[].
1378 */
1379 tmp[u] = zint_mul_small(tmp, u, p);
1380 }
1381
1382 /*
1383 * Normalize the reconstructed values around 0.
1384 */
1385 if (normalize_signed) {
1386 for (u = 0, x = xx; u < num; u ++, x += xstride) {
1387 zint_norm_zero(x, tmp, xlen);
1388 }
1389 }
1390 }
1391
1392 /*
1393 * Negate a big integer conditionally: value a is replaced with -a if
1394 * and only if ctl = 1. Control value ctl must be 0 or 1.
1395 */
1396 static void
1397 zint_negate(uint32_t *a, size_t len, uint32_t ctl)
1398 {
1399 size_t u;
1400 uint32_t cc, m;
1401
1402 /*
1403 * If ctl = 1 then we flip the bits of a by XORing with
1404 * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
1405 * with 0 and add 0, which leaves the value unchanged.
1406 */
1407 cc = ctl;
1408 m = -ctl >> 1;
1409 for (u = 0; u < len; u ++) {
1410 uint32_t aw;
1411
1412 aw = a[u];
1413 aw = (aw ^ m) + cc;
1414 a[u] = aw & 0x7FFFFFFF;
1415 cc = aw >> 31;
1416 }
1417 }
1418
1419 /*
1420 * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
1421 * The low bits are dropped (the caller should compute the coefficients
1422 * such that these dropped bits are all zeros). If either or both
1423 * yields a negative value, then the value is negated.
1424 *
1425 * Returned value is:
1426 * 0 both values were positive
1427 * 1 new a had to be negated
1428 * 2 new b had to be negated
1429 * 3 both new a and new b had to be negated
1430 *
1431 * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
1432 */
1433 static uint32_t
1434 zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
1435 int64_t xa, int64_t xb, int64_t ya, int64_t yb)
1436 {
1437 size_t u;
1438 int64_t cca, ccb;
1439 uint32_t nega, negb;
1440
1441 cca = 0;
1442 ccb = 0;
1443 for (u = 0; u < len; u ++) {
1444 uint32_t wa, wb;
1445 uint64_t za, zb;
1446
1447 wa = a[u];
1448 wb = b[u];
1449 za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
1450 zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
1451 if (u > 0) {
1452 a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
1453 b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
1454 }
1455 cca = *(int64_t *)&za >> 31;
1456 ccb = *(int64_t *)&zb >> 31;
1457 }
1458 a[len - 1] = (uint32_t)cca;
1459 b[len - 1] = (uint32_t)ccb;
1460
1461 nega = (uint32_t)((uint64_t)cca >> 63);
1462 negb = (uint32_t)((uint64_t)ccb >> 63);
1463 zint_negate(a, len, nega);
1464 zint_negate(b, len, negb);
1465 return nega | (negb << 1);
1466 }
1467
1468 /*
1469 * Finish modular reduction. Rules on input parameters:
1470 *
1471 * if neg = 1, then -m <= a < 0
1472 * if neg = 0, then 0 <= a < 2*m
1473 *
1474 * If neg = 0, then the top word of a[] is allowed to use 32 bits.
1475 *
1476 * Modulus m must be odd.
1477 */
1478 static void
1479 zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg)
1480 {
1481 size_t u;
1482 uint32_t cc, xm, ym;
1483
1484 /*
1485 * First pass: compare a (assumed nonnegative) with m. Note that
1486 * if the top word uses 32 bits, subtracting m must yield a
1487 * value less than 2^31 since a < 2*m.
1488 */
1489 cc = 0;
1490 for (u = 0; u < len; u ++) {
1491 cc = (a[u] - m[u] - cc) >> 31;
1492 }
1493
1494 /*
1495 * If neg = 1 then we must add m (regardless of cc)
1496 * If neg = 0 and cc = 0 then we must subtract m
1497 * If neg = 0 and cc = 1 then we must do nothing
1498 *
1499 * In the loop below, we conditionally subtract either m or -m
1500 * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
1501 * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
1502 */
1503 xm = -neg >> 1;
1504 ym = -(neg | (1 - cc));
1505 cc = neg;
1506 for (u = 0; u < len; u ++) {
1507 uint32_t aw, mw;
1508
1509 aw = a[u];
1510 mw = (m[u] ^ xm) & ym;
1511 aw = aw - mw - cc;
1512 a[u] = aw & 0x7FFFFFFF;
1513 cc = aw >> 31;
1514 }
1515 }
1516
1517 /*
1518 * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
1519 * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
1520 */
1521 static void
1522 zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
1523 uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb)
1524 {
1525 size_t u;
1526 int64_t cca, ccb;
1527 uint32_t fa, fb;
1528
1529 /*
1530 * These are actually four combined Montgomery multiplications.
1531 */
1532 cca = 0;
1533 ccb = 0;
1534 fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
1535 fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
1536 for (u = 0; u < len; u ++) {
1537 uint32_t wa, wb;
1538 uint64_t za, zb;
1539
1540 wa = a[u];
1541 wb = b[u];
1542 za = wa * (uint64_t)xa + wb * (uint64_t)xb
1543 + m[u] * (uint64_t)fa + (uint64_t)cca;
1544 zb = wa * (uint64_t)ya + wb * (uint64_t)yb
1545 + m[u] * (uint64_t)fb + (uint64_t)ccb;
1546 if (u > 0) {
1547 a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
1548 b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
1549 }
1550 cca = *(int64_t *)&za >> 31;
1551 ccb = *(int64_t *)&zb >> 31;
1552 }
1553 a[len - 1] = (uint32_t)cca;
1554 b[len - 1] = (uint32_t)ccb;
1555
1556 /*
1557 * At this point:
1558 * -m <= a < 2*m
1559 * -m <= b < 2*m
1560 * (this is a case of Montgomery reduction)
1561 * The top words of 'a' and 'b' may have a 32-th bit set.
1562 * We want to add or subtract the modulus, as required.
1563 */
1564 zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
1565 zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
1566 }
1567
1568 /*
1569 * Compute a GCD between two positive big integers x and y. The two
1570 * integers must be odd. Returned value is 1 if the GCD is 1, 0
1571 * otherwise. When 1 is returned, arrays u and v are filled with values
1572 * such that:
1573 * 0 <= u <= y
1574 * 0 <= v <= x
1575 * x*u - y*v = 1
1576 * x[] and y[] are unmodified. Both input values must have the same
1577 * encoded length. Temporary array must be large enough to accommodate 4
1578 * extra values of that length. Arrays u, v and tmp may not overlap with
1579 * each other, or with either x or y.
1580 */
1581 static int
1582 zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
1583 const uint32_t *restrict x, const uint32_t *restrict y,
1584 size_t len, uint32_t *restrict tmp)
1585 {
1586 /*
1587 * Algorithm is an extended binary GCD. We maintain 6 values
1588 * a, b, u0, u1, v0 and v1 with the following invariants:
1589 *
1590 * a = x*u0 - y*v0
1591 * b = x*u1 - y*v1
1592 * 0 <= a <= x
1593 * 0 <= b <= y
1594 * 0 <= u0 < y
1595 * 0 <= v0 < x
1596 * 0 <= u1 <= y
1597 * 0 <= v1 < x
1598 *
1599 * Initial values are:
1600 *
1601 * a = x u0 = 1 v0 = 0
1602 * b = y u1 = y v1 = x-1
1603 *
1604 * Each iteration reduces either a or b, and maintains the
1605 * invariants. Algorithm stops when a = b, at which point their
1606 * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
1607 * the values (u,v) we want to return.
1608 *
1609 * The formal definition of the algorithm is a sequence of steps:
1610 *
1611 * - If a is even, then:
1612 * a <- a/2
1613 * u0 <- u0/2 mod y
1614 * v0 <- v0/2 mod x
1615 *
1616 * - Otherwise, if b is even, then:
1617 * b <- b/2
1618 * u1 <- u1/2 mod y
1619 * v1 <- v1/2 mod x
1620 *
1621 * - Otherwise, if a > b, then:
1622 * a <- (a-b)/2
1623 * u0 <- (u0-u1)/2 mod y
1624 * v0 <- (v0-v1)/2 mod x
1625 *
1626 * - Otherwise:
1627 * b <- (b-a)/2
1628 * u1 <- (u1-u0)/2 mod y
1629 * v1 <- (v1-v0)/2 mod y
1630 *
1631 * We can show that the operations above preserve the invariants:
1632 *
1633 * - If a is even, then u0 and v0 are either both even or both
1634 * odd (since a = x*u0 - y*v0, and x and y are both odd).
1635 * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
1636 * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
1637 * the a = x*u0 - y*v0 invariant is preserved.
1638 *
1639 * - The same holds for the case where b is even.
1640 *
1641 * - If a and b are odd, and a > b, then:
1642 *
1643 * a-b = x*(u0-u1) - y*(v0-v1)
1644 *
1645 * In that situation, if u0 < u1, then x*(u0-u1) < 0, but
1646 * a-b > 0; therefore, it must be that v0 < v1, and the
1647 * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
1648 * which preserves the invariants. Otherwise, if u0 > u1,
1649 * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
1650 * b >= 0, hence a-b <= x. It follows that, in that case,
1651 * v0-v1 >= 0. The first part of the update is then:
1652 * (u0,v0) <- (u0-u1,v0-v1), which again preserves the
1653 * invariants.
1654 *
1655 * Either way, once the subtraction is done, the new value of
1656 * a, which is the difference of two odd values, is even,
1657 * and the remaining of this step is a subcase of the
1658 * first algorithm case (i.e. when a is even).
1659 *
1660 * - If a and b are odd, and b > a, then the a similar
1661 * argument holds.
1662 *
1663 * The values a and b start at x and y, respectively. Since x
1664 * and y are odd, their GCD is odd, and it is easily seen that
1665 * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
1666 * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
1667 * or b is reduced by at least one bit at each iteration, so
1668 * the algorithm necessarily converges on the case a = b, at
1669 * which point the common value is the GCD.
1670 *
1671 * In the algorithm expressed above, when a = b, the fourth case
1672 * applies, and sets b = 0. Since a contains the GCD of x and y,
1673 * which are both odd, a must be odd, and subsequent iterations
1674 * (if any) will simply divide b by 2 repeatedly, which has no
1675 * consequence. Thus, the algorithm can run for more iterations
1676 * than necessary; the final GCD will be in a, and the (u,v)
1677 * coefficients will be (u0,v0).
1678 *
1679 *
1680 * The presentation above is bit-by-bit. It can be sped up by
1681 * noticing that all decisions are taken based on the low bits
1682 * and high bits of a and b. We can extract the two top words
1683 * and low word of each of a and b, and compute reduction
1684 * parameters pa, pb, qa and qb such that the new values for
1685 * a and b are:
1686 * a' = (a*pa + b*pb) / (2^31)
1687 * b' = (a*qa + b*qb) / (2^31)
1688 * the two divisions being exact. The coefficients are obtained
1689 * just from the extracted words, and may be slightly off, requiring
1690 * an optional correction: if a' < 0, then we replace pa with -pa
1691 * and pb with -pb. Each such step will reduce the total length
1692 * (sum of lengths of a and b) by at least 30 bits at each
1693 * iteration.
1694 */
1695 uint32_t *u0, *u1, *v0, *v1, *a, *b;
1696 uint32_t x0i, y0i;
1697 uint32_t num, rc;
1698 size_t j;
1699
1700 if (len == 0) {
1701 return 0;
1702 }
1703
1704 /*
1705 * u0 and v0 are the u and v result buffers; the four other
1706 * values (u1, v1, a and b) are taken from tmp[].
1707 */
1708 u0 = u;
1709 v0 = v;
1710 u1 = tmp;
1711 v1 = u1 + len;
1712 a = v1 + len;
1713 b = a + len;
1714
1715 /*
1716 * We'll need the Montgomery reduction coefficients.
1717 */
1718 x0i = modp_ninv31(x[0]);
1719 y0i = modp_ninv31(y[0]);
1720
1721 /*
1722 * Initialize a, b, u0, u1, v0 and v1.
1723 * a = x u0 = 1 v0 = 0
1724 * b = y u1 = y v1 = x-1
1725 * Note that x is odd, so computing x-1 is easy.
1726 */
1727 memcpy(a, x, len * sizeof *x);
1728 memcpy(b, y, len * sizeof *y);
1729 u0[0] = 1;
1730 memset(u0 + 1, 0, (len - 1) * sizeof *u0);
1731 memset(v0, 0, len * sizeof *v0);
1732 memcpy(u1, y, len * sizeof *u1);
1733 memcpy(v1, x, len * sizeof *v1);
1734 v1[0] --;
1735
1736 /*
1737 * Each input operand may be as large as 31*len bits, and we
1738 * reduce the total length by at least 30 bits at each iteration.
1739 */
1740 for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
1741 uint32_t c0, c1;
1742 uint32_t a0, a1, b0, b1;
1743 uint64_t a_hi, b_hi;
1744 uint32_t a_lo, b_lo;
1745 int64_t pa, pb, qa, qb;
1746 int i;
1747 uint32_t r;
1748
1749 /*
1750 * Extract the top words of a and b. If j is the highest
1751 * index >= 1 such that a[j] != 0 or b[j] != 0, then we
1752 * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
1753 * If a and b are down to one word each, then we use
1754 * a[0] and b[0].
1755 */
1756 c0 = (uint32_t)-1;
1757 c1 = (uint32_t)-1;
1758 a0 = 0;
1759 a1 = 0;
1760 b0 = 0;
1761 b1 = 0;
1762 j = len;
1763 while (j -- > 0) {
1764 uint32_t aw, bw;
1765
1766 aw = a[j];
1767 bw = b[j];
1768 a0 ^= (a0 ^ aw) & c0;
1769 a1 ^= (a1 ^ aw) & c1;
1770 b0 ^= (b0 ^ bw) & c0;
1771 b1 ^= (b1 ^ bw) & c1;
1772 c1 = c0;
1773 c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
1774 }
1775
1776 /*
1777 * If c1 = 0, then we grabbed two words for a and b.
1778 * If c1 != 0 but c0 = 0, then we grabbed one word. It
1779 * is not possible that c1 != 0 and c0 != 0, because that
1780 * would mean that both integers are zero.
1781 */
1782 a1 |= a0 & c1;
1783 a0 &= ~c1;
1784 b1 |= b0 & c1;
1785 b0 &= ~c1;
1786 a_hi = ((uint64_t)a0 << 31) + a1;
1787 b_hi = ((uint64_t)b0 << 31) + b1;
1788 a_lo = a[0];
1789 b_lo = b[0];
1790
1791 /*
1792 * Compute reduction factors:
1793 *
1794 * a' = a*pa + b*pb
1795 * b' = a*qa + b*qb
1796 *
1797 * such that a' and b' are both multiple of 2^31, but are
1798 * only marginally larger than a and b.
1799 */
1800 pa = 1;
1801 pb = 0;
1802 qa = 0;
1803 qb = 1;
1804 for (i = 0; i < 31; i ++) {
1805 /*
1806 * At each iteration:
1807 *
1808 * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
1809 * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
1810 * a <- a/2 if: a is even
1811 * b <- b/2 if: a is odd, b is even
1812 *
1813 * We multiply a_lo and b_lo by 2 at each
1814 * iteration, thus a division by 2 really is a
1815 * non-multiplication by 2.
1816 */
1817 uint32_t rt, oa, ob, cAB, cBA, cA;
1818 uint64_t rz;
1819
1820 /*
1821 * rt = 1 if a_hi > b_hi, 0 otherwise.
1822 */
1823 rz = b_hi - a_hi;
1824 rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
1825 & (a_hi ^ rz))) >> 63);
1826
1827 /*
1828 * cAB = 1 if b must be subtracted from a
1829 * cBA = 1 if a must be subtracted from b
1830 * cA = 1 if a must be divided by 2
1831 *
1832 * Rules:
1833 *
1834 * cAB and cBA cannot both be 1.
1835 * If a is not divided by 2, b is.
1836 */
1837 oa = (a_lo >> i) & 1;
1838 ob = (b_lo >> i) & 1;
1839 cAB = oa & ob & rt;
1840 cBA = oa & ob & ~rt;
1841 cA = cAB | (oa ^ 1);
1842
1843 /*
1844 * Conditional subtractions.
1845 */
1846 a_lo -= b_lo & -cAB;
1847 a_hi -= b_hi & -(uint64_t)cAB;
1848 pa -= qa & -(int64_t)cAB;
1849 pb -= qb & -(int64_t)cAB;
1850 b_lo -= a_lo & -cBA;
1851 b_hi -= a_hi & -(uint64_t)cBA;
1852 qa -= pa & -(int64_t)cBA;
1853 qb -= pb & -(int64_t)cBA;
1854
1855 /*
1856 * Shifting.
1857 */
1858 a_lo += a_lo & (cA - 1);
1859 pa += pa & ((int64_t)cA - 1);
1860 pb += pb & ((int64_t)cA - 1);
1861 a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
1862 b_lo += b_lo & -cA;
1863 qa += qa & -(int64_t)cA;
1864 qb += qb & -(int64_t)cA;
1865 b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
1866 }
1867
1868 /*
1869 * Apply the computed parameters to our values. We
1870 * may have to correct pa and pb depending on the
1871 * returned value of zint_co_reduce() (when a and/or b
1872 * had to be negated).
1873 */
1874 r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
1875 pa -= (pa + pa) & -(int64_t)(r & 1);
1876 pb -= (pb + pb) & -(int64_t)(r & 1);
1877 qa -= (qa + qa) & -(int64_t)(r >> 1);
1878 qb -= (qb + qb) & -(int64_t)(r >> 1);
1879 zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
1880 zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
1881 }
1882
1883 /*
1884 * At that point, array a[] should contain the GCD, and the
1885 * results (u,v) should already be set. We check that the GCD
1886 * is indeed 1. We also check that the two operands x and y
1887 * are odd.
1888 */
1889 rc = a[0] ^ 1;
1890 for (j = 1; j < len; j ++) {
1891 rc |= a[j];
1892 }
1893 return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
1894 }
1895
1896 /*
1897 * Add k*y*2^sc to x. The result is assumed to fit in the array of
1898 * size xlen (truncation is applied if necessary).
1899 * Scale factor 'sc' is provided as sch and scl, such that:
1900 * sch = sc / 31
1901 * scl = sc % 31
1902 * xlen MUST NOT be lower than ylen.
1903 *
1904 * x[] and y[] are both signed integers, using two's complement for
1905 * negative values.
1906 */
1907 static void
1908 zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
1909 const uint32_t *restrict y, size_t ylen, int32_t k,
1910 uint32_t sch, uint32_t scl)
1911 {
1912 size_t u;
1913 uint32_t ysign, tw;
1914 int32_t cc;
1915
1916 if (ylen == 0) {
1917 return;
1918 }
1919
1920 ysign = -(y[ylen - 1] >> 30) >> 1;
1921 tw = 0;
1922 cc = 0;
1923 for (u = sch; u < xlen; u ++) {
1924 size_t v;
1925 uint32_t wy, wys, ccu;
1926 uint64_t z;
1927
1928 /*
1929 * Get the next word of y (scaled).
1930 */
1931 v = u - sch;
1932 wy = v < ylen ? y[v] : ysign;
1933 wys = ((wy << scl) & 0x7FFFFFFF) | tw;
1934 tw = wy >> (31 - scl);
1935
1936 /*
1937 * The expression below does not overflow.
1938 */
1939 z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
1940 x[u] = (uint32_t)z & 0x7FFFFFFF;
1941
1942 /*
1943 * Right-shifting the signed value z would yield
1944 * implementation-defined results (arithmetic shift is
1945 * not guaranteed). However, we can cast to unsigned,
1946 * and get the next carry as an unsigned word. We can
1947 * then convert it back to signed by using the guaranteed
1948 * fact that 'int32_t' uses two's complement with no
1949 * trap representation or padding bit, and with a layout
1950 * compatible with that of 'uint32_t'.
1951 */
1952 ccu = (uint32_t)(z >> 31);
1953 cc = *(int32_t *)&ccu;
1954 }
1955 }
1956
1957 /*
1958 * Subtract y*2^sc from x. The result is assumed to fit in the array of
1959 * size xlen (truncation is applied if necessary).
1960 * Scale factor 'sc' is provided as sch and scl, such that:
1961 * sch = sc / 31
1962 * scl = sc % 31
1963 * xlen MUST NOT be lower than ylen.
1964 *
1965 * x[] and y[] are both signed integers, using two's complement for
1966 * negative values.
1967 */
1968 static void
1969 zint_sub_scaled(uint32_t *restrict x, size_t xlen,
1970 const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl)
1971 {
1972 size_t u;
1973 uint32_t ysign, tw;
1974 uint32_t cc;
1975
1976 if (ylen == 0) {
1977 return;
1978 }
1979
1980 ysign = -(y[ylen - 1] >> 30) >> 1;
1981 tw = 0;
1982 cc = 0;
1983 for (u = sch; u < xlen; u ++) {
1984 size_t v;
1985 uint32_t w, wy, wys;
1986
1987 /*
1988 * Get the next word of y (scaled).
1989 */
1990 v = u - sch;
1991 wy = v < ylen ? y[v] : ysign;
1992 wys = ((wy << scl) & 0x7FFFFFFF) | tw;
1993 tw = wy >> (31 - scl);
1994
1995 w = x[u] - wys - cc;
1996 x[u] = w & 0x7FFFFFFF;
1997 cc = w >> 31;
1998 }
1999 }
2000
2001 /*
2002 * Convert a one-word signed big integer into a signed value.
2003 */
2004 static inline int32_t
2005 zint_one_to_plain(const uint32_t *x)
2006 {
2007 uint32_t w;
2008
2009 w = x[0];
2010 w |= (w & 0x40000000) << 1;
2011 return *(int32_t *)&w;
2012 }
2013
2014 /* ==================================================================== */
2015
2016 /*
2017 * Convert a polynomial to floating-point values.
2018 *
2019 * Each coefficient has length flen words, and starts fstride words after
2020 * the previous.
2021 *
2022 * IEEE-754 binary64 values can represent values in a finite range,
2023 * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
2024 * they should be "trimmed" by pointing not to the lowest word of each,
2025 * but upper.
2026 */
2027 static void
2028 poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
2029 unsigned logn)
2030 {
2031 size_t n, u;
2032
2033 n = MKN(logn);
2034 if (flen == 0) {
2035 for (u = 0; u < n; u ++) {
2036 d[u] = fpr_zero;
2037 }
2038 return;
2039 }
2040 for (u = 0; u < n; u ++, f += fstride) {
2041 size_t v;
2042 uint32_t neg, cc, xm;
2043 fpr x, fsc;
2044
2045 /*
2046 * Get sign of the integer; if it is negative, then we
2047 * will load its absolute value instead, and negate the
2048 * result.
2049 */
2050 neg = -(f[flen - 1] >> 30);
2051 xm = neg >> 1;
2052 cc = neg & 1;
2053 x = fpr_zero;
2054 fsc = fpr_one;
2055 for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
2056 uint32_t w;
2057
2058 w = (f[v] ^ xm) + cc;
2059 cc = w >> 31;
2060 w &= 0x7FFFFFFF;
2061 w -= (w << 1) & neg;
2062 x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
2063 }
2064 d[u] = x;
2065 }
2066 }
2067
2068 /*
2069 * Convert a polynomial to small integers. Source values are supposed
2070 * to be one-word integers, signed over 31 bits. Returned value is 0
2071 * if any of the coefficients exceeds the provided limit (in absolute
2072 * value), or 1 on success.
2073 *
2074 * This is not constant-time; this is not a problem here, because on
2075 * any failure, the NTRU-solving process will be deemed to have failed
2076 * and the (f,g) polynomials will be discarded.
2077 */
2078 static int
2079 poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn)
2080 {
2081 size_t n, u;
2082
2083 n = MKN(logn);
2084 for (u = 0; u < n; u ++) {
2085 int32_t z;
2086
2087 z = zint_one_to_plain(s + u);
2088 if (z < -lim || z > lim) {
2089 return 0;
2090 }
2091 d[u] = (int8_t)z;
2092 }
2093 return 1;
2094 }
2095
2096 /*
2097 * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
2098 * Coefficients of polynomial k are small integers (signed values in the
2099 * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
2100 * and scl = sc % 31.
2101 *
2102 * This function implements the basic quadratic multiplication algorithm,
2103 * which is efficient in space (no extra buffer needed) but slow at
2104 * high degree.
2105 */
2106 static void
2107 poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
2108 const uint32_t *restrict f, size_t flen, size_t fstride,
2109 const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn)
2110 {
2111 size_t n, u;
2112
2113 n = MKN(logn);
2114 for (u = 0; u < n; u ++) {
2115 int32_t kf;
2116 size_t v;
2117 uint32_t *x;
2118 const uint32_t *y;
2119
2120 kf = -k[u];
2121 x = F + u * Fstride;
2122 y = f;
2123 for (v = 0; v < n; v ++) {
2124 zint_add_scaled_mul_small(
2125 x, Flen, y, flen, kf, sch, scl);
2126 if (u + v == n - 1) {
2127 x = F;
2128 kf = -kf;
2129 } else {
2130 x += Fstride;
2131 }
2132 y += fstride;
2133 }
2134 }
2135 }
2136
2137 /*
2138 * Subtract k*f from F. Coefficients of polynomial k are small integers
2139 * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
2140 * assumes that the degree is large, and integers relatively small.
2141 * The value sc is provided as sch = sc / 31 and scl = sc % 31.
2142 */
2143 static void
2144 poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
2145 const uint32_t *restrict f, size_t flen, size_t fstride,
2146 const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
2147 uint32_t *restrict tmp)
2148 {
2149 uint32_t *gm, *igm, *fk, *t1, *x;
2150 const uint32_t *y;
2151 size_t n, u, tlen;
2152 const small_prime *primes;
2153
2154 n = MKN(logn);
2155 tlen = flen + 1;
2156 gm = tmp;
2157 igm = gm + MKN(logn);
2158 fk = igm + MKN(logn);
2159 t1 = fk + n * tlen;
2160
2161 primes = PRIMES;
2162
2163 /*
2164 * Compute k*f in fk[], in RNS notation.
2165 */
2166 for (u = 0; u < tlen; u ++) {
2167 uint32_t p, p0i, R2, Rx;
2168 size_t v;
2169
2170 p = primes[u].p;
2171 p0i = modp_ninv31(p);
2172 R2 = modp_R2(p, p0i);
2173 Rx = modp_Rx((unsigned)flen, p, p0i, R2);
2174 modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
2175
2176 for (v = 0; v < n; v ++) {
2177 t1[v] = modp_set(k[v], p);
2178 }
2179 modp_NTT2(t1, gm, logn, p, p0i);
2180 for (v = 0, y = f, x = fk + u;
2181 v < n; v ++, y += fstride, x += tlen)
2182 {
2183 *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
2184 }
2185 modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
2186 for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
2187 *x = modp_montymul(
2188 modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
2189 }
2190 modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
2191 }
2192
2193 /*
2194 * Rebuild k*f.
2195 */
2196 zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
2197
2198 /*
2199 * Subtract k*f, scaled, from F.
2200 */
2201 for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
2202 zint_sub_scaled(x, Flen, y, tlen, sch, scl);
2203 }
2204 }
2205
2206 /* ==================================================================== */
2207
2208 #if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1
2209
2210 #define RNG_CONTEXT prng
2211 #define get_rng_u64 prng_get_u64
2212
2213 #else // yyyKG_CHACHA20+0
2214
2215 #define RNG_CONTEXT inner_shake256_context
2216
2217 /*
2218 * Get a random 8-byte integer from a SHAKE-based RNG. This function
2219 * ensures consistent interpretation of the SHAKE output so that
2220 * the same values will be obtained over different platforms, in case
2221 * a known seed is used.
2222 */
2223 static inline uint64_t
2224 get_rng_u64(inner_shake256_context *rng)
2225 {
2226 /*
2227 * We enforce little-endian representation.
2228 */
2229
2230 #if FALCON_LE // yyyLE+1
2231 /*
2232 * On little-endian systems we just interpret the bytes "as is"
2233 * (this is correct because the exact-width types such as
2234 * 'uint64_t' are guaranteed to have no padding and no trap
2235 * representation).
2236 */
2237 uint64_t r;
2238
2239 inner_shake256_extract(rng, (uint8_t *)&r, sizeof r);
2240 return r;
2241 #else // yyyLE+0
2242 uint8_t tmp[8];
2243
2244 inner_shake256_extract(rng, tmp, sizeof tmp);
2245 return (uint64_t)tmp[0]
2246 | ((uint64_t)tmp[1] << 8)
2247 | ((uint64_t)tmp[2] << 16)
2248 | ((uint64_t)tmp[3] << 24)
2249 | ((uint64_t)tmp[4] << 32)
2250 | ((uint64_t)tmp[5] << 40)
2251 | ((uint64_t)tmp[6] << 48)
2252 | ((uint64_t)tmp[7] << 56);
2253 #endif // yyyLE-
2254 }
2255
2256 #endif // yyyKG_CHACHA20-
2257
2258 /*
2259 * Table below incarnates a discrete Gaussian distribution:
2260 * D(x) = exp(-(x^2)/(2*sigma^2))
2261 * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
2262 * Element 0 of the table is P(x = 0).
2263 * For k > 0, element k is P(x >= k+1 | x > 0).
2264 * Probabilities are scaled up by 2^63.
2265 */
2266 static const uint64_t gauss_1024_12289[] = {
2267 1283868770400643928u, 6416574995475331444u, 4078260278032692663u,
2268 2353523259288686585u, 1227179971273316331u, 575931623374121527u,
2269 242543240509105209u, 91437049221049666u, 30799446349977173u,
2270 9255276791179340u, 2478152334826140u, 590642893610164u,
2271 125206034929641u, 23590435911403u, 3948334035941u,
2272 586753615614u, 77391054539u, 9056793210u,
2273 940121950u, 86539696u, 7062824u,
2274 510971u, 32764u, 1862u,
2275 94u, 4u, 0u
2276 };
2277
2278 /*
2279 * Generate a random value with a Gaussian distribution centered on 0.
2280 * The RNG must be ready for extraction (already flipped).
2281 *
2282 * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
2283 * precomputed table is for N = 1024. Since the sum of two independent
2284 * values of standard deviation sigma has standard deviation
2285 * sigma*sqrt(2), then we can just generate more values and add them
2286 * together for lower dimensions.
2287 */
2288 static int
2289 mkgauss(RNG_CONTEXT *rng, unsigned logn)
2290 {
2291 unsigned u, g;
2292 int val;
2293
2294 g = 1U << (10 - logn);
2295 val = 0;
2296 for (u = 0; u < g; u ++) {
2297 /*
2298 * Each iteration generates one value with the
2299 * Gaussian distribution for N = 1024.
2300 *
2301 * We use two random 64-bit values. First value
2302 * decides on whether the generated value is 0, and,
2303 * if not, the sign of the value. Second random 64-bit
2304 * word is used to generate the non-zero value.
2305 *
2306 * For constant-time code we have to read the complete
2307 * table. This has negligible cost, compared with the
2308 * remainder of the keygen process (solving the NTRU
2309 * equation).
2310 */
2311 uint64_t r;
2312 uint32_t f, v, k, neg;
2313
2314 /*
2315 * First value:
2316 * - flag 'neg' is randomly selected to be 0 or 1.
2317 * - flag 'f' is set to 1 if the generated value is zero,
2318 * or set to 0 otherwise.
2319 */
2320 r = get_rng_u64(rng);
2321 neg = (uint32_t)(r >> 63);
2322 r &= ~((uint64_t)1 << 63);
2323 f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
2324
2325 /*
2326 * We produce a new random 63-bit integer r, and go over
2327 * the array, starting at index 1. We store in v the
2328 * index of the first array element which is not greater
2329 * than r, unless the flag f was already 1.
2330 */
2331 v = 0;
2332 r = get_rng_u64(rng);
2333 r &= ~((uint64_t)1 << 63);
2334 for (k = 1; k < (sizeof gauss_1024_12289)
2335 / (sizeof gauss_1024_12289[0]); k ++)
2336 {
2337 uint32_t t;
2338
2339 t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
2340 v |= k & -(t & (f ^ 1));
2341 f |= t;
2342 }
2343
2344 /*
2345 * We apply the sign ('neg' flag). If the value is zero,
2346 * the sign has no effect.
2347 */
2348 v = (v ^ -neg) + neg;
2349
2350 /*
2351 * Generated value is added to val.
2352 */
2353 val += *(int32_t *)&v;
2354 }
2355 return val;
2356 }
2357
2358 /*
2359 * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
2360 * words, of intermediate values in the computation:
2361 *
2362 * MAX_BL_SMALL[depth]: length for the input f and g at that depth
2363 * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
2364 *
2365 * Rules:
2366 *
2367 * - Within an array, values grow.
2368 *
2369 * - The 'SMALL' array must have an entry for maximum depth, corresponding
2370 * to the size of values used in the binary GCD. There is no such value
2371 * for the 'LARGE' array (the binary GCD yields already reduced
2372 * coefficients).
2373 *
2374 * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
2375 *
2376 * - Values must be large enough to handle the common cases, with some
2377 * margins.
2378 *
2379 * - Values must not be "too large" either because we will convert some
2380 * integers into floating-point values by considering the top 10 words,
2381 * i.e. 310 bits; hence, for values of length more than 10 words, we
2382 * should take care to have the length centered on the expected size.
2383 *
2384 * The following average lengths, in bits, have been measured on thousands
2385 * of random keys (fg = max length of the absolute value of coefficients
2386 * of f and g at that depth; FG = idem for the unreduced F and G; for the
2387 * maximum depth, F and G are the output of binary GCD, multiplied by q;
2388 * for each value, the average and standard deviation are provided).
2389 *
2390 * Binary case:
2391 * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51)
2392 * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55)
2393 * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77)
2394 * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31)
2395 * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04)
2396 * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87)
2397 * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38)
2398 * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39)
2399 * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73)
2400 * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41)
2401 * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49)
2402 *
2403 * Integers are actually represented either in binary notation over
2404 * 31-bit words (signed, using two's complement), or in RNS, modulo
2405 * many small primes. These small primes are close to, but slightly
2406 * lower than, 2^31. Use of RNS loses less than two bits, even for
2407 * the largest values.
2408 *
2409 * IMPORTANT: if these values are modified, then the temporary buffer
2410 * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
2411 * accordingly.
2412 */
2413
2414 static const size_t MAX_BL_SMALL[] = {
2415 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
2416 };
2417
2418 static const size_t MAX_BL_LARGE[] = {
2419 2, 2, 5, 7, 12, 21, 40, 78, 157, 308
2420 };
2421
2422 /*
2423 * Average and standard deviation for the maximum size (in bits) of
2424 * coefficients of (f,g), depending on depth. These values are used
2425 * to compute bounds for Babai's reduction.
2426 */
2427 static const struct {
2428 int avg;
2429 int std;
2430 } BITLENGTH[] = {
2431 { 4, 0 },
2432 { 11, 1 },
2433 { 24, 1 },
2434 { 50, 1 },
2435 { 102, 1 },
2436 { 202, 2 },
2437 { 401, 4 },
2438 { 794, 5 },
2439 { 1577, 8 },
2440 { 3138, 13 },
2441 { 6308, 25 }
2442 };
2443
2444 /*
2445 * Minimal recursion depth at which we rebuild intermediate values
2446 * when reconstructing f and g.
2447 */
2448 #define DEPTH_INT_FG 4
2449
2450 /*
2451 * Compute squared norm of a short vector. Returned value is saturated to
2452 * 2^32-1 if it is not lower than 2^31.
2453 */
2454 static uint32_t
2455 poly_small_sqnorm(const int8_t *f, unsigned logn)
2456 {
2457 size_t n, u;
2458 uint32_t s, ng;
2459
2460 n = MKN(logn);
2461 s = 0;
2462 ng = 0;
2463 for (u = 0; u < n; u ++) {
2464 int32_t z;
2465
2466 z = f[u];
2467 s += (uint32_t)(z * z);
2468 ng |= s;
2469 }
2470 return s | -(ng >> 31);
2471 }
2472
2473 /*
2474 * Align (upwards) the provided 'data' pointer with regards to 'base'
2475 * so that the offset is a multiple of the size of 'fpr'.
2476 */
2477 static fpr *
2478 align_fpr(void *base, void *data)
2479 {
2480 uint8_t *cb, *cd;
2481 size_t k, km;
2482
2483 cb = base;
2484 cd = data;
2485 k = (size_t)(cd - cb);
2486 km = k % sizeof(fpr);
2487 if (km) {
2488 k += (sizeof(fpr)) - km;
2489 }
2490 return (fpr *)(cb + k);
2491 }
2492
2493 /*
2494 * Align (upwards) the provided 'data' pointer with regards to 'base'
2495 * so that the offset is a multiple of the size of 'uint32_t'.
2496 */
2497 static uint32_t *
2498 align_u32(void *base, void *data)
2499 {
2500 uint8_t *cb, *cd;
2501 size_t k, km;
2502
2503 cb = base;
2504 cd = data;
2505 k = (size_t)(cd - cb);
2506 km = k % sizeof(uint32_t);
2507 if (km) {
2508 k += (sizeof(uint32_t)) - km;
2509 }
2510 return (uint32_t *)(cb + k);
2511 }
2512
2513 /*
2514 * Convert a small vector to floating point.
2515 */
2516 static void
2517 poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn)
2518 {
2519 size_t n, u;
2520
2521 n = MKN(logn);
2522 for (u = 0; u < n; u ++) {
2523 x[u] = fpr_of(f[u]);
2524 }
2525 }
2526
2527 /*
2528 * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
2529 * individual length.
2530 *
2531 * Output: f',g' of degree N/2, with the length for 'depth+1'.
2532 *
2533 * Values are in RNS; input and/or output may also be in NTT.
2534 */
2535 static void
2536 make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
2537 int in_ntt, int out_ntt)
2538 {
2539 size_t n, hn, u;
2540 size_t slen, tlen;
2541 uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
2542 const small_prime *primes;
2543
2544 n = (size_t)1 << logn;
2545 hn = n >> 1;
2546 slen = MAX_BL_SMALL[depth];
2547 tlen = MAX_BL_SMALL[depth + 1];
2548 primes = PRIMES;
2549
2550 /*
2551 * Prepare room for the result.
2552 */
2553 fd = data;
2554 gd = fd + hn * tlen;
2555 fs = gd + hn * tlen;
2556 gs = fs + n * slen;
2557 gm = gs + n * slen;
2558 igm = gm + n;
2559 t1 = igm + n;
2560 memmove(fs, data, 2 * n * slen * sizeof *data);
2561
2562 /*
2563 * First slen words: we use the input values directly, and apply
2564 * inverse NTT as we go.
2565 */
2566 for (u = 0; u < slen; u ++) {
2567 uint32_t p, p0i, R2;
2568 size_t v;
2569 uint32_t *x;
2570
2571 p = primes[u].p;
2572 p0i = modp_ninv31(p);
2573 R2 = modp_R2(p, p0i);
2574 modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
2575
2576 for (v = 0, x = fs + u; v < n; v ++, x += slen) {
2577 t1[v] = *x;
2578 }
2579 if (!in_ntt) {
2580 modp_NTT2(t1, gm, logn, p, p0i);
2581 }
2582 for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
2583 uint32_t w0, w1;
2584
2585 w0 = t1[(v << 1) + 0];
2586 w1 = t1[(v << 1) + 1];
2587 *x = modp_montymul(
2588 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
2589 }
2590 if (in_ntt) {
2591 modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
2592 }
2593
2594 for (v = 0, x = gs + u; v < n; v ++, x += slen) {
2595 t1[v] = *x;
2596 }
2597 if (!in_ntt) {
2598 modp_NTT2(t1, gm, logn, p, p0i);
2599 }
2600 for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
2601 uint32_t w0, w1;
2602
2603 w0 = t1[(v << 1) + 0];
2604 w1 = t1[(v << 1) + 1];
2605 *x = modp_montymul(
2606 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
2607 }
2608 if (in_ntt) {
2609 modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
2610 }
2611
2612 if (!out_ntt) {
2613 modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
2614 modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
2615 }
2616 }
2617
2618 /*
2619 * Since the fs and gs words have been de-NTTized, we can use the
2620 * CRT to rebuild the values.
2621 */
2622 zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
2623 zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
2624
2625 /*
2626 * Remaining words: use modular reductions to extract the values.
2627 */
2628 for (u = slen; u < tlen; u ++) {
2629 uint32_t p, p0i, R2, Rx;
2630 size_t v;
2631 uint32_t *x;
2632
2633 p = primes[u].p;
2634 p0i = modp_ninv31(p);
2635 R2 = modp_R2(p, p0i);
2636 Rx = modp_Rx((unsigned)slen, p, p0i, R2);
2637 modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
2638 for (v = 0, x = fs; v < n; v ++, x += slen) {
2639 t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
2640 }
2641 modp_NTT2(t1, gm, logn, p, p0i);
2642 for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
2643 uint32_t w0, w1;
2644
2645 w0 = t1[(v << 1) + 0];
2646 w1 = t1[(v << 1) + 1];
2647 *x = modp_montymul(
2648 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
2649 }
2650 for (v = 0, x = gs; v < n; v ++, x += slen) {
2651 t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
2652 }
2653 modp_NTT2(t1, gm, logn, p, p0i);
2654 for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
2655 uint32_t w0, w1;
2656
2657 w0 = t1[(v << 1) + 0];
2658 w1 = t1[(v << 1) + 1];
2659 *x = modp_montymul(
2660 modp_montymul(w0, w1, p, p0i), R2, p, p0i);
2661 }
2662
2663 if (!out_ntt) {
2664 modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
2665 modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
2666 }
2667 }
2668 }
2669
2670 /*
2671 * Compute f and g at a specific depth, in RNS notation.
2672 *
2673 * Returned values are stored in the data[] array, at slen words per integer.
2674 *
2675 * Conditions:
2676 * 0 <= depth <= logn
2677 *
2678 * Space use in data[]: enough room for any two successive values (f', g',
2679 * f and g).
2680 */
2681 static void
2682 make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
2683 unsigned logn, unsigned depth, int out_ntt)
2684 {
2685 size_t n, u;
2686 uint32_t *ft, *gt, p0;
2687 unsigned d;
2688 const small_prime *primes;
2689
2690 n = MKN(logn);
2691 ft = data;
2692 gt = ft + n;
2693 primes = PRIMES;
2694 p0 = primes[0].p;
2695 for (u = 0; u < n; u ++) {
2696 ft[u] = modp_set(f[u], p0);
2697 gt[u] = modp_set(g[u], p0);
2698 }
2699
2700 if (depth == 0 && out_ntt) {
2701 uint32_t *gm, *igm;
2702 uint32_t p, p0i;
2703
2704 p = primes[0].p;
2705 p0i = modp_ninv31(p);
2706 gm = gt + n;
2707 igm = gm + MKN(logn);
2708 modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
2709 modp_NTT2(ft, gm, logn, p, p0i);
2710 modp_NTT2(gt, gm, logn, p, p0i);
2711 return;
2712 }
2713
2714 for (d = 0; d < depth; d ++) {
2715 make_fg_step(data, logn - d, d,
2716 d != 0, (d + 1) < depth || out_ntt);
2717 }
2718 }
2719
2720 /*
2721 * Solving the NTRU equation, deepest level: compute the resultants of
2722 * f and g with X^N+1, and use binary GCD. The F and G values are
2723 * returned in tmp[].
2724 *
2725 * Returned value: 1 on success, 0 on error.
2726 */
2727 static int
2728 solve_NTRU_deepest(unsigned logn_top,
2729 const int8_t *f, const int8_t *g, uint32_t *tmp)
2730 {
2731 size_t len;
2732 uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
2733 const small_prime *primes;
2734
2735 len = MAX_BL_SMALL[logn_top];
2736 primes = PRIMES;
2737
2738 Fp = tmp;
2739 Gp = Fp + len;
2740 fp = Gp + len;
2741 gp = fp + len;
2742 t1 = gp + len;
2743
2744 make_fg(fp, f, g, logn_top, logn_top, 0);
2745
2746 /*
2747 * We use the CRT to rebuild the resultants as big integers.
2748 * There are two such big integers. The resultants are always
2749 * nonnegative.
2750 */
2751 zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
2752
2753 /*
2754 * Apply the binary GCD. The zint_bezout() function works only
2755 * if both inputs are odd.
2756 *
2757 * We can test on the result and return 0 because that would
2758 * imply failure of the NTRU solving equation, and the (f,g)
2759 * values will be abandoned in that case.
2760 */
2761 if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
2762 return 0;
2763 }
2764
2765 /*
2766 * Multiply the two values by the target value q. Values must
2767 * fit in the destination arrays.
2768 * We can again test on the returned words: a non-zero output
2769 * of zint_mul_small() means that we exceeded our array
2770 * capacity, and that implies failure and rejection of (f,g).
2771 */
2772 q = 12289;
2773 if (zint_mul_small(Fp, len, q) != 0
2774 || zint_mul_small(Gp, len, q) != 0)
2775 {
2776 return 0;
2777 }
2778
2779 return 1;
2780 }
2781
2782 /*
2783 * Solving the NTRU equation, intermediate level. Upon entry, the F and G
2784 * from the previous level should be in the tmp[] array.
2785 * This function MAY be invoked for the top-level (in which case depth = 0).
2786 *
2787 * Returned value: 1 on success, 0 on error.
2788 */
2789 static int
2790 solve_NTRU_intermediate(unsigned logn_top,
2791 const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp)
2792 {
2793 /*
2794 * In this function, 'logn' is the log2 of the degree for
2795 * this step. If N = 2^logn, then:
2796 * - the F and G values already in fk->tmp (from the deeper
2797 * levels) have degree N/2;
2798 * - this function should return F and G of degree N.
2799 */
2800 unsigned logn;
2801 size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
2802 uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
2803 fpr *rt1, *rt2, *rt3, *rt4, *rt5;
2804 int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
2805 uint32_t *x, *y;
2806 int32_t *k;
2807 const small_prime *primes;
2808
2809 logn = logn_top - depth;
2810 n = (size_t)1 << logn;
2811 hn = n >> 1;
2812
2813 /*
2814 * slen = size for our input f and g; also size of the reduced
2815 * F and G we return (degree N)
2816 *
2817 * dlen = size of the F and G obtained from the deeper level
2818 * (degree N/2 or N/3)
2819 *
2820 * llen = size for intermediary F and G before reduction (degree N)
2821 *
2822 * We build our non-reduced F and G as two independent halves each,
2823 * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
2824 */
2825 slen = MAX_BL_SMALL[depth];
2826 dlen = MAX_BL_SMALL[depth + 1];
2827 llen = MAX_BL_LARGE[depth];
2828 primes = PRIMES;
2829
2830 /*
2831 * Fd and Gd are the F and G from the deeper level.
2832 */
2833 Fd = tmp;
2834 Gd = Fd + dlen * hn;
2835
2836 /*
2837 * Compute the input f and g for this level. Note that we get f
2838 * and g in RNS + NTT representation.
2839 */
2840 ft = Gd + dlen * hn;
2841 make_fg(ft, f, g, logn_top, depth, 1);
2842
2843 /*
2844 * Move the newly computed f and g to make room for our candidate
2845 * F and G (unreduced).
2846 */
2847 Ft = tmp;
2848 Gt = Ft + n * llen;
2849 t1 = Gt + n * llen;
2850 memmove(t1, ft, 2 * n * slen * sizeof *ft);
2851 ft = t1;
2852 gt = ft + slen * n;
2853 t1 = gt + slen * n;
2854
2855 /*
2856 * Move Fd and Gd _after_ f and g.
2857 */
2858 memmove(t1, Fd, 2 * hn * dlen * sizeof *Fd);
2859 Fd = t1;
2860 Gd = Fd + hn * dlen;
2861
2862 /*
2863 * We reduce Fd and Gd modulo all the small primes we will need,
2864 * and store the values in Ft and Gt (only n/2 values in each).
2865 */
2866 for (u = 0; u < llen; u ++) {
2867 uint32_t p, p0i, R2, Rx;
2868 size_t v;
2869 uint32_t *xs, *ys, *xd, *yd;
2870
2871 p = primes[u].p;
2872 p0i = modp_ninv31(p);
2873 R2 = modp_R2(p, p0i);
2874 Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
2875 for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
2876 v < hn;
2877 v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
2878 {
2879 *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
2880 *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
2881 }
2882 }
2883
2884 /*
2885 * We do not need Fd and Gd after that point.
2886 */
2887
2888 /*
2889 * Compute our F and G modulo sufficiently many small primes.
2890 */
2891 for (u = 0; u < llen; u ++) {
2892 uint32_t p, p0i, R2;
2893 uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
2894 size_t v;
2895
2896 /*
2897 * All computations are done modulo p.
2898 */
2899 p = primes[u].p;
2900 p0i = modp_ninv31(p);
2901 R2 = modp_R2(p, p0i);
2902
2903 /*
2904 * If we processed slen words, then f and g have been
2905 * de-NTTized, and are in RNS; we can rebuild them.
2906 */
2907 if (u == slen) {
2908 zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
2909 zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
2910 }
2911
2912 gm = t1;
2913 igm = gm + n;
2914 fx = igm + n;
2915 gx = fx + n;
2916
2917 modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
2918
2919 if (u < slen) {
2920 for (v = 0, x = ft + u, y = gt + u;
2921 v < n; v ++, x += slen, y += slen)
2922 {
2923 fx[v] = *x;
2924 gx[v] = *y;
2925 }
2926 modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
2927 modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
2928 } else {
2929 uint32_t Rx;
2930
2931 Rx = modp_Rx((unsigned)slen, p, p0i, R2);
2932 for (v = 0, x = ft, y = gt;
2933 v < n; v ++, x += slen, y += slen)
2934 {
2935 fx[v] = zint_mod_small_signed(x, slen,
2936 p, p0i, R2, Rx);
2937 gx[v] = zint_mod_small_signed(y, slen,
2938 p, p0i, R2, Rx);
2939 }
2940 modp_NTT2(fx, gm, logn, p, p0i);
2941 modp_NTT2(gx, gm, logn, p, p0i);
2942 }
2943
2944 /*
2945 * Get F' and G' modulo p and in NTT representation
2946 * (they have degree n/2). These values were computed in
2947 * a previous step, and stored in Ft and Gt.
2948 */
2949 Fp = gx + n;
2950 Gp = Fp + hn;
2951 for (v = 0, x = Ft + u, y = Gt + u;
2952 v < hn; v ++, x += llen, y += llen)
2953 {
2954 Fp[v] = *x;
2955 Gp[v] = *y;
2956 }
2957 modp_NTT2(Fp, gm, logn - 1, p, p0i);
2958 modp_NTT2(Gp, gm, logn - 1, p, p0i);
2959
2960 /*
2961 * Compute our F and G modulo p.
2962 *
2963 * General case:
2964 *
2965 * we divide degree by d = 2 or 3
2966 * f'(x^d) = N(f)(x^d) = f * adj(f)
2967 * g'(x^d) = N(g)(x^d) = g * adj(g)
2968 * f'*G' - g'*F' = q
2969 * F = F'(x^d) * adj(g)
2970 * G = G'(x^d) * adj(f)
2971 *
2972 * We compute things in the NTT. We group roots of phi
2973 * such that all roots x in a group share the same x^d.
2974 * If the roots in a group are x_1, x_2... x_d, then:
2975 *
2976 * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
2977 *
2978 * Thus, we have:
2979 *
2980 * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
2981 * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
2982 * ...
2983 * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
2984 *
2985 * In all cases, we can thus compute F and G in NTT
2986 * representation by a few simple multiplications.
2987 * Moreover, in our chosen NTT representation, roots
2988 * from the same group are consecutive in RAM.
2989 */
2990 for (v = 0, x = Ft + u, y = Gt + u; v < hn;
2991 v ++, x += (llen << 1), y += (llen << 1))
2992 {
2993 uint32_t ftA, ftB, gtA, gtB;
2994 uint32_t mFp, mGp;
2995
2996 ftA = fx[(v << 1) + 0];
2997 ftB = fx[(v << 1) + 1];
2998 gtA = gx[(v << 1) + 0];
2999 gtB = gx[(v << 1) + 1];
3000 mFp = modp_montymul(Fp[v], R2, p, p0i);
3001 mGp = modp_montymul(Gp[v], R2, p, p0i);
3002 x[0] = modp_montymul(gtB, mFp, p, p0i);
3003 x[llen] = modp_montymul(gtA, mFp, p, p0i);
3004 y[0] = modp_montymul(ftB, mGp, p, p0i);
3005 y[llen] = modp_montymul(ftA, mGp, p, p0i);
3006 }
3007 modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
3008 modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
3009 }
3010
3011 /*
3012 * Rebuild F and G with the CRT.
3013 */
3014 zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
3015 zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
3016
3017 /*
3018 * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
3019 * order).
3020 */
3021
3022 /*
3023 * Apply Babai reduction to bring back F and G to size slen.
3024 *
3025 * We use the FFT to compute successive approximations of the
3026 * reduction coefficient. We first isolate the top bits of
3027 * the coefficients of f and g, and convert them to floating
3028 * point; with the FFT, we compute adj(f), adj(g), and
3029 * 1/(f*adj(f)+g*adj(g)).
3030 *
3031 * Then, we repeatedly apply the following:
3032 *
3033 * - Get the top bits of the coefficients of F and G into
3034 * floating point, and use the FFT to compute:
3035 * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
3036 *
3037 * - Convert back that value into normal representation, and
3038 * round it to the nearest integers, yielding a polynomial k.
3039 * Proper scaling is applied to f, g, F and G so that the
3040 * coefficients fit on 32 bits (signed).
3041 *
3042 * - Subtract k*f from F and k*g from G.
3043 *
3044 * Under normal conditions, this process reduces the size of F
3045 * and G by some bits at each iteration. For constant-time
3046 * operation, we do not want to measure the actual length of
3047 * F and G; instead, we do the following:
3048 *
3049 * - f and g are converted to floating-point, with some scaling
3050 * if necessary to keep values in the representable range.
3051 *
3052 * - For each iteration, we _assume_ a maximum size for F and G,
3053 * and use the values at that size. If we overreach, then
3054 * we get zeros, which is harmless: the resulting coefficients
3055 * of k will be 0 and the value won't be reduced.
3056 *
3057 * - We conservatively assume that F and G will be reduced by
3058 * at least 25 bits at each iteration.
3059 *
3060 * Even when reaching the bottom of the reduction, reduction
3061 * coefficient will remain low. If it goes out-of-range, then
3062 * something wrong occurred and the whole NTRU solving fails.
3063 */
3064
3065 /*
3066 * Memory layout:
3067 * - We need to compute and keep adj(f), adj(g), and
3068 * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
3069 * respectively).
3070 * - At each iteration we need two extra fp buffer (N fp values),
3071 * and produce a k (N 32-bit words). k will be shared with one
3072 * of the fp buffers.
3073 * - To compute k*f and k*g efficiently (with the NTT), we need
3074 * some extra room; we reuse the space of the temporary buffers.
3075 *
3076 * Arrays of 'fpr' are obtained from the temporary array itself.
3077 * We ensure that the base is at a properly aligned offset (the
3078 * source array tmp[] is supposed to be already aligned).
3079 */
3080
3081 rt3 = align_fpr(tmp, t1);
3082 rt4 = rt3 + n;
3083 rt5 = rt4 + n;
3084 rt1 = rt5 + (n >> 1);
3085 k = (int32_t *)align_u32(tmp, rt1);
3086 rt2 = align_fpr(tmp, k + n);
3087 if (rt2 < (rt1 + n)) {
3088 rt2 = rt1 + n;
3089 }
3090 t1 = (uint32_t *)k + n;
3091
3092 /*
3093 * Get f and g into rt3 and rt4 as floating-point approximations.
3094 *
3095 * We need to "scale down" the floating-point representation of
3096 * coefficients when they are too big. We want to keep the value
3097 * below 2^310 or so. Thus, when values are larger than 10 words,
3098 * we consider only the top 10 words. Array lengths have been
3099 * computed so that average maximum length will fall in the
3100 * middle or the upper half of these top 10 words.
3101 */
3102 rlen = (slen > 10) ? 10 : slen;
3103 poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
3104 poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
3105
3106 /*
3107 * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
3108 */
3109 scale_fg = 31 * (int)(slen - rlen);
3110
3111 /*
3112 * Estimated boundaries for the maximum size (in bits) of the
3113 * coefficients of (f,g). We use the measured average, and
3114 * allow for a deviation of at most six times the standard
3115 * deviation.
3116 */
3117 minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
3118 maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
3119
3120 /*
3121 * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
3122 * and adj(g) in rt3 and rt4, respectively.
3123 */
3124 Zf(FFT)(rt3, logn);
3125 Zf(FFT)(rt4, logn);
3126 Zf(poly_invnorm2_fft)(rt5, rt3, rt4, logn);
3127 Zf(poly_adj_fft)(rt3, logn);
3128 Zf(poly_adj_fft)(rt4, logn);
3129
3130 /*
3131 * Reduce F and G repeatedly.
3132 *
3133 * The expected maximum bit length of coefficients of F and G
3134 * is kept in maxbl_FG, with the corresponding word length in
3135 * FGlen.
3136 */
3137 FGlen = llen;
3138 maxbl_FG = 31 * (int)llen;
3139
3140 /*
3141 * Each reduction operation computes the reduction polynomial
3142 * "k". We need that polynomial to have coefficients that fit
3143 * on 32-bit signed integers, with some scaling; thus, we use
3144 * a descending sequence of scaling values, down to zero.
3145 *
3146 * The size of the coefficients of k is (roughly) the difference
3147 * between the size of the coefficients of (F,G) and the size
3148 * of the coefficients of (f,g). Thus, the maximum size of the
3149 * coefficients of k is, at the start, maxbl_FG - minbl_fg;
3150 * this is our starting scale value for k.
3151 *
3152 * We need to estimate the size of (F,G) during the execution of
3153 * the algorithm; we are allowed some overestimation but not too
3154 * much (poly_big_to_fp() uses a 310-bit window). Generally
3155 * speaking, after applying a reduction with k scaled to
3156 * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
3157 * where 'dd' is a few bits to account for the fact that the
3158 * reduction is never perfect (intuitively, dd is on the order
3159 * of sqrt(N), so at most 5 bits; we here allow for 10 extra
3160 * bits).
3161 *
3162 * The size of (f,g) is not known exactly, but maxbl_fg is an
3163 * upper bound.
3164 */
3165 scale_k = maxbl_FG - minbl_fg;
3166
3167 for (;;) {
3168 int scale_FG, dc, new_maxbl_FG;
3169 uint32_t scl, sch;
3170 fpr pdc, pt;
3171
3172 /*
3173 * Convert current F and G into floating-point. We apply
3174 * scaling if the current length is more than 10 words.
3175 */
3176 rlen = (FGlen > 10) ? 10 : FGlen;
3177 scale_FG = 31 * (int)(FGlen - rlen);
3178 poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
3179 poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
3180
3181 /*
3182 * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
3183 */
3184 Zf(FFT)(rt1, logn);
3185 Zf(FFT)(rt2, logn);
3186 Zf(poly_mul_fft)(rt1, rt3, logn);
3187 Zf(poly_mul_fft)(rt2, rt4, logn);
3188 Zf(poly_add)(rt2, rt1, logn);
3189 Zf(poly_mul_autoadj_fft)(rt2, rt5, logn);
3190 Zf(iFFT)(rt2, logn);
3191
3192 /*
3193 * (f,g) are scaled by 'scale_fg', meaning that the
3194 * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
3195 * to have their true mathematical value.
3196 *
3197 * (F,G) are similarly scaled by 'scale_FG'. Therefore,
3198 * the value we computed in rt2 is scaled by
3199 * 'scale_FG-scale_fg'.
3200 *
3201 * We want that value to be scaled by 'scale_k', hence we
3202 * apply a corrective scaling. After scaling, the values
3203 * should fit in -2^31-1..+2^31-1.
3204 */
3205 dc = scale_k - scale_FG + scale_fg;
3206
3207 /*
3208 * We will need to multiply values by 2^(-dc). The value
3209 * 'dc' is not secret, so we can compute 2^(-dc) with a
3210 * non-constant-time process.
3211 * (We could use ldexp(), but we prefer to avoid any
3212 * dependency on libm. When using FP emulation, we could
3213 * use our fpr_ldexp(), which is constant-time.)
3214 */
3215 if (dc < 0) {
3216 dc = -dc;
3217 pt = fpr_two;
3218 } else {
3219 pt = fpr_onehalf;
3220 }
3221 pdc = fpr_one;
3222 while (dc != 0) {
3223 if ((dc & 1) != 0) {
3224 pdc = fpr_mul(pdc, pt);
3225 }
3226 dc >>= 1;
3227 pt = fpr_sqr(pt);
3228 }
3229
3230 for (u = 0; u < n; u ++) {
3231 fpr xv;
3232
3233 xv = fpr_mul(rt2[u], pdc);
3234
3235 /*
3236 * Sometimes the values can be out-of-bounds if
3237 * the algorithm fails; we must not call
3238 * fpr_rint() (and cast to int32_t) if the value
3239 * is not in-bounds. Note that the test does not
3240 * break constant-time discipline, since any
3241 * failure here implies that we discard the current
3242 * secret key (f,g).
3243 */
3244 if (!fpr_lt(fpr_mtwo31m1, xv)
3245 || !fpr_lt(xv, fpr_ptwo31m1))
3246 {
3247 return 0;
3248 }
3249 k[u] = (int32_t)fpr_rint(xv);
3250 }
3251
3252 /*
3253 * Values in k[] are integers. They really are scaled
3254 * down by maxbl_FG - minbl_fg bits.
3255 *
3256 * If we are at low depth, then we use the NTT to
3257 * compute k*f and k*g.
3258 */
3259 sch = (uint32_t)(scale_k / 31);
3260 scl = (uint32_t)(scale_k % 31);
3261 if (depth <= DEPTH_INT_FG) {
3262 poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
3263 k, sch, scl, logn, t1);
3264 poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
3265 k, sch, scl, logn, t1);
3266 } else {
3267 poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
3268 k, sch, scl, logn);
3269 poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
3270 k, sch, scl, logn);
3271 }
3272
3273 /*
3274 * We compute the new maximum size of (F,G), assuming that
3275 * (f,g) has _maximal_ length (i.e. that reduction is
3276 * "late" instead of "early". We also adjust FGlen
3277 * accordingly.
3278 */
3279 new_maxbl_FG = scale_k + maxbl_fg + 10;
3280 if (new_maxbl_FG < maxbl_FG) {
3281 maxbl_FG = new_maxbl_FG;
3282 if ((int)FGlen * 31 >= maxbl_FG + 31) {
3283 FGlen --;
3284 }
3285 }
3286
3287 /*
3288 * We suppose that scaling down achieves a reduction by
3289 * at least 25 bits per iteration. We stop when we have
3290 * done the loop with an unscaled k.
3291 */
3292 if (scale_k <= 0) {
3293 break;
3294 }
3295 scale_k -= 25;
3296 if (scale_k < 0) {
3297 scale_k = 0;
3298 }
3299 }
3300
3301 /*
3302 * If (F,G) length was lowered below 'slen', then we must take
3303 * care to re-extend the sign.
3304 */
3305 if (FGlen < slen) {
3306 for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
3307 size_t v;
3308 uint32_t sw;
3309
3310 sw = -(Ft[FGlen - 1] >> 30) >> 1;
3311 for (v = FGlen; v < slen; v ++) {
3312 Ft[v] = sw;
3313 }
3314 sw = -(Gt[FGlen - 1] >> 30) >> 1;
3315 for (v = FGlen; v < slen; v ++) {
3316 Gt[v] = sw;
3317 }
3318 }
3319 }
3320
3321 /*
3322 * Compress encoding of all values to 'slen' words (this is the
3323 * expected output format).
3324 */
3325 for (u = 0, x = tmp, y = tmp;
3326 u < (n << 1); u ++, x += slen, y += llen)
3327 {
3328 memmove(x, y, slen * sizeof *y);
3329 }
3330 return 1;
3331 }
3332
3333 /*
3334 * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
3335 * F and G from the previous level should be in the tmp[] array.
3336 *
3337 * Returned value: 1 on success, 0 on error.
3338 */
3339 static int
3340 solve_NTRU_binary_depth1(unsigned logn_top,
3341 const int8_t *f, const int8_t *g, uint32_t *tmp)
3342 {
3343 /*
3344 * The first half of this function is a copy of the corresponding
3345 * part in solve_NTRU_intermediate(), for the reconstruction of
3346 * the unreduced F and G. The second half (Babai reduction) is
3347 * done differently, because the unreduced F and G fit in 53 bits
3348 * of precision, allowing a much simpler process with lower RAM
3349 * usage.
3350 */
3351 unsigned depth, logn;
3352 size_t n_top, n, hn, slen, dlen, llen, u;
3353 uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
3354 fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
3355 uint32_t *x, *y;
3356
3357 depth = 1;
3358 n_top = (size_t)1 << logn_top;
3359 logn = logn_top - depth;
3360 n = (size_t)1 << logn;
3361 hn = n >> 1;
3362
3363 /*
3364 * Equations are:
3365 *
3366 * f' = f0^2 - X^2*f1^2
3367 * g' = g0^2 - X^2*g1^2
3368 * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
3369 * F = F'*(g0 - X*g1)
3370 * G = G'*(f0 - X*f1)
3371 *
3372 * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
3373 * degree N/2 (their odd-indexed coefficients are all zero).
3374 */
3375
3376 /*
3377 * slen = size for our input f and g; also size of the reduced
3378 * F and G we return (degree N)
3379 *
3380 * dlen = size of the F and G obtained from the deeper level
3381 * (degree N/2)
3382 *
3383 * llen = size for intermediary F and G before reduction (degree N)
3384 *
3385 * We build our non-reduced F and G as two independent halves each,
3386 * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
3387 */
3388 slen = MAX_BL_SMALL[depth];
3389 dlen = MAX_BL_SMALL[depth + 1];
3390 llen = MAX_BL_LARGE[depth];
3391
3392 /*
3393 * Fd and Gd are the F and G from the deeper level. Ft and Gt
3394 * are the destination arrays for the unreduced F and G.
3395 */
3396 Fd = tmp;
3397 Gd = Fd + dlen * hn;
3398 Ft = Gd + dlen * hn;
3399 Gt = Ft + llen * n;
3400
3401 /*
3402 * We reduce Fd and Gd modulo all the small primes we will need,
3403 * and store the values in Ft and Gt.
3404 */
3405 for (u = 0; u < llen; u ++) {
3406 uint32_t p, p0i, R2, Rx;
3407 size_t v;
3408 uint32_t *xs, *ys, *xd, *yd;
3409
3410 p = PRIMES[u].p;
3411 p0i = modp_ninv31(p);
3412 R2 = modp_R2(p, p0i);
3413 Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
3414 for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
3415 v < hn;
3416 v ++, xs += dlen, ys += dlen, xd += llen, yd += llen)
3417 {
3418 *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
3419 *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
3420 }
3421 }
3422
3423 /*
3424 * Now Fd and Gd are not needed anymore; we can squeeze them out.
3425 */
3426 memmove(tmp, Ft, llen * n * sizeof(uint32_t));
3427 Ft = tmp;
3428 memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
3429 Gt = Ft + llen * n;
3430 ft = Gt + llen * n;
3431 gt = ft + slen * n;
3432
3433 t1 = gt + slen * n;
3434
3435 /*
3436 * Compute our F and G modulo sufficiently many small primes.
3437 */
3438 for (u = 0; u < llen; u ++) {
3439 uint32_t p, p0i, R2;
3440 uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
3441 unsigned e;
3442 size_t v;
3443
3444 /*
3445 * All computations are done modulo p.
3446 */
3447 p = PRIMES[u].p;
3448 p0i = modp_ninv31(p);
3449 R2 = modp_R2(p, p0i);
3450
3451 /*
3452 * We recompute things from the source f and g, of full
3453 * degree. However, we will need only the n first elements
3454 * of the inverse NTT table (igm); the call to modp_mkgm()
3455 * below will fill n_top elements in igm[] (thus overflowing
3456 * into fx[]) but later code will overwrite these extra
3457 * elements.
3458 */
3459 gm = t1;
3460 igm = gm + n_top;
3461 fx = igm + n;
3462 gx = fx + n_top;
3463 modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
3464
3465 /*
3466 * Set ft and gt to f and g modulo p, respectively.
3467 */
3468 for (v = 0; v < n_top; v ++) {
3469 fx[v] = modp_set(f[v], p);
3470 gx[v] = modp_set(g[v], p);
3471 }
3472
3473 /*
3474 * Convert to NTT and compute our f and g.
3475 */
3476 modp_NTT2(fx, gm, logn_top, p, p0i);
3477 modp_NTT2(gx, gm, logn_top, p, p0i);
3478 for (e = logn_top; e > logn; e --) {
3479 modp_poly_rec_res(fx, e, p, p0i, R2);
3480 modp_poly_rec_res(gx, e, p, p0i, R2);
3481 }
3482
3483 /*
3484 * From that point onward, we only need tables for
3485 * degree n, so we can save some space.
3486 */
3487 if (depth > 0) { /* always true */
3488 memmove(gm + n, igm, n * sizeof *igm);
3489 igm = gm + n;
3490 memmove(igm + n, fx, n * sizeof *ft);
3491 fx = igm + n;
3492 memmove(fx + n, gx, n * sizeof *gt);
3493 gx = fx + n;
3494 }
3495
3496 /*
3497 * Get F' and G' modulo p and in NTT representation
3498 * (they have degree n/2). These values were computed
3499 * in a previous step, and stored in Ft and Gt.
3500 */
3501 Fp = gx + n;
3502 Gp = Fp + hn;
3503 for (v = 0, x = Ft + u, y = Gt + u;
3504 v < hn; v ++, x += llen, y += llen)
3505 {
3506 Fp[v] = *x;
3507 Gp[v] = *y;
3508 }
3509 modp_NTT2(Fp, gm, logn - 1, p, p0i);
3510 modp_NTT2(Gp, gm, logn - 1, p, p0i);
3511
3512 /*
3513 * Compute our F and G modulo p.
3514 *
3515 * Equations are:
3516 *
3517 * f'(x^2) = N(f)(x^2) = f * adj(f)
3518 * g'(x^2) = N(g)(x^2) = g * adj(g)
3519 *
3520 * f'*G' - g'*F' = q
3521 *
3522 * F = F'(x^2) * adj(g)
3523 * G = G'(x^2) * adj(f)
3524 *
3525 * The NTT representation of f is f(w) for all w which
3526 * are roots of phi. In the binary case, as well as in
3527 * the ternary case for all depth except the deepest,
3528 * these roots can be grouped in pairs (w,-w), and we
3529 * then have:
3530 *
3531 * f(w) = adj(f)(-w)
3532 * f(-w) = adj(f)(w)
3533 *
3534 * and w^2 is then a root for phi at the half-degree.
3535 *
3536 * At the deepest level in the ternary case, this still
3537 * holds, in the following sense: the roots of x^2-x+1
3538 * are (w,-w^2) (for w^3 = -1, and w != -1), and we
3539 * have:
3540 *
3541 * f(w) = adj(f)(-w^2)
3542 * f(-w^2) = adj(f)(w)
3543 *
3544 * In all case, we can thus compute F and G in NTT
3545 * representation by a few simple multiplications.
3546 * Moreover, the two roots for each pair are consecutive
3547 * in our bit-reversal encoding.
3548 */
3549 for (v = 0, x = Ft + u, y = Gt + u;
3550 v < hn; v ++, x += (llen << 1), y += (llen << 1))
3551 {
3552 uint32_t ftA, ftB, gtA, gtB;
3553 uint32_t mFp, mGp;
3554
3555 ftA = fx[(v << 1) + 0];
3556 ftB = fx[(v << 1) + 1];
3557 gtA = gx[(v << 1) + 0];
3558 gtB = gx[(v << 1) + 1];
3559 mFp = modp_montymul(Fp[v], R2, p, p0i);
3560 mGp = modp_montymul(Gp[v], R2, p, p0i);
3561 x[0] = modp_montymul(gtB, mFp, p, p0i);
3562 x[llen] = modp_montymul(gtA, mFp, p, p0i);
3563 y[0] = modp_montymul(ftB, mGp, p, p0i);
3564 y[llen] = modp_montymul(ftA, mGp, p, p0i);
3565 }
3566 modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
3567 modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
3568
3569 /*
3570 * Also save ft and gt (only up to size slen).
3571 */
3572 if (u < slen) {
3573 modp_iNTT2(fx, igm, logn, p, p0i);
3574 modp_iNTT2(gx, igm, logn, p, p0i);
3575 for (v = 0, x = ft + u, y = gt + u;
3576 v < n; v ++, x += slen, y += slen)
3577 {
3578 *x = fx[v];
3579 *y = gx[v];
3580 }
3581 }
3582 }
3583
3584 /*
3585 * Rebuild f, g, F and G with the CRT. Note that the elements of F
3586 * and G are consecutive, and thus can be rebuilt in a single
3587 * loop; similarly, the elements of f and g are consecutive.
3588 */
3589 zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
3590 zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
3591
3592 /*
3593 * Here starts the Babai reduction, specialized for depth = 1.
3594 *
3595 * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
3596 * are converted to floating point. There is no scaling, and a
3597 * single pass is sufficient.
3598 */
3599
3600 /*
3601 * Convert F and G into floating point (rt1 and rt2).
3602 */
3603 rt1 = align_fpr(tmp, gt + slen * n);
3604 rt2 = rt1 + n;
3605 poly_big_to_fp(rt1, Ft, llen, llen, logn);
3606 poly_big_to_fp(rt2, Gt, llen, llen, logn);
3607
3608 /*
3609 * Integer representation of F and G is no longer needed, we
3610 * can remove it.
3611 */
3612 memmove(tmp, ft, 2 * slen * n * sizeof *ft);
3613 ft = tmp;
3614 gt = ft + slen * n;
3615 rt3 = align_fpr(tmp, gt + slen * n);
3616 memmove(rt3, rt1, 2 * n * sizeof *rt1);
3617 rt1 = rt3;
3618 rt2 = rt1 + n;
3619 rt3 = rt2 + n;
3620 rt4 = rt3 + n;
3621
3622 /*
3623 * Convert f and g into floating point (rt3 and rt4).
3624 */
3625 poly_big_to_fp(rt3, ft, slen, slen, logn);
3626 poly_big_to_fp(rt4, gt, slen, slen, logn);
3627
3628 /*
3629 * Remove unneeded ft and gt.
3630 */
3631 memmove(tmp, rt1, 4 * n * sizeof *rt1);
3632 rt1 = (fpr *)tmp;
3633 rt2 = rt1 + n;
3634 rt3 = rt2 + n;
3635 rt4 = rt3 + n;
3636
3637 /*
3638 * We now have:
3639 * rt1 = F
3640 * rt2 = G
3641 * rt3 = f
3642 * rt4 = g
3643 * in that order in RAM. We convert all of them to FFT.
3644 */
3645 Zf(FFT)(rt1, logn);
3646 Zf(FFT)(rt2, logn);
3647 Zf(FFT)(rt3, logn);
3648 Zf(FFT)(rt4, logn);
3649
3650 /*
3651 * Compute:
3652 * rt5 = F*adj(f) + G*adj(g)
3653 * rt6 = 1 / (f*adj(f) + g*adj(g))
3654 * (Note that rt6 is half-length.)
3655 */
3656 rt5 = rt4 + n;
3657 rt6 = rt5 + n;
3658 Zf(poly_add_muladj_fft)(rt5, rt1, rt2, rt3, rt4, logn);
3659 Zf(poly_invnorm2_fft)(rt6, rt3, rt4, logn);
3660
3661 /*
3662 * Compute:
3663 * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
3664 */
3665 Zf(poly_mul_autoadj_fft)(rt5, rt6, logn);
3666
3667 /*
3668 * Compute k as the rounded version of rt5. Check that none of
3669 * the values is larger than 2^63-1 (in absolute value)
3670 * because that would make the fpr_rint() do something undefined;
3671 * note that any out-of-bounds value here implies a failure and
3672 * (f,g) will be discarded, so we can make a simple test.
3673 */
3674 Zf(iFFT)(rt5, logn);
3675 for (u = 0; u < n; u ++) {
3676 fpr z;
3677
3678 z = rt5[u];
3679 if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
3680 return 0;
3681 }
3682 rt5[u] = fpr_of(fpr_rint(z));
3683 }
3684 Zf(FFT)(rt5, logn);
3685
3686 /*
3687 * Subtract k*f from F, and k*g from G.
3688 */
3689 Zf(poly_mul_fft)(rt3, rt5, logn);
3690 Zf(poly_mul_fft)(rt4, rt5, logn);
3691 Zf(poly_sub)(rt1, rt3, logn);
3692 Zf(poly_sub)(rt2, rt4, logn);
3693 Zf(iFFT)(rt1, logn);
3694 Zf(iFFT)(rt2, logn);
3695
3696 /*
3697 * Convert back F and G to integers, and return.
3698 */
3699 Ft = tmp;
3700 Gt = Ft + n;
3701 rt3 = align_fpr(tmp, Gt + n);
3702 memmove(rt3, rt1, 2 * n * sizeof *rt1);
3703 rt1 = rt3;
3704 rt2 = rt1 + n;
3705 for (u = 0; u < n; u ++) {
3706 Ft[u] = (uint32_t)fpr_rint(rt1[u]);
3707 Gt[u] = (uint32_t)fpr_rint(rt2[u]);
3708 }
3709
3710 return 1;
3711 }
3712
3713 /*
3714 * Solving the NTRU equation, top level. Upon entry, the F and G
3715 * from the previous level should be in the tmp[] array.
3716 *
3717 * Returned value: 1 on success, 0 on error.
3718 */
3719 static int
3720 solve_NTRU_binary_depth0(unsigned logn,
3721 const int8_t *f, const int8_t *g, uint32_t *tmp)
3722 {
3723 size_t n, hn, u;
3724 uint32_t p, p0i, R2;
3725 uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
3726 uint32_t *gm, *igm, *ft, *gt;
3727 fpr *rt2, *rt3;
3728
3729 n = (size_t)1 << logn;
3730 hn = n >> 1;
3731
3732 /*
3733 * Equations are:
3734 *
3735 * f' = f0^2 - X^2*f1^2
3736 * g' = g0^2 - X^2*g1^2
3737 * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
3738 * F = F'*(g0 - X*g1)
3739 * G = G'*(f0 - X*f1)
3740 *
3741 * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
3742 * degree N/2 (their odd-indexed coefficients are all zero).
3743 *
3744 * Everything should fit in 31-bit integers, hence we can just use
3745 * the first small prime p = 2147473409.
3746 */
3747 p = PRIMES[0].p;
3748 p0i = modp_ninv31(p);
3749 R2 = modp_R2(p, p0i);
3750
3751 Fp = tmp;
3752 Gp = Fp + hn;
3753 ft = Gp + hn;
3754 gt = ft + n;
3755 gm = gt + n;
3756 igm = gm + n;
3757
3758 modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
3759
3760 /*
3761 * Convert F' anf G' in NTT representation.
3762 */
3763 for (u = 0; u < hn; u ++) {
3764 Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
3765 Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
3766 }
3767 modp_NTT2(Fp, gm, logn - 1, p, p0i);
3768 modp_NTT2(Gp, gm, logn - 1, p, p0i);
3769
3770 /*
3771 * Load f and g and convert them to NTT representation.
3772 */
3773 for (u = 0; u < n; u ++) {
3774 ft[u] = modp_set(f[u], p);
3775 gt[u] = modp_set(g[u], p);
3776 }
3777 modp_NTT2(ft, gm, logn, p, p0i);
3778 modp_NTT2(gt, gm, logn, p, p0i);
3779
3780 /*
3781 * Build the unreduced F,G in ft and gt.
3782 */
3783 for (u = 0; u < n; u += 2) {
3784 uint32_t ftA, ftB, gtA, gtB;
3785 uint32_t mFp, mGp;
3786
3787 ftA = ft[u + 0];
3788 ftB = ft[u + 1];
3789 gtA = gt[u + 0];
3790 gtB = gt[u + 1];
3791 mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
3792 mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
3793 ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
3794 ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
3795 gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
3796 gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
3797 }
3798 modp_iNTT2(ft, igm, logn, p, p0i);
3799 modp_iNTT2(gt, igm, logn, p, p0i);
3800
3801 Gp = Fp + n;
3802 t1 = Gp + n;
3803 memmove(Fp, ft, 2 * n * sizeof *ft);
3804
3805 /*
3806 * We now need to apply the Babai reduction. At that point,
3807 * we have F and G in two n-word arrays.
3808 *
3809 * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
3810 * modulo p, using the NTT. We still move memory around in
3811 * order to save RAM.
3812 */
3813 t2 = t1 + n;
3814 t3 = t2 + n;
3815 t4 = t3 + n;
3816 t5 = t4 + n;
3817
3818 /*
3819 * Compute the NTT tables in t1 and t2. We do not keep t2
3820 * (we'll recompute it later on).
3821 */
3822 modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
3823
3824 /*
3825 * Convert F and G to NTT.
3826 */
3827 modp_NTT2(Fp, t1, logn, p, p0i);
3828 modp_NTT2(Gp, t1, logn, p, p0i);
3829
3830 /*
3831 * Load f and adj(f) in t4 and t5, and convert them to NTT
3832 * representation.
3833 */
3834 t4[0] = t5[0] = modp_set(f[0], p);
3835 for (u = 1; u < n; u ++) {
3836 t4[u] = modp_set(f[u], p);
3837 t5[n - u] = modp_set(-f[u], p);
3838 }
3839 modp_NTT2(t4, t1, logn, p, p0i);
3840 modp_NTT2(t5, t1, logn, p, p0i);
3841
3842 /*
3843 * Compute F*adj(f) in t2, and f*adj(f) in t3.
3844 */
3845 for (u = 0; u < n; u ++) {
3846 uint32_t w;
3847
3848 w = modp_montymul(t5[u], R2, p, p0i);
3849 t2[u] = modp_montymul(w, Fp[u], p, p0i);
3850 t3[u] = modp_montymul(w, t4[u], p, p0i);
3851 }
3852
3853 /*
3854 * Load g and adj(g) in t4 and t5, and convert them to NTT
3855 * representation.
3856 */
3857 t4[0] = t5[0] = modp_set(g[0], p);
3858 for (u = 1; u < n; u ++) {
3859 t4[u] = modp_set(g[u], p);
3860 t5[n - u] = modp_set(-g[u], p);
3861 }
3862 modp_NTT2(t4, t1, logn, p, p0i);
3863 modp_NTT2(t5, t1, logn, p, p0i);
3864
3865 /*
3866 * Add G*adj(g) to t2, and g*adj(g) to t3.
3867 */
3868 for (u = 0; u < n; u ++) {
3869 uint32_t w;
3870
3871 w = modp_montymul(t5[u], R2, p, p0i);
3872 t2[u] = modp_add(t2[u],
3873 modp_montymul(w, Gp[u], p, p0i), p);
3874 t3[u] = modp_add(t3[u],
3875 modp_montymul(w, t4[u], p, p0i), p);
3876 }
3877
3878 /*
3879 * Convert back t2 and t3 to normal representation (normalized
3880 * around 0), and then
3881 * move them to t1 and t2. We first need to recompute the
3882 * inverse table for NTT.
3883 */
3884 modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
3885 modp_iNTT2(t2, t4, logn, p, p0i);
3886 modp_iNTT2(t3, t4, logn, p, p0i);
3887 for (u = 0; u < n; u ++) {
3888 t1[u] = (uint32_t)modp_norm(t2[u], p);
3889 t2[u] = (uint32_t)modp_norm(t3[u], p);
3890 }
3891
3892 /*
3893 * At that point, array contents are:
3894 *
3895 * F (NTT representation) (Fp)
3896 * G (NTT representation) (Gp)
3897 * F*adj(f)+G*adj(g) (t1)
3898 * f*adj(f)+g*adj(g) (t2)
3899 *
3900 * We want to divide t1 by t2. The result is not integral; it
3901 * must be rounded. We thus need to use the FFT.
3902 */
3903
3904 /*
3905 * Get f*adj(f)+g*adj(g) in FFT representation. Since this
3906 * polynomial is auto-adjoint, all its coordinates in FFT
3907 * representation are actually real, so we can truncate off
3908 * the imaginary parts.
3909 */
3910 rt3 = align_fpr(tmp, t3);
3911 for (u = 0; u < n; u ++) {
3912 rt3[u] = fpr_of(((int32_t *)t2)[u]);
3913 }
3914 Zf(FFT)(rt3, logn);
3915 rt2 = align_fpr(tmp, t2);
3916 memmove(rt2, rt3, hn * sizeof *rt3);
3917
3918 /*
3919 * Convert F*adj(f)+G*adj(g) in FFT representation.
3920 */
3921 rt3 = rt2 + hn;
3922 for (u = 0; u < n; u ++) {
3923 rt3[u] = fpr_of(((int32_t *)t1)[u]);
3924 }
3925 Zf(FFT)(rt3, logn);
3926
3927 /*
3928 * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
3929 * its rounded normal representation in t1.
3930 */
3931 Zf(poly_div_autoadj_fft)(rt3, rt2, logn);
3932 Zf(iFFT)(rt3, logn);
3933 for (u = 0; u < n; u ++) {
3934 t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
3935 }
3936
3937 /*
3938 * RAM contents are now:
3939 *
3940 * F (NTT representation) (Fp)
3941 * G (NTT representation) (Gp)
3942 * k (t1)
3943 *
3944 * We want to compute F-k*f, and G-k*g.
3945 */
3946 t2 = t1 + n;
3947 t3 = t2 + n;
3948 t4 = t3 + n;
3949 t5 = t4 + n;
3950 modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
3951 for (u = 0; u < n; u ++) {
3952 t4[u] = modp_set(f[u], p);
3953 t5[u] = modp_set(g[u], p);
3954 }
3955 modp_NTT2(t1, t2, logn, p, p0i);
3956 modp_NTT2(t4, t2, logn, p, p0i);
3957 modp_NTT2(t5, t2, logn, p, p0i);
3958 for (u = 0; u < n; u ++) {
3959 uint32_t kw;
3960
3961 kw = modp_montymul(t1[u], R2, p, p0i);
3962 Fp[u] = modp_sub(Fp[u],
3963 modp_montymul(kw, t4[u], p, p0i), p);
3964 Gp[u] = modp_sub(Gp[u],
3965 modp_montymul(kw, t5[u], p, p0i), p);
3966 }
3967 modp_iNTT2(Fp, t3, logn, p, p0i);
3968 modp_iNTT2(Gp, t3, logn, p, p0i);
3969 for (u = 0; u < n; u ++) {
3970 Fp[u] = (uint32_t)modp_norm(Fp[u], p);
3971 Gp[u] = (uint32_t)modp_norm(Gp[u], p);
3972 }
3973
3974 return 1;
3975 }
3976
3977 /*
3978 * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
3979 * G can be NULL, in which case that value is computed but not returned.
3980 * If any of the coefficients of F and G exceeds lim (in absolute value),
3981 * then 0 is returned.
3982 */
3983 static int
3984 solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
3985 const int8_t *f, const int8_t *g, int lim, uint32_t *tmp)
3986 {
3987 size_t n, u;
3988 uint32_t *ft, *gt, *Ft, *Gt, *gm;
3989 uint32_t p, p0i, r;
3990 const small_prime *primes;
3991
3992 n = MKN(logn);
3993
3994 if (!solve_NTRU_deepest(logn, f, g, tmp)) {
3995 return 0;
3996 }
3997
3998 /*
3999 * For logn <= 2, we need to use solve_NTRU_intermediate()
4000 * directly, because coefficients are a bit too large and
4001 * do not fit the hypotheses in solve_NTRU_binary_depth0().
4002 */
4003 if (logn <= 2) {
4004 unsigned depth;
4005
4006 depth = logn;
4007 while (depth -- > 0) {
4008 if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
4009 return 0;
4010 }
4011 }
4012 } else {
4013 unsigned depth;
4014
4015 depth = logn;
4016 while (depth -- > 2) {
4017 if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
4018 return 0;
4019 }
4020 }
4021 if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
4022 return 0;
4023 }
4024 if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
4025 return 0;
4026 }
4027 }
4028
4029 /*
4030 * If no buffer has been provided for G, use a temporary one.
4031 */
4032 if (G == NULL) {
4033 G = (int8_t *)(tmp + 2 * n);
4034 }
4035
4036 /*
4037 * Final F and G are in fk->tmp, one word per coefficient
4038 * (signed value over 31 bits).
4039 */
4040 if (!poly_big_to_small(F, tmp, lim, logn)
4041 || !poly_big_to_small(G, tmp + n, lim, logn))
4042 {
4043 return 0;
4044 }
4045
4046 /*
4047 * Verify that the NTRU equation is fulfilled. Since all elements
4048 * have short lengths, verifying modulo a small prime p works, and
4049 * allows using the NTT.
4050 *
4051 * We put Gt[] first in tmp[], and process it first, so that it does
4052 * not overlap with G[] in case we allocated it ourselves.
4053 */
4054 Gt = tmp;
4055 ft = Gt + n;
4056 gt = ft + n;
4057 Ft = gt + n;
4058 gm = Ft + n;
4059
4060 primes = PRIMES;
4061 p = primes[0].p;
4062 p0i = modp_ninv31(p);
4063 modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
4064 for (u = 0; u < n; u ++) {
4065 Gt[u] = modp_set(G[u], p);
4066 }
4067 for (u = 0; u < n; u ++) {
4068 ft[u] = modp_set(f[u], p);
4069 gt[u] = modp_set(g[u], p);
4070 Ft[u] = modp_set(F[u], p);
4071 }
4072 modp_NTT2(ft, gm, logn, p, p0i);
4073 modp_NTT2(gt, gm, logn, p, p0i);
4074 modp_NTT2(Ft, gm, logn, p, p0i);
4075 modp_NTT2(Gt, gm, logn, p, p0i);
4076 r = modp_montymul(12289, 1, p, p0i);
4077 for (u = 0; u < n; u ++) {
4078 uint32_t z;
4079
4080 z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
4081 modp_montymul(gt[u], Ft[u], p, p0i), p);
4082 if (z != r) {
4083 return 0;
4084 }
4085 }
4086
4087 return 1;
4088 }
4089
4090 /*
4091 * Generate a random polynomial with a Gaussian distribution. This function
4092 * also makes sure that the resultant of the polynomial with phi is odd.
4093 */
4094 static void
4095 poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn)
4096 {
4097 size_t n, u;
4098 unsigned mod2;
4099
4100 n = MKN(logn);
4101 mod2 = 0;
4102 for (u = 0; u < n; u ++) {
4103 int s;
4104
4105 restart:
4106 s = mkgauss(rng, logn);
4107
4108 /*
4109 * We need the coefficient to fit within -127..+127;
4110 * realistically, this is always the case except for
4111 * the very low degrees (N = 2 or 4), for which there
4112 * is no real security anyway.
4113 */
4114 if (s < -127 || s > 127) {
4115 goto restart;
4116 }
4117
4118 /*
4119 * We need the sum of all coefficients to be 1; otherwise,
4120 * the resultant of the polynomial with X^N+1 will be even,
4121 * and the binary GCD will fail.
4122 */
4123 if (u == n - 1) {
4124 if ((mod2 ^ (unsigned)(s & 1)) == 0) {
4125 goto restart;
4126 }
4127 } else {
4128 mod2 ^= (unsigned)(s & 1);
4129 }
4130 f[u] = (int8_t)s;
4131 }
4132 }
4133
4134 /* see falcon.h */
4135 void
4136 Zf(keygen)(inner_shake256_context *rng,
4137 int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
4138 unsigned logn, uint8_t *tmp)
4139 {
4140 /*
4141 * Algorithm is the following:
4142 *
4143 * - Generate f and g with the Gaussian distribution.
4144 *
4145 * - If either Res(f,phi) or Res(g,phi) is even, try again.
4146 *
4147 * - If ||(f,g)|| is too large, try again.
4148 *
4149 * - If ||B~_{f,g}|| is too large, try again.
4150 *
4151 * - If f is not invertible mod phi mod q, try again.
4152 *
4153 * - Compute h = g/f mod phi mod q.
4154 *
4155 * - Solve the NTRU equation fG - gF = q; if the solving fails,
4156 * try again. Usual failure condition is when Res(f,phi)
4157 * and Res(g,phi) are not prime to each other.
4158 */
4159 size_t n, u;
4160 uint16_t *h2, *tmp2;
4161 RNG_CONTEXT *rc;
4162 #if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1
4163 prng p;
4164 #endif // yyyKG_CHACHA20-
4165
4166 n = MKN(logn);
4167 #if FALCON_KG_CHACHA20 // yyyKG_CHACHA20+1
4168 Zf(prng_init)(&p, rng);
4169 rc = &p;
4170 #else // yyyKG_CHACHA20+0
4171 rc = rng;
4172 #endif // yyyKG_CHACHA20-
4173
4174 /*
4175 * We need to generate f and g randomly, until we find values
4176 * such that the norm of (g,-f), and of the orthogonalized
4177 * vector, are satisfying. The orthogonalized vector is:
4178 * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
4179 * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
4180 *
4181 * In the binary case, coefficients of f and g are generated
4182 * independently of each other, with a discrete Gaussian
4183 * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
4184 * the two vectors have expected norm 1.17*sqrt(q), which is
4185 * also our acceptance bound: we require both vectors to be no
4186 * larger than that (this will be satisfied about 1/4th of the
4187 * time, thus we expect sampling new (f,g) about 4 times for that
4188 * step).
4189 *
4190 * We require that Res(f,phi) and Res(g,phi) are both odd (the
4191 * NTRU equation solver requires it).
4192 */
4193 for (;;) {
4194 fpr *rt1, *rt2, *rt3;
4195 fpr bnorm;
4196 uint32_t normf, normg, norm;
4197 int lim;
4198
4199 /*
4200 * The poly_small_mkgauss() function makes sure
4201 * that the sum of coefficients is 1 modulo 2
4202 * (i.e. the resultant of the polynomial with phi
4203 * will be odd).
4204 */
4205 poly_small_mkgauss(rc, f, logn);
4206 poly_small_mkgauss(rc, g, logn);
4207
4208 /*
4209 * Verify that all coefficients are within the bounds
4210 * defined in max_fg_bits. This is the case with
4211 * overwhelming probability; this guarantees that the
4212 * key will be encodable with FALCON_COMP_TRIM.
4213 */
4214 lim = 1 << (Zf(max_fg_bits)[logn] - 1);
4215 for (u = 0; u < n; u ++) {
4216 /*
4217 * We can use non-CT tests since on any failure
4218 * we will discard f and g.
4219 */
4220 if (f[u] >= lim || f[u] <= -lim
4221 || g[u] >= lim || g[u] <= -lim)
4222 {
4223 lim = -1;
4224 break;
4225 }
4226 }
4227 if (lim < 0) {
4228 continue;
4229 }
4230
4231 /*
4232 * Bound is 1.17*sqrt(q). We compute the squared
4233 * norms. With q = 12289, the squared bound is:
4234 * (1.17^2)* 12289 = 16822.4121
4235 * Since f and g are integral, the squared norm
4236 * of (g,-f) is an integer.
4237 */
4238 normf = poly_small_sqnorm(f, logn);
4239 normg = poly_small_sqnorm(g, logn);
4240 norm = (normf + normg) | -((normf | normg) >> 31);
4241 if (norm >= 16823) {
4242 continue;
4243 }
4244
4245 /*
4246 * We compute the orthogonalized vector norm.
4247 */
4248 rt1 = (fpr *)tmp;
4249 rt2 = rt1 + n;
4250 rt3 = rt2 + n;
4251 poly_small_to_fp(rt1, f, logn);
4252 poly_small_to_fp(rt2, g, logn);
4253 Zf(FFT)(rt1, logn);
4254 Zf(FFT)(rt2, logn);
4255 Zf(poly_invnorm2_fft)(rt3, rt1, rt2, logn);
4256 Zf(poly_adj_fft)(rt1, logn);
4257 Zf(poly_adj_fft)(rt2, logn);
4258 Zf(poly_mulconst)(rt1, fpr_q, logn);
4259 Zf(poly_mulconst)(rt2, fpr_q, logn);
4260 Zf(poly_mul_autoadj_fft)(rt1, rt3, logn);
4261 Zf(poly_mul_autoadj_fft)(rt2, rt3, logn);
4262 Zf(iFFT)(rt1, logn);
4263 Zf(iFFT)(rt2, logn);
4264 bnorm = fpr_zero;
4265 for (u = 0; u < n; u ++) {
4266 bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
4267 bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
4268 }
4269 if (!fpr_lt(bnorm, fpr_bnorm_max)) {
4270 continue;
4271 }
4272
4273 /*
4274 * Compute public key h = g/f mod X^N+1 mod q. If this
4275 * fails, we must restart.
4276 */
4277 if (h == NULL) {
4278 h2 = (uint16_t *)tmp;
4279 tmp2 = h2 + n;
4280 } else {
4281 h2 = h;
4282 tmp2 = (uint16_t *)tmp;
4283 }
4284 if (!Zf(compute_public)(h2, f, g, logn, (uint8_t *)tmp2)) {
4285 continue;
4286 }
4287
4288 /*
4289 * Solve the NTRU equation to get F and G.
4290 */
4291 lim = (1 << (Zf(max_FG_bits)[logn] - 1)) - 1;
4292 if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
4293 continue;
4294 }
4295
4296 /*
4297 * Key pair is generated.
4298 */
4299 break;
4300 }
4301 }