rng.c
1 /*
2 * PRNG and interface to the system RNG.
3 *
4 * ==========================(LICENSE BEGIN)============================
5 *
6 * Copyright (c) 2017-2019 Falcon Project
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining
9 * a copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be
17 * included in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * ===========================(LICENSE END)=============================
28 *
29 * @author Thomas Pornin <thomas.pornin@nccgroup.com>
30 */
31
32 #include <assert.h>
33
34 #include "inner.h"
35
36 // yyyNIST+0 yyyPQCLEAN+0
37 /*
38 * Include relevant system header files. For Win32, this will also need
39 * linking with advapi32.dll, which we trigger with an appropriate #pragma.
40 */
41 #if FALCON_RAND_GETENTROPY
42 #include <unistd.h>
43 #endif
44 #if FALCON_RAND_URANDOM
45 #include <sys/types.h>
46 #if !FALCON_RAND_GETENTROPY
47 #include <unistd.h>
48 #endif
49 #include <fcntl.h>
50 #include <errno.h>
51 #endif
52 #if FALCON_RAND_WIN32
53 #include <windows.h>
54 #include <wincrypt.h>
55 #pragma comment(lib, "advapi32")
56 #endif
57
58 /* see inner.h */
59 int
60 Zf(get_seed)(void *seed, size_t len)
61 {
62 (void)seed;
63 if (len == 0) {
64 return 1;
65 }
66 #if FALCON_RAND_GETENTROPY
67 if (getentropy(seed, len) == 0) {
68 return 1;
69 }
70 #endif
71 #if FALCON_RAND_URANDOM
72 {
73 int f;
74
75 f = open("/dev/urandom", O_RDONLY);
76 if (f >= 0) {
77 while (len > 0) {
78 ssize_t rlen;
79
80 rlen = read(f, seed, len);
81 if (rlen < 0) {
82 if (errno == EINTR) {
83 continue;
84 }
85 break;
86 }
87 seed = (uint8_t *)seed + rlen;
88 len -= (size_t)rlen;
89 }
90 close(f);
91 if (len == 0) {
92 return 1;
93 }
94 }
95 }
96 #endif
97 #if FALCON_RAND_WIN32
98 {
99 HCRYPTPROV hp;
100
101 if (CryptAcquireContext(&hp, 0, 0, PROV_RSA_FULL,
102 CRYPT_VERIFYCONTEXT | CRYPT_SILENT))
103 {
104 BOOL r;
105
106 r = CryptGenRandom(hp, (DWORD)len, seed);
107 CryptReleaseContext(hp, 0);
108 if (r) {
109 return 1;
110 }
111 }
112 }
113 #endif
114 return 0;
115 }
116 // yyyNIST- yyyPQCLEAN-
117
118 /* see inner.h */
119 void
120 Zf(prng_init)(prng *p, inner_shake256_context *src)
121 {
122 #if FALCON_LE // yyyLE+1
123 inner_shake256_extract(src, p->state.d, 56);
124 #else // yyyLE+0
125 /*
126 * To ensure reproducibility for a given seed, we
127 * must enforce little-endian interpretation of
128 * the state words.
129 */
130 uint8_t tmp[56];
131 uint64_t th, tl;
132 int i;
133
134 inner_shake256_extract(src, tmp, 56);
135 for (i = 0; i < 14; i ++) {
136 uint32_t w;
137
138 w = (uint32_t)tmp[(i << 2) + 0]
139 | ((uint32_t)tmp[(i << 2) + 1] << 8)
140 | ((uint32_t)tmp[(i << 2) + 2] << 16)
141 | ((uint32_t)tmp[(i << 2) + 3] << 24);
142 *(uint32_t *)(p->state.d + (i << 2)) = w;
143 }
144 tl = *(uint32_t *)(p->state.d + 48);
145 th = *(uint32_t *)(p->state.d + 52);
146 *(uint64_t *)(p->state.d + 48) = tl + (th << 32);
147 #endif // yyyLE-
148 Zf(prng_refill)(p);
149 }
150
151 /*
152 * PRNG based on ChaCha20.
153 *
154 * State consists in key (32 bytes) then IV (16 bytes) and block counter
155 * (8 bytes). Normally, we should not care about local endianness (this
156 * is for a PRNG), but for the NIST competition we need reproducible KAT
157 * vectors that work across architectures, so we enforce little-endian
158 * interpretation where applicable. Moreover, output words are "spread
159 * out" over the output buffer with the interleaving pattern that is
160 * naturally obtained from the AVX2 implementation that runs eight
161 * ChaCha20 instances in parallel.
162 *
163 * The block counter is XORed into the first 8 bytes of the IV.
164 */
165 TARGET_AVX2
166 void
167 Zf(prng_refill)(prng *p)
168 {
169 #if FALCON_AVX2 // yyyAVX2+1
170
171 static const uint32_t CW[] = {
172 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
173 };
174
175 uint64_t cc;
176 size_t u;
177 int i;
178 uint32_t *sw;
179 union {
180 uint32_t w[16];
181 __m256i y[2]; /* for alignment */
182 } t;
183 __m256i state[16], init[16];
184
185 sw = (uint32_t *)p->state.d;
186
187 /*
188 * XOR next counter values into state.
189 */
190 cc = *(uint64_t *)(p->state.d + 48);
191 for (u = 0; u < 8; u ++) {
192 t.w[u] = (uint32_t)(cc + u);
193 t.w[u + 8] = (uint32_t)((cc + u) >> 32);
194 }
195 *(uint64_t *)(p->state.d + 48) = cc + 8;
196
197 /*
198 * Load state.
199 */
200 for (u = 0; u < 4; u ++) {
201 state[u] = init[u] =
202 _mm256_broadcastd_epi32(_mm_cvtsi32_si128(CW[u]));
203 }
204 for (u = 0; u < 10; u ++) {
205 state[u + 4] = init[u + 4] =
206 _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[u]));
207 }
208 state[14] = init[14] = _mm256_xor_si256(
209 _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[10])),
210 _mm256_loadu_si256((__m256i *)&t.w[0]));
211 state[15] = init[15] = _mm256_xor_si256(
212 _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[11])),
213 _mm256_loadu_si256((__m256i *)&t.w[8]));
214
215 /*
216 * Do all rounds.
217 */
218 for (i = 0; i < 10; i ++) {
219
220 #define QROUND(a, b, c, d) do { \
221 state[a] = _mm256_add_epi32(state[a], state[b]); \
222 state[d] = _mm256_xor_si256(state[d], state[a]); \
223 state[d] = _mm256_or_si256( \
224 _mm256_slli_epi32(state[d], 16), \
225 _mm256_srli_epi32(state[d], 16)); \
226 state[c] = _mm256_add_epi32(state[c], state[d]); \
227 state[b] = _mm256_xor_si256(state[b], state[c]); \
228 state[b] = _mm256_or_si256( \
229 _mm256_slli_epi32(state[b], 12), \
230 _mm256_srli_epi32(state[b], 20)); \
231 state[a] = _mm256_add_epi32(state[a], state[b]); \
232 state[d] = _mm256_xor_si256(state[d], state[a]); \
233 state[d] = _mm256_or_si256( \
234 _mm256_slli_epi32(state[d], 8), \
235 _mm256_srli_epi32(state[d], 24)); \
236 state[c] = _mm256_add_epi32(state[c], state[d]); \
237 state[b] = _mm256_xor_si256(state[b], state[c]); \
238 state[b] = _mm256_or_si256( \
239 _mm256_slli_epi32(state[b], 7), \
240 _mm256_srli_epi32(state[b], 25)); \
241 } while (0)
242
243 QROUND( 0, 4, 8, 12);
244 QROUND( 1, 5, 9, 13);
245 QROUND( 2, 6, 10, 14);
246 QROUND( 3, 7, 11, 15);
247 QROUND( 0, 5, 10, 15);
248 QROUND( 1, 6, 11, 12);
249 QROUND( 2, 7, 8, 13);
250 QROUND( 3, 4, 9, 14);
251
252 #undef QROUND
253
254 }
255
256 /*
257 * Add initial state back and encode the result in the destination
258 * buffer. We can dump the AVX2 values "as is" because the non-AVX2
259 * code uses a compatible order of values.
260 */
261 for (u = 0; u < 16; u ++) {
262 _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
263 _mm256_add_epi32(state[u], init[u]));
264 }
265
266 #else // yyyAVX2+0
267
268 static const uint32_t CW[] = {
269 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
270 };
271
272 uint64_t cc;
273 size_t u;
274
275 /*
276 * State uses local endianness. Only the output bytes must be
277 * converted to little endian (if used on a big-endian machine).
278 */
279 cc = *(uint64_t *)(p->state.d + 48);
280 for (u = 0; u < 8; u ++) {
281 uint32_t state[16];
282 size_t v;
283 int i;
284
285 memcpy(&state[0], CW, sizeof CW);
286 memcpy(&state[4], p->state.d, 48);
287 state[14] ^= (uint32_t)cc;
288 state[15] ^= (uint32_t)(cc >> 32);
289 for (i = 0; i < 10; i ++) {
290
291 #define QROUND(a, b, c, d) do { \
292 state[a] += state[b]; \
293 state[d] ^= state[a]; \
294 state[d] = (state[d] << 16) | (state[d] >> 16); \
295 state[c] += state[d]; \
296 state[b] ^= state[c]; \
297 state[b] = (state[b] << 12) | (state[b] >> 20); \
298 state[a] += state[b]; \
299 state[d] ^= state[a]; \
300 state[d] = (state[d] << 8) | (state[d] >> 24); \
301 state[c] += state[d]; \
302 state[b] ^= state[c]; \
303 state[b] = (state[b] << 7) | (state[b] >> 25); \
304 } while (0)
305
306 QROUND( 0, 4, 8, 12);
307 QROUND( 1, 5, 9, 13);
308 QROUND( 2, 6, 10, 14);
309 QROUND( 3, 7, 11, 15);
310 QROUND( 0, 5, 10, 15);
311 QROUND( 1, 6, 11, 12);
312 QROUND( 2, 7, 8, 13);
313 QROUND( 3, 4, 9, 14);
314
315 #undef QROUND
316
317 }
318
319 for (v = 0; v < 4; v ++) {
320 state[v] += CW[v];
321 }
322 for (v = 4; v < 14; v ++) {
323 state[v] += ((uint32_t *)p->state.d)[v - 4];
324 }
325 state[14] += ((uint32_t *)p->state.d)[10]
326 ^ (uint32_t)cc;
327 state[15] += ((uint32_t *)p->state.d)[11]
328 ^ (uint32_t)(cc >> 32);
329 cc ++;
330
331 /*
332 * We mimic the interleaving that is used in the AVX2
333 * implementation.
334 */
335 for (v = 0; v < 16; v ++) {
336 #if FALCON_LE // yyyLE+1
337 ((uint32_t *)p->buf.d)[u + (v << 3)] = state[v];
338 #else // yyyLE+0
339 p->buf.d[(u << 2) + (v << 5) + 0] =
340 (uint8_t)state[v];
341 p->buf.d[(u << 2) + (v << 5) + 1] =
342 (uint8_t)(state[v] >> 8);
343 p->buf.d[(u << 2) + (v << 5) + 2] =
344 (uint8_t)(state[v] >> 16);
345 p->buf.d[(u << 2) + (v << 5) + 3] =
346 (uint8_t)(state[v] >> 24);
347 #endif // yyyLE-
348 }
349 }
350 *(uint64_t *)(p->state.d + 48) = cc;
351
352 #endif // yyyAVX2-
353
354 p->ptr = 0;
355 }
356
357 /* see inner.h */
358 void
359 Zf(prng_get_bytes)(prng *p, void *dst, size_t len)
360 {
361 uint8_t *buf;
362
363 buf = dst;
364 while (len > 0) {
365 size_t clen;
366
367 clen = (sizeof p->buf.d) - p->ptr;
368 if (clen > len) {
369 clen = len;
370 }
371 memcpy(buf, p->buf.d, clen);
372 buf += clen;
373 len -= clen;
374 p->ptr += clen;
375 if (p->ptr == sizeof p->buf.d) {
376 Zf(prng_refill)(p);
377 }
378 }
379 }