Falcon source files (reference implementation)


rng.c

    1 /*
    2  * PRNG and interface to the system RNG.
    3  *
    4  * ==========================(LICENSE BEGIN)============================
    5  *
    6  * Copyright (c) 2017-2019  Falcon Project
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining
    9  * a copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be
   17  * included in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * ===========================(LICENSE END)=============================
   28  *
   29  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
   30  */
   31 
   32 #include <assert.h>
   33 
   34 #include "inner.h"
   35 
   36 // yyyNIST+0 yyyPQCLEAN+0
   37 /*
   38  * Include relevant system header files. For Win32, this will also need
   39  * linking with advapi32.dll, which we trigger with an appropriate #pragma.
   40  */
   41 #if FALCON_RAND_GETENTROPY
   42 #include <unistd.h>
   43 #endif
   44 #if FALCON_RAND_URANDOM
   45 #include <sys/types.h>
   46 #if !FALCON_RAND_GETENTROPY
   47 #include <unistd.h>
   48 #endif
   49 #include <fcntl.h>
   50 #include <errno.h>
   51 #endif
   52 #if FALCON_RAND_WIN32
   53 #include <windows.h>
   54 #include <wincrypt.h>
   55 #pragma comment(lib, "advapi32")
   56 #endif
   57 
   58 /* see inner.h */
   59 int
   60 Zf(get_seed)(void *seed, size_t len)
   61 {
   62         (void)seed;
   63         if (len == 0) {
   64                 return 1;
   65         }
   66 #if FALCON_RAND_GETENTROPY
   67         if (getentropy(seed, len) == 0) {
   68                 return 1;
   69         }
   70 #endif
   71 #if FALCON_RAND_URANDOM
   72         {
   73                 int f;
   74 
   75                 f = open("/dev/urandom", O_RDONLY);
   76                 if (f >= 0) {
   77                         while (len > 0) {
   78                                 ssize_t rlen;
   79 
   80                                 rlen = read(f, seed, len);
   81                                 if (rlen < 0) {
   82                                         if (errno == EINTR) {
   83                                                 continue;
   84                                         }
   85                                         break;
   86                                 }
   87                                 seed = (uint8_t *)seed + rlen;
   88                                 len -= (size_t)rlen;
   89                         }
   90                         close(f);
   91                         if (len == 0) {
   92                                 return 1;
   93                         }
   94                 }
   95         }
   96 #endif
   97 #if FALCON_RAND_WIN32
   98         {
   99                 HCRYPTPROV hp;
  100 
  101                 if (CryptAcquireContext(&hp, 0, 0, PROV_RSA_FULL,
  102                         CRYPT_VERIFYCONTEXT | CRYPT_SILENT))
  103                 {
  104                         BOOL r;
  105 
  106                         r = CryptGenRandom(hp, (DWORD)len, seed);
  107                         CryptReleaseContext(hp, 0);
  108                         if (r) {
  109                                 return 1;
  110                         }
  111                 }
  112         }
  113 #endif
  114         return 0;
  115 }
  116 // yyyNIST- yyyPQCLEAN-
  117 
  118 /* see inner.h */
  119 void
  120 Zf(prng_init)(prng *p, inner_shake256_context *src)
  121 {
  122 #if FALCON_LE  // yyyLE+1
  123         inner_shake256_extract(src, p->state.d, 56);
  124 #else  // yyyLE+0
  125         /*
  126          * To ensure reproducibility for a given seed, we
  127          * must enforce little-endian interpretation of
  128          * the state words.
  129          */
  130         uint8_t tmp[56];
  131         uint64_t th, tl;
  132         int i;
  133 
  134         inner_shake256_extract(src, tmp, 56);
  135         for (i = 0; i < 14; i ++) {
  136                 uint32_t w;
  137 
  138                 w = (uint32_t)tmp[(i << 2) + 0]
  139                         | ((uint32_t)tmp[(i << 2) + 1] << 8)
  140                         | ((uint32_t)tmp[(i << 2) + 2] << 16)
  141                         | ((uint32_t)tmp[(i << 2) + 3] << 24);
  142                 *(uint32_t *)(p->state.d + (i << 2)) = w;
  143         }
  144         tl = *(uint32_t *)(p->state.d + 48);
  145         th = *(uint32_t *)(p->state.d + 52);
  146         *(uint64_t *)(p->state.d + 48) = tl + (th << 32);
  147 #endif  // yyyLE-
  148         Zf(prng_refill)(p);
  149 }
  150 
  151 /*
  152  * PRNG based on ChaCha20.
  153  *
  154  * State consists in key (32 bytes) then IV (16 bytes) and block counter
  155  * (8 bytes). Normally, we should not care about local endianness (this
  156  * is for a PRNG), but for the NIST competition we need reproducible KAT
  157  * vectors that work across architectures, so we enforce little-endian
  158  * interpretation where applicable. Moreover, output words are "spread
  159  * out" over the output buffer with the interleaving pattern that is
  160  * naturally obtained from the AVX2 implementation that runs eight
  161  * ChaCha20 instances in parallel.
  162  *
  163  * The block counter is XORed into the first 8 bytes of the IV.
  164  */
  165 TARGET_AVX2
  166 void
  167 Zf(prng_refill)(prng *p)
  168 {
  169 #if FALCON_AVX2 // yyyAVX2+1
  170 
  171         static const uint32_t CW[] = {
  172                 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
  173         };
  174 
  175         uint64_t cc;
  176         size_t u;
  177         int i;
  178         uint32_t *sw;
  179         union {
  180                 uint32_t w[16];
  181                 __m256i y[2];  /* for alignment */
  182         } t;
  183         __m256i state[16], init[16];
  184 
  185         sw = (uint32_t *)p->state.d;
  186 
  187         /*
  188          * XOR next counter values into state.
  189          */
  190         cc = *(uint64_t *)(p->state.d + 48);
  191         for (u = 0; u < 8; u ++) {
  192                 t.w[u] = (uint32_t)(cc + u);
  193                 t.w[u + 8] = (uint32_t)((cc + u) >> 32);
  194         }
  195         *(uint64_t *)(p->state.d + 48) = cc + 8;
  196 
  197         /*
  198          * Load state.
  199          */
  200         for (u = 0; u < 4; u ++) {
  201                 state[u] = init[u] =
  202                         _mm256_broadcastd_epi32(_mm_cvtsi32_si128(CW[u]));
  203         }
  204         for (u = 0; u < 10; u ++) {
  205                 state[u + 4] = init[u + 4] =
  206                         _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[u]));
  207         }
  208         state[14] = init[14] = _mm256_xor_si256(
  209                 _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[10])),
  210                 _mm256_loadu_si256((__m256i *)&t.w[0]));
  211         state[15] = init[15] = _mm256_xor_si256(
  212                 _mm256_broadcastd_epi32(_mm_cvtsi32_si128(sw[11])),
  213                 _mm256_loadu_si256((__m256i *)&t.w[8]));
  214 
  215         /*
  216          * Do all rounds.
  217          */
  218         for (i = 0; i < 10; i ++) {
  219 
  220 #define QROUND(a, b, c, d)   do { \
  221                 state[a] = _mm256_add_epi32(state[a], state[b]); \
  222                 state[d] = _mm256_xor_si256(state[d], state[a]); \
  223                 state[d] = _mm256_or_si256( \
  224                         _mm256_slli_epi32(state[d], 16), \
  225                         _mm256_srli_epi32(state[d], 16)); \
  226                 state[c] = _mm256_add_epi32(state[c], state[d]); \
  227                 state[b] = _mm256_xor_si256(state[b], state[c]); \
  228                 state[b] = _mm256_or_si256( \
  229                         _mm256_slli_epi32(state[b], 12), \
  230                         _mm256_srli_epi32(state[b], 20)); \
  231                 state[a] = _mm256_add_epi32(state[a], state[b]); \
  232                 state[d] = _mm256_xor_si256(state[d], state[a]); \
  233                 state[d] = _mm256_or_si256( \
  234                         _mm256_slli_epi32(state[d],  8), \
  235                         _mm256_srli_epi32(state[d], 24)); \
  236                 state[c] = _mm256_add_epi32(state[c], state[d]); \
  237                 state[b] = _mm256_xor_si256(state[b], state[c]); \
  238                 state[b] = _mm256_or_si256( \
  239                         _mm256_slli_epi32(state[b], 7), \
  240                         _mm256_srli_epi32(state[b], 25)); \
  241         } while (0)
  242 
  243                 QROUND( 0,  4,  8, 12);
  244                 QROUND( 1,  5,  9, 13);
  245                 QROUND( 2,  6, 10, 14);
  246                 QROUND( 3,  7, 11, 15);
  247                 QROUND( 0,  5, 10, 15);
  248                 QROUND( 1,  6, 11, 12);
  249                 QROUND( 2,  7,  8, 13);
  250                 QROUND( 3,  4,  9, 14);
  251 
  252 #undef QROUND
  253 
  254         }
  255 
  256         /*
  257          * Add initial state back and encode the result in the destination
  258          * buffer. We can dump the AVX2 values "as is" because the non-AVX2
  259          * code uses a compatible order of values.
  260          */
  261         for (u = 0; u < 16; u ++) {
  262                 _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
  263                         _mm256_add_epi32(state[u], init[u]));
  264         }
  265 
  266 #else // yyyAVX2+0
  267 
  268         static const uint32_t CW[] = {
  269                 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
  270         };
  271 
  272         uint64_t cc;
  273         size_t u;
  274 
  275         /*
  276          * State uses local endianness. Only the output bytes must be
  277          * converted to little endian (if used on a big-endian machine).
  278          */
  279         cc = *(uint64_t *)(p->state.d + 48);
  280         for (u = 0; u < 8; u ++) {
  281                 uint32_t state[16];
  282                 size_t v;
  283                 int i;
  284 
  285                 memcpy(&state[0], CW, sizeof CW);
  286                 memcpy(&state[4], p->state.d, 48);
  287                 state[14] ^= (uint32_t)cc;
  288                 state[15] ^= (uint32_t)(cc >> 32);
  289                 for (i = 0; i < 10; i ++) {
  290 
  291 #define QROUND(a, b, c, d)   do { \
  292                 state[a] += state[b]; \
  293                 state[d] ^= state[a]; \
  294                 state[d] = (state[d] << 16) | (state[d] >> 16); \
  295                 state[c] += state[d]; \
  296                 state[b] ^= state[c]; \
  297                 state[b] = (state[b] << 12) | (state[b] >> 20); \
  298                 state[a] += state[b]; \
  299                 state[d] ^= state[a]; \
  300                 state[d] = (state[d] <<  8) | (state[d] >> 24); \
  301                 state[c] += state[d]; \
  302                 state[b] ^= state[c]; \
  303                 state[b] = (state[b] <<  7) | (state[b] >> 25); \
  304         } while (0)
  305 
  306                         QROUND( 0,  4,  8, 12);
  307                         QROUND( 1,  5,  9, 13);
  308                         QROUND( 2,  6, 10, 14);
  309                         QROUND( 3,  7, 11, 15);
  310                         QROUND( 0,  5, 10, 15);
  311                         QROUND( 1,  6, 11, 12);
  312                         QROUND( 2,  7,  8, 13);
  313                         QROUND( 3,  4,  9, 14);
  314 
  315 #undef QROUND
  316 
  317                 }
  318 
  319                 for (v = 0; v < 4; v ++) {
  320                         state[v] += CW[v];
  321                 }
  322                 for (v = 4; v < 14; v ++) {
  323                         state[v] += ((uint32_t *)p->state.d)[v - 4];
  324                 }
  325                 state[14] += ((uint32_t *)p->state.d)[10]
  326                         ^ (uint32_t)cc;
  327                 state[15] += ((uint32_t *)p->state.d)[11]
  328                         ^ (uint32_t)(cc >> 32);
  329                 cc ++;
  330 
  331                 /*
  332                  * We mimic the interleaving that is used in the AVX2
  333                  * implementation.
  334                  */
  335                 for (v = 0; v < 16; v ++) {
  336 #if FALCON_LE  // yyyLE+1
  337                         ((uint32_t *)p->buf.d)[u + (v << 3)] = state[v];
  338 #else  // yyyLE+0
  339                         p->buf.d[(u << 2) + (v << 5) + 0] =
  340                                 (uint8_t)state[v];
  341                         p->buf.d[(u << 2) + (v << 5) + 1] =
  342                                 (uint8_t)(state[v] >> 8);
  343                         p->buf.d[(u << 2) + (v << 5) + 2] =
  344                                 (uint8_t)(state[v] >> 16);
  345                         p->buf.d[(u << 2) + (v << 5) + 3] =
  346                                 (uint8_t)(state[v] >> 24);
  347 #endif  // yyyLE-
  348                 }
  349         }
  350         *(uint64_t *)(p->state.d + 48) = cc;
  351 
  352 #endif // yyyAVX2-
  353 
  354         p->ptr = 0;
  355 }
  356 
  357 /* see inner.h */
  358 void
  359 Zf(prng_get_bytes)(prng *p, void *dst, size_t len)
  360 {
  361         uint8_t *buf;
  362 
  363         buf = dst;
  364         while (len > 0) {
  365                 size_t clen;
  366 
  367                 clen = (sizeof p->buf.d) - p->ptr;
  368                 if (clen > len) {
  369                         clen = len;
  370                 }
  371                 memcpy(buf, p->buf.d, clen);
  372                 buf += clen;
  373                 len -= clen;
  374                 p->ptr += clen;
  375                 if (p->ptr == sizeof p->buf.d) {
  376                         Zf(prng_refill)(p);
  377                 }
  378         }
  379 }