Falcon source files (reference implementation)


shake.c

    1 /*
    2  * SHAKE implementation.
    3  *
    4  * ==========================(LICENSE BEGIN)============================
    5  *
    6  * Copyright (c) 2017-2019  Falcon Project
    7  *
    8  * Permission is hereby granted, free of charge, to any person obtaining
    9  * a copy of this software and associated documentation files (the
   10  * "Software"), to deal in the Software without restriction, including
   11  * without limitation the rights to use, copy, modify, merge, publish,
   12  * distribute, sublicense, and/or sell copies of the Software, and to
   13  * permit persons to whom the Software is furnished to do so, subject to
   14  * the following conditions:
   15  *
   16  * The above copyright notice and this permission notice shall be
   17  * included in all copies or substantial portions of the Software.
   18  *
   19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   22  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
   23  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   26  *
   27  * ===========================(LICENSE END)=============================
   28  *
   29  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
   30  */
   31 
   32 #include <string.h>
   33 
   34 #include "inner.h"
   35 
   36 #if FALCON_ASM_CORTEXM4  // yyyASM_CORTEXM4+1
   37 
   38 __attribute__((naked))
   39 static void
   40 process_block(uint64_t *A __attribute__((unused)))
   41 {
   42         __asm__ (
   43         "push   { r1, r2, r3, r4, r5, r6, r7, r8, r10, r11, r12, lr }\n\t"
   44         "sub    sp, sp, #232\n\t"
   45         "\n\t"
   46         "@ Invert some words (alternate internal representation, which\n\t"
   47         "@ saves some operations).\n\t"
   48         "\n\t"
   49 
   50 #define INVERT_WORDS \
   51         "@ Invert A[1] and A[2].\n\t" \
   52         "adds   r1, r0, #8\n\t" \
   53         "ldm    r1, { r2, r3, r4, r5 }\n\t" \
   54         "mvns   r2, r2\n\t" \
   55         "mvns   r3, r3\n\t" \
   56         "mvns   r4, r4\n\t" \
   57         "mvns   r5, r5\n\t" \
   58         "stm    r1!, { r2, r3, r4, r5 }\n\t" \
   59         "@ Invert A[8]\n\t" \
   60         "adds   r1, r0, #64\n\t" \
   61         "ldm    r1, { r2, r3 }\n\t" \
   62         "mvns   r2, r2\n\t" \
   63         "mvns   r3, r3\n\t" \
   64         "stm    r1!, { r2, r3 }\n\t" \
   65         "@ Invert A[12]\n\t" \
   66         "adds   r1, r0, #96\n\t" \
   67         "ldm    r1, { r2, r3 }\n\t" \
   68         "mvns   r2, r2\n\t" \
   69         "mvns   r3, r3\n\t" \
   70         "stm    r1!, { r2, r3 }\n\t" \
   71         "@ Invert A[17]\n\t" \
   72         "adds   r1, r0, #136\n\t" \
   73         "ldm    r1, { r2, r3 }\n\t" \
   74         "mvns   r2, r2\n\t" \
   75         "mvns   r3, r3\n\t" \
   76         "stm    r1!, { r2, r3 }\n\t" \
   77         "@ Invert A[20]\n\t" \
   78         "adds   r1, r0, #160\n\t" \
   79         "ldm    r1, { r2, r3 }\n\t" \
   80         "mvns   r2, r2\n\t" \
   81         "mvns   r3, r3\n\t" \
   82         "stm    r1!, { r2, r3 }\n\t" \
   83         "\n\t"
   84 
   85         INVERT_WORDS
   86 
   87         "@ Do 24 rounds. Each loop iteration performs one rounds. We\n\t"
   88         "@ keep eight times the current round counter in [sp] (i.e.\n\t"
   89         "@ a multiple of 8, from 0 to 184).\n\t"
   90         "\n\t"
   91         "eors   r1, r1\n\t"
   92         "str    r1, [sp, #0]\n\t"
   93 ".process_block_loop:\n\t"
   94         "\n\t"
   95         "@ xor(A[5*i+0]) -> r1:r2\n\t"
   96         "@ xor(A[5*i+1]) -> r3:r4\n\t"
   97         "@ xor(A[5*i+2]) -> r5:r6\n\t"
   98         "@ xor(A[5*i+3]) -> r7:r8\n\t"
   99         "@ xor(A[5*i+4]) -> r10:r11\n\t"
  100         "ldm    r0!, { r1, r2, r3, r4, r5, r6, r7, r8 }\n\t"
  101         "adds   r0, #8\n\t"
  102         "ldm    r0!, { r10, r11, r12 }\n\t"
  103         "eors   r1, r10\n\t"
  104         "eors   r2, r11\n\t"
  105         "eors   r3, r12\n\t"
  106         "ldm    r0!, { r10, r11, r12 }\n\t"
  107         "eors   r4, r10\n\t"
  108         "eors   r5, r11\n\t"
  109         "eors   r6, r12\n\t"
  110         "ldm    r0!, { r10, r11 }\n\t"
  111         "eors   r7, r10\n\t"
  112         "eors   r8, r11\n\t"
  113         "adds   r0, #8\n\t"
  114         "ldm    r0!, { r10, r11, r12 }\n\t"
  115         "eors   r1, r10\n\t"
  116         "eors   r2, r11\n\t"
  117         "eors   r3, r12\n\t"
  118         "ldm    r0!, { r10, r11, r12 }\n\t"
  119         "eors   r4, r10\n\t"
  120         "eors   r5, r11\n\t"
  121         "eors   r6, r12\n\t"
  122         "ldm    r0!, { r10, r11 }\n\t"
  123         "eors   r7, r10\n\t"
  124         "eors   r8, r11\n\t"
  125         "adds   r0, #8\n\t"
  126         "ldm    r0!, { r10, r11, r12 }\n\t"
  127         "eors   r1, r10\n\t"
  128         "eors   r2, r11\n\t"
  129         "eors   r3, r12\n\t"
  130         "ldm    r0!, { r10, r11, r12 }\n\t"
  131         "eors   r4, r10\n\t"
  132         "eors   r5, r11\n\t"
  133         "eors   r6, r12\n\t"
  134         "ldm    r0!, { r10, r11 }\n\t"
  135         "eors   r7, r10\n\t"
  136         "eors   r8, r11\n\t"
  137         "adds   r0, #8\n\t"
  138         "ldm    r0!, { r10, r11, r12 }\n\t"
  139         "eors   r1, r10\n\t"
  140         "eors   r2, r11\n\t"
  141         "eors   r3, r12\n\t"
  142         "ldm    r0!, { r10, r11, r12 }\n\t"
  143         "eors   r4, r10\n\t"
  144         "eors   r5, r11\n\t"
  145         "eors   r6, r12\n\t"
  146         "ldm    r0!, { r10, r11 }\n\t"
  147         "eors   r7, r10\n\t"
  148         "eors   r8, r11\n\t"
  149         "ldm    r0!, { r10, r11 }\n\t"
  150         "subs   r0, #200\n\t"
  151         "ldr    r12, [r0, #32]\n\t"
  152         "eors   r10, r12\n\t"
  153         "ldr    r12, [r0, #36]\n\t"
  154         "eors   r11, r12\n\t"
  155         "ldr    r12, [r0, #72]\n\t"
  156         "eors   r10, r12\n\t"
  157         "ldr    r12, [r0, #76]\n\t"
  158         "eors   r11, r12\n\t"
  159         "ldr    r12, [r0, #112]\n\t"
  160         "eors   r10, r12\n\t"
  161         "ldr    r12, [r0, #116]\n\t"
  162         "eors   r11, r12\n\t"
  163         "ldr    r12, [r0, #152]\n\t"
  164         "eors   r10, r12\n\t"
  165         "ldr    r12, [r0, #156]\n\t"
  166         "eors   r11, r12\n\t"
  167         "\n\t"
  168         "@ t0 = xor(A[5*i+4]) ^ rotl1(xor(A[5*i+1])) -> r10:r11\n\t"
  169         "@ t1 = xor(A[5*i+0]) ^ rotl1(xor(A[5*i+2])) -> r1:r2\n\t"
  170         "@ t2 = xor(A[5*i+1]) ^ rotl1(xor(A[5*i+3])) -> r3:r4\n\t"
  171         "@ t3 = xor(A[5*i+2]) ^ rotl1(xor(A[5*i+4])) -> r5:r6\n\t"
  172         "@ t4 = xor(A[5*i+3]) ^ rotl1(xor(A[5*i+0])) -> r7:r8\n\t"
  173         "str    r11, [sp, #4]\n\t"
  174         "mov    r12, r10\n\t"
  175         "eors   r10, r10, r3, lsl #1\n\t"
  176         "eors   r10, r10, r4, lsr #31\n\t"
  177         "eors   r11, r11, r4, lsl #1\n\t"
  178         "eors   r11, r11, r3, lsr #31\n\t"
  179         "eors   r3, r3, r7, lsl #1\n\t"
  180         "eors   r3, r3, r8, lsr #31\n\t"
  181         "eors   r4, r4, r8, lsl #1\n\t"
  182         "eors   r4, r4, r7, lsr #31\n\t"
  183         "eors   r7, r7, r1, lsl #1\n\t"
  184         "eors   r7, r7, r2, lsr #31\n\t"
  185         "eors   r8, r8, r2, lsl #1\n\t"
  186         "eors   r8, r8, r1, lsr #31\n\t"
  187         "eors   r1, r1, r5, lsl #1\n\t"
  188         "eors   r1, r1, r6, lsr #31\n\t"
  189         "eors   r2, r2, r6, lsl #1\n\t"
  190         "eors   r2, r2, r5, lsr #31\n\t"
  191         "eors   r5, r5, r12, lsl #1\n\t"
  192         "eors   r6, r6, r12, lsr #31\n\t"
  193         "ldr    r12, [sp, #4]\n\t"
  194         "eors   r5, r5, r12, lsr #31\n\t"
  195         "eors   r6, r6, r12, lsl #1\n\t"
  196         "\n\t"
  197         "@ Save t2, t3 and t4 on the stack.\n\t"
  198         "addw   r12, sp, #4\n\t"
  199         "stm    r12, { r3, r4, r5, r6, r7, r8 }\n\t"
  200         "\n\t"
  201         "@ We XOR one of the t0..t4 values into each A[] word, and\n\t"
  202         "@ rotate the result by some amount (each word has its own\n\t"
  203         "@ amount). The results are written back into a stack buffer\n\t"
  204         "@ that starts at sp+32\n\t"
  205         "addw   r12, sp, #32\n\t"
  206         "\n\t"
  207         "@ XOR t0 into A[5*i+0] and t1 into A[5*i+1]; each A[i] is also\n\t"
  208         "@ rotated left by some amount.\n\t"
  209         "\n\t"
  210         "@ A[0] and A[1]\n\t"
  211         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  212         "eors   r5, r10\n\t"
  213         "eors   r6, r11\n\t"
  214         "eors   r3, r7, r1\n\t"
  215         "eors   r4, r8, r2\n\t"
  216         "lsl    r7, r3, #1\n\t"
  217         "orr    r7, r7, r4, lsr #31\n\t"
  218         "lsl    r8, r4, #1\n\t"
  219         "orr    r8, r8, r3, lsr #31\n\t"
  220         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  221         "\n\t"
  222         "@ A[5] and A[6]\n\t"
  223         "adds   r0, #24\n\t"
  224         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  225         "eors   r3, r5, r10\n\t"
  226         "eors   r4, r6, r11\n\t"
  227         "lsl    r5, r4, #4\n\t"
  228         "orr    r5, r5, r3, lsr #28\n\t"
  229         "lsl    r6, r3, #4\n\t"
  230         "orr    r6, r6, r4, lsr #28\n\t"
  231         "eors   r3, r7, r1\n\t"
  232         "eors   r4, r8, r2\n\t"
  233         "lsl    r7, r4, #12\n\t"
  234         "orr    r7, r7, r3, lsr #20\n\t"
  235         "lsl    r8, r3, #12\n\t"
  236         "orr    r8, r8, r4, lsr #20\n\t"
  237         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  238         "\n\t"
  239         "@ A[10] and A[11]\n\t"
  240         "adds   r0, #24\n\t"
  241         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  242         "eors   r3, r5, r10\n\t"
  243         "eors   r4, r6, r11\n\t"
  244         "lsl    r5, r3, #3\n\t"
  245         "orr    r5, r5, r4, lsr #29\n\t"
  246         "lsl    r6, r4, #3\n\t"
  247         "orr    r6, r6, r3, lsr #29\n\t"
  248         "eors   r3, r7, r1\n\t"
  249         "eors   r4, r8, r2\n\t"
  250         "lsl    r7, r3, #10\n\t"
  251         "orr    r7, r7, r4, lsr #22\n\t"
  252         "lsl    r8, r4, #10\n\t"
  253         "orr    r8, r8, r3, lsr #22\n\t"
  254         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  255         "\n\t"
  256         "@ A[15] and A[16]\n\t"
  257         "adds   r0, #24\n\t"
  258         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  259         "eors   r3, r5, r10\n\t"
  260         "eors   r4, r6, r11\n\t"
  261         "lsl    r5, r4, #9\n\t"
  262         "orr    r5, r5, r3, lsr #23\n\t"
  263         "lsl    r6, r3, #9\n\t"
  264         "orr    r6, r6, r4, lsr #23\n\t"
  265         "eors   r3, r7, r1\n\t"
  266         "eors   r4, r8, r2\n\t"
  267         "lsl    r7, r4, #13\n\t"
  268         "orr    r7, r7, r3, lsr #19\n\t"
  269         "lsl    r8, r3, #13\n\t"
  270         "orr    r8, r8, r4, lsr #19\n\t"
  271         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  272         "\n\t"
  273         "@ A[20] and A[21]\n\t"
  274         "adds   r0, #24\n\t"
  275         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  276         "eors   r3, r5, r10\n\t"
  277         "eors   r4, r6, r11\n\t"
  278         "lsl    r5, r3, #18\n\t"
  279         "orr    r5, r5, r4, lsr #14\n\t"
  280         "lsl    r6, r4, #18\n\t"
  281         "orr    r6, r6, r3, lsr #14\n\t"
  282         "eors   r3, r7, r1\n\t"
  283         "eors   r4, r8, r2\n\t"
  284         "lsl    r7, r3, #2\n\t"
  285         "orr    r7, r7, r4, lsr #30\n\t"
  286         "lsl    r8, r4, #2\n\t"
  287         "orr    r8, r8, r3, lsr #30\n\t"
  288         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  289         "\n\t"
  290         "@ XOR t2 into A[5*i+2] and t3 into A[5*i+3]; each A[i] is also\n\t"
  291         "@ rotated left by some amount. We reload t2 into r1:r2 and t3\n\t"
  292         "@ into r3:r4.\n\t"
  293         "addw   r5, sp, #4\n\t"
  294         "ldm    r5!, { r1, r2, r3, r4 }\n\t"
  295         "\n\t"
  296         "@ A[2] and A[3]\n\t"
  297         "subs   r0, #160\n\t"
  298         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  299         "eors   r10, r5, r1\n\t"
  300         "eors   r11, r6, r2\n\t"
  301         "lsl    r5, r11, #30\n\t"
  302         "orr    r5, r5, r10, lsr #2\n\t"
  303         "lsl    r6, r10, #30\n\t"
  304         "orr    r6, r6, r11, lsr #2\n\t"
  305         "eors   r10, r7, r3\n\t"
  306         "eors   r11, r8, r4\n\t"
  307         "lsl    r7, r10, #28\n\t"
  308         "orr    r7, r7, r11, lsr #4\n\t"
  309         "lsl    r8, r11, #28\n\t"
  310         "orr    r8, r8, r10, lsr #4\n\t"
  311         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  312         "\n\t"
  313         "@ A[7] and A[8]\n\t"
  314         "adds   r0, #24\n\t"
  315         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  316         "eors   r10, r5, r1\n\t"
  317         "eors   r11, r6, r2\n\t"
  318         "lsl    r5, r10, #6\n\t"
  319         "orr    r5, r5, r11, lsr #26\n\t"
  320         "lsl    r6, r11, #6\n\t"
  321         "orr    r6, r6, r10, lsr #26\n\t"
  322         "eors   r10, r7, r3\n\t"
  323         "eors   r11, r8, r4\n\t"
  324         "lsl    r7, r11, #23\n\t"
  325         "orr    r7, r7, r10, lsr #9\n\t"
  326         "lsl    r8, r10, #23\n\t"
  327         "orr    r8, r8, r11, lsr #9\n\t"
  328         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  329         "\n\t"
  330         "@ A[12] and A[13]\n\t"
  331         "adds   r0, #24\n\t"
  332         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  333         "eors   r10, r5, r1\n\t"
  334         "eors   r11, r6, r2\n\t"
  335         "lsl    r5, r11, #11\n\t"
  336         "orr    r5, r5, r10, lsr #21\n\t"
  337         "lsl    r6, r10, #11\n\t"
  338         "orr    r6, r6, r11, lsr #21\n\t"
  339         "eors   r10, r7, r3\n\t"
  340         "eors   r11, r8, r4\n\t"
  341         "lsl    r7, r10, #25\n\t"
  342         "orr    r7, r7, r11, lsr #7\n\t"
  343         "lsl    r8, r11, #25\n\t"
  344         "orr    r8, r8, r10, lsr #7\n\t"
  345         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  346         "\n\t"
  347         "@ A[17] and A[18]\n\t"
  348         "adds   r0, #24\n\t"
  349         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  350         "eors   r10, r5, r1\n\t"
  351         "eors   r11, r6, r2\n\t"
  352         "lsl    r5, r10, #15\n\t"
  353         "orr    r5, r5, r11, lsr #17\n\t"
  354         "lsl    r6, r11, #15\n\t"
  355         "orr    r6, r6, r10, lsr #17\n\t"
  356         "eors   r10, r7, r3\n\t"
  357         "eors   r11, r8, r4\n\t"
  358         "lsl    r7, r10, #21\n\t"
  359         "orr    r7, r7, r11, lsr #11\n\t"
  360         "lsl    r8, r11, #21\n\t"
  361         "orr    r8, r8, r10, lsr #11\n\t"
  362         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  363         "\n\t"
  364         "@ A[22] and A[23]\n\t"
  365         "adds   r0, #24\n\t"
  366         "ldm    r0!, { r5, r6, r7, r8 }\n\t"
  367         "eors   r10, r5, r1\n\t"
  368         "eors   r11, r6, r2\n\t"
  369         "lsl    r5, r11, #29\n\t"
  370         "orr    r5, r5, r10, lsr #3\n\t"
  371         "lsl    r6, r10, #29\n\t"
  372         "orr    r6, r6, r11, lsr #3\n\t"
  373         "eors   r10, r7, r3\n\t"
  374         "eors   r11, r8, r4\n\t"
  375         "lsl    r7, r11, #24\n\t"
  376         "orr    r7, r7, r10, lsr #8\n\t"
  377         "lsl    r8, r10, #24\n\t"
  378         "orr    r8, r8, r11, lsr #8\n\t"
  379         "stm    r12!, { r5, r6, r7, r8 }\n\t"
  380         "\n\t"
  381         "@ XOR t4 into A[5*i+4]; each A[i] is also rotated left by some\n\t"
  382         "@ amount. We reload t4 into r1:r2.\n\t"
  383         "ldr    r1, [sp, #20]\n\t"
  384         "ldr    r2, [sp, #24]\n\t"
  385         "\n\t"
  386         "@ A[4]\n\t"
  387         "subs   r0, #160\n\t"
  388         "ldm    r0!, { r5, r6 }\n\t"
  389         "eors   r3, r5, r1\n\t"
  390         "eors   r4, r6, r2\n\t"
  391         "lsl    r5, r3, #27\n\t"
  392         "orr    r5, r5, r4, lsr #5\n\t"
  393         "lsl    r6, r4, #27\n\t"
  394         "orr    r6, r6, r3, lsr #5\n\t"
  395         "stm    r12!, { r5, r6 }\n\t"
  396         "\n\t"
  397         "@ A[9]\n\t"
  398         "adds   r0, #32\n\t"
  399         "ldm    r0!, { r5, r6 }\n\t"
  400         "eors   r3, r5, r1\n\t"
  401         "eors   r4, r6, r2\n\t"
  402         "lsl    r5, r3, #20\n\t"
  403         "orr    r5, r5, r4, lsr #12\n\t"
  404         "lsl    r6, r4, #20\n\t"
  405         "orr    r6, r6, r3, lsr #12\n\t"
  406         "stm    r12!, { r5, r6 }\n\t"
  407         "\n\t"
  408         "@ A[14]\n\t"
  409         "adds   r0, #32\n\t"
  410         "ldm    r0!, { r5, r6 }\n\t"
  411         "eors   r3, r5, r1\n\t"
  412         "eors   r4, r6, r2\n\t"
  413         "lsl    r5, r4, #7\n\t"
  414         "orr    r5, r5, r3, lsr #25\n\t"
  415         "lsl    r6, r3, #7\n\t"
  416         "orr    r6, r6, r4, lsr #25\n\t"
  417         "stm    r12!, { r5, r6 }\n\t"
  418         "\n\t"
  419         "@ A[19]\n\t"
  420         "adds   r0, #32\n\t"
  421         "ldm    r0!, { r5, r6 }\n\t"
  422         "eors   r3, r5, r1\n\t"
  423         "eors   r4, r6, r2\n\t"
  424         "lsl    r5, r3, #8\n\t"
  425         "orr    r5, r5, r4, lsr #24\n\t"
  426         "lsl    r6, r4, #8\n\t"
  427         "orr    r6, r6, r3, lsr #24\n\t"
  428         "stm    r12!, { r5, r6 }\n\t"
  429         "\n\t"
  430         "@ A[24]\n\t"
  431         "adds   r0, #32\n\t"
  432         "ldm    r0!, { r5, r6 }\n\t"
  433         "eors   r3, r5, r1\n\t"
  434         "eors   r4, r6, r2\n\t"
  435         "lsl    r5, r3, #14\n\t"
  436         "orr    r5, r5, r4, lsr #18\n\t"
  437         "lsl    r6, r4, #14\n\t"
  438         "orr    r6, r6, r3, lsr #18\n\t"
  439         "stm    r12!, { r5, r6 }\n\t"
  440         "\n\t"
  441         "subs   r0, #200\n\t"
  442         "\n\t"
  443         "@ At that point, the stack buffer at sp+32 contains the words\n\t"
  444         "@ at the following indexes (0 to 24) and offsets (from sp)\n\t"
  445         "@   A[ 0]    0      32\n\t"
  446         "@   A[ 1]    1      40\n\t"
  447         "@   A[ 2]   10     112\n\t"
  448         "@   A[ 3]   11     120\n\t"
  449         "@   A[ 4]   20     192\n\t"
  450         "@   A[ 5]    2      48\n\t"
  451         "@   A[ 6]    3      56\n\t"
  452         "@   A[ 7]   12     128\n\t"
  453         "@   A[ 8]   13     136\n\t"
  454         "@   A[ 9]   21     200\n\t"
  455         "@   A[10]    4      64\n\t"
  456         "@   A[11]    5      72\n\t"
  457         "@   A[12]   14     144\n\t"
  458         "@   A[13]   15     152\n\t"
  459         "@   A[14]   22     208\n\t"
  460         "@   A[15]    6      80\n\t"
  461         "@   A[16]    7      88\n\t"
  462         "@   A[17]   16     160\n\t"
  463         "@   A[18]   17     168\n\t"
  464         "@   A[19]   23     216\n\t"
  465         "@   A[20]    8      96\n\t"
  466         "@   A[21]    9     104\n\t"
  467         "@   A[22]   18     176\n\t"
  468         "@   A[23]   19     184\n\t"
  469         "@   A[24]   24     224\n\t"
  470 
  471 #define KHI_LOAD(s0, s1, s2, s3, s4) \
  472         "ldr    r1, [sp, #(32 + 8 * " #s0 ")]\n\t" \
  473         "ldr    r2, [sp, #(36 + 8 * " #s0 ")]\n\t" \
  474         "ldr    r3, [sp, #(32 + 8 * " #s1 ")]\n\t" \
  475         "ldr    r4, [sp, #(36 + 8 * " #s1 ")]\n\t" \
  476         "ldr    r5, [sp, #(32 + 8 * " #s2 ")]\n\t" \
  477         "ldr    r6, [sp, #(36 + 8 * " #s2 ")]\n\t" \
  478         "ldr    r7, [sp, #(32 + 8 * " #s3 ")]\n\t" \
  479         "ldr    r8, [sp, #(36 + 8 * " #s3 ")]\n\t" \
  480         "ldr    r10, [sp, #(32 + 8 * " #s4 ")]\n\t" \
  481         "ldr    r11, [sp, #(36 + 8 * " #s4 ")]\n\t"
  482 
  483 #define KHI_STEP(op, x0, x1, x2, x3, x4, x5, d) \
  484         #op "   r12, " #x0 ", " #x2 "\n\t" \
  485         "eors   r12, " #x4 "\n\t" \
  486         "str    r12, [r0, #(8 * " #d ")]\n\t" \
  487         #op "   r12, " #x1 ", " #x3 "\n\t" \
  488         "eors   r12, " #x5 "\n\t" \
  489         "str    r12, [r0, #(4 + 8 * " #d ")]\n\t"
  490 
  491         "@ A[0], A[6], A[12], A[18] and A[24]\n\t"
  492         KHI_LOAD(0, 3, 14, 17, 24)
  493         KHI_STEP(orrs, r3, r4, r5, r6, r1, r2, 0)
  494         KHI_STEP(orns, r7, r8, r5, r6, r3, r4, 1)
  495         KHI_STEP(ands, r7, r8, r10, r11, r5, r6, 2)
  496         KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 3)
  497         KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 4)
  498         "\n\t"
  499 
  500         "@ A[3], A[9], A[10], A[16] and A[22]\n\t"
  501         KHI_LOAD(11, 21, 4, 7, 18)
  502         KHI_STEP(orrs, r3, r4, r5, r6, r1, r2, 5)
  503         KHI_STEP(ands, r7, r8, r5, r6, r3, r4, 6)
  504         KHI_STEP(orns, r7, r8, r10, r11, r5, r6, 7)
  505         KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 8)
  506         KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 9)
  507         "\n\t"
  508 
  509         "@ A[1], A[7], A[13], A[19] and A[20]\n\t"
  510         KHI_LOAD(1, 12, 15, 23, 8)
  511         KHI_STEP(orrs, r3, r4, r5, r6, r1, r2, 10)
  512         KHI_STEP(ands, r7, r8, r5, r6, r3, r4, 11)
  513         KHI_STEP(bics, r10, r11, r7, r8, r5, r6, 12)
  514         "mvns   r7, r7\n\t"
  515         "mvns   r8, r8\n\t"
  516         KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 13)
  517         KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 14)
  518         "\n\t"
  519 
  520         "@ A[4], A[5], A[11], A[17] and A[23]\n\t"
  521         KHI_LOAD(20, 2, 5, 16, 19)
  522         KHI_STEP(ands, r3, r4, r5, r6, r1, r2, 15)
  523         KHI_STEP(orrs, r7, r8, r5, r6, r3, r4, 16)
  524         KHI_STEP(orns, r10, r11, r7, r8, r5, r6, 17)
  525         "mvns   r7, r7\n\t"
  526         "mvns   r8, r8\n\t"
  527         KHI_STEP(ands, r1, r2, r10, r11, r7, r8, 18)
  528         KHI_STEP(orrs, r1, r2, r3, r4, r10, r11, 19)
  529         "\n\t"
  530 
  531         "@ A[2], A[8], A[14], A[15] and A[21]\n\t"
  532         KHI_LOAD(10, 13, 22, 6, 9)
  533         KHI_STEP(bics, r5, r6, r3, r4, r1, r2, 20)
  534         KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 24)
  535         "mvns   r3, r3\n\t"
  536         "mvns   r4, r4\n\t"
  537         KHI_STEP(orrs, r7, r8, r5, r6, r3, r4, 21)
  538         KHI_STEP(ands, r7, r8, r10, r11, r5, r6, 22)
  539         KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 23)
  540         "\n\t"
  541 
  542         "@ Get round counter XOR round constant into A[0]\n\t"
  543         "ldr    r1, [sp, #0]\n\t"
  544         "adr    r2, .process_block_RC\n\t"
  545         "adds   r2, r1\n\t"
  546         "ldm    r2, { r3, r4 }\n\t"
  547         "ldm    r0, { r5, r6 }\n\t"
  548         "eors   r5, r3\n\t"
  549         "eors   r6, r4\n\t"
  550         "stm    r0, { r5, r6 }\n\t"
  551         "\n\t"
  552         "@ Increment round counter, loop until all 24 rounds are done.\n\t"
  553         "\n\t"
  554         "adds   r1, #8\n\t"
  555         "str    r1, [sp, #0]\n\t"
  556         "cmp    r1, #192\n\t"
  557         "blo    .process_block_loop\n\t"
  558 
  559         INVERT_WORDS
  560 
  561         "add    sp, sp, #232\n\t"
  562         "pop    { r1, r2, r3, r4, r5, r6, r7, r8, r10, r11, r12, pc }\n\t"
  563         "\n\t"
  564 ".process_block_RC:\n\t"
  565         ".word  0x00000001\n\t"
  566         ".word  0x00000000\n\t"
  567         ".word  0x00008082\n\t"
  568         ".word  0x00000000\n\t"
  569         ".word  0x0000808A\n\t"
  570         ".word  0x80000000\n\t"
  571         ".word  0x80008000\n\t"
  572         ".word  0x80000000\n\t"
  573         ".word  0x0000808B\n\t"
  574         ".word  0x00000000\n\t"
  575         ".word  0x80000001\n\t"
  576         ".word  0x00000000\n\t"
  577         ".word  0x80008081\n\t"
  578         ".word  0x80000000\n\t"
  579         ".word  0x00008009\n\t"
  580         ".word  0x80000000\n\t"
  581         ".word  0x0000008A\n\t"
  582         ".word  0x00000000\n\t"
  583         ".word  0x00000088\n\t"
  584         ".word  0x00000000\n\t"
  585         ".word  0x80008009\n\t"
  586         ".word  0x00000000\n\t"
  587         ".word  0x8000000A\n\t"
  588         ".word  0x00000000\n\t"
  589         ".word  0x8000808B\n\t"
  590         ".word  0x00000000\n\t"
  591         ".word  0x0000008B\n\t"
  592         ".word  0x80000000\n\t"
  593         ".word  0x00008089\n\t"
  594         ".word  0x80000000\n\t"
  595         ".word  0x00008003\n\t"
  596         ".word  0x80000000\n\t"
  597         ".word  0x00008002\n\t"
  598         ".word  0x80000000\n\t"
  599         ".word  0x00000080\n\t"
  600         ".word  0x80000000\n\t"
  601         ".word  0x0000800A\n\t"
  602         ".word  0x00000000\n\t"
  603         ".word  0x8000000A\n\t"
  604         ".word  0x80000000\n\t"
  605         ".word  0x80008081\n\t"
  606         ".word  0x80000000\n\t"
  607         ".word  0x00008080\n\t"
  608         ".word  0x80000000\n\t"
  609         ".word  0x80000001\n\t"
  610         ".word  0x00000000\n\t"
  611         ".word  0x80008008\n\t"
  612         ".word  0x80000000\n\t"
  613 
  614 #undef INVERT_WORDS
  615 #undef KHI_LOAD
  616 #undef KHI_STEP
  617 
  618         );
  619 }
  620 
  621 #else  // yyyASM_CORTEXM4+0
  622 
  623 /*
  624  * Round constants.
  625  */
  626 static const uint64_t RC[] = {
  627         0x0000000000000001, 0x0000000000008082,
  628         0x800000000000808A, 0x8000000080008000,
  629         0x000000000000808B, 0x0000000080000001,
  630         0x8000000080008081, 0x8000000000008009,
  631         0x000000000000008A, 0x0000000000000088,
  632         0x0000000080008009, 0x000000008000000A,
  633         0x000000008000808B, 0x800000000000008B,
  634         0x8000000000008089, 0x8000000000008003,
  635         0x8000000000008002, 0x8000000000000080,
  636         0x000000000000800A, 0x800000008000000A,
  637         0x8000000080008081, 0x8000000000008080,
  638         0x0000000080000001, 0x8000000080008008
  639 };
  640 
  641 /*
  642  * Process the provided state.
  643  */
  644 static void
  645 process_block(uint64_t *A)
  646 {
  647         uint64_t t0, t1, t2, t3, t4;
  648         uint64_t tt0, tt1, tt2, tt3;
  649         uint64_t t, kt;
  650         uint64_t c0, c1, c2, c3, c4, bnn;
  651         int j;
  652 
  653         /*
  654          * Invert some words (alternate internal representation, which
  655          * saves some operations).
  656          */
  657         A[ 1] = ~A[ 1];
  658         A[ 2] = ~A[ 2];
  659         A[ 8] = ~A[ 8];
  660         A[12] = ~A[12];
  661         A[17] = ~A[17];
  662         A[20] = ~A[20];
  663 
  664         /*
  665          * Compute the 24 rounds. This loop is partially unrolled (each
  666          * iteration computes two rounds).
  667          */
  668         for (j = 0; j < 24; j += 2) {
  669 
  670                 tt0 = A[ 1] ^ A[ 6];
  671                 tt1 = A[11] ^ A[16];
  672                 tt0 ^= A[21] ^ tt1;
  673                 tt0 = (tt0 << 1) | (tt0 >> 63);
  674                 tt2 = A[ 4] ^ A[ 9];
  675                 tt3 = A[14] ^ A[19];
  676                 tt0 ^= A[24];
  677                 tt2 ^= tt3;
  678                 t0 = tt0 ^ tt2;
  679 
  680                 tt0 = A[ 2] ^ A[ 7];
  681                 tt1 = A[12] ^ A[17];
  682                 tt0 ^= A[22] ^ tt1;
  683                 tt0 = (tt0 << 1) | (tt0 >> 63);
  684                 tt2 = A[ 0] ^ A[ 5];
  685                 tt3 = A[10] ^ A[15];
  686                 tt0 ^= A[20];
  687                 tt2 ^= tt3;
  688                 t1 = tt0 ^ tt2;
  689 
  690                 tt0 = A[ 3] ^ A[ 8];
  691                 tt1 = A[13] ^ A[18];
  692                 tt0 ^= A[23] ^ tt1;
  693                 tt0 = (tt0 << 1) | (tt0 >> 63);
  694                 tt2 = A[ 1] ^ A[ 6];
  695                 tt3 = A[11] ^ A[16];
  696                 tt0 ^= A[21];
  697                 tt2 ^= tt3;
  698                 t2 = tt0 ^ tt2;
  699 
  700                 tt0 = A[ 4] ^ A[ 9];
  701                 tt1 = A[14] ^ A[19];
  702                 tt0 ^= A[24] ^ tt1;
  703                 tt0 = (tt0 << 1) | (tt0 >> 63);
  704                 tt2 = A[ 2] ^ A[ 7];
  705                 tt3 = A[12] ^ A[17];
  706                 tt0 ^= A[22];
  707                 tt2 ^= tt3;
  708                 t3 = tt0 ^ tt2;
  709 
  710                 tt0 = A[ 0] ^ A[ 5];
  711                 tt1 = A[10] ^ A[15];
  712                 tt0 ^= A[20] ^ tt1;
  713                 tt0 = (tt0 << 1) | (tt0 >> 63);
  714                 tt2 = A[ 3] ^ A[ 8];
  715                 tt3 = A[13] ^ A[18];
  716                 tt0 ^= A[23];
  717                 tt2 ^= tt3;
  718                 t4 = tt0 ^ tt2;
  719 
  720                 A[ 0] = A[ 0] ^ t0;
  721                 A[ 5] = A[ 5] ^ t0;
  722                 A[10] = A[10] ^ t0;
  723                 A[15] = A[15] ^ t0;
  724                 A[20] = A[20] ^ t0;
  725                 A[ 1] = A[ 1] ^ t1;
  726                 A[ 6] = A[ 6] ^ t1;
  727                 A[11] = A[11] ^ t1;
  728                 A[16] = A[16] ^ t1;
  729                 A[21] = A[21] ^ t1;
  730                 A[ 2] = A[ 2] ^ t2;
  731                 A[ 7] = A[ 7] ^ t2;
  732                 A[12] = A[12] ^ t2;
  733                 A[17] = A[17] ^ t2;
  734                 A[22] = A[22] ^ t2;
  735                 A[ 3] = A[ 3] ^ t3;
  736                 A[ 8] = A[ 8] ^ t3;
  737                 A[13] = A[13] ^ t3;
  738                 A[18] = A[18] ^ t3;
  739                 A[23] = A[23] ^ t3;
  740                 A[ 4] = A[ 4] ^ t4;
  741                 A[ 9] = A[ 9] ^ t4;
  742                 A[14] = A[14] ^ t4;
  743                 A[19] = A[19] ^ t4;
  744                 A[24] = A[24] ^ t4;
  745                 A[ 5] = (A[ 5] << 36) | (A[ 5] >> (64 - 36));
  746                 A[10] = (A[10] <<  3) | (A[10] >> (64 -  3));
  747                 A[15] = (A[15] << 41) | (A[15] >> (64 - 41));
  748                 A[20] = (A[20] << 18) | (A[20] >> (64 - 18));
  749                 A[ 1] = (A[ 1] <<  1) | (A[ 1] >> (64 -  1));
  750                 A[ 6] = (A[ 6] << 44) | (A[ 6] >> (64 - 44));
  751                 A[11] = (A[11] << 10) | (A[11] >> (64 - 10));
  752                 A[16] = (A[16] << 45) | (A[16] >> (64 - 45));
  753                 A[21] = (A[21] <<  2) | (A[21] >> (64 - 2));
  754                 A[ 2] = (A[ 2] << 62) | (A[ 2] >> (64 - 62));
  755                 A[ 7] = (A[ 7] <<  6) | (A[ 7] >> (64 -  6));
  756                 A[12] = (A[12] << 43) | (A[12] >> (64 - 43));
  757                 A[17] = (A[17] << 15) | (A[17] >> (64 - 15));
  758                 A[22] = (A[22] << 61) | (A[22] >> (64 - 61));
  759                 A[ 3] = (A[ 3] << 28) | (A[ 3] >> (64 - 28));
  760                 A[ 8] = (A[ 8] << 55) | (A[ 8] >> (64 - 55));
  761                 A[13] = (A[13] << 25) | (A[13] >> (64 - 25));
  762                 A[18] = (A[18] << 21) | (A[18] >> (64 - 21));
  763                 A[23] = (A[23] << 56) | (A[23] >> (64 - 56));
  764                 A[ 4] = (A[ 4] << 27) | (A[ 4] >> (64 - 27));
  765                 A[ 9] = (A[ 9] << 20) | (A[ 9] >> (64 - 20));
  766                 A[14] = (A[14] << 39) | (A[14] >> (64 - 39));
  767                 A[19] = (A[19] <<  8) | (A[19] >> (64 -  8));
  768                 A[24] = (A[24] << 14) | (A[24] >> (64 - 14));
  769 
  770                 bnn = ~A[12];
  771                 kt = A[ 6] | A[12];
  772                 c0 = A[ 0] ^ kt;
  773                 kt = bnn | A[18];
  774                 c1 = A[ 6] ^ kt;
  775                 kt = A[18] & A[24];
  776                 c2 = A[12] ^ kt;
  777                 kt = A[24] | A[ 0];
  778                 c3 = A[18] ^ kt;
  779                 kt = A[ 0] & A[ 6];
  780                 c4 = A[24] ^ kt;
  781                 A[ 0] = c0;
  782                 A[ 6] = c1;
  783                 A[12] = c2;
  784                 A[18] = c3;
  785                 A[24] = c4;
  786                 bnn = ~A[22];
  787                 kt = A[ 9] | A[10];
  788                 c0 = A[ 3] ^ kt;
  789                 kt = A[10] & A[16];
  790                 c1 = A[ 9] ^ kt;
  791                 kt = A[16] | bnn;
  792                 c2 = A[10] ^ kt;
  793                 kt = A[22] | A[ 3];
  794                 c3 = A[16] ^ kt;
  795                 kt = A[ 3] & A[ 9];
  796                 c4 = A[22] ^ kt;
  797                 A[ 3] = c0;
  798                 A[ 9] = c1;
  799                 A[10] = c2;
  800                 A[16] = c3;
  801                 A[22] = c4;
  802                 bnn = ~A[19];
  803                 kt = A[ 7] | A[13];
  804                 c0 = A[ 1] ^ kt;
  805                 kt = A[13] & A[19];
  806                 c1 = A[ 7] ^ kt;
  807                 kt = bnn & A[20];
  808                 c2 = A[13] ^ kt;
  809                 kt = A[20] | A[ 1];
  810                 c3 = bnn ^ kt;
  811                 kt = A[ 1] & A[ 7];
  812                 c4 = A[20] ^ kt;
  813                 A[ 1] = c0;
  814                 A[ 7] = c1;
  815                 A[13] = c2;
  816                 A[19] = c3;
  817                 A[20] = c4;
  818                 bnn = ~A[17];
  819                 kt = A[ 5] & A[11];
  820                 c0 = A[ 4] ^ kt;
  821                 kt = A[11] | A[17];
  822                 c1 = A[ 5] ^ kt;
  823                 kt = bnn | A[23];
  824                 c2 = A[11] ^ kt;
  825                 kt = A[23] & A[ 4];
  826                 c3 = bnn ^ kt;
  827                 kt = A[ 4] | A[ 5];
  828                 c4 = A[23] ^ kt;
  829                 A[ 4] = c0;
  830                 A[ 5] = c1;
  831                 A[11] = c2;
  832                 A[17] = c3;
  833                 A[23] = c4;
  834                 bnn = ~A[ 8];
  835                 kt = bnn & A[14];
  836                 c0 = A[ 2] ^ kt;
  837                 kt = A[14] | A[15];
  838                 c1 = bnn ^ kt;
  839                 kt = A[15] & A[21];
  840                 c2 = A[14] ^ kt;
  841                 kt = A[21] | A[ 2];
  842                 c3 = A[15] ^ kt;
  843                 kt = A[ 2] & A[ 8];
  844                 c4 = A[21] ^ kt;
  845                 A[ 2] = c0;
  846                 A[ 8] = c1;
  847                 A[14] = c2;
  848                 A[15] = c3;
  849                 A[21] = c4;
  850                 A[ 0] = A[ 0] ^ RC[j + 0];
  851 
  852                 tt0 = A[ 6] ^ A[ 9];
  853                 tt1 = A[ 7] ^ A[ 5];
  854                 tt0 ^= A[ 8] ^ tt1;
  855                 tt0 = (tt0 << 1) | (tt0 >> 63);
  856                 tt2 = A[24] ^ A[22];
  857                 tt3 = A[20] ^ A[23];
  858                 tt0 ^= A[21];
  859                 tt2 ^= tt3;
  860                 t0 = tt0 ^ tt2;
  861 
  862                 tt0 = A[12] ^ A[10];
  863                 tt1 = A[13] ^ A[11];
  864                 tt0 ^= A[14] ^ tt1;
  865                 tt0 = (tt0 << 1) | (tt0 >> 63);
  866                 tt2 = A[ 0] ^ A[ 3];
  867                 tt3 = A[ 1] ^ A[ 4];
  868                 tt0 ^= A[ 2];
  869                 tt2 ^= tt3;
  870                 t1 = tt0 ^ tt2;
  871 
  872                 tt0 = A[18] ^ A[16];
  873                 tt1 = A[19] ^ A[17];
  874                 tt0 ^= A[15] ^ tt1;
  875                 tt0 = (tt0 << 1) | (tt0 >> 63);
  876                 tt2 = A[ 6] ^ A[ 9];
  877                 tt3 = A[ 7] ^ A[ 5];
  878                 tt0 ^= A[ 8];
  879                 tt2 ^= tt3;
  880                 t2 = tt0 ^ tt2;
  881 
  882                 tt0 = A[24] ^ A[22];
  883                 tt1 = A[20] ^ A[23];
  884                 tt0 ^= A[21] ^ tt1;
  885                 tt0 = (tt0 << 1) | (tt0 >> 63);
  886                 tt2 = A[12] ^ A[10];
  887                 tt3 = A[13] ^ A[11];
  888                 tt0 ^= A[14];
  889                 tt2 ^= tt3;
  890                 t3 = tt0 ^ tt2;
  891 
  892                 tt0 = A[ 0] ^ A[ 3];
  893                 tt1 = A[ 1] ^ A[ 4];
  894                 tt0 ^= A[ 2] ^ tt1;
  895                 tt0 = (tt0 << 1) | (tt0 >> 63);
  896                 tt2 = A[18] ^ A[16];
  897                 tt3 = A[19] ^ A[17];
  898                 tt0 ^= A[15];
  899                 tt2 ^= tt3;
  900                 t4 = tt0 ^ tt2;
  901 
  902                 A[ 0] = A[ 0] ^ t0;
  903                 A[ 3] = A[ 3] ^ t0;
  904                 A[ 1] = A[ 1] ^ t0;
  905                 A[ 4] = A[ 4] ^ t0;
  906                 A[ 2] = A[ 2] ^ t0;
  907                 A[ 6] = A[ 6] ^ t1;
  908                 A[ 9] = A[ 9] ^ t1;
  909                 A[ 7] = A[ 7] ^ t1;
  910                 A[ 5] = A[ 5] ^ t1;
  911                 A[ 8] = A[ 8] ^ t1;
  912                 A[12] = A[12] ^ t2;
  913                 A[10] = A[10] ^ t2;
  914                 A[13] = A[13] ^ t2;
  915                 A[11] = A[11] ^ t2;
  916                 A[14] = A[14] ^ t2;
  917                 A[18] = A[18] ^ t3;
  918                 A[16] = A[16] ^ t3;
  919                 A[19] = A[19] ^ t3;
  920                 A[17] = A[17] ^ t3;
  921                 A[15] = A[15] ^ t3;
  922                 A[24] = A[24] ^ t4;
  923                 A[22] = A[22] ^ t4;
  924                 A[20] = A[20] ^ t4;
  925                 A[23] = A[23] ^ t4;
  926                 A[21] = A[21] ^ t4;
  927                 A[ 3] = (A[ 3] << 36) | (A[ 3] >> (64 - 36));
  928                 A[ 1] = (A[ 1] <<  3) | (A[ 1] >> (64 -  3));
  929                 A[ 4] = (A[ 4] << 41) | (A[ 4] >> (64 - 41));
  930                 A[ 2] = (A[ 2] << 18) | (A[ 2] >> (64 - 18));
  931                 A[ 6] = (A[ 6] <<  1) | (A[ 6] >> (64 -  1));
  932                 A[ 9] = (A[ 9] << 44) | (A[ 9] >> (64 - 44));
  933                 A[ 7] = (A[ 7] << 10) | (A[ 7] >> (64 - 10));
  934                 A[ 5] = (A[ 5] << 45) | (A[ 5] >> (64 - 45));
  935                 A[ 8] = (A[ 8] <<  2) | (A[ 8] >> (64 - 2));
  936                 A[12] = (A[12] << 62) | (A[12] >> (64 - 62));
  937                 A[10] = (A[10] <<  6) | (A[10] >> (64 -  6));
  938                 A[13] = (A[13] << 43) | (A[13] >> (64 - 43));
  939                 A[11] = (A[11] << 15) | (A[11] >> (64 - 15));
  940                 A[14] = (A[14] << 61) | (A[14] >> (64 - 61));
  941                 A[18] = (A[18] << 28) | (A[18] >> (64 - 28));
  942                 A[16] = (A[16] << 55) | (A[16] >> (64 - 55));
  943                 A[19] = (A[19] << 25) | (A[19] >> (64 - 25));
  944                 A[17] = (A[17] << 21) | (A[17] >> (64 - 21));
  945                 A[15] = (A[15] << 56) | (A[15] >> (64 - 56));
  946                 A[24] = (A[24] << 27) | (A[24] >> (64 - 27));
  947                 A[22] = (A[22] << 20) | (A[22] >> (64 - 20));
  948                 A[20] = (A[20] << 39) | (A[20] >> (64 - 39));
  949                 A[23] = (A[23] <<  8) | (A[23] >> (64 -  8));
  950                 A[21] = (A[21] << 14) | (A[21] >> (64 - 14));
  951 
  952                 bnn = ~A[13];
  953                 kt = A[ 9] | A[13];
  954                 c0 = A[ 0] ^ kt;
  955                 kt = bnn | A[17];
  956                 c1 = A[ 9] ^ kt;
  957                 kt = A[17] & A[21];
  958                 c2 = A[13] ^ kt;
  959                 kt = A[21] | A[ 0];
  960                 c3 = A[17] ^ kt;
  961                 kt = A[ 0] & A[ 9];
  962                 c4 = A[21] ^ kt;
  963                 A[ 0] = c0;
  964                 A[ 9] = c1;
  965                 A[13] = c2;
  966                 A[17] = c3;
  967                 A[21] = c4;
  968                 bnn = ~A[14];
  969                 kt = A[22] | A[ 1];
  970                 c0 = A[18] ^ kt;
  971                 kt = A[ 1] & A[ 5];
  972                 c1 = A[22] ^ kt;
  973                 kt = A[ 5] | bnn;
  974                 c2 = A[ 1] ^ kt;
  975                 kt = A[14] | A[18];
  976                 c3 = A[ 5] ^ kt;
  977                 kt = A[18] & A[22];
  978                 c4 = A[14] ^ kt;
  979                 A[18] = c0;
  980                 A[22] = c1;
  981                 A[ 1] = c2;
  982                 A[ 5] = c3;
  983                 A[14] = c4;
  984                 bnn = ~A[23];
  985                 kt = A[10] | A[19];
  986                 c0 = A[ 6] ^ kt;
  987                 kt = A[19] & A[23];
  988                 c1 = A[10] ^ kt;
  989                 kt = bnn & A[ 2];
  990                 c2 = A[19] ^ kt;
  991                 kt = A[ 2] | A[ 6];
  992                 c3 = bnn ^ kt;
  993                 kt = A[ 6] & A[10];
  994                 c4 = A[ 2] ^ kt;
  995                 A[ 6] = c0;
  996                 A[10] = c1;
  997                 A[19] = c2;
  998                 A[23] = c3;
  999                 A[ 2] = c4;
 1000                 bnn = ~A[11];
 1001                 kt = A[ 3] & A[ 7];
 1002                 c0 = A[24] ^ kt;
 1003                 kt = A[ 7] | A[11];
 1004                 c1 = A[ 3] ^ kt;
 1005                 kt = bnn | A[15];
 1006                 c2 = A[ 7] ^ kt;
 1007                 kt = A[15] & A[24];
 1008                 c3 = bnn ^ kt;
 1009                 kt = A[24] | A[ 3];
 1010                 c4 = A[15] ^ kt;
 1011                 A[24] = c0;
 1012                 A[ 3] = c1;
 1013                 A[ 7] = c2;
 1014                 A[11] = c3;
 1015                 A[15] = c4;
 1016                 bnn = ~A[16];
 1017                 kt = bnn & A[20];
 1018                 c0 = A[12] ^ kt;
 1019                 kt = A[20] | A[ 4];
 1020                 c1 = bnn ^ kt;
 1021                 kt = A[ 4] & A[ 8];
 1022                 c2 = A[20] ^ kt;
 1023                 kt = A[ 8] | A[12];
 1024                 c3 = A[ 4] ^ kt;
 1025                 kt = A[12] & A[16];
 1026                 c4 = A[ 8] ^ kt;
 1027                 A[12] = c0;
 1028                 A[16] = c1;
 1029                 A[20] = c2;
 1030                 A[ 4] = c3;
 1031                 A[ 8] = c4;
 1032                 A[ 0] = A[ 0] ^ RC[j + 1];
 1033                 t = A[ 5];
 1034                 A[ 5] = A[18];
 1035                 A[18] = A[11];
 1036                 A[11] = A[10];
 1037                 A[10] = A[ 6];
 1038                 A[ 6] = A[22];
 1039                 A[22] = A[20];
 1040                 A[20] = A[12];
 1041                 A[12] = A[19];
 1042                 A[19] = A[15];
 1043                 A[15] = A[24];
 1044                 A[24] = A[ 8];
 1045                 A[ 8] = t;
 1046                 t = A[ 1];
 1047                 A[ 1] = A[ 9];
 1048                 A[ 9] = A[14];
 1049                 A[14] = A[ 2];
 1050                 A[ 2] = A[13];
 1051                 A[13] = A[23];
 1052                 A[23] = A[ 4];
 1053                 A[ 4] = A[21];
 1054                 A[21] = A[16];
 1055                 A[16] = A[ 3];
 1056                 A[ 3] = A[17];
 1057                 A[17] = A[ 7];
 1058                 A[ 7] = t;
 1059         }
 1060 
 1061         /*
 1062          * Invert some words back to normal representation.
 1063          */
 1064         A[ 1] = ~A[ 1];
 1065         A[ 2] = ~A[ 2];
 1066         A[ 8] = ~A[ 8];
 1067         A[12] = ~A[12];
 1068         A[17] = ~A[17];
 1069         A[20] = ~A[20];
 1070 }
 1071 
 1072 #endif  // yyyASM_CORTEXM4-
 1073 
 1074 /* see inner.h */
 1075 void
 1076 Zf(i_shake256_init)(inner_shake256_context *sc)
 1077 {
 1078         sc->dptr = 0;
 1079 
 1080         /*
 1081          * Representation of an all-ones uint64_t is the same regardless
 1082          * of local endianness.
 1083          */
 1084         memset(sc->st.A, 0, sizeof sc->st.A);
 1085 }
 1086 
 1087 /* see inner.h */
 1088 void
 1089 Zf(i_shake256_inject)(inner_shake256_context *sc, const uint8_t *in, size_t len)
 1090 {
 1091         size_t dptr;
 1092 
 1093         dptr = (size_t)sc->dptr;
 1094         while (len > 0) {
 1095                 size_t clen, u;
 1096 
 1097                 clen = 136 - dptr;
 1098                 if (clen > len) {
 1099                         clen = len;
 1100                 }
 1101 #if FALCON_LE  // yyyLE+1
 1102                 for (u = 0; u < clen; u ++) {
 1103                         sc->st.dbuf[dptr + u] ^= in[u];
 1104                 }
 1105 #else  // yyyLE+0
 1106                 for (u = 0; u < clen; u ++) {
 1107                         size_t v;
 1108 
 1109                         v = u + dptr;
 1110                         sc->st.A[v >> 3] ^= (uint64_t)in[u] << ((v & 7) << 3);
 1111                 }
 1112 #endif  // yyyLE-
 1113                 dptr += clen;
 1114                 in += clen;
 1115                 len -= clen;
 1116                 if (dptr == 136) {
 1117                         process_block(sc->st.A);
 1118                         dptr = 0;
 1119                 }
 1120         }
 1121         sc->dptr = dptr;
 1122 }
 1123 
 1124 /* see falcon.h */
 1125 void
 1126 Zf(i_shake256_flip)(inner_shake256_context *sc)
 1127 {
 1128         /*
 1129          * We apply padding and pre-XOR the value into the state. We
 1130          * set dptr to the end of the buffer, so that first call to
 1131          * shake_extract() will process the block.
 1132          */
 1133 #if FALCON_LE  // yyyLE+1
 1134         sc->st.dbuf[sc->dptr] ^= 0x1F;
 1135         sc->st.dbuf[135] ^= 0x80;
 1136 #else  // yyyLE+0
 1137         unsigned v;
 1138 
 1139         v = sc->dptr;
 1140         sc->st.A[v >> 3] ^= (uint64_t)0x1F << ((v & 7) << 3);
 1141         sc->st.A[16] ^= (uint64_t)0x80 << 56;
 1142 #endif  // yyyLE-
 1143         sc->dptr = 136;
 1144 }
 1145 
 1146 /* see falcon.h */
 1147 void
 1148 Zf(i_shake256_extract)(inner_shake256_context *sc, uint8_t *out, size_t len)
 1149 {
 1150         size_t dptr;
 1151 
 1152         dptr = (size_t)sc->dptr;
 1153         while (len > 0) {
 1154                 size_t clen;
 1155 
 1156                 if (dptr == 136) {
 1157                         process_block(sc->st.A);
 1158                         dptr = 0;
 1159                 }
 1160                 clen = 136 - dptr;
 1161                 if (clen > len) {
 1162                         clen = len;
 1163                 }
 1164                 len -= clen;
 1165 #if FALCON_LE  // yyyLE+1
 1166                 memcpy(out, sc->st.dbuf + dptr, clen);
 1167                 dptr += clen;
 1168                 out += clen;
 1169 #else  // yyyLE+0
 1170                 while (clen -- > 0) {
 1171                         *out ++ = sc->st.A[dptr >> 3] >> ((dptr & 7) << 3);
 1172                         dptr ++;
 1173                 }
 1174 #endif  // yyyLE-
 1175         }
 1176         sc->dptr = dptr;
 1177 }