shake.c
1 /*
2 * SHAKE implementation.
3 *
4 * ==========================(LICENSE BEGIN)============================
5 *
6 * Copyright (c) 2017-2019 Falcon Project
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining
9 * a copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be
17 * included in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * ===========================(LICENSE END)=============================
28 *
29 * @author Thomas Pornin <thomas.pornin@nccgroup.com>
30 */
31
32 #include <string.h>
33
34 #include "inner.h"
35
36 #if FALCON_ASM_CORTEXM4 // yyyASM_CORTEXM4+1
37
38 __attribute__((naked))
39 static void
40 process_block(uint64_t *A __attribute__((unused)))
41 {
42 __asm__ (
43 "push { r1, r2, r3, r4, r5, r6, r7, r8, r10, r11, r12, lr }\n\t"
44 "sub sp, sp, #232\n\t"
45 "\n\t"
46 "@ Invert some words (alternate internal representation, which\n\t"
47 "@ saves some operations).\n\t"
48 "\n\t"
49
50 #define INVERT_WORDS \
51 "@ Invert A[1] and A[2].\n\t" \
52 "adds r1, r0, #8\n\t" \
53 "ldm r1, { r2, r3, r4, r5 }\n\t" \
54 "mvns r2, r2\n\t" \
55 "mvns r3, r3\n\t" \
56 "mvns r4, r4\n\t" \
57 "mvns r5, r5\n\t" \
58 "stm r1!, { r2, r3, r4, r5 }\n\t" \
59 "@ Invert A[8]\n\t" \
60 "adds r1, r0, #64\n\t" \
61 "ldm r1, { r2, r3 }\n\t" \
62 "mvns r2, r2\n\t" \
63 "mvns r3, r3\n\t" \
64 "stm r1!, { r2, r3 }\n\t" \
65 "@ Invert A[12]\n\t" \
66 "adds r1, r0, #96\n\t" \
67 "ldm r1, { r2, r3 }\n\t" \
68 "mvns r2, r2\n\t" \
69 "mvns r3, r3\n\t" \
70 "stm r1!, { r2, r3 }\n\t" \
71 "@ Invert A[17]\n\t" \
72 "adds r1, r0, #136\n\t" \
73 "ldm r1, { r2, r3 }\n\t" \
74 "mvns r2, r2\n\t" \
75 "mvns r3, r3\n\t" \
76 "stm r1!, { r2, r3 }\n\t" \
77 "@ Invert A[20]\n\t" \
78 "adds r1, r0, #160\n\t" \
79 "ldm r1, { r2, r3 }\n\t" \
80 "mvns r2, r2\n\t" \
81 "mvns r3, r3\n\t" \
82 "stm r1!, { r2, r3 }\n\t" \
83 "\n\t"
84
85 INVERT_WORDS
86
87 "@ Do 24 rounds. Each loop iteration performs one rounds. We\n\t"
88 "@ keep eight times the current round counter in [sp] (i.e.\n\t"
89 "@ a multiple of 8, from 0 to 184).\n\t"
90 "\n\t"
91 "eors r1, r1\n\t"
92 "str r1, [sp, #0]\n\t"
93 ".process_block_loop:\n\t"
94 "\n\t"
95 "@ xor(A[5*i+0]) -> r1:r2\n\t"
96 "@ xor(A[5*i+1]) -> r3:r4\n\t"
97 "@ xor(A[5*i+2]) -> r5:r6\n\t"
98 "@ xor(A[5*i+3]) -> r7:r8\n\t"
99 "@ xor(A[5*i+4]) -> r10:r11\n\t"
100 "ldm r0!, { r1, r2, r3, r4, r5, r6, r7, r8 }\n\t"
101 "adds r0, #8\n\t"
102 "ldm r0!, { r10, r11, r12 }\n\t"
103 "eors r1, r10\n\t"
104 "eors r2, r11\n\t"
105 "eors r3, r12\n\t"
106 "ldm r0!, { r10, r11, r12 }\n\t"
107 "eors r4, r10\n\t"
108 "eors r5, r11\n\t"
109 "eors r6, r12\n\t"
110 "ldm r0!, { r10, r11 }\n\t"
111 "eors r7, r10\n\t"
112 "eors r8, r11\n\t"
113 "adds r0, #8\n\t"
114 "ldm r0!, { r10, r11, r12 }\n\t"
115 "eors r1, r10\n\t"
116 "eors r2, r11\n\t"
117 "eors r3, r12\n\t"
118 "ldm r0!, { r10, r11, r12 }\n\t"
119 "eors r4, r10\n\t"
120 "eors r5, r11\n\t"
121 "eors r6, r12\n\t"
122 "ldm r0!, { r10, r11 }\n\t"
123 "eors r7, r10\n\t"
124 "eors r8, r11\n\t"
125 "adds r0, #8\n\t"
126 "ldm r0!, { r10, r11, r12 }\n\t"
127 "eors r1, r10\n\t"
128 "eors r2, r11\n\t"
129 "eors r3, r12\n\t"
130 "ldm r0!, { r10, r11, r12 }\n\t"
131 "eors r4, r10\n\t"
132 "eors r5, r11\n\t"
133 "eors r6, r12\n\t"
134 "ldm r0!, { r10, r11 }\n\t"
135 "eors r7, r10\n\t"
136 "eors r8, r11\n\t"
137 "adds r0, #8\n\t"
138 "ldm r0!, { r10, r11, r12 }\n\t"
139 "eors r1, r10\n\t"
140 "eors r2, r11\n\t"
141 "eors r3, r12\n\t"
142 "ldm r0!, { r10, r11, r12 }\n\t"
143 "eors r4, r10\n\t"
144 "eors r5, r11\n\t"
145 "eors r6, r12\n\t"
146 "ldm r0!, { r10, r11 }\n\t"
147 "eors r7, r10\n\t"
148 "eors r8, r11\n\t"
149 "ldm r0!, { r10, r11 }\n\t"
150 "subs r0, #200\n\t"
151 "ldr r12, [r0, #32]\n\t"
152 "eors r10, r12\n\t"
153 "ldr r12, [r0, #36]\n\t"
154 "eors r11, r12\n\t"
155 "ldr r12, [r0, #72]\n\t"
156 "eors r10, r12\n\t"
157 "ldr r12, [r0, #76]\n\t"
158 "eors r11, r12\n\t"
159 "ldr r12, [r0, #112]\n\t"
160 "eors r10, r12\n\t"
161 "ldr r12, [r0, #116]\n\t"
162 "eors r11, r12\n\t"
163 "ldr r12, [r0, #152]\n\t"
164 "eors r10, r12\n\t"
165 "ldr r12, [r0, #156]\n\t"
166 "eors r11, r12\n\t"
167 "\n\t"
168 "@ t0 = xor(A[5*i+4]) ^ rotl1(xor(A[5*i+1])) -> r10:r11\n\t"
169 "@ t1 = xor(A[5*i+0]) ^ rotl1(xor(A[5*i+2])) -> r1:r2\n\t"
170 "@ t2 = xor(A[5*i+1]) ^ rotl1(xor(A[5*i+3])) -> r3:r4\n\t"
171 "@ t3 = xor(A[5*i+2]) ^ rotl1(xor(A[5*i+4])) -> r5:r6\n\t"
172 "@ t4 = xor(A[5*i+3]) ^ rotl1(xor(A[5*i+0])) -> r7:r8\n\t"
173 "str r11, [sp, #4]\n\t"
174 "mov r12, r10\n\t"
175 "eors r10, r10, r3, lsl #1\n\t"
176 "eors r10, r10, r4, lsr #31\n\t"
177 "eors r11, r11, r4, lsl #1\n\t"
178 "eors r11, r11, r3, lsr #31\n\t"
179 "eors r3, r3, r7, lsl #1\n\t"
180 "eors r3, r3, r8, lsr #31\n\t"
181 "eors r4, r4, r8, lsl #1\n\t"
182 "eors r4, r4, r7, lsr #31\n\t"
183 "eors r7, r7, r1, lsl #1\n\t"
184 "eors r7, r7, r2, lsr #31\n\t"
185 "eors r8, r8, r2, lsl #1\n\t"
186 "eors r8, r8, r1, lsr #31\n\t"
187 "eors r1, r1, r5, lsl #1\n\t"
188 "eors r1, r1, r6, lsr #31\n\t"
189 "eors r2, r2, r6, lsl #1\n\t"
190 "eors r2, r2, r5, lsr #31\n\t"
191 "eors r5, r5, r12, lsl #1\n\t"
192 "eors r6, r6, r12, lsr #31\n\t"
193 "ldr r12, [sp, #4]\n\t"
194 "eors r5, r5, r12, lsr #31\n\t"
195 "eors r6, r6, r12, lsl #1\n\t"
196 "\n\t"
197 "@ Save t2, t3 and t4 on the stack.\n\t"
198 "addw r12, sp, #4\n\t"
199 "stm r12, { r3, r4, r5, r6, r7, r8 }\n\t"
200 "\n\t"
201 "@ We XOR one of the t0..t4 values into each A[] word, and\n\t"
202 "@ rotate the result by some amount (each word has its own\n\t"
203 "@ amount). The results are written back into a stack buffer\n\t"
204 "@ that starts at sp+32\n\t"
205 "addw r12, sp, #32\n\t"
206 "\n\t"
207 "@ XOR t0 into A[5*i+0] and t1 into A[5*i+1]; each A[i] is also\n\t"
208 "@ rotated left by some amount.\n\t"
209 "\n\t"
210 "@ A[0] and A[1]\n\t"
211 "ldm r0!, { r5, r6, r7, r8 }\n\t"
212 "eors r5, r10\n\t"
213 "eors r6, r11\n\t"
214 "eors r3, r7, r1\n\t"
215 "eors r4, r8, r2\n\t"
216 "lsl r7, r3, #1\n\t"
217 "orr r7, r7, r4, lsr #31\n\t"
218 "lsl r8, r4, #1\n\t"
219 "orr r8, r8, r3, lsr #31\n\t"
220 "stm r12!, { r5, r6, r7, r8 }\n\t"
221 "\n\t"
222 "@ A[5] and A[6]\n\t"
223 "adds r0, #24\n\t"
224 "ldm r0!, { r5, r6, r7, r8 }\n\t"
225 "eors r3, r5, r10\n\t"
226 "eors r4, r6, r11\n\t"
227 "lsl r5, r4, #4\n\t"
228 "orr r5, r5, r3, lsr #28\n\t"
229 "lsl r6, r3, #4\n\t"
230 "orr r6, r6, r4, lsr #28\n\t"
231 "eors r3, r7, r1\n\t"
232 "eors r4, r8, r2\n\t"
233 "lsl r7, r4, #12\n\t"
234 "orr r7, r7, r3, lsr #20\n\t"
235 "lsl r8, r3, #12\n\t"
236 "orr r8, r8, r4, lsr #20\n\t"
237 "stm r12!, { r5, r6, r7, r8 }\n\t"
238 "\n\t"
239 "@ A[10] and A[11]\n\t"
240 "adds r0, #24\n\t"
241 "ldm r0!, { r5, r6, r7, r8 }\n\t"
242 "eors r3, r5, r10\n\t"
243 "eors r4, r6, r11\n\t"
244 "lsl r5, r3, #3\n\t"
245 "orr r5, r5, r4, lsr #29\n\t"
246 "lsl r6, r4, #3\n\t"
247 "orr r6, r6, r3, lsr #29\n\t"
248 "eors r3, r7, r1\n\t"
249 "eors r4, r8, r2\n\t"
250 "lsl r7, r3, #10\n\t"
251 "orr r7, r7, r4, lsr #22\n\t"
252 "lsl r8, r4, #10\n\t"
253 "orr r8, r8, r3, lsr #22\n\t"
254 "stm r12!, { r5, r6, r7, r8 }\n\t"
255 "\n\t"
256 "@ A[15] and A[16]\n\t"
257 "adds r0, #24\n\t"
258 "ldm r0!, { r5, r6, r7, r8 }\n\t"
259 "eors r3, r5, r10\n\t"
260 "eors r4, r6, r11\n\t"
261 "lsl r5, r4, #9\n\t"
262 "orr r5, r5, r3, lsr #23\n\t"
263 "lsl r6, r3, #9\n\t"
264 "orr r6, r6, r4, lsr #23\n\t"
265 "eors r3, r7, r1\n\t"
266 "eors r4, r8, r2\n\t"
267 "lsl r7, r4, #13\n\t"
268 "orr r7, r7, r3, lsr #19\n\t"
269 "lsl r8, r3, #13\n\t"
270 "orr r8, r8, r4, lsr #19\n\t"
271 "stm r12!, { r5, r6, r7, r8 }\n\t"
272 "\n\t"
273 "@ A[20] and A[21]\n\t"
274 "adds r0, #24\n\t"
275 "ldm r0!, { r5, r6, r7, r8 }\n\t"
276 "eors r3, r5, r10\n\t"
277 "eors r4, r6, r11\n\t"
278 "lsl r5, r3, #18\n\t"
279 "orr r5, r5, r4, lsr #14\n\t"
280 "lsl r6, r4, #18\n\t"
281 "orr r6, r6, r3, lsr #14\n\t"
282 "eors r3, r7, r1\n\t"
283 "eors r4, r8, r2\n\t"
284 "lsl r7, r3, #2\n\t"
285 "orr r7, r7, r4, lsr #30\n\t"
286 "lsl r8, r4, #2\n\t"
287 "orr r8, r8, r3, lsr #30\n\t"
288 "stm r12!, { r5, r6, r7, r8 }\n\t"
289 "\n\t"
290 "@ XOR t2 into A[5*i+2] and t3 into A[5*i+3]; each A[i] is also\n\t"
291 "@ rotated left by some amount. We reload t2 into r1:r2 and t3\n\t"
292 "@ into r3:r4.\n\t"
293 "addw r5, sp, #4\n\t"
294 "ldm r5!, { r1, r2, r3, r4 }\n\t"
295 "\n\t"
296 "@ A[2] and A[3]\n\t"
297 "subs r0, #160\n\t"
298 "ldm r0!, { r5, r6, r7, r8 }\n\t"
299 "eors r10, r5, r1\n\t"
300 "eors r11, r6, r2\n\t"
301 "lsl r5, r11, #30\n\t"
302 "orr r5, r5, r10, lsr #2\n\t"
303 "lsl r6, r10, #30\n\t"
304 "orr r6, r6, r11, lsr #2\n\t"
305 "eors r10, r7, r3\n\t"
306 "eors r11, r8, r4\n\t"
307 "lsl r7, r10, #28\n\t"
308 "orr r7, r7, r11, lsr #4\n\t"
309 "lsl r8, r11, #28\n\t"
310 "orr r8, r8, r10, lsr #4\n\t"
311 "stm r12!, { r5, r6, r7, r8 }\n\t"
312 "\n\t"
313 "@ A[7] and A[8]\n\t"
314 "adds r0, #24\n\t"
315 "ldm r0!, { r5, r6, r7, r8 }\n\t"
316 "eors r10, r5, r1\n\t"
317 "eors r11, r6, r2\n\t"
318 "lsl r5, r10, #6\n\t"
319 "orr r5, r5, r11, lsr #26\n\t"
320 "lsl r6, r11, #6\n\t"
321 "orr r6, r6, r10, lsr #26\n\t"
322 "eors r10, r7, r3\n\t"
323 "eors r11, r8, r4\n\t"
324 "lsl r7, r11, #23\n\t"
325 "orr r7, r7, r10, lsr #9\n\t"
326 "lsl r8, r10, #23\n\t"
327 "orr r8, r8, r11, lsr #9\n\t"
328 "stm r12!, { r5, r6, r7, r8 }\n\t"
329 "\n\t"
330 "@ A[12] and A[13]\n\t"
331 "adds r0, #24\n\t"
332 "ldm r0!, { r5, r6, r7, r8 }\n\t"
333 "eors r10, r5, r1\n\t"
334 "eors r11, r6, r2\n\t"
335 "lsl r5, r11, #11\n\t"
336 "orr r5, r5, r10, lsr #21\n\t"
337 "lsl r6, r10, #11\n\t"
338 "orr r6, r6, r11, lsr #21\n\t"
339 "eors r10, r7, r3\n\t"
340 "eors r11, r8, r4\n\t"
341 "lsl r7, r10, #25\n\t"
342 "orr r7, r7, r11, lsr #7\n\t"
343 "lsl r8, r11, #25\n\t"
344 "orr r8, r8, r10, lsr #7\n\t"
345 "stm r12!, { r5, r6, r7, r8 }\n\t"
346 "\n\t"
347 "@ A[17] and A[18]\n\t"
348 "adds r0, #24\n\t"
349 "ldm r0!, { r5, r6, r7, r8 }\n\t"
350 "eors r10, r5, r1\n\t"
351 "eors r11, r6, r2\n\t"
352 "lsl r5, r10, #15\n\t"
353 "orr r5, r5, r11, lsr #17\n\t"
354 "lsl r6, r11, #15\n\t"
355 "orr r6, r6, r10, lsr #17\n\t"
356 "eors r10, r7, r3\n\t"
357 "eors r11, r8, r4\n\t"
358 "lsl r7, r10, #21\n\t"
359 "orr r7, r7, r11, lsr #11\n\t"
360 "lsl r8, r11, #21\n\t"
361 "orr r8, r8, r10, lsr #11\n\t"
362 "stm r12!, { r5, r6, r7, r8 }\n\t"
363 "\n\t"
364 "@ A[22] and A[23]\n\t"
365 "adds r0, #24\n\t"
366 "ldm r0!, { r5, r6, r7, r8 }\n\t"
367 "eors r10, r5, r1\n\t"
368 "eors r11, r6, r2\n\t"
369 "lsl r5, r11, #29\n\t"
370 "orr r5, r5, r10, lsr #3\n\t"
371 "lsl r6, r10, #29\n\t"
372 "orr r6, r6, r11, lsr #3\n\t"
373 "eors r10, r7, r3\n\t"
374 "eors r11, r8, r4\n\t"
375 "lsl r7, r11, #24\n\t"
376 "orr r7, r7, r10, lsr #8\n\t"
377 "lsl r8, r10, #24\n\t"
378 "orr r8, r8, r11, lsr #8\n\t"
379 "stm r12!, { r5, r6, r7, r8 }\n\t"
380 "\n\t"
381 "@ XOR t4 into A[5*i+4]; each A[i] is also rotated left by some\n\t"
382 "@ amount. We reload t4 into r1:r2.\n\t"
383 "ldr r1, [sp, #20]\n\t"
384 "ldr r2, [sp, #24]\n\t"
385 "\n\t"
386 "@ A[4]\n\t"
387 "subs r0, #160\n\t"
388 "ldm r0!, { r5, r6 }\n\t"
389 "eors r3, r5, r1\n\t"
390 "eors r4, r6, r2\n\t"
391 "lsl r5, r3, #27\n\t"
392 "orr r5, r5, r4, lsr #5\n\t"
393 "lsl r6, r4, #27\n\t"
394 "orr r6, r6, r3, lsr #5\n\t"
395 "stm r12!, { r5, r6 }\n\t"
396 "\n\t"
397 "@ A[9]\n\t"
398 "adds r0, #32\n\t"
399 "ldm r0!, { r5, r6 }\n\t"
400 "eors r3, r5, r1\n\t"
401 "eors r4, r6, r2\n\t"
402 "lsl r5, r3, #20\n\t"
403 "orr r5, r5, r4, lsr #12\n\t"
404 "lsl r6, r4, #20\n\t"
405 "orr r6, r6, r3, lsr #12\n\t"
406 "stm r12!, { r5, r6 }\n\t"
407 "\n\t"
408 "@ A[14]\n\t"
409 "adds r0, #32\n\t"
410 "ldm r0!, { r5, r6 }\n\t"
411 "eors r3, r5, r1\n\t"
412 "eors r4, r6, r2\n\t"
413 "lsl r5, r4, #7\n\t"
414 "orr r5, r5, r3, lsr #25\n\t"
415 "lsl r6, r3, #7\n\t"
416 "orr r6, r6, r4, lsr #25\n\t"
417 "stm r12!, { r5, r6 }\n\t"
418 "\n\t"
419 "@ A[19]\n\t"
420 "adds r0, #32\n\t"
421 "ldm r0!, { r5, r6 }\n\t"
422 "eors r3, r5, r1\n\t"
423 "eors r4, r6, r2\n\t"
424 "lsl r5, r3, #8\n\t"
425 "orr r5, r5, r4, lsr #24\n\t"
426 "lsl r6, r4, #8\n\t"
427 "orr r6, r6, r3, lsr #24\n\t"
428 "stm r12!, { r5, r6 }\n\t"
429 "\n\t"
430 "@ A[24]\n\t"
431 "adds r0, #32\n\t"
432 "ldm r0!, { r5, r6 }\n\t"
433 "eors r3, r5, r1\n\t"
434 "eors r4, r6, r2\n\t"
435 "lsl r5, r3, #14\n\t"
436 "orr r5, r5, r4, lsr #18\n\t"
437 "lsl r6, r4, #14\n\t"
438 "orr r6, r6, r3, lsr #18\n\t"
439 "stm r12!, { r5, r6 }\n\t"
440 "\n\t"
441 "subs r0, #200\n\t"
442 "\n\t"
443 "@ At that point, the stack buffer at sp+32 contains the words\n\t"
444 "@ at the following indexes (0 to 24) and offsets (from sp)\n\t"
445 "@ A[ 0] 0 32\n\t"
446 "@ A[ 1] 1 40\n\t"
447 "@ A[ 2] 10 112\n\t"
448 "@ A[ 3] 11 120\n\t"
449 "@ A[ 4] 20 192\n\t"
450 "@ A[ 5] 2 48\n\t"
451 "@ A[ 6] 3 56\n\t"
452 "@ A[ 7] 12 128\n\t"
453 "@ A[ 8] 13 136\n\t"
454 "@ A[ 9] 21 200\n\t"
455 "@ A[10] 4 64\n\t"
456 "@ A[11] 5 72\n\t"
457 "@ A[12] 14 144\n\t"
458 "@ A[13] 15 152\n\t"
459 "@ A[14] 22 208\n\t"
460 "@ A[15] 6 80\n\t"
461 "@ A[16] 7 88\n\t"
462 "@ A[17] 16 160\n\t"
463 "@ A[18] 17 168\n\t"
464 "@ A[19] 23 216\n\t"
465 "@ A[20] 8 96\n\t"
466 "@ A[21] 9 104\n\t"
467 "@ A[22] 18 176\n\t"
468 "@ A[23] 19 184\n\t"
469 "@ A[24] 24 224\n\t"
470
471 #define KHI_LOAD(s0, s1, s2, s3, s4) \
472 "ldr r1, [sp, #(32 + 8 * " #s0 ")]\n\t" \
473 "ldr r2, [sp, #(36 + 8 * " #s0 ")]\n\t" \
474 "ldr r3, [sp, #(32 + 8 * " #s1 ")]\n\t" \
475 "ldr r4, [sp, #(36 + 8 * " #s1 ")]\n\t" \
476 "ldr r5, [sp, #(32 + 8 * " #s2 ")]\n\t" \
477 "ldr r6, [sp, #(36 + 8 * " #s2 ")]\n\t" \
478 "ldr r7, [sp, #(32 + 8 * " #s3 ")]\n\t" \
479 "ldr r8, [sp, #(36 + 8 * " #s3 ")]\n\t" \
480 "ldr r10, [sp, #(32 + 8 * " #s4 ")]\n\t" \
481 "ldr r11, [sp, #(36 + 8 * " #s4 ")]\n\t"
482
483 #define KHI_STEP(op, x0, x1, x2, x3, x4, x5, d) \
484 #op " r12, " #x0 ", " #x2 "\n\t" \
485 "eors r12, " #x4 "\n\t" \
486 "str r12, [r0, #(8 * " #d ")]\n\t" \
487 #op " r12, " #x1 ", " #x3 "\n\t" \
488 "eors r12, " #x5 "\n\t" \
489 "str r12, [r0, #(4 + 8 * " #d ")]\n\t"
490
491 "@ A[0], A[6], A[12], A[18] and A[24]\n\t"
492 KHI_LOAD(0, 3, 14, 17, 24)
493 KHI_STEP(orrs, r3, r4, r5, r6, r1, r2, 0)
494 KHI_STEP(orns, r7, r8, r5, r6, r3, r4, 1)
495 KHI_STEP(ands, r7, r8, r10, r11, r5, r6, 2)
496 KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 3)
497 KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 4)
498 "\n\t"
499
500 "@ A[3], A[9], A[10], A[16] and A[22]\n\t"
501 KHI_LOAD(11, 21, 4, 7, 18)
502 KHI_STEP(orrs, r3, r4, r5, r6, r1, r2, 5)
503 KHI_STEP(ands, r7, r8, r5, r6, r3, r4, 6)
504 KHI_STEP(orns, r7, r8, r10, r11, r5, r6, 7)
505 KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 8)
506 KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 9)
507 "\n\t"
508
509 "@ A[1], A[7], A[13], A[19] and A[20]\n\t"
510 KHI_LOAD(1, 12, 15, 23, 8)
511 KHI_STEP(orrs, r3, r4, r5, r6, r1, r2, 10)
512 KHI_STEP(ands, r7, r8, r5, r6, r3, r4, 11)
513 KHI_STEP(bics, r10, r11, r7, r8, r5, r6, 12)
514 "mvns r7, r7\n\t"
515 "mvns r8, r8\n\t"
516 KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 13)
517 KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 14)
518 "\n\t"
519
520 "@ A[4], A[5], A[11], A[17] and A[23]\n\t"
521 KHI_LOAD(20, 2, 5, 16, 19)
522 KHI_STEP(ands, r3, r4, r5, r6, r1, r2, 15)
523 KHI_STEP(orrs, r7, r8, r5, r6, r3, r4, 16)
524 KHI_STEP(orns, r10, r11, r7, r8, r5, r6, 17)
525 "mvns r7, r7\n\t"
526 "mvns r8, r8\n\t"
527 KHI_STEP(ands, r1, r2, r10, r11, r7, r8, 18)
528 KHI_STEP(orrs, r1, r2, r3, r4, r10, r11, 19)
529 "\n\t"
530
531 "@ A[2], A[8], A[14], A[15] and A[21]\n\t"
532 KHI_LOAD(10, 13, 22, 6, 9)
533 KHI_STEP(bics, r5, r6, r3, r4, r1, r2, 20)
534 KHI_STEP(ands, r1, r2, r3, r4, r10, r11, 24)
535 "mvns r3, r3\n\t"
536 "mvns r4, r4\n\t"
537 KHI_STEP(orrs, r7, r8, r5, r6, r3, r4, 21)
538 KHI_STEP(ands, r7, r8, r10, r11, r5, r6, 22)
539 KHI_STEP(orrs, r1, r2, r10, r11, r7, r8, 23)
540 "\n\t"
541
542 "@ Get round counter XOR round constant into A[0]\n\t"
543 "ldr r1, [sp, #0]\n\t"
544 "adr r2, .process_block_RC\n\t"
545 "adds r2, r1\n\t"
546 "ldm r2, { r3, r4 }\n\t"
547 "ldm r0, { r5, r6 }\n\t"
548 "eors r5, r3\n\t"
549 "eors r6, r4\n\t"
550 "stm r0, { r5, r6 }\n\t"
551 "\n\t"
552 "@ Increment round counter, loop until all 24 rounds are done.\n\t"
553 "\n\t"
554 "adds r1, #8\n\t"
555 "str r1, [sp, #0]\n\t"
556 "cmp r1, #192\n\t"
557 "blo .process_block_loop\n\t"
558
559 INVERT_WORDS
560
561 "add sp, sp, #232\n\t"
562 "pop { r1, r2, r3, r4, r5, r6, r7, r8, r10, r11, r12, pc }\n\t"
563 "\n\t"
564 ".process_block_RC:\n\t"
565 ".word 0x00000001\n\t"
566 ".word 0x00000000\n\t"
567 ".word 0x00008082\n\t"
568 ".word 0x00000000\n\t"
569 ".word 0x0000808A\n\t"
570 ".word 0x80000000\n\t"
571 ".word 0x80008000\n\t"
572 ".word 0x80000000\n\t"
573 ".word 0x0000808B\n\t"
574 ".word 0x00000000\n\t"
575 ".word 0x80000001\n\t"
576 ".word 0x00000000\n\t"
577 ".word 0x80008081\n\t"
578 ".word 0x80000000\n\t"
579 ".word 0x00008009\n\t"
580 ".word 0x80000000\n\t"
581 ".word 0x0000008A\n\t"
582 ".word 0x00000000\n\t"
583 ".word 0x00000088\n\t"
584 ".word 0x00000000\n\t"
585 ".word 0x80008009\n\t"
586 ".word 0x00000000\n\t"
587 ".word 0x8000000A\n\t"
588 ".word 0x00000000\n\t"
589 ".word 0x8000808B\n\t"
590 ".word 0x00000000\n\t"
591 ".word 0x0000008B\n\t"
592 ".word 0x80000000\n\t"
593 ".word 0x00008089\n\t"
594 ".word 0x80000000\n\t"
595 ".word 0x00008003\n\t"
596 ".word 0x80000000\n\t"
597 ".word 0x00008002\n\t"
598 ".word 0x80000000\n\t"
599 ".word 0x00000080\n\t"
600 ".word 0x80000000\n\t"
601 ".word 0x0000800A\n\t"
602 ".word 0x00000000\n\t"
603 ".word 0x8000000A\n\t"
604 ".word 0x80000000\n\t"
605 ".word 0x80008081\n\t"
606 ".word 0x80000000\n\t"
607 ".word 0x00008080\n\t"
608 ".word 0x80000000\n\t"
609 ".word 0x80000001\n\t"
610 ".word 0x00000000\n\t"
611 ".word 0x80008008\n\t"
612 ".word 0x80000000\n\t"
613
614 #undef INVERT_WORDS
615 #undef KHI_LOAD
616 #undef KHI_STEP
617
618 );
619 }
620
621 #else // yyyASM_CORTEXM4+0
622
623 /*
624 * Round constants.
625 */
626 static const uint64_t RC[] = {
627 0x0000000000000001, 0x0000000000008082,
628 0x800000000000808A, 0x8000000080008000,
629 0x000000000000808B, 0x0000000080000001,
630 0x8000000080008081, 0x8000000000008009,
631 0x000000000000008A, 0x0000000000000088,
632 0x0000000080008009, 0x000000008000000A,
633 0x000000008000808B, 0x800000000000008B,
634 0x8000000000008089, 0x8000000000008003,
635 0x8000000000008002, 0x8000000000000080,
636 0x000000000000800A, 0x800000008000000A,
637 0x8000000080008081, 0x8000000000008080,
638 0x0000000080000001, 0x8000000080008008
639 };
640
641 /*
642 * Process the provided state.
643 */
644 static void
645 process_block(uint64_t *A)
646 {
647 uint64_t t0, t1, t2, t3, t4;
648 uint64_t tt0, tt1, tt2, tt3;
649 uint64_t t, kt;
650 uint64_t c0, c1, c2, c3, c4, bnn;
651 int j;
652
653 /*
654 * Invert some words (alternate internal representation, which
655 * saves some operations).
656 */
657 A[ 1] = ~A[ 1];
658 A[ 2] = ~A[ 2];
659 A[ 8] = ~A[ 8];
660 A[12] = ~A[12];
661 A[17] = ~A[17];
662 A[20] = ~A[20];
663
664 /*
665 * Compute the 24 rounds. This loop is partially unrolled (each
666 * iteration computes two rounds).
667 */
668 for (j = 0; j < 24; j += 2) {
669
670 tt0 = A[ 1] ^ A[ 6];
671 tt1 = A[11] ^ A[16];
672 tt0 ^= A[21] ^ tt1;
673 tt0 = (tt0 << 1) | (tt0 >> 63);
674 tt2 = A[ 4] ^ A[ 9];
675 tt3 = A[14] ^ A[19];
676 tt0 ^= A[24];
677 tt2 ^= tt3;
678 t0 = tt0 ^ tt2;
679
680 tt0 = A[ 2] ^ A[ 7];
681 tt1 = A[12] ^ A[17];
682 tt0 ^= A[22] ^ tt1;
683 tt0 = (tt0 << 1) | (tt0 >> 63);
684 tt2 = A[ 0] ^ A[ 5];
685 tt3 = A[10] ^ A[15];
686 tt0 ^= A[20];
687 tt2 ^= tt3;
688 t1 = tt0 ^ tt2;
689
690 tt0 = A[ 3] ^ A[ 8];
691 tt1 = A[13] ^ A[18];
692 tt0 ^= A[23] ^ tt1;
693 tt0 = (tt0 << 1) | (tt0 >> 63);
694 tt2 = A[ 1] ^ A[ 6];
695 tt3 = A[11] ^ A[16];
696 tt0 ^= A[21];
697 tt2 ^= tt3;
698 t2 = tt0 ^ tt2;
699
700 tt0 = A[ 4] ^ A[ 9];
701 tt1 = A[14] ^ A[19];
702 tt0 ^= A[24] ^ tt1;
703 tt0 = (tt0 << 1) | (tt0 >> 63);
704 tt2 = A[ 2] ^ A[ 7];
705 tt3 = A[12] ^ A[17];
706 tt0 ^= A[22];
707 tt2 ^= tt3;
708 t3 = tt0 ^ tt2;
709
710 tt0 = A[ 0] ^ A[ 5];
711 tt1 = A[10] ^ A[15];
712 tt0 ^= A[20] ^ tt1;
713 tt0 = (tt0 << 1) | (tt0 >> 63);
714 tt2 = A[ 3] ^ A[ 8];
715 tt3 = A[13] ^ A[18];
716 tt0 ^= A[23];
717 tt2 ^= tt3;
718 t4 = tt0 ^ tt2;
719
720 A[ 0] = A[ 0] ^ t0;
721 A[ 5] = A[ 5] ^ t0;
722 A[10] = A[10] ^ t0;
723 A[15] = A[15] ^ t0;
724 A[20] = A[20] ^ t0;
725 A[ 1] = A[ 1] ^ t1;
726 A[ 6] = A[ 6] ^ t1;
727 A[11] = A[11] ^ t1;
728 A[16] = A[16] ^ t1;
729 A[21] = A[21] ^ t1;
730 A[ 2] = A[ 2] ^ t2;
731 A[ 7] = A[ 7] ^ t2;
732 A[12] = A[12] ^ t2;
733 A[17] = A[17] ^ t2;
734 A[22] = A[22] ^ t2;
735 A[ 3] = A[ 3] ^ t3;
736 A[ 8] = A[ 8] ^ t3;
737 A[13] = A[13] ^ t3;
738 A[18] = A[18] ^ t3;
739 A[23] = A[23] ^ t3;
740 A[ 4] = A[ 4] ^ t4;
741 A[ 9] = A[ 9] ^ t4;
742 A[14] = A[14] ^ t4;
743 A[19] = A[19] ^ t4;
744 A[24] = A[24] ^ t4;
745 A[ 5] = (A[ 5] << 36) | (A[ 5] >> (64 - 36));
746 A[10] = (A[10] << 3) | (A[10] >> (64 - 3));
747 A[15] = (A[15] << 41) | (A[15] >> (64 - 41));
748 A[20] = (A[20] << 18) | (A[20] >> (64 - 18));
749 A[ 1] = (A[ 1] << 1) | (A[ 1] >> (64 - 1));
750 A[ 6] = (A[ 6] << 44) | (A[ 6] >> (64 - 44));
751 A[11] = (A[11] << 10) | (A[11] >> (64 - 10));
752 A[16] = (A[16] << 45) | (A[16] >> (64 - 45));
753 A[21] = (A[21] << 2) | (A[21] >> (64 - 2));
754 A[ 2] = (A[ 2] << 62) | (A[ 2] >> (64 - 62));
755 A[ 7] = (A[ 7] << 6) | (A[ 7] >> (64 - 6));
756 A[12] = (A[12] << 43) | (A[12] >> (64 - 43));
757 A[17] = (A[17] << 15) | (A[17] >> (64 - 15));
758 A[22] = (A[22] << 61) | (A[22] >> (64 - 61));
759 A[ 3] = (A[ 3] << 28) | (A[ 3] >> (64 - 28));
760 A[ 8] = (A[ 8] << 55) | (A[ 8] >> (64 - 55));
761 A[13] = (A[13] << 25) | (A[13] >> (64 - 25));
762 A[18] = (A[18] << 21) | (A[18] >> (64 - 21));
763 A[23] = (A[23] << 56) | (A[23] >> (64 - 56));
764 A[ 4] = (A[ 4] << 27) | (A[ 4] >> (64 - 27));
765 A[ 9] = (A[ 9] << 20) | (A[ 9] >> (64 - 20));
766 A[14] = (A[14] << 39) | (A[14] >> (64 - 39));
767 A[19] = (A[19] << 8) | (A[19] >> (64 - 8));
768 A[24] = (A[24] << 14) | (A[24] >> (64 - 14));
769
770 bnn = ~A[12];
771 kt = A[ 6] | A[12];
772 c0 = A[ 0] ^ kt;
773 kt = bnn | A[18];
774 c1 = A[ 6] ^ kt;
775 kt = A[18] & A[24];
776 c2 = A[12] ^ kt;
777 kt = A[24] | A[ 0];
778 c3 = A[18] ^ kt;
779 kt = A[ 0] & A[ 6];
780 c4 = A[24] ^ kt;
781 A[ 0] = c0;
782 A[ 6] = c1;
783 A[12] = c2;
784 A[18] = c3;
785 A[24] = c4;
786 bnn = ~A[22];
787 kt = A[ 9] | A[10];
788 c0 = A[ 3] ^ kt;
789 kt = A[10] & A[16];
790 c1 = A[ 9] ^ kt;
791 kt = A[16] | bnn;
792 c2 = A[10] ^ kt;
793 kt = A[22] | A[ 3];
794 c3 = A[16] ^ kt;
795 kt = A[ 3] & A[ 9];
796 c4 = A[22] ^ kt;
797 A[ 3] = c0;
798 A[ 9] = c1;
799 A[10] = c2;
800 A[16] = c3;
801 A[22] = c4;
802 bnn = ~A[19];
803 kt = A[ 7] | A[13];
804 c0 = A[ 1] ^ kt;
805 kt = A[13] & A[19];
806 c1 = A[ 7] ^ kt;
807 kt = bnn & A[20];
808 c2 = A[13] ^ kt;
809 kt = A[20] | A[ 1];
810 c3 = bnn ^ kt;
811 kt = A[ 1] & A[ 7];
812 c4 = A[20] ^ kt;
813 A[ 1] = c0;
814 A[ 7] = c1;
815 A[13] = c2;
816 A[19] = c3;
817 A[20] = c4;
818 bnn = ~A[17];
819 kt = A[ 5] & A[11];
820 c0 = A[ 4] ^ kt;
821 kt = A[11] | A[17];
822 c1 = A[ 5] ^ kt;
823 kt = bnn | A[23];
824 c2 = A[11] ^ kt;
825 kt = A[23] & A[ 4];
826 c3 = bnn ^ kt;
827 kt = A[ 4] | A[ 5];
828 c4 = A[23] ^ kt;
829 A[ 4] = c0;
830 A[ 5] = c1;
831 A[11] = c2;
832 A[17] = c3;
833 A[23] = c4;
834 bnn = ~A[ 8];
835 kt = bnn & A[14];
836 c0 = A[ 2] ^ kt;
837 kt = A[14] | A[15];
838 c1 = bnn ^ kt;
839 kt = A[15] & A[21];
840 c2 = A[14] ^ kt;
841 kt = A[21] | A[ 2];
842 c3 = A[15] ^ kt;
843 kt = A[ 2] & A[ 8];
844 c4 = A[21] ^ kt;
845 A[ 2] = c0;
846 A[ 8] = c1;
847 A[14] = c2;
848 A[15] = c3;
849 A[21] = c4;
850 A[ 0] = A[ 0] ^ RC[j + 0];
851
852 tt0 = A[ 6] ^ A[ 9];
853 tt1 = A[ 7] ^ A[ 5];
854 tt0 ^= A[ 8] ^ tt1;
855 tt0 = (tt0 << 1) | (tt0 >> 63);
856 tt2 = A[24] ^ A[22];
857 tt3 = A[20] ^ A[23];
858 tt0 ^= A[21];
859 tt2 ^= tt3;
860 t0 = tt0 ^ tt2;
861
862 tt0 = A[12] ^ A[10];
863 tt1 = A[13] ^ A[11];
864 tt0 ^= A[14] ^ tt1;
865 tt0 = (tt0 << 1) | (tt0 >> 63);
866 tt2 = A[ 0] ^ A[ 3];
867 tt3 = A[ 1] ^ A[ 4];
868 tt0 ^= A[ 2];
869 tt2 ^= tt3;
870 t1 = tt0 ^ tt2;
871
872 tt0 = A[18] ^ A[16];
873 tt1 = A[19] ^ A[17];
874 tt0 ^= A[15] ^ tt1;
875 tt0 = (tt0 << 1) | (tt0 >> 63);
876 tt2 = A[ 6] ^ A[ 9];
877 tt3 = A[ 7] ^ A[ 5];
878 tt0 ^= A[ 8];
879 tt2 ^= tt3;
880 t2 = tt0 ^ tt2;
881
882 tt0 = A[24] ^ A[22];
883 tt1 = A[20] ^ A[23];
884 tt0 ^= A[21] ^ tt1;
885 tt0 = (tt0 << 1) | (tt0 >> 63);
886 tt2 = A[12] ^ A[10];
887 tt3 = A[13] ^ A[11];
888 tt0 ^= A[14];
889 tt2 ^= tt3;
890 t3 = tt0 ^ tt2;
891
892 tt0 = A[ 0] ^ A[ 3];
893 tt1 = A[ 1] ^ A[ 4];
894 tt0 ^= A[ 2] ^ tt1;
895 tt0 = (tt0 << 1) | (tt0 >> 63);
896 tt2 = A[18] ^ A[16];
897 tt3 = A[19] ^ A[17];
898 tt0 ^= A[15];
899 tt2 ^= tt3;
900 t4 = tt0 ^ tt2;
901
902 A[ 0] = A[ 0] ^ t0;
903 A[ 3] = A[ 3] ^ t0;
904 A[ 1] = A[ 1] ^ t0;
905 A[ 4] = A[ 4] ^ t0;
906 A[ 2] = A[ 2] ^ t0;
907 A[ 6] = A[ 6] ^ t1;
908 A[ 9] = A[ 9] ^ t1;
909 A[ 7] = A[ 7] ^ t1;
910 A[ 5] = A[ 5] ^ t1;
911 A[ 8] = A[ 8] ^ t1;
912 A[12] = A[12] ^ t2;
913 A[10] = A[10] ^ t2;
914 A[13] = A[13] ^ t2;
915 A[11] = A[11] ^ t2;
916 A[14] = A[14] ^ t2;
917 A[18] = A[18] ^ t3;
918 A[16] = A[16] ^ t3;
919 A[19] = A[19] ^ t3;
920 A[17] = A[17] ^ t3;
921 A[15] = A[15] ^ t3;
922 A[24] = A[24] ^ t4;
923 A[22] = A[22] ^ t4;
924 A[20] = A[20] ^ t4;
925 A[23] = A[23] ^ t4;
926 A[21] = A[21] ^ t4;
927 A[ 3] = (A[ 3] << 36) | (A[ 3] >> (64 - 36));
928 A[ 1] = (A[ 1] << 3) | (A[ 1] >> (64 - 3));
929 A[ 4] = (A[ 4] << 41) | (A[ 4] >> (64 - 41));
930 A[ 2] = (A[ 2] << 18) | (A[ 2] >> (64 - 18));
931 A[ 6] = (A[ 6] << 1) | (A[ 6] >> (64 - 1));
932 A[ 9] = (A[ 9] << 44) | (A[ 9] >> (64 - 44));
933 A[ 7] = (A[ 7] << 10) | (A[ 7] >> (64 - 10));
934 A[ 5] = (A[ 5] << 45) | (A[ 5] >> (64 - 45));
935 A[ 8] = (A[ 8] << 2) | (A[ 8] >> (64 - 2));
936 A[12] = (A[12] << 62) | (A[12] >> (64 - 62));
937 A[10] = (A[10] << 6) | (A[10] >> (64 - 6));
938 A[13] = (A[13] << 43) | (A[13] >> (64 - 43));
939 A[11] = (A[11] << 15) | (A[11] >> (64 - 15));
940 A[14] = (A[14] << 61) | (A[14] >> (64 - 61));
941 A[18] = (A[18] << 28) | (A[18] >> (64 - 28));
942 A[16] = (A[16] << 55) | (A[16] >> (64 - 55));
943 A[19] = (A[19] << 25) | (A[19] >> (64 - 25));
944 A[17] = (A[17] << 21) | (A[17] >> (64 - 21));
945 A[15] = (A[15] << 56) | (A[15] >> (64 - 56));
946 A[24] = (A[24] << 27) | (A[24] >> (64 - 27));
947 A[22] = (A[22] << 20) | (A[22] >> (64 - 20));
948 A[20] = (A[20] << 39) | (A[20] >> (64 - 39));
949 A[23] = (A[23] << 8) | (A[23] >> (64 - 8));
950 A[21] = (A[21] << 14) | (A[21] >> (64 - 14));
951
952 bnn = ~A[13];
953 kt = A[ 9] | A[13];
954 c0 = A[ 0] ^ kt;
955 kt = bnn | A[17];
956 c1 = A[ 9] ^ kt;
957 kt = A[17] & A[21];
958 c2 = A[13] ^ kt;
959 kt = A[21] | A[ 0];
960 c3 = A[17] ^ kt;
961 kt = A[ 0] & A[ 9];
962 c4 = A[21] ^ kt;
963 A[ 0] = c0;
964 A[ 9] = c1;
965 A[13] = c2;
966 A[17] = c3;
967 A[21] = c4;
968 bnn = ~A[14];
969 kt = A[22] | A[ 1];
970 c0 = A[18] ^ kt;
971 kt = A[ 1] & A[ 5];
972 c1 = A[22] ^ kt;
973 kt = A[ 5] | bnn;
974 c2 = A[ 1] ^ kt;
975 kt = A[14] | A[18];
976 c3 = A[ 5] ^ kt;
977 kt = A[18] & A[22];
978 c4 = A[14] ^ kt;
979 A[18] = c0;
980 A[22] = c1;
981 A[ 1] = c2;
982 A[ 5] = c3;
983 A[14] = c4;
984 bnn = ~A[23];
985 kt = A[10] | A[19];
986 c0 = A[ 6] ^ kt;
987 kt = A[19] & A[23];
988 c1 = A[10] ^ kt;
989 kt = bnn & A[ 2];
990 c2 = A[19] ^ kt;
991 kt = A[ 2] | A[ 6];
992 c3 = bnn ^ kt;
993 kt = A[ 6] & A[10];
994 c4 = A[ 2] ^ kt;
995 A[ 6] = c0;
996 A[10] = c1;
997 A[19] = c2;
998 A[23] = c3;
999 A[ 2] = c4;
1000 bnn = ~A[11];
1001 kt = A[ 3] & A[ 7];
1002 c0 = A[24] ^ kt;
1003 kt = A[ 7] | A[11];
1004 c1 = A[ 3] ^ kt;
1005 kt = bnn | A[15];
1006 c2 = A[ 7] ^ kt;
1007 kt = A[15] & A[24];
1008 c3 = bnn ^ kt;
1009 kt = A[24] | A[ 3];
1010 c4 = A[15] ^ kt;
1011 A[24] = c0;
1012 A[ 3] = c1;
1013 A[ 7] = c2;
1014 A[11] = c3;
1015 A[15] = c4;
1016 bnn = ~A[16];
1017 kt = bnn & A[20];
1018 c0 = A[12] ^ kt;
1019 kt = A[20] | A[ 4];
1020 c1 = bnn ^ kt;
1021 kt = A[ 4] & A[ 8];
1022 c2 = A[20] ^ kt;
1023 kt = A[ 8] | A[12];
1024 c3 = A[ 4] ^ kt;
1025 kt = A[12] & A[16];
1026 c4 = A[ 8] ^ kt;
1027 A[12] = c0;
1028 A[16] = c1;
1029 A[20] = c2;
1030 A[ 4] = c3;
1031 A[ 8] = c4;
1032 A[ 0] = A[ 0] ^ RC[j + 1];
1033 t = A[ 5];
1034 A[ 5] = A[18];
1035 A[18] = A[11];
1036 A[11] = A[10];
1037 A[10] = A[ 6];
1038 A[ 6] = A[22];
1039 A[22] = A[20];
1040 A[20] = A[12];
1041 A[12] = A[19];
1042 A[19] = A[15];
1043 A[15] = A[24];
1044 A[24] = A[ 8];
1045 A[ 8] = t;
1046 t = A[ 1];
1047 A[ 1] = A[ 9];
1048 A[ 9] = A[14];
1049 A[14] = A[ 2];
1050 A[ 2] = A[13];
1051 A[13] = A[23];
1052 A[23] = A[ 4];
1053 A[ 4] = A[21];
1054 A[21] = A[16];
1055 A[16] = A[ 3];
1056 A[ 3] = A[17];
1057 A[17] = A[ 7];
1058 A[ 7] = t;
1059 }
1060
1061 /*
1062 * Invert some words back to normal representation.
1063 */
1064 A[ 1] = ~A[ 1];
1065 A[ 2] = ~A[ 2];
1066 A[ 8] = ~A[ 8];
1067 A[12] = ~A[12];
1068 A[17] = ~A[17];
1069 A[20] = ~A[20];
1070 }
1071
1072 #endif // yyyASM_CORTEXM4-
1073
1074 /* see inner.h */
1075 void
1076 Zf(i_shake256_init)(inner_shake256_context *sc)
1077 {
1078 sc->dptr = 0;
1079
1080 /*
1081 * Representation of an all-ones uint64_t is the same regardless
1082 * of local endianness.
1083 */
1084 memset(sc->st.A, 0, sizeof sc->st.A);
1085 }
1086
1087 /* see inner.h */
1088 void
1089 Zf(i_shake256_inject)(inner_shake256_context *sc, const uint8_t *in, size_t len)
1090 {
1091 size_t dptr;
1092
1093 dptr = (size_t)sc->dptr;
1094 while (len > 0) {
1095 size_t clen, u;
1096
1097 clen = 136 - dptr;
1098 if (clen > len) {
1099 clen = len;
1100 }
1101 #if FALCON_LE // yyyLE+1
1102 for (u = 0; u < clen; u ++) {
1103 sc->st.dbuf[dptr + u] ^= in[u];
1104 }
1105 #else // yyyLE+0
1106 for (u = 0; u < clen; u ++) {
1107 size_t v;
1108
1109 v = u + dptr;
1110 sc->st.A[v >> 3] ^= (uint64_t)in[u] << ((v & 7) << 3);
1111 }
1112 #endif // yyyLE-
1113 dptr += clen;
1114 in += clen;
1115 len -= clen;
1116 if (dptr == 136) {
1117 process_block(sc->st.A);
1118 dptr = 0;
1119 }
1120 }
1121 sc->dptr = dptr;
1122 }
1123
1124 /* see falcon.h */
1125 void
1126 Zf(i_shake256_flip)(inner_shake256_context *sc)
1127 {
1128 /*
1129 * We apply padding and pre-XOR the value into the state. We
1130 * set dptr to the end of the buffer, so that first call to
1131 * shake_extract() will process the block.
1132 */
1133 #if FALCON_LE // yyyLE+1
1134 sc->st.dbuf[sc->dptr] ^= 0x1F;
1135 sc->st.dbuf[135] ^= 0x80;
1136 #else // yyyLE+0
1137 unsigned v;
1138
1139 v = sc->dptr;
1140 sc->st.A[v >> 3] ^= (uint64_t)0x1F << ((v & 7) << 3);
1141 sc->st.A[16] ^= (uint64_t)0x80 << 56;
1142 #endif // yyyLE-
1143 sc->dptr = 136;
1144 }
1145
1146 /* see falcon.h */
1147 void
1148 Zf(i_shake256_extract)(inner_shake256_context *sc, uint8_t *out, size_t len)
1149 {
1150 size_t dptr;
1151
1152 dptr = (size_t)sc->dptr;
1153 while (len > 0) {
1154 size_t clen;
1155
1156 if (dptr == 136) {
1157 process_block(sc->st.A);
1158 dptr = 0;
1159 }
1160 clen = 136 - dptr;
1161 if (clen > len) {
1162 clen = len;
1163 }
1164 len -= clen;
1165 #if FALCON_LE // yyyLE+1
1166 memcpy(out, sc->st.dbuf + dptr, clen);
1167 dptr += clen;
1168 out += clen;
1169 #else // yyyLE+0
1170 while (clen -- > 0) {
1171 *out ++ = sc->st.A[dptr >> 3] >> ((dptr & 7) << 3);
1172 dptr ++;
1173 }
1174 #endif // yyyLE-
1175 }
1176 sc->dptr = dptr;
1177 }