Skinny-C
 All Data Structures Files Functions Variables Groups Pages
skinny128-parallel-vec128.c
1 /*
2  * Copyright (C) 2017 Southern Storm Software, Pty Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "skinny128-parallel.h"
24 #include "skinny-internal.h"
25 
26 #if SKINNY_VEC128_MATH
27 
28 STATIC_INLINE SkinnyVector4x32_t skinny128_rotate_right
29  (SkinnyVector4x32_t x, unsigned count)
30 {
31  /* Note: we are rotating the cells right, which actually moves
32  the values up closer to the MSB. That is, we do a left shift
33  on the word to rotate the cells in the word right */
34  return (x << count) | (x >> (32 - count));
35 }
36 
37 #if SKINNY_64BIT
38 
39 /* This function evaluates the S-box on four 128-bit vectors in parallel
40  by interleaving the operations. This tends to make better use of XMM
41  registers on x86-64 CPU's that have SSE2 support or better as the CPU
42  can schedule unrelated operations to operate in parallel. */
43 STATIC_INLINE void skinny128_sbox_four
44  (SkinnyVector4x32_t *u, SkinnyVector4x32_t *v,
45  SkinnyVector4x32_t *s, SkinnyVector4x32_t *t)
46 {
47  SkinnyVector4x32_t x1 = *u;
48  SkinnyVector4x32_t y1;
49  SkinnyVector4x32_t x2 = *v;
50  SkinnyVector4x32_t y2;
51  SkinnyVector4x32_t x3 = *s;
52  SkinnyVector4x32_t y3;
53  SkinnyVector4x32_t x4 = *t;
54  SkinnyVector4x32_t y4;
55 
56  x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x11111111U);
57  x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x11111111U);
58  x3 ^= ((~((x3 >> 2) | (x3 >> 3))) & 0x11111111U);
59  x4 ^= ((~((x4 >> 2) | (x4 >> 3))) & 0x11111111U);
60 
61  y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
62  y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
63  y3 = ((~((x3 << 5) | (x3 << 1))) & 0x20202020U);
64  y4 = ((~((x4 << 5) | (x4 << 1))) & 0x20202020U);
65 
66  x1 ^= ((~((x1 << 5) | (x1 << 4))) & 0x40404040U) ^ y1;
67  x2 ^= ((~((x2 << 5) | (x2 << 4))) & 0x40404040U) ^ y2;
68  x3 ^= ((~((x3 << 5) | (x3 << 4))) & 0x40404040U) ^ y3;
69  x4 ^= ((~((x4 << 5) | (x4 << 4))) & 0x40404040U) ^ y4;
70 
71  y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
72  y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
73  y3 = ((~((x3 << 2) | (x3 << 1))) & 0x80808080U);
74  y4 = ((~((x4 << 2) | (x4 << 1))) & 0x80808080U);
75 
76  x1 ^= ((~((x1 >> 2) | (x1 << 1))) & 0x02020202U) ^ y1;
77  x2 ^= ((~((x2 >> 2) | (x2 << 1))) & 0x02020202U) ^ y2;
78  x3 ^= ((~((x3 >> 2) | (x3 << 1))) & 0x02020202U) ^ y3;
79  x4 ^= ((~((x4 >> 2) | (x4 << 1))) & 0x02020202U) ^ y4;
80 
81  y1 = ((~((x1 >> 5) | (x1 << 1))) & 0x04040404U);
82  y2 = ((~((x2 >> 5) | (x2 << 1))) & 0x04040404U);
83  y3 = ((~((x3 >> 5) | (x3 << 1))) & 0x04040404U);
84  y4 = ((~((x4 >> 5) | (x4 << 1))) & 0x04040404U);
85 
86  x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
87  x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
88  x3 ^= ((~((x3 >> 1) | (x3 >> 2))) & 0x08080808U) ^ y3;
89  x4 ^= ((~((x4 >> 1) | (x4 >> 2))) & 0x08080808U) ^ y4;
90 
91  *u = ((x1 & 0x08080808U) << 1) |
92  ((x1 & 0x32323232U) << 2) |
93  ((x1 & 0x01010101U) << 5) |
94  ((x1 & 0x80808080U) >> 6) |
95  ((x1 & 0x40404040U) >> 4) |
96  ((x1 & 0x04040404U) >> 2);
97 
98  *v = ((x2 & 0x08080808U) << 1) |
99  ((x2 & 0x32323232U) << 2) |
100  ((x2 & 0x01010101U) << 5) |
101  ((x2 & 0x80808080U) >> 6) |
102  ((x2 & 0x40404040U) >> 4) |
103  ((x2 & 0x04040404U) >> 2);
104 
105  *s = ((x3 & 0x08080808U) << 1) |
106  ((x3 & 0x32323232U) << 2) |
107  ((x3 & 0x01010101U) << 5) |
108  ((x3 & 0x80808080U) >> 6) |
109  ((x3 & 0x40404040U) >> 4) |
110  ((x3 & 0x04040404U) >> 2);
111 
112  *t = ((x4 & 0x08080808U) << 1) |
113  ((x4 & 0x32323232U) << 2) |
114  ((x4 & 0x01010101U) << 5) |
115  ((x4 & 0x80808080U) >> 6) |
116  ((x4 & 0x40404040U) >> 4) |
117  ((x4 & 0x04040404U) >> 2);
118 }
119 
120 STATIC_INLINE void skinny128_inv_sbox_four
121  (SkinnyVector4x32_t *u, SkinnyVector4x32_t *v,
122  SkinnyVector4x32_t *s, SkinnyVector4x32_t *t)
123 {
124  SkinnyVector4x32_t x1 = *u;
125  SkinnyVector4x32_t y1;
126  SkinnyVector4x32_t x2 = *v;
127  SkinnyVector4x32_t y2;
128  SkinnyVector4x32_t x3 = *s;
129  SkinnyVector4x32_t y3;
130  SkinnyVector4x32_t x4 = *t;
131  SkinnyVector4x32_t y4;
132 
133  y1 = ((~((x1 >> 1) | (x1 >> 3))) & 0x01010101U);
134  y2 = ((~((x2 >> 1) | (x2 >> 3))) & 0x01010101U);
135  y3 = ((~((x3 >> 1) | (x3 >> 3))) & 0x01010101U);
136  y4 = ((~((x4 >> 1) | (x4 >> 3))) & 0x01010101U);
137 
138  x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x10101010U) ^ y1;
139  x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x10101010U) ^ y2;
140  x3 ^= ((~((x3 >> 2) | (x3 >> 3))) & 0x10101010U) ^ y3;
141  x4 ^= ((~((x4 >> 2) | (x4 >> 3))) & 0x10101010U) ^ y4;
142 
143  y1 = ((~((x1 >> 6) | (x1 >> 1))) & 0x02020202U);
144  y2 = ((~((x2 >> 6) | (x2 >> 1))) & 0x02020202U);
145  y3 = ((~((x3 >> 6) | (x3 >> 1))) & 0x02020202U);
146  y4 = ((~((x4 >> 6) | (x4 >> 1))) & 0x02020202U);
147 
148  x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
149  x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
150  x3 ^= ((~((x3 >> 1) | (x3 >> 2))) & 0x08080808U) ^ y3;
151  x4 ^= ((~((x4 >> 1) | (x4 >> 2))) & 0x08080808U) ^ y4;
152 
153  y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
154  y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
155  y3 = ((~((x3 << 2) | (x3 << 1))) & 0x80808080U);
156  y4 = ((~((x4 << 2) | (x4 << 1))) & 0x80808080U);
157 
158  x1 ^= ((~((x1 >> 1) | (x1 << 2))) & 0x04040404U) ^ y1;
159  x2 ^= ((~((x2 >> 1) | (x2 << 2))) & 0x04040404U) ^ y2;
160  x3 ^= ((~((x3 >> 1) | (x3 << 2))) & 0x04040404U) ^ y3;
161  x4 ^= ((~((x4 >> 1) | (x4 << 2))) & 0x04040404U) ^ y4;
162 
163  y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
164  y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
165  y3 = ((~((x3 << 5) | (x3 << 1))) & 0x20202020U);
166  y4 = ((~((x4 << 5) | (x4 << 1))) & 0x20202020U);
167 
168  x1 ^= ((~((x1 << 4) | (x1 << 5))) & 0x40404040U) ^ y1;
169  x2 ^= ((~((x2 << 4) | (x2 << 5))) & 0x40404040U) ^ y2;
170  x3 ^= ((~((x3 << 4) | (x3 << 5))) & 0x40404040U) ^ y3;
171  x4 ^= ((~((x4 << 4) | (x4 << 5))) & 0x40404040U) ^ y4;
172 
173  *u = ((x1 & 0x01010101U) << 2) |
174  ((x1 & 0x04040404U) << 4) |
175  ((x1 & 0x02020202U) << 6) |
176  ((x1 & 0x20202020U) >> 5) |
177  ((x1 & 0xC8C8C8C8U) >> 2) |
178  ((x1 & 0x10101010U) >> 1);
179 
180  *v = ((x2 & 0x01010101U) << 2) |
181  ((x2 & 0x04040404U) << 4) |
182  ((x2 & 0x02020202U) << 6) |
183  ((x2 & 0x20202020U) >> 5) |
184  ((x2 & 0xC8C8C8C8U) >> 2) |
185  ((x2 & 0x10101010U) >> 1);
186 
187  *s = ((x3 & 0x01010101U) << 2) |
188  ((x3 & 0x04040404U) << 4) |
189  ((x3 & 0x02020202U) << 6) |
190  ((x3 & 0x20202020U) >> 5) |
191  ((x3 & 0xC8C8C8C8U) >> 2) |
192  ((x3 & 0x10101010U) >> 1);
193 
194  *t = ((x4 & 0x01010101U) << 2) |
195  ((x4 & 0x04040404U) << 4) |
196  ((x4 & 0x02020202U) << 6) |
197  ((x4 & 0x20202020U) >> 5) |
198  ((x4 & 0xC8C8C8C8U) >> 2) |
199  ((x4 & 0x10101010U) >> 1);
200 }
201 
202 #else
203 
204 /* 32-bit x86 CPU's have eight 128-bit registers instead of the
205  16 registers on x86-64 CPU's. Since we need some intermediate
206  temporary values below, we perform the operations two at a time
207  instead of four at a time. This alleviates register pressure. */
208 STATIC_INLINE void skinny128_sbox_two
209  (SkinnyVector4x32_t *u, SkinnyVector4x32_t *v)
210 {
211  SkinnyVector4x32_t x1 = *u;
212  SkinnyVector4x32_t y1;
213  SkinnyVector4x32_t x2 = *v;
214  SkinnyVector4x32_t y2;
215 
216  x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x11111111U);
217  x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x11111111U);
218 
219  y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
220  y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
221 
222  x1 ^= ((~((x1 << 5) | (x1 << 4))) & 0x40404040U) ^ y1;
223  x2 ^= ((~((x2 << 5) | (x2 << 4))) & 0x40404040U) ^ y2;
224 
225  y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
226  y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
227 
228  x1 ^= ((~((x1 >> 2) | (x1 << 1))) & 0x02020202U) ^ y1;
229  x2 ^= ((~((x2 >> 2) | (x2 << 1))) & 0x02020202U) ^ y2;
230 
231  y1 = ((~((x1 >> 5) | (x1 << 1))) & 0x04040404U);
232  y2 = ((~((x2 >> 5) | (x2 << 1))) & 0x04040404U);
233 
234  x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
235  x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
236 
237  *u = ((x1 & 0x08080808U) << 1) |
238  ((x1 & 0x32323232U) << 2) |
239  ((x1 & 0x01010101U) << 5) |
240  ((x1 & 0x80808080U) >> 6) |
241  ((x1 & 0x40404040U) >> 4) |
242  ((x1 & 0x04040404U) >> 2);
243 
244  *v = ((x2 & 0x08080808U) << 1) |
245  ((x2 & 0x32323232U) << 2) |
246  ((x2 & 0x01010101U) << 5) |
247  ((x2 & 0x80808080U) >> 6) |
248  ((x2 & 0x40404040U) >> 4) |
249  ((x2 & 0x04040404U) >> 2);
250 }
251 
252 STATIC_INLINE void skinny128_inv_sbox_two
253  (SkinnyVector4x32_t *u, SkinnyVector4x32_t *v)
254 {
255  SkinnyVector4x32_t x1 = *u;
256  SkinnyVector4x32_t y1;
257  SkinnyVector4x32_t x2 = *v;
258  SkinnyVector4x32_t y2;
259 
260  y1 = ((~((x1 >> 1) | (x1 >> 3))) & 0x01010101U);
261  y2 = ((~((x2 >> 1) | (x2 >> 3))) & 0x01010101U);
262 
263  x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x10101010U) ^ y1;
264  x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x10101010U) ^ y2;
265 
266  y1 = ((~((x1 >> 6) | (x1 >> 1))) & 0x02020202U);
267  y2 = ((~((x2 >> 6) | (x2 >> 1))) & 0x02020202U);
268 
269  x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
270  x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
271 
272  y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
273  y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
274 
275  x1 ^= ((~((x1 >> 1) | (x1 << 2))) & 0x04040404U) ^ y1;
276  x2 ^= ((~((x2 >> 1) | (x2 << 2))) & 0x04040404U) ^ y2;
277 
278  y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
279  y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
280 
281  x1 ^= ((~((x1 << 4) | (x1 << 5))) & 0x40404040U) ^ y1;
282  x2 ^= ((~((x2 << 4) | (x2 << 5))) & 0x40404040U) ^ y2;
283 
284  *u = ((x1 & 0x01010101U) << 2) |
285  ((x1 & 0x04040404U) << 4) |
286  ((x1 & 0x02020202U) << 6) |
287  ((x1 & 0x20202020U) >> 5) |
288  ((x1 & 0xC8C8C8C8U) >> 2) |
289  ((x1 & 0x10101010U) >> 1);
290 
291  *v = ((x2 & 0x01010101U) << 2) |
292  ((x2 & 0x04040404U) << 4) |
293  ((x2 & 0x02020202U) << 6) |
294  ((x2 & 0x20202020U) >> 5) |
295  ((x2 & 0xC8C8C8C8U) >> 2) |
296  ((x2 & 0x10101010U) >> 1);
297 }
298 
299 #endif
300 
301 void _skinny128_parallel_encrypt_vec128
302  (void *output, const void *input, const Skinny128Key_t *ks)
303 {
304  SkinnyVector4x32_t row0;
305  SkinnyVector4x32_t row1;
306  SkinnyVector4x32_t row2;
307  SkinnyVector4x32_t row3;
308  const Skinny128HalfCells_t *schedule;
309  unsigned index;
310  SkinnyVector4x32_t temp;
311 
312  /* Read the rows of all four blocks into memory */
313  row0 = (SkinnyVector4x32_t)
314  {READ_WORD32(input, 0), READ_WORD32(input, 16),
315  READ_WORD32(input, 32), READ_WORD32(input, 48)};
316  row1 = (SkinnyVector4x32_t)
317  {READ_WORD32(input, 4), READ_WORD32(input, 20),
318  READ_WORD32(input, 36), READ_WORD32(input, 52)};
319  row2 = (SkinnyVector4x32_t)
320  {READ_WORD32(input, 8), READ_WORD32(input, 24),
321  READ_WORD32(input, 40), READ_WORD32(input, 56)};
322  row3 = (SkinnyVector4x32_t)
323  {READ_WORD32(input, 12), READ_WORD32(input, 28),
324  READ_WORD32(input, 44), READ_WORD32(input, 60)};
325 
326  /* Perform all encryption rounds on the four blocks in parallel */
327  schedule = ks->schedule;
328  for (index = ks->rounds; index > 0; --index, ++schedule) {
329  /* Apply the S-box to all bytes in the state */
330 #if SKINNY_64BIT
331  skinny128_sbox_four(&row0, &row1, &row2, &row3);
332 #else
333  skinny128_sbox_two(&row0, &row1);
334  skinny128_sbox_two(&row2, &row3);
335 #endif
336 
337  /* Apply the subkey for this round */
338  row0 ^= schedule->row[0];
339  row1 ^= schedule->row[1];
340  row2 ^= 0x02;
341 
342  /* Shift the rows */
343  row1 = skinny128_rotate_right(row1, 8);
344  row2 = skinny128_rotate_right(row2, 16);
345  row3 = skinny128_rotate_right(row3, 24);
346 
347  /* Mix the columns */
348  row1 ^= row2;
349  row2 ^= row0;
350  temp = row3 ^ row2;
351  row3 = row2;
352  row2 = row1;
353  row1 = row0;
354  row0 = temp;
355  }
356 
357  /* Write the rows of all four blocks back to memory */
358 #if SKINNY_LITTLE_ENDIAN && SKINNY_UNALIGNED
359  *((SkinnyVector4x32U_t *)output) =
360  (SkinnyVector4x32_t){row0[0], row1[0], row2[0], row3[0]};
361  *((SkinnyVector4x32U_t *)(output + 16)) =
362  (SkinnyVector4x32_t){row0[1], row1[1], row2[1], row3[1]};
363  *((SkinnyVector4x32U_t *)(output + 32)) =
364  (SkinnyVector4x32_t){row0[2], row1[2], row2[2], row3[2]};
365  *((SkinnyVector4x32U_t *)(output + 48)) =
366  (SkinnyVector4x32_t){row0[3], row1[3], row2[3], row3[3]};
367 #else
368  WRITE_WORD32(output, 0, row0[0]);
369  WRITE_WORD32(output, 4, row1[0]);
370  WRITE_WORD32(output, 8, row2[0]);
371  WRITE_WORD32(output, 12, row3[0]);
372  WRITE_WORD32(output, 16, row0[1]);
373  WRITE_WORD32(output, 20, row1[1]);
374  WRITE_WORD32(output, 24, row2[1]);
375  WRITE_WORD32(output, 28, row3[1]);
376  WRITE_WORD32(output, 32, row0[2]);
377  WRITE_WORD32(output, 36, row1[2]);
378  WRITE_WORD32(output, 40, row2[2]);
379  WRITE_WORD32(output, 44, row3[2]);
380  WRITE_WORD32(output, 48, row0[3]);
381  WRITE_WORD32(output, 52, row1[3]);
382  WRITE_WORD32(output, 56, row2[3]);
383  WRITE_WORD32(output, 60, row3[3]);
384 #endif
385 }
386 
387 void _skinny128_parallel_decrypt_vec128
388  (void *output, const void *input, const Skinny128Key_t *ks)
389 {
390  SkinnyVector4x32_t row0;
391  SkinnyVector4x32_t row1;
392  SkinnyVector4x32_t row2;
393  SkinnyVector4x32_t row3;
394  const Skinny128HalfCells_t *schedule;
395  unsigned index;
396  SkinnyVector4x32_t temp;
397 
398  /* Read the rows of all four blocks into memory */
399  row0 = (SkinnyVector4x32_t)
400  {READ_WORD32(input, 0), READ_WORD32(input, 16),
401  READ_WORD32(input, 32), READ_WORD32(input, 48)};
402  row1 = (SkinnyVector4x32_t)
403  {READ_WORD32(input, 4), READ_WORD32(input, 20),
404  READ_WORD32(input, 36), READ_WORD32(input, 52)};
405  row2 = (SkinnyVector4x32_t)
406  {READ_WORD32(input, 8), READ_WORD32(input, 24),
407  READ_WORD32(input, 40), READ_WORD32(input, 56)};
408  row3 = (SkinnyVector4x32_t)
409  {READ_WORD32(input, 12), READ_WORD32(input, 28),
410  READ_WORD32(input, 44), READ_WORD32(input, 60)};
411 
412  /* Perform all decryption rounds on the four blocks in parallel */
413  schedule = &(ks->schedule[ks->rounds - 1]);
414  for (index = ks->rounds; index > 0; --index, --schedule) {
415  /* Inverse mix of the columns */
416  temp = row3;
417  row3 = row0;
418  row0 = row1;
419  row1 = row2;
420  row3 ^= temp;
421  row2 = temp ^ row0;
422  row1 ^= row2;
423 
424  /* Inverse shift of the rows */
425  row1 = skinny128_rotate_right(row1, 24);
426  row2 = skinny128_rotate_right(row2, 16);
427  row3 = skinny128_rotate_right(row3, 8);
428 
429  /* Apply the subkey for this round */
430  row0 ^= schedule->row[0];
431  row1 ^= schedule->row[1];
432  row2 ^= 0x02;
433 
434  /* Apply the inverse S-box to all bytes in the state */
435 #if SKINNY_64BIT
436  skinny128_inv_sbox_four(&row0, &row1, &row2, &row3);
437 #else
438  skinny128_inv_sbox_two(&row0, &row1);
439  skinny128_inv_sbox_two(&row2, &row3);
440 #endif
441  }
442 
443  /* Write the rows of all four blocks back to memory */
444 #if SKINNY_LITTLE_ENDIAN && SKINNY_UNALIGNED
445  *((SkinnyVector4x32U_t *)output) =
446  (SkinnyVector4x32_t){row0[0], row1[0], row2[0], row3[0]};
447  *((SkinnyVector4x32U_t *)(output + 16)) =
448  (SkinnyVector4x32_t){row0[1], row1[1], row2[1], row3[1]};
449  *((SkinnyVector4x32U_t *)(output + 32)) =
450  (SkinnyVector4x32_t){row0[2], row1[2], row2[2], row3[2]};
451  *((SkinnyVector4x32U_t *)(output + 48)) =
452  (SkinnyVector4x32_t){row0[3], row1[3], row2[3], row3[3]};
453 #else
454  WRITE_WORD32(output, 0, row0[0]);
455  WRITE_WORD32(output, 4, row1[0]);
456  WRITE_WORD32(output, 8, row2[0]);
457  WRITE_WORD32(output, 12, row3[0]);
458  WRITE_WORD32(output, 16, row0[1]);
459  WRITE_WORD32(output, 20, row1[1]);
460  WRITE_WORD32(output, 24, row2[1]);
461  WRITE_WORD32(output, 28, row3[1]);
462  WRITE_WORD32(output, 32, row0[2]);
463  WRITE_WORD32(output, 36, row1[2]);
464  WRITE_WORD32(output, 40, row2[2]);
465  WRITE_WORD32(output, 44, row3[2]);
466  WRITE_WORD32(output, 48, row0[3]);
467  WRITE_WORD32(output, 52, row1[3]);
468  WRITE_WORD32(output, 56, row2[3]);
469  WRITE_WORD32(output, 60, row3[3]);
470 #endif
471 }
472 
473 #else /* !SKINNY_VEC128_MATH */
474 
475 /* Stubbed out */
476 
477 void _skinny128_parallel_encrypt_vec128
478  (void *output, const void *input, const Skinny128Key_t *ks)
479 {
480  (void)output;
481  (void)input;
482  (void)ks;
483 }
484 
485 void _skinny128_parallel_decrypt_vec128
486  (void *output, const void *input, const Skinny128Key_t *ks)
487 {
488  (void)output;
489  (void)input;
490  (void)ks;
491 }
492 
493 #endif /* !SKINNY_VEC128_MATH */
Union that describes a 64-bit 2x4 array of cells.
Skinny128HalfCells_t schedule[SKINNY128_MAX_ROUNDS]
Key schedule for Skinny128 block ciphers.