23 #include "skinny128-parallel.h"
24 #include "skinny-internal.h"
26 #if SKINNY_VEC128_MATH
28 STATIC_INLINE SkinnyVector4x32_t skinny128_rotate_right
29 (SkinnyVector4x32_t x,
unsigned count)
34 return (x << count) | (x >> (32 - count));
43 STATIC_INLINE
void skinny128_sbox_four
44 (SkinnyVector4x32_t *u, SkinnyVector4x32_t *v,
45 SkinnyVector4x32_t *s, SkinnyVector4x32_t *t)
47 SkinnyVector4x32_t x1 = *u;
48 SkinnyVector4x32_t y1;
49 SkinnyVector4x32_t x2 = *v;
50 SkinnyVector4x32_t y2;
51 SkinnyVector4x32_t x3 = *s;
52 SkinnyVector4x32_t y3;
53 SkinnyVector4x32_t x4 = *t;
54 SkinnyVector4x32_t y4;
56 x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x11111111U);
57 x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x11111111U);
58 x3 ^= ((~((x3 >> 2) | (x3 >> 3))) & 0x11111111U);
59 x4 ^= ((~((x4 >> 2) | (x4 >> 3))) & 0x11111111U);
61 y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
62 y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
63 y3 = ((~((x3 << 5) | (x3 << 1))) & 0x20202020U);
64 y4 = ((~((x4 << 5) | (x4 << 1))) & 0x20202020U);
66 x1 ^= ((~((x1 << 5) | (x1 << 4))) & 0x40404040U) ^ y1;
67 x2 ^= ((~((x2 << 5) | (x2 << 4))) & 0x40404040U) ^ y2;
68 x3 ^= ((~((x3 << 5) | (x3 << 4))) & 0x40404040U) ^ y3;
69 x4 ^= ((~((x4 << 5) | (x4 << 4))) & 0x40404040U) ^ y4;
71 y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
72 y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
73 y3 = ((~((x3 << 2) | (x3 << 1))) & 0x80808080U);
74 y4 = ((~((x4 << 2) | (x4 << 1))) & 0x80808080U);
76 x1 ^= ((~((x1 >> 2) | (x1 << 1))) & 0x02020202U) ^ y1;
77 x2 ^= ((~((x2 >> 2) | (x2 << 1))) & 0x02020202U) ^ y2;
78 x3 ^= ((~((x3 >> 2) | (x3 << 1))) & 0x02020202U) ^ y3;
79 x4 ^= ((~((x4 >> 2) | (x4 << 1))) & 0x02020202U) ^ y4;
81 y1 = ((~((x1 >> 5) | (x1 << 1))) & 0x04040404U);
82 y2 = ((~((x2 >> 5) | (x2 << 1))) & 0x04040404U);
83 y3 = ((~((x3 >> 5) | (x3 << 1))) & 0x04040404U);
84 y4 = ((~((x4 >> 5) | (x4 << 1))) & 0x04040404U);
86 x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
87 x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
88 x3 ^= ((~((x3 >> 1) | (x3 >> 2))) & 0x08080808U) ^ y3;
89 x4 ^= ((~((x4 >> 1) | (x4 >> 2))) & 0x08080808U) ^ y4;
91 *u = ((x1 & 0x08080808U) << 1) |
92 ((x1 & 0x32323232U) << 2) |
93 ((x1 & 0x01010101U) << 5) |
94 ((x1 & 0x80808080U) >> 6) |
95 ((x1 & 0x40404040U) >> 4) |
96 ((x1 & 0x04040404U) >> 2);
98 *v = ((x2 & 0x08080808U) << 1) |
99 ((x2 & 0x32323232U) << 2) |
100 ((x2 & 0x01010101U) << 5) |
101 ((x2 & 0x80808080U) >> 6) |
102 ((x2 & 0x40404040U) >> 4) |
103 ((x2 & 0x04040404U) >> 2);
105 *s = ((x3 & 0x08080808U) << 1) |
106 ((x3 & 0x32323232U) << 2) |
107 ((x3 & 0x01010101U) << 5) |
108 ((x3 & 0x80808080U) >> 6) |
109 ((x3 & 0x40404040U) >> 4) |
110 ((x3 & 0x04040404U) >> 2);
112 *t = ((x4 & 0x08080808U) << 1) |
113 ((x4 & 0x32323232U) << 2) |
114 ((x4 & 0x01010101U) << 5) |
115 ((x4 & 0x80808080U) >> 6) |
116 ((x4 & 0x40404040U) >> 4) |
117 ((x4 & 0x04040404U) >> 2);
120 STATIC_INLINE
void skinny128_inv_sbox_four
121 (SkinnyVector4x32_t *u, SkinnyVector4x32_t *v,
122 SkinnyVector4x32_t *s, SkinnyVector4x32_t *t)
124 SkinnyVector4x32_t x1 = *u;
125 SkinnyVector4x32_t y1;
126 SkinnyVector4x32_t x2 = *v;
127 SkinnyVector4x32_t y2;
128 SkinnyVector4x32_t x3 = *s;
129 SkinnyVector4x32_t y3;
130 SkinnyVector4x32_t x4 = *t;
131 SkinnyVector4x32_t y4;
133 y1 = ((~((x1 >> 1) | (x1 >> 3))) & 0x01010101U);
134 y2 = ((~((x2 >> 1) | (x2 >> 3))) & 0x01010101U);
135 y3 = ((~((x3 >> 1) | (x3 >> 3))) & 0x01010101U);
136 y4 = ((~((x4 >> 1) | (x4 >> 3))) & 0x01010101U);
138 x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x10101010U) ^ y1;
139 x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x10101010U) ^ y2;
140 x3 ^= ((~((x3 >> 2) | (x3 >> 3))) & 0x10101010U) ^ y3;
141 x4 ^= ((~((x4 >> 2) | (x4 >> 3))) & 0x10101010U) ^ y4;
143 y1 = ((~((x1 >> 6) | (x1 >> 1))) & 0x02020202U);
144 y2 = ((~((x2 >> 6) | (x2 >> 1))) & 0x02020202U);
145 y3 = ((~((x3 >> 6) | (x3 >> 1))) & 0x02020202U);
146 y4 = ((~((x4 >> 6) | (x4 >> 1))) & 0x02020202U);
148 x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
149 x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
150 x3 ^= ((~((x3 >> 1) | (x3 >> 2))) & 0x08080808U) ^ y3;
151 x4 ^= ((~((x4 >> 1) | (x4 >> 2))) & 0x08080808U) ^ y4;
153 y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
154 y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
155 y3 = ((~((x3 << 2) | (x3 << 1))) & 0x80808080U);
156 y4 = ((~((x4 << 2) | (x4 << 1))) & 0x80808080U);
158 x1 ^= ((~((x1 >> 1) | (x1 << 2))) & 0x04040404U) ^ y1;
159 x2 ^= ((~((x2 >> 1) | (x2 << 2))) & 0x04040404U) ^ y2;
160 x3 ^= ((~((x3 >> 1) | (x3 << 2))) & 0x04040404U) ^ y3;
161 x4 ^= ((~((x4 >> 1) | (x4 << 2))) & 0x04040404U) ^ y4;
163 y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
164 y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
165 y3 = ((~((x3 << 5) | (x3 << 1))) & 0x20202020U);
166 y4 = ((~((x4 << 5) | (x4 << 1))) & 0x20202020U);
168 x1 ^= ((~((x1 << 4) | (x1 << 5))) & 0x40404040U) ^ y1;
169 x2 ^= ((~((x2 << 4) | (x2 << 5))) & 0x40404040U) ^ y2;
170 x3 ^= ((~((x3 << 4) | (x3 << 5))) & 0x40404040U) ^ y3;
171 x4 ^= ((~((x4 << 4) | (x4 << 5))) & 0x40404040U) ^ y4;
173 *u = ((x1 & 0x01010101U) << 2) |
174 ((x1 & 0x04040404U) << 4) |
175 ((x1 & 0x02020202U) << 6) |
176 ((x1 & 0x20202020U) >> 5) |
177 ((x1 & 0xC8C8C8C8U) >> 2) |
178 ((x1 & 0x10101010U) >> 1);
180 *v = ((x2 & 0x01010101U) << 2) |
181 ((x2 & 0x04040404U) << 4) |
182 ((x2 & 0x02020202U) << 6) |
183 ((x2 & 0x20202020U) >> 5) |
184 ((x2 & 0xC8C8C8C8U) >> 2) |
185 ((x2 & 0x10101010U) >> 1);
187 *s = ((x3 & 0x01010101U) << 2) |
188 ((x3 & 0x04040404U) << 4) |
189 ((x3 & 0x02020202U) << 6) |
190 ((x3 & 0x20202020U) >> 5) |
191 ((x3 & 0xC8C8C8C8U) >> 2) |
192 ((x3 & 0x10101010U) >> 1);
194 *t = ((x4 & 0x01010101U) << 2) |
195 ((x4 & 0x04040404U) << 4) |
196 ((x4 & 0x02020202U) << 6) |
197 ((x4 & 0x20202020U) >> 5) |
198 ((x4 & 0xC8C8C8C8U) >> 2) |
199 ((x4 & 0x10101010U) >> 1);
208 STATIC_INLINE
void skinny128_sbox_two
209 (SkinnyVector4x32_t *u, SkinnyVector4x32_t *v)
211 SkinnyVector4x32_t x1 = *u;
212 SkinnyVector4x32_t y1;
213 SkinnyVector4x32_t x2 = *v;
214 SkinnyVector4x32_t y2;
216 x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x11111111U);
217 x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x11111111U);
219 y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
220 y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
222 x1 ^= ((~((x1 << 5) | (x1 << 4))) & 0x40404040U) ^ y1;
223 x2 ^= ((~((x2 << 5) | (x2 << 4))) & 0x40404040U) ^ y2;
225 y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
226 y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
228 x1 ^= ((~((x1 >> 2) | (x1 << 1))) & 0x02020202U) ^ y1;
229 x2 ^= ((~((x2 >> 2) | (x2 << 1))) & 0x02020202U) ^ y2;
231 y1 = ((~((x1 >> 5) | (x1 << 1))) & 0x04040404U);
232 y2 = ((~((x2 >> 5) | (x2 << 1))) & 0x04040404U);
234 x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
235 x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
237 *u = ((x1 & 0x08080808U) << 1) |
238 ((x1 & 0x32323232U) << 2) |
239 ((x1 & 0x01010101U) << 5) |
240 ((x1 & 0x80808080U) >> 6) |
241 ((x1 & 0x40404040U) >> 4) |
242 ((x1 & 0x04040404U) >> 2);
244 *v = ((x2 & 0x08080808U) << 1) |
245 ((x2 & 0x32323232U) << 2) |
246 ((x2 & 0x01010101U) << 5) |
247 ((x2 & 0x80808080U) >> 6) |
248 ((x2 & 0x40404040U) >> 4) |
249 ((x2 & 0x04040404U) >> 2);
252 STATIC_INLINE
void skinny128_inv_sbox_two
253 (SkinnyVector4x32_t *u, SkinnyVector4x32_t *v)
255 SkinnyVector4x32_t x1 = *u;
256 SkinnyVector4x32_t y1;
257 SkinnyVector4x32_t x2 = *v;
258 SkinnyVector4x32_t y2;
260 y1 = ((~((x1 >> 1) | (x1 >> 3))) & 0x01010101U);
261 y2 = ((~((x2 >> 1) | (x2 >> 3))) & 0x01010101U);
263 x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x10101010U) ^ y1;
264 x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x10101010U) ^ y2;
266 y1 = ((~((x1 >> 6) | (x1 >> 1))) & 0x02020202U);
267 y2 = ((~((x2 >> 6) | (x2 >> 1))) & 0x02020202U);
269 x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
270 x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
272 y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
273 y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
275 x1 ^= ((~((x1 >> 1) | (x1 << 2))) & 0x04040404U) ^ y1;
276 x2 ^= ((~((x2 >> 1) | (x2 << 2))) & 0x04040404U) ^ y2;
278 y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
279 y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
281 x1 ^= ((~((x1 << 4) | (x1 << 5))) & 0x40404040U) ^ y1;
282 x2 ^= ((~((x2 << 4) | (x2 << 5))) & 0x40404040U) ^ y2;
284 *u = ((x1 & 0x01010101U) << 2) |
285 ((x1 & 0x04040404U) << 4) |
286 ((x1 & 0x02020202U) << 6) |
287 ((x1 & 0x20202020U) >> 5) |
288 ((x1 & 0xC8C8C8C8U) >> 2) |
289 ((x1 & 0x10101010U) >> 1);
291 *v = ((x2 & 0x01010101U) << 2) |
292 ((x2 & 0x04040404U) << 4) |
293 ((x2 & 0x02020202U) << 6) |
294 ((x2 & 0x20202020U) >> 5) |
295 ((x2 & 0xC8C8C8C8U) >> 2) |
296 ((x2 & 0x10101010U) >> 1);
301 void _skinny128_parallel_encrypt_vec128
304 SkinnyVector4x32_t row0;
305 SkinnyVector4x32_t row1;
306 SkinnyVector4x32_t row2;
307 SkinnyVector4x32_t row3;
310 SkinnyVector4x32_t temp;
313 row0 = (SkinnyVector4x32_t)
314 {READ_WORD32(input, 0), READ_WORD32(input, 16),
315 READ_WORD32(input, 32), READ_WORD32(input, 48)};
316 row1 = (SkinnyVector4x32_t)
317 {READ_WORD32(input, 4), READ_WORD32(input, 20),
318 READ_WORD32(input, 36), READ_WORD32(input, 52)};
319 row2 = (SkinnyVector4x32_t)
320 {READ_WORD32(input, 8), READ_WORD32(input, 24),
321 READ_WORD32(input, 40), READ_WORD32(input, 56)};
322 row3 = (SkinnyVector4x32_t)
323 {READ_WORD32(input, 12), READ_WORD32(input, 28),
324 READ_WORD32(input, 44), READ_WORD32(input, 60)};
328 for (index = ks->
rounds; index > 0; --index, ++schedule) {
331 skinny128_sbox_four(&row0, &row1, &row2, &row3);
333 skinny128_sbox_two(&row0, &row1);
334 skinny128_sbox_two(&row2, &row3);
338 row0 ^= schedule->
row[0];
339 row1 ^= schedule->
row[1];
343 row1 = skinny128_rotate_right(row1, 8);
344 row2 = skinny128_rotate_right(row2, 16);
345 row3 = skinny128_rotate_right(row3, 24);
358 #if SKINNY_LITTLE_ENDIAN && SKINNY_UNALIGNED
359 *((SkinnyVector4x32U_t *)output) =
360 (SkinnyVector4x32_t){row0[0], row1[0], row2[0], row3[0]};
361 *((SkinnyVector4x32U_t *)(output + 16)) =
362 (SkinnyVector4x32_t){row0[1], row1[1], row2[1], row3[1]};
363 *((SkinnyVector4x32U_t *)(output + 32)) =
364 (SkinnyVector4x32_t){row0[2], row1[2], row2[2], row3[2]};
365 *((SkinnyVector4x32U_t *)(output + 48)) =
366 (SkinnyVector4x32_t){row0[3], row1[3], row2[3], row3[3]};
368 WRITE_WORD32(output, 0, row0[0]);
369 WRITE_WORD32(output, 4, row1[0]);
370 WRITE_WORD32(output, 8, row2[0]);
371 WRITE_WORD32(output, 12, row3[0]);
372 WRITE_WORD32(output, 16, row0[1]);
373 WRITE_WORD32(output, 20, row1[1]);
374 WRITE_WORD32(output, 24, row2[1]);
375 WRITE_WORD32(output, 28, row3[1]);
376 WRITE_WORD32(output, 32, row0[2]);
377 WRITE_WORD32(output, 36, row1[2]);
378 WRITE_WORD32(output, 40, row2[2]);
379 WRITE_WORD32(output, 44, row3[2]);
380 WRITE_WORD32(output, 48, row0[3]);
381 WRITE_WORD32(output, 52, row1[3]);
382 WRITE_WORD32(output, 56, row2[3]);
383 WRITE_WORD32(output, 60, row3[3]);
387 void _skinny128_parallel_decrypt_vec128
390 SkinnyVector4x32_t row0;
391 SkinnyVector4x32_t row1;
392 SkinnyVector4x32_t row2;
393 SkinnyVector4x32_t row3;
396 SkinnyVector4x32_t temp;
399 row0 = (SkinnyVector4x32_t)
400 {READ_WORD32(input, 0), READ_WORD32(input, 16),
401 READ_WORD32(input, 32), READ_WORD32(input, 48)};
402 row1 = (SkinnyVector4x32_t)
403 {READ_WORD32(input, 4), READ_WORD32(input, 20),
404 READ_WORD32(input, 36), READ_WORD32(input, 52)};
405 row2 = (SkinnyVector4x32_t)
406 {READ_WORD32(input, 8), READ_WORD32(input, 24),
407 READ_WORD32(input, 40), READ_WORD32(input, 56)};
408 row3 = (SkinnyVector4x32_t)
409 {READ_WORD32(input, 12), READ_WORD32(input, 28),
410 READ_WORD32(input, 44), READ_WORD32(input, 60)};
414 for (index = ks->
rounds; index > 0; --index, --schedule) {
425 row1 = skinny128_rotate_right(row1, 24);
426 row2 = skinny128_rotate_right(row2, 16);
427 row3 = skinny128_rotate_right(row3, 8);
430 row0 ^= schedule->
row[0];
431 row1 ^= schedule->
row[1];
436 skinny128_inv_sbox_four(&row0, &row1, &row2, &row3);
438 skinny128_inv_sbox_two(&row0, &row1);
439 skinny128_inv_sbox_two(&row2, &row3);
444 #if SKINNY_LITTLE_ENDIAN && SKINNY_UNALIGNED
445 *((SkinnyVector4x32U_t *)output) =
446 (SkinnyVector4x32_t){row0[0], row1[0], row2[0], row3[0]};
447 *((SkinnyVector4x32U_t *)(output + 16)) =
448 (SkinnyVector4x32_t){row0[1], row1[1], row2[1], row3[1]};
449 *((SkinnyVector4x32U_t *)(output + 32)) =
450 (SkinnyVector4x32_t){row0[2], row1[2], row2[2], row3[2]};
451 *((SkinnyVector4x32U_t *)(output + 48)) =
452 (SkinnyVector4x32_t){row0[3], row1[3], row2[3], row3[3]};
454 WRITE_WORD32(output, 0, row0[0]);
455 WRITE_WORD32(output, 4, row1[0]);
456 WRITE_WORD32(output, 8, row2[0]);
457 WRITE_WORD32(output, 12, row3[0]);
458 WRITE_WORD32(output, 16, row0[1]);
459 WRITE_WORD32(output, 20, row1[1]);
460 WRITE_WORD32(output, 24, row2[1]);
461 WRITE_WORD32(output, 28, row3[1]);
462 WRITE_WORD32(output, 32, row0[2]);
463 WRITE_WORD32(output, 36, row1[2]);
464 WRITE_WORD32(output, 40, row2[2]);
465 WRITE_WORD32(output, 44, row3[2]);
466 WRITE_WORD32(output, 48, row0[3]);
467 WRITE_WORD32(output, 52, row1[3]);
468 WRITE_WORD32(output, 56, row2[3]);
469 WRITE_WORD32(output, 60, row3[3]);
477 void _skinny128_parallel_encrypt_vec128
485 void _skinny128_parallel_decrypt_vec128
Union that describes a 64-bit 2x4 array of cells.
Skinny128HalfCells_t schedule[SKINNY128_MAX_ROUNDS]
Key schedule for Skinny128 block ciphers.