23 #include "skinny128-parallel.h"
24 #include "skinny-internal.h"
26 #if SKINNY_VEC256_MATH
28 STATIC_INLINE SkinnyVector8x32_t skinny128_rotate_right
29 (SkinnyVector8x32_t x,
unsigned count)
34 return (x << count) | (x >> (32 - count));
41 STATIC_INLINE
void skinny128_sbox_four
42 (SkinnyVector8x32_t *u, SkinnyVector8x32_t *v,
43 SkinnyVector8x32_t *s, SkinnyVector8x32_t *t)
45 SkinnyVector8x32_t x1 = *u;
46 SkinnyVector8x32_t y1;
47 SkinnyVector8x32_t x2 = *v;
48 SkinnyVector8x32_t y2;
49 SkinnyVector8x32_t x3 = *s;
50 SkinnyVector8x32_t y3;
51 SkinnyVector8x32_t x4 = *t;
52 SkinnyVector8x32_t y4;
54 x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x11111111U);
55 x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x11111111U);
56 x3 ^= ((~((x3 >> 2) | (x3 >> 3))) & 0x11111111U);
57 x4 ^= ((~((x4 >> 2) | (x4 >> 3))) & 0x11111111U);
59 y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
60 y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
61 y3 = ((~((x3 << 5) | (x3 << 1))) & 0x20202020U);
62 y4 = ((~((x4 << 5) | (x4 << 1))) & 0x20202020U);
64 x1 ^= ((~((x1 << 5) | (x1 << 4))) & 0x40404040U) ^ y1;
65 x2 ^= ((~((x2 << 5) | (x2 << 4))) & 0x40404040U) ^ y2;
66 x3 ^= ((~((x3 << 5) | (x3 << 4))) & 0x40404040U) ^ y3;
67 x4 ^= ((~((x4 << 5) | (x4 << 4))) & 0x40404040U) ^ y4;
69 y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
70 y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
71 y3 = ((~((x3 << 2) | (x3 << 1))) & 0x80808080U);
72 y4 = ((~((x4 << 2) | (x4 << 1))) & 0x80808080U);
74 x1 ^= ((~((x1 >> 2) | (x1 << 1))) & 0x02020202U) ^ y1;
75 x2 ^= ((~((x2 >> 2) | (x2 << 1))) & 0x02020202U) ^ y2;
76 x3 ^= ((~((x3 >> 2) | (x3 << 1))) & 0x02020202U) ^ y3;
77 x4 ^= ((~((x4 >> 2) | (x4 << 1))) & 0x02020202U) ^ y4;
79 y1 = ((~((x1 >> 5) | (x1 << 1))) & 0x04040404U);
80 y2 = ((~((x2 >> 5) | (x2 << 1))) & 0x04040404U);
81 y3 = ((~((x3 >> 5) | (x3 << 1))) & 0x04040404U);
82 y4 = ((~((x4 >> 5) | (x4 << 1))) & 0x04040404U);
84 x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
85 x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
86 x3 ^= ((~((x3 >> 1) | (x3 >> 2))) & 0x08080808U) ^ y3;
87 x4 ^= ((~((x4 >> 1) | (x4 >> 2))) & 0x08080808U) ^ y4;
89 *u = ((x1 & 0x08080808U) << 1) |
90 ((x1 & 0x32323232U) << 2) |
91 ((x1 & 0x01010101U) << 5) |
92 ((x1 & 0x80808080U) >> 6) |
93 ((x1 & 0x40404040U) >> 4) |
94 ((x1 & 0x04040404U) >> 2);
96 *v = ((x2 & 0x08080808U) << 1) |
97 ((x2 & 0x32323232U) << 2) |
98 ((x2 & 0x01010101U) << 5) |
99 ((x2 & 0x80808080U) >> 6) |
100 ((x2 & 0x40404040U) >> 4) |
101 ((x2 & 0x04040404U) >> 2);
103 *s = ((x3 & 0x08080808U) << 1) |
104 ((x3 & 0x32323232U) << 2) |
105 ((x3 & 0x01010101U) << 5) |
106 ((x3 & 0x80808080U) >> 6) |
107 ((x3 & 0x40404040U) >> 4) |
108 ((x3 & 0x04040404U) >> 2);
110 *t = ((x4 & 0x08080808U) << 1) |
111 ((x4 & 0x32323232U) << 2) |
112 ((x4 & 0x01010101U) << 5) |
113 ((x4 & 0x80808080U) >> 6) |
114 ((x4 & 0x40404040U) >> 4) |
115 ((x4 & 0x04040404U) >> 2);
118 STATIC_INLINE
void skinny128_inv_sbox_four
119 (SkinnyVector8x32_t *u, SkinnyVector8x32_t *v,
120 SkinnyVector8x32_t *s, SkinnyVector8x32_t *t)
122 SkinnyVector8x32_t x1 = *u;
123 SkinnyVector8x32_t y1;
124 SkinnyVector8x32_t x2 = *v;
125 SkinnyVector8x32_t y2;
126 SkinnyVector8x32_t x3 = *s;
127 SkinnyVector8x32_t y3;
128 SkinnyVector8x32_t x4 = *t;
129 SkinnyVector8x32_t y4;
131 y1 = ((~((x1 >> 1) | (x1 >> 3))) & 0x01010101U);
132 y2 = ((~((x2 >> 1) | (x2 >> 3))) & 0x01010101U);
133 y3 = ((~((x3 >> 1) | (x3 >> 3))) & 0x01010101U);
134 y4 = ((~((x4 >> 1) | (x4 >> 3))) & 0x01010101U);
136 x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x10101010U) ^ y1;
137 x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x10101010U) ^ y2;
138 x3 ^= ((~((x3 >> 2) | (x3 >> 3))) & 0x10101010U) ^ y3;
139 x4 ^= ((~((x4 >> 2) | (x4 >> 3))) & 0x10101010U) ^ y4;
141 y1 = ((~((x1 >> 6) | (x1 >> 1))) & 0x02020202U);
142 y2 = ((~((x2 >> 6) | (x2 >> 1))) & 0x02020202U);
143 y3 = ((~((x3 >> 6) | (x3 >> 1))) & 0x02020202U);
144 y4 = ((~((x4 >> 6) | (x4 >> 1))) & 0x02020202U);
146 x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
147 x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
148 x3 ^= ((~((x3 >> 1) | (x3 >> 2))) & 0x08080808U) ^ y3;
149 x4 ^= ((~((x4 >> 1) | (x4 >> 2))) & 0x08080808U) ^ y4;
151 y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
152 y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
153 y3 = ((~((x3 << 2) | (x3 << 1))) & 0x80808080U);
154 y4 = ((~((x4 << 2) | (x4 << 1))) & 0x80808080U);
156 x1 ^= ((~((x1 >> 1) | (x1 << 2))) & 0x04040404U) ^ y1;
157 x2 ^= ((~((x2 >> 1) | (x2 << 2))) & 0x04040404U) ^ y2;
158 x3 ^= ((~((x3 >> 1) | (x3 << 2))) & 0x04040404U) ^ y3;
159 x4 ^= ((~((x4 >> 1) | (x4 << 2))) & 0x04040404U) ^ y4;
161 y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
162 y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
163 y3 = ((~((x3 << 5) | (x3 << 1))) & 0x20202020U);
164 y4 = ((~((x4 << 5) | (x4 << 1))) & 0x20202020U);
166 x1 ^= ((~((x1 << 4) | (x1 << 5))) & 0x40404040U) ^ y1;
167 x2 ^= ((~((x2 << 4) | (x2 << 5))) & 0x40404040U) ^ y2;
168 x3 ^= ((~((x3 << 4) | (x3 << 5))) & 0x40404040U) ^ y3;
169 x4 ^= ((~((x4 << 4) | (x4 << 5))) & 0x40404040U) ^ y4;
171 *u = ((x1 & 0x01010101U) << 2) |
172 ((x1 & 0x04040404U) << 4) |
173 ((x1 & 0x02020202U) << 6) |
174 ((x1 & 0x20202020U) >> 5) |
175 ((x1 & 0xC8C8C8C8U) >> 2) |
176 ((x1 & 0x10101010U) >> 1);
178 *v = ((x2 & 0x01010101U) << 2) |
179 ((x2 & 0x04040404U) << 4) |
180 ((x2 & 0x02020202U) << 6) |
181 ((x2 & 0x20202020U) >> 5) |
182 ((x2 & 0xC8C8C8C8U) >> 2) |
183 ((x2 & 0x10101010U) >> 1);
185 *s = ((x3 & 0x01010101U) << 2) |
186 ((x3 & 0x04040404U) << 4) |
187 ((x3 & 0x02020202U) << 6) |
188 ((x3 & 0x20202020U) >> 5) |
189 ((x3 & 0xC8C8C8C8U) >> 2) |
190 ((x3 & 0x10101010U) >> 1);
192 *t = ((x4 & 0x01010101U) << 2) |
193 ((x4 & 0x04040404U) << 4) |
194 ((x4 & 0x02020202U) << 6) |
195 ((x4 & 0x20202020U) >> 5) |
196 ((x4 & 0xC8C8C8C8U) >> 2) |
197 ((x4 & 0x10101010U) >> 1);
200 void _skinny128_parallel_encrypt_vec256
203 SkinnyVector8x32_t row0;
204 SkinnyVector8x32_t row1;
205 SkinnyVector8x32_t row2;
206 SkinnyVector8x32_t row3;
209 SkinnyVector8x32_t temp;
212 row0 = (SkinnyVector8x32_t)
213 {READ_WORD32(input, 0), READ_WORD32(input, 16),
214 READ_WORD32(input, 32), READ_WORD32(input, 48),
215 READ_WORD32(input, 64), READ_WORD32(input, 80),
216 READ_WORD32(input, 96), READ_WORD32(input, 112)};
217 row1 = (SkinnyVector8x32_t)
218 {READ_WORD32(input, 4), READ_WORD32(input, 20),
219 READ_WORD32(input, 36), READ_WORD32(input, 52),
220 READ_WORD32(input, 68), READ_WORD32(input, 84),
221 READ_WORD32(input, 100), READ_WORD32(input, 116)};
222 row2 = (SkinnyVector8x32_t)
223 {READ_WORD32(input, 8), READ_WORD32(input, 24),
224 READ_WORD32(input, 40), READ_WORD32(input, 56),
225 READ_WORD32(input, 72), READ_WORD32(input, 88),
226 READ_WORD32(input, 104), READ_WORD32(input, 120)};
227 row3 = (SkinnyVector8x32_t)
228 {READ_WORD32(input, 12), READ_WORD32(input, 28),
229 READ_WORD32(input, 44), READ_WORD32(input, 60),
230 READ_WORD32(input, 76), READ_WORD32(input, 92),
231 READ_WORD32(input, 108), READ_WORD32(input, 124)};
235 for (index = ks->
rounds; index > 0; --index, ++schedule) {
237 skinny128_sbox_four(&row0, &row1, &row2, &row3);
240 row0 ^= schedule->
row[0];
241 row1 ^= schedule->
row[1];
245 row1 = skinny128_rotate_right(row1, 8);
246 row2 = skinny128_rotate_right(row2, 16);
247 row3 = skinny128_rotate_right(row3, 24);
260 #if SKINNY_LITTLE_ENDIAN && SKINNY_UNALIGNED
261 *((SkinnyVector8x32U_t *)output) =
262 (SkinnyVector8x32_t){row0[0], row1[0], row2[0], row3[0],
263 row0[1], row1[1], row2[1], row3[1]};
264 *((SkinnyVector8x32U_t *)(output + 32)) =
265 (SkinnyVector8x32_t){row0[2], row1[2], row2[2], row3[2],
266 row0[3], row1[3], row2[3], row3[3]};
267 *((SkinnyVector8x32U_t *)(output + 64)) =
268 (SkinnyVector8x32_t){row0[4], row1[4], row2[4], row3[4],
269 row0[5], row1[5], row2[5], row3[5]};
270 *((SkinnyVector8x32U_t *)(output + 96)) =
271 (SkinnyVector8x32_t){row0[6], row1[6], row2[6], row3[6],
272 row0[7], row1[7], row2[7], row3[7]};
274 WRITE_WORD32(output, 0, row0[0]);
275 WRITE_WORD32(output, 4, row1[0]);
276 WRITE_WORD32(output, 8, row2[0]);
277 WRITE_WORD32(output, 12, row3[0]);
278 WRITE_WORD32(output, 16, row0[1]);
279 WRITE_WORD32(output, 20, row1[1]);
280 WRITE_WORD32(output, 24, row2[1]);
281 WRITE_WORD32(output, 28, row3[1]);
282 WRITE_WORD32(output, 32, row0[2]);
283 WRITE_WORD32(output, 36, row1[2]);
284 WRITE_WORD32(output, 40, row2[2]);
285 WRITE_WORD32(output, 44, row3[2]);
286 WRITE_WORD32(output, 48, row0[3]);
287 WRITE_WORD32(output, 52, row1[3]);
288 WRITE_WORD32(output, 56, row2[3]);
289 WRITE_WORD32(output, 60, row3[3]);
290 WRITE_WORD32(output, 64, row0[4]);
291 WRITE_WORD32(output, 68, row1[4]);
292 WRITE_WORD32(output, 72, row2[4]);
293 WRITE_WORD32(output, 76, row3[4]);
294 WRITE_WORD32(output, 80, row0[5]);
295 WRITE_WORD32(output, 84, row1[5]);
296 WRITE_WORD32(output, 88, row2[5]);
297 WRITE_WORD32(output, 92, row3[5]);
298 WRITE_WORD32(output, 96, row0[6]);
299 WRITE_WORD32(output, 100, row1[6]);
300 WRITE_WORD32(output, 104, row2[6]);
301 WRITE_WORD32(output, 108, row3[6]);
302 WRITE_WORD32(output, 112, row0[7]);
303 WRITE_WORD32(output, 116, row1[7]);
304 WRITE_WORD32(output, 120, row2[7]);
305 WRITE_WORD32(output, 124, row3[7]);
309 void _skinny128_parallel_decrypt_vec256
312 SkinnyVector8x32_t row0;
313 SkinnyVector8x32_t row1;
314 SkinnyVector8x32_t row2;
315 SkinnyVector8x32_t row3;
318 SkinnyVector8x32_t temp;
321 row0 = (SkinnyVector8x32_t)
322 {READ_WORD32(input, 0), READ_WORD32(input, 16),
323 READ_WORD32(input, 32), READ_WORD32(input, 48),
324 READ_WORD32(input, 64), READ_WORD32(input, 80),
325 READ_WORD32(input, 96), READ_WORD32(input, 112)};
326 row1 = (SkinnyVector8x32_t)
327 {READ_WORD32(input, 4), READ_WORD32(input, 20),
328 READ_WORD32(input, 36), READ_WORD32(input, 52),
329 READ_WORD32(input, 68), READ_WORD32(input, 84),
330 READ_WORD32(input, 100), READ_WORD32(input, 116)};
331 row2 = (SkinnyVector8x32_t)
332 {READ_WORD32(input, 8), READ_WORD32(input, 24),
333 READ_WORD32(input, 40), READ_WORD32(input, 56),
334 READ_WORD32(input, 72), READ_WORD32(input, 88),
335 READ_WORD32(input, 104), READ_WORD32(input, 120)};
336 row3 = (SkinnyVector8x32_t)
337 {READ_WORD32(input, 12), READ_WORD32(input, 28),
338 READ_WORD32(input, 44), READ_WORD32(input, 60),
339 READ_WORD32(input, 76), READ_WORD32(input, 92),
340 READ_WORD32(input, 108), READ_WORD32(input, 124)};
344 for (index = ks->
rounds; index > 0; --index, --schedule) {
355 row1 = skinny128_rotate_right(row1, 24);
356 row2 = skinny128_rotate_right(row2, 16);
357 row3 = skinny128_rotate_right(row3, 8);
360 row0 ^= schedule->
row[0];
361 row1 ^= schedule->
row[1];
365 skinny128_inv_sbox_four(&row0, &row1, &row2, &row3);
369 #if SKINNY_LITTLE_ENDIAN && SKINNY_UNALIGNED
370 *((SkinnyVector8x32U_t *)output) =
371 (SkinnyVector8x32_t){row0[0], row1[0], row2[0], row3[0],
372 row0[1], row1[1], row2[1], row3[1]};
373 *((SkinnyVector8x32U_t *)(output + 32)) =
374 (SkinnyVector8x32_t){row0[2], row1[2], row2[2], row3[2],
375 row0[3], row1[3], row2[3], row3[3]};
376 *((SkinnyVector8x32U_t *)(output + 64)) =
377 (SkinnyVector8x32_t){row0[4], row1[4], row2[4], row3[4],
378 row0[5], row1[5], row2[5], row3[5]};
379 *((SkinnyVector8x32U_t *)(output + 96)) =
380 (SkinnyVector8x32_t){row0[6], row1[6], row2[6], row3[6],
381 row0[7], row1[7], row2[7], row3[7]};
383 WRITE_WORD32(output, 0, row0[0]);
384 WRITE_WORD32(output, 4, row1[0]);
385 WRITE_WORD32(output, 8, row2[0]);
386 WRITE_WORD32(output, 12, row3[0]);
387 WRITE_WORD32(output, 16, row0[1]);
388 WRITE_WORD32(output, 20, row1[1]);
389 WRITE_WORD32(output, 24, row2[1]);
390 WRITE_WORD32(output, 28, row3[1]);
391 WRITE_WORD32(output, 32, row0[2]);
392 WRITE_WORD32(output, 36, row1[2]);
393 WRITE_WORD32(output, 40, row2[2]);
394 WRITE_WORD32(output, 44, row3[2]);
395 WRITE_WORD32(output, 48, row0[3]);
396 WRITE_WORD32(output, 52, row1[3]);
397 WRITE_WORD32(output, 56, row2[3]);
398 WRITE_WORD32(output, 60, row3[3]);
399 WRITE_WORD32(output, 64, row0[4]);
400 WRITE_WORD32(output, 68, row1[4]);
401 WRITE_WORD32(output, 72, row2[4]);
402 WRITE_WORD32(output, 76, row3[4]);
403 WRITE_WORD32(output, 80, row0[5]);
404 WRITE_WORD32(output, 84, row1[5]);
405 WRITE_WORD32(output, 88, row2[5]);
406 WRITE_WORD32(output, 92, row3[5]);
407 WRITE_WORD32(output, 96, row0[6]);
408 WRITE_WORD32(output, 100, row1[6]);
409 WRITE_WORD32(output, 104, row2[6]);
410 WRITE_WORD32(output, 108, row3[6]);
411 WRITE_WORD32(output, 112, row0[7]);
412 WRITE_WORD32(output, 116, row1[7]);
413 WRITE_WORD32(output, 120, row2[7]);
414 WRITE_WORD32(output, 124, row3[7]);
422 void _skinny128_parallel_encrypt_vec256
430 void _skinny128_parallel_decrypt_vec256
Union that describes a 64-bit 2x4 array of cells.
Skinny128HalfCells_t schedule[SKINNY128_MAX_ROUNDS]
Key schedule for Skinny128 block ciphers.