Skinny-C
 All Data Structures Files Functions Variables Groups Pages
skinny128-parallel-vec256.c
1 /*
2  * Copyright (C) 2017 Southern Storm Software, Pty Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "skinny128-parallel.h"
24 #include "skinny-internal.h"
25 
26 #if SKINNY_VEC256_MATH
27 
28 STATIC_INLINE SkinnyVector8x32_t skinny128_rotate_right
29  (SkinnyVector8x32_t x, unsigned count)
30 {
31  /* Note: we are rotating the cells right, which actually moves
32  the values up closer to the MSB. That is, we do a left shift
33  on the word to rotate the cells in the word right */
34  return (x << count) | (x >> (32 - count));
35 }
36 
37 /* This function evaluates the S-box on four 256-bit vectors in parallel
38  by interleaving the operations. This tends to make better use of YMM
39  registers on x86-64 CPU's that have AVX2 support or better as the CPU
40  can schedule unrelated operations to operate in parallel. */
41 STATIC_INLINE void skinny128_sbox_four
42  (SkinnyVector8x32_t *u, SkinnyVector8x32_t *v,
43  SkinnyVector8x32_t *s, SkinnyVector8x32_t *t)
44 {
45  SkinnyVector8x32_t x1 = *u;
46  SkinnyVector8x32_t y1;
47  SkinnyVector8x32_t x2 = *v;
48  SkinnyVector8x32_t y2;
49  SkinnyVector8x32_t x3 = *s;
50  SkinnyVector8x32_t y3;
51  SkinnyVector8x32_t x4 = *t;
52  SkinnyVector8x32_t y4;
53 
54  x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x11111111U);
55  x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x11111111U);
56  x3 ^= ((~((x3 >> 2) | (x3 >> 3))) & 0x11111111U);
57  x4 ^= ((~((x4 >> 2) | (x4 >> 3))) & 0x11111111U);
58 
59  y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
60  y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
61  y3 = ((~((x3 << 5) | (x3 << 1))) & 0x20202020U);
62  y4 = ((~((x4 << 5) | (x4 << 1))) & 0x20202020U);
63 
64  x1 ^= ((~((x1 << 5) | (x1 << 4))) & 0x40404040U) ^ y1;
65  x2 ^= ((~((x2 << 5) | (x2 << 4))) & 0x40404040U) ^ y2;
66  x3 ^= ((~((x3 << 5) | (x3 << 4))) & 0x40404040U) ^ y3;
67  x4 ^= ((~((x4 << 5) | (x4 << 4))) & 0x40404040U) ^ y4;
68 
69  y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
70  y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
71  y3 = ((~((x3 << 2) | (x3 << 1))) & 0x80808080U);
72  y4 = ((~((x4 << 2) | (x4 << 1))) & 0x80808080U);
73 
74  x1 ^= ((~((x1 >> 2) | (x1 << 1))) & 0x02020202U) ^ y1;
75  x2 ^= ((~((x2 >> 2) | (x2 << 1))) & 0x02020202U) ^ y2;
76  x3 ^= ((~((x3 >> 2) | (x3 << 1))) & 0x02020202U) ^ y3;
77  x4 ^= ((~((x4 >> 2) | (x4 << 1))) & 0x02020202U) ^ y4;
78 
79  y1 = ((~((x1 >> 5) | (x1 << 1))) & 0x04040404U);
80  y2 = ((~((x2 >> 5) | (x2 << 1))) & 0x04040404U);
81  y3 = ((~((x3 >> 5) | (x3 << 1))) & 0x04040404U);
82  y4 = ((~((x4 >> 5) | (x4 << 1))) & 0x04040404U);
83 
84  x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
85  x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
86  x3 ^= ((~((x3 >> 1) | (x3 >> 2))) & 0x08080808U) ^ y3;
87  x4 ^= ((~((x4 >> 1) | (x4 >> 2))) & 0x08080808U) ^ y4;
88 
89  *u = ((x1 & 0x08080808U) << 1) |
90  ((x1 & 0x32323232U) << 2) |
91  ((x1 & 0x01010101U) << 5) |
92  ((x1 & 0x80808080U) >> 6) |
93  ((x1 & 0x40404040U) >> 4) |
94  ((x1 & 0x04040404U) >> 2);
95 
96  *v = ((x2 & 0x08080808U) << 1) |
97  ((x2 & 0x32323232U) << 2) |
98  ((x2 & 0x01010101U) << 5) |
99  ((x2 & 0x80808080U) >> 6) |
100  ((x2 & 0x40404040U) >> 4) |
101  ((x2 & 0x04040404U) >> 2);
102 
103  *s = ((x3 & 0x08080808U) << 1) |
104  ((x3 & 0x32323232U) << 2) |
105  ((x3 & 0x01010101U) << 5) |
106  ((x3 & 0x80808080U) >> 6) |
107  ((x3 & 0x40404040U) >> 4) |
108  ((x3 & 0x04040404U) >> 2);
109 
110  *t = ((x4 & 0x08080808U) << 1) |
111  ((x4 & 0x32323232U) << 2) |
112  ((x4 & 0x01010101U) << 5) |
113  ((x4 & 0x80808080U) >> 6) |
114  ((x4 & 0x40404040U) >> 4) |
115  ((x4 & 0x04040404U) >> 2);
116 }
117 
118 STATIC_INLINE void skinny128_inv_sbox_four
119  (SkinnyVector8x32_t *u, SkinnyVector8x32_t *v,
120  SkinnyVector8x32_t *s, SkinnyVector8x32_t *t)
121 {
122  SkinnyVector8x32_t x1 = *u;
123  SkinnyVector8x32_t y1;
124  SkinnyVector8x32_t x2 = *v;
125  SkinnyVector8x32_t y2;
126  SkinnyVector8x32_t x3 = *s;
127  SkinnyVector8x32_t y3;
128  SkinnyVector8x32_t x4 = *t;
129  SkinnyVector8x32_t y4;
130 
131  y1 = ((~((x1 >> 1) | (x1 >> 3))) & 0x01010101U);
132  y2 = ((~((x2 >> 1) | (x2 >> 3))) & 0x01010101U);
133  y3 = ((~((x3 >> 1) | (x3 >> 3))) & 0x01010101U);
134  y4 = ((~((x4 >> 1) | (x4 >> 3))) & 0x01010101U);
135 
136  x1 ^= ((~((x1 >> 2) | (x1 >> 3))) & 0x10101010U) ^ y1;
137  x2 ^= ((~((x2 >> 2) | (x2 >> 3))) & 0x10101010U) ^ y2;
138  x3 ^= ((~((x3 >> 2) | (x3 >> 3))) & 0x10101010U) ^ y3;
139  x4 ^= ((~((x4 >> 2) | (x4 >> 3))) & 0x10101010U) ^ y4;
140 
141  y1 = ((~((x1 >> 6) | (x1 >> 1))) & 0x02020202U);
142  y2 = ((~((x2 >> 6) | (x2 >> 1))) & 0x02020202U);
143  y3 = ((~((x3 >> 6) | (x3 >> 1))) & 0x02020202U);
144  y4 = ((~((x4 >> 6) | (x4 >> 1))) & 0x02020202U);
145 
146  x1 ^= ((~((x1 >> 1) | (x1 >> 2))) & 0x08080808U) ^ y1;
147  x2 ^= ((~((x2 >> 1) | (x2 >> 2))) & 0x08080808U) ^ y2;
148  x3 ^= ((~((x3 >> 1) | (x3 >> 2))) & 0x08080808U) ^ y3;
149  x4 ^= ((~((x4 >> 1) | (x4 >> 2))) & 0x08080808U) ^ y4;
150 
151  y1 = ((~((x1 << 2) | (x1 << 1))) & 0x80808080U);
152  y2 = ((~((x2 << 2) | (x2 << 1))) & 0x80808080U);
153  y3 = ((~((x3 << 2) | (x3 << 1))) & 0x80808080U);
154  y4 = ((~((x4 << 2) | (x4 << 1))) & 0x80808080U);
155 
156  x1 ^= ((~((x1 >> 1) | (x1 << 2))) & 0x04040404U) ^ y1;
157  x2 ^= ((~((x2 >> 1) | (x2 << 2))) & 0x04040404U) ^ y2;
158  x3 ^= ((~((x3 >> 1) | (x3 << 2))) & 0x04040404U) ^ y3;
159  x4 ^= ((~((x4 >> 1) | (x4 << 2))) & 0x04040404U) ^ y4;
160 
161  y1 = ((~((x1 << 5) | (x1 << 1))) & 0x20202020U);
162  y2 = ((~((x2 << 5) | (x2 << 1))) & 0x20202020U);
163  y3 = ((~((x3 << 5) | (x3 << 1))) & 0x20202020U);
164  y4 = ((~((x4 << 5) | (x4 << 1))) & 0x20202020U);
165 
166  x1 ^= ((~((x1 << 4) | (x1 << 5))) & 0x40404040U) ^ y1;
167  x2 ^= ((~((x2 << 4) | (x2 << 5))) & 0x40404040U) ^ y2;
168  x3 ^= ((~((x3 << 4) | (x3 << 5))) & 0x40404040U) ^ y3;
169  x4 ^= ((~((x4 << 4) | (x4 << 5))) & 0x40404040U) ^ y4;
170 
171  *u = ((x1 & 0x01010101U) << 2) |
172  ((x1 & 0x04040404U) << 4) |
173  ((x1 & 0x02020202U) << 6) |
174  ((x1 & 0x20202020U) >> 5) |
175  ((x1 & 0xC8C8C8C8U) >> 2) |
176  ((x1 & 0x10101010U) >> 1);
177 
178  *v = ((x2 & 0x01010101U) << 2) |
179  ((x2 & 0x04040404U) << 4) |
180  ((x2 & 0x02020202U) << 6) |
181  ((x2 & 0x20202020U) >> 5) |
182  ((x2 & 0xC8C8C8C8U) >> 2) |
183  ((x2 & 0x10101010U) >> 1);
184 
185  *s = ((x3 & 0x01010101U) << 2) |
186  ((x3 & 0x04040404U) << 4) |
187  ((x3 & 0x02020202U) << 6) |
188  ((x3 & 0x20202020U) >> 5) |
189  ((x3 & 0xC8C8C8C8U) >> 2) |
190  ((x3 & 0x10101010U) >> 1);
191 
192  *t = ((x4 & 0x01010101U) << 2) |
193  ((x4 & 0x04040404U) << 4) |
194  ((x4 & 0x02020202U) << 6) |
195  ((x4 & 0x20202020U) >> 5) |
196  ((x4 & 0xC8C8C8C8U) >> 2) |
197  ((x4 & 0x10101010U) >> 1);
198 }
199 
200 void _skinny128_parallel_encrypt_vec256
201  (void *output, const void *input, const Skinny128Key_t *ks)
202 {
203  SkinnyVector8x32_t row0;
204  SkinnyVector8x32_t row1;
205  SkinnyVector8x32_t row2;
206  SkinnyVector8x32_t row3;
207  const Skinny128HalfCells_t *schedule;
208  unsigned index;
209  SkinnyVector8x32_t temp;
210 
211  /* Read the rows of all eight blocks into memory */
212  row0 = (SkinnyVector8x32_t)
213  {READ_WORD32(input, 0), READ_WORD32(input, 16),
214  READ_WORD32(input, 32), READ_WORD32(input, 48),
215  READ_WORD32(input, 64), READ_WORD32(input, 80),
216  READ_WORD32(input, 96), READ_WORD32(input, 112)};
217  row1 = (SkinnyVector8x32_t)
218  {READ_WORD32(input, 4), READ_WORD32(input, 20),
219  READ_WORD32(input, 36), READ_WORD32(input, 52),
220  READ_WORD32(input, 68), READ_WORD32(input, 84),
221  READ_WORD32(input, 100), READ_WORD32(input, 116)};
222  row2 = (SkinnyVector8x32_t)
223  {READ_WORD32(input, 8), READ_WORD32(input, 24),
224  READ_WORD32(input, 40), READ_WORD32(input, 56),
225  READ_WORD32(input, 72), READ_WORD32(input, 88),
226  READ_WORD32(input, 104), READ_WORD32(input, 120)};
227  row3 = (SkinnyVector8x32_t)
228  {READ_WORD32(input, 12), READ_WORD32(input, 28),
229  READ_WORD32(input, 44), READ_WORD32(input, 60),
230  READ_WORD32(input, 76), READ_WORD32(input, 92),
231  READ_WORD32(input, 108), READ_WORD32(input, 124)};
232 
233  /* Perform all encryption rounds on the eight blocks in parallel */
234  schedule = ks->schedule;
235  for (index = ks->rounds; index > 0; --index, ++schedule) {
236  /* Apply the S-box to all bytes in the state */
237  skinny128_sbox_four(&row0, &row1, &row2, &row3);
238 
239  /* Apply the subkey for this round */
240  row0 ^= schedule->row[0];
241  row1 ^= schedule->row[1];
242  row2 ^= 0x02;
243 
244  /* Shift the rows */
245  row1 = skinny128_rotate_right(row1, 8);
246  row2 = skinny128_rotate_right(row2, 16);
247  row3 = skinny128_rotate_right(row3, 24);
248 
249  /* Mix the columns */
250  row1 ^= row2;
251  row2 ^= row0;
252  temp = row3 ^ row2;
253  row3 = row2;
254  row2 = row1;
255  row1 = row0;
256  row0 = temp;
257  }
258 
259  /* Write the rows of all eight blocks back to memory */
260 #if SKINNY_LITTLE_ENDIAN && SKINNY_UNALIGNED
261  *((SkinnyVector8x32U_t *)output) =
262  (SkinnyVector8x32_t){row0[0], row1[0], row2[0], row3[0],
263  row0[1], row1[1], row2[1], row3[1]};
264  *((SkinnyVector8x32U_t *)(output + 32)) =
265  (SkinnyVector8x32_t){row0[2], row1[2], row2[2], row3[2],
266  row0[3], row1[3], row2[3], row3[3]};
267  *((SkinnyVector8x32U_t *)(output + 64)) =
268  (SkinnyVector8x32_t){row0[4], row1[4], row2[4], row3[4],
269  row0[5], row1[5], row2[5], row3[5]};
270  *((SkinnyVector8x32U_t *)(output + 96)) =
271  (SkinnyVector8x32_t){row0[6], row1[6], row2[6], row3[6],
272  row0[7], row1[7], row2[7], row3[7]};
273 #else
274  WRITE_WORD32(output, 0, row0[0]);
275  WRITE_WORD32(output, 4, row1[0]);
276  WRITE_WORD32(output, 8, row2[0]);
277  WRITE_WORD32(output, 12, row3[0]);
278  WRITE_WORD32(output, 16, row0[1]);
279  WRITE_WORD32(output, 20, row1[1]);
280  WRITE_WORD32(output, 24, row2[1]);
281  WRITE_WORD32(output, 28, row3[1]);
282  WRITE_WORD32(output, 32, row0[2]);
283  WRITE_WORD32(output, 36, row1[2]);
284  WRITE_WORD32(output, 40, row2[2]);
285  WRITE_WORD32(output, 44, row3[2]);
286  WRITE_WORD32(output, 48, row0[3]);
287  WRITE_WORD32(output, 52, row1[3]);
288  WRITE_WORD32(output, 56, row2[3]);
289  WRITE_WORD32(output, 60, row3[3]);
290  WRITE_WORD32(output, 64, row0[4]);
291  WRITE_WORD32(output, 68, row1[4]);
292  WRITE_WORD32(output, 72, row2[4]);
293  WRITE_WORD32(output, 76, row3[4]);
294  WRITE_WORD32(output, 80, row0[5]);
295  WRITE_WORD32(output, 84, row1[5]);
296  WRITE_WORD32(output, 88, row2[5]);
297  WRITE_WORD32(output, 92, row3[5]);
298  WRITE_WORD32(output, 96, row0[6]);
299  WRITE_WORD32(output, 100, row1[6]);
300  WRITE_WORD32(output, 104, row2[6]);
301  WRITE_WORD32(output, 108, row3[6]);
302  WRITE_WORD32(output, 112, row0[7]);
303  WRITE_WORD32(output, 116, row1[7]);
304  WRITE_WORD32(output, 120, row2[7]);
305  WRITE_WORD32(output, 124, row3[7]);
306 #endif
307 }
308 
309 void _skinny128_parallel_decrypt_vec256
310  (void *output, const void *input, const Skinny128Key_t *ks)
311 {
312  SkinnyVector8x32_t row0;
313  SkinnyVector8x32_t row1;
314  SkinnyVector8x32_t row2;
315  SkinnyVector8x32_t row3;
316  const Skinny128HalfCells_t *schedule;
317  unsigned index;
318  SkinnyVector8x32_t temp;
319 
320  /* Read the rows of all eight blocks into memory */
321  row0 = (SkinnyVector8x32_t)
322  {READ_WORD32(input, 0), READ_WORD32(input, 16),
323  READ_WORD32(input, 32), READ_WORD32(input, 48),
324  READ_WORD32(input, 64), READ_WORD32(input, 80),
325  READ_WORD32(input, 96), READ_WORD32(input, 112)};
326  row1 = (SkinnyVector8x32_t)
327  {READ_WORD32(input, 4), READ_WORD32(input, 20),
328  READ_WORD32(input, 36), READ_WORD32(input, 52),
329  READ_WORD32(input, 68), READ_WORD32(input, 84),
330  READ_WORD32(input, 100), READ_WORD32(input, 116)};
331  row2 = (SkinnyVector8x32_t)
332  {READ_WORD32(input, 8), READ_WORD32(input, 24),
333  READ_WORD32(input, 40), READ_WORD32(input, 56),
334  READ_WORD32(input, 72), READ_WORD32(input, 88),
335  READ_WORD32(input, 104), READ_WORD32(input, 120)};
336  row3 = (SkinnyVector8x32_t)
337  {READ_WORD32(input, 12), READ_WORD32(input, 28),
338  READ_WORD32(input, 44), READ_WORD32(input, 60),
339  READ_WORD32(input, 76), READ_WORD32(input, 92),
340  READ_WORD32(input, 108), READ_WORD32(input, 124)};
341 
342  /* Perform all decryption rounds on the eight blocks in parallel */
343  schedule = &(ks->schedule[ks->rounds - 1]);
344  for (index = ks->rounds; index > 0; --index, --schedule) {
345  /* Inverse mix of the columns */
346  temp = row3;
347  row3 = row0;
348  row0 = row1;
349  row1 = row2;
350  row3 ^= temp;
351  row2 = temp ^ row0;
352  row1 ^= row2;
353 
354  /* Inverse shift of the rows */
355  row1 = skinny128_rotate_right(row1, 24);
356  row2 = skinny128_rotate_right(row2, 16);
357  row3 = skinny128_rotate_right(row3, 8);
358 
359  /* Apply the subkey for this round */
360  row0 ^= schedule->row[0];
361  row1 ^= schedule->row[1];
362  row2 ^= 0x02;
363 
364  /* Apply the inverse S-box to all bytes in the state */
365  skinny128_inv_sbox_four(&row0, &row1, &row2, &row3);
366  }
367 
368  /* Write the rows of all eight blocks back to memory */
369 #if SKINNY_LITTLE_ENDIAN && SKINNY_UNALIGNED
370  *((SkinnyVector8x32U_t *)output) =
371  (SkinnyVector8x32_t){row0[0], row1[0], row2[0], row3[0],
372  row0[1], row1[1], row2[1], row3[1]};
373  *((SkinnyVector8x32U_t *)(output + 32)) =
374  (SkinnyVector8x32_t){row0[2], row1[2], row2[2], row3[2],
375  row0[3], row1[3], row2[3], row3[3]};
376  *((SkinnyVector8x32U_t *)(output + 64)) =
377  (SkinnyVector8x32_t){row0[4], row1[4], row2[4], row3[4],
378  row0[5], row1[5], row2[5], row3[5]};
379  *((SkinnyVector8x32U_t *)(output + 96)) =
380  (SkinnyVector8x32_t){row0[6], row1[6], row2[6], row3[6],
381  row0[7], row1[7], row2[7], row3[7]};
382 #else
383  WRITE_WORD32(output, 0, row0[0]);
384  WRITE_WORD32(output, 4, row1[0]);
385  WRITE_WORD32(output, 8, row2[0]);
386  WRITE_WORD32(output, 12, row3[0]);
387  WRITE_WORD32(output, 16, row0[1]);
388  WRITE_WORD32(output, 20, row1[1]);
389  WRITE_WORD32(output, 24, row2[1]);
390  WRITE_WORD32(output, 28, row3[1]);
391  WRITE_WORD32(output, 32, row0[2]);
392  WRITE_WORD32(output, 36, row1[2]);
393  WRITE_WORD32(output, 40, row2[2]);
394  WRITE_WORD32(output, 44, row3[2]);
395  WRITE_WORD32(output, 48, row0[3]);
396  WRITE_WORD32(output, 52, row1[3]);
397  WRITE_WORD32(output, 56, row2[3]);
398  WRITE_WORD32(output, 60, row3[3]);
399  WRITE_WORD32(output, 64, row0[4]);
400  WRITE_WORD32(output, 68, row1[4]);
401  WRITE_WORD32(output, 72, row2[4]);
402  WRITE_WORD32(output, 76, row3[4]);
403  WRITE_WORD32(output, 80, row0[5]);
404  WRITE_WORD32(output, 84, row1[5]);
405  WRITE_WORD32(output, 88, row2[5]);
406  WRITE_WORD32(output, 92, row3[5]);
407  WRITE_WORD32(output, 96, row0[6]);
408  WRITE_WORD32(output, 100, row1[6]);
409  WRITE_WORD32(output, 104, row2[6]);
410  WRITE_WORD32(output, 108, row3[6]);
411  WRITE_WORD32(output, 112, row0[7]);
412  WRITE_WORD32(output, 116, row1[7]);
413  WRITE_WORD32(output, 120, row2[7]);
414  WRITE_WORD32(output, 124, row3[7]);
415 #endif
416 }
417 
418 #else /* !SKINNY_VEC256_MATH */
419 
420 /* Stubbed out */
421 
422 void _skinny128_parallel_encrypt_vec256
423  (void *output, const void *input, const Skinny128Key_t *ks)
424 {
425  (void)output;
426  (void)input;
427  (void)ks;
428 }
429 
430 void _skinny128_parallel_decrypt_vec256
431  (void *output, const void *input, const Skinny128Key_t *ks)
432 {
433  (void)output;
434  (void)input;
435  (void)ks;
436 }
437 
438 #endif /* !SKINNY_VEC256_MATH */
Union that describes a 64-bit 2x4 array of cells.
Skinny128HalfCells_t schedule[SKINNY128_MAX_ROUNDS]
Key schedule for Skinny128 block ciphers.