23 #include "skinny64-parallel.h"
24 #include "skinny-internal.h"
26 #if SKINNY_VEC128_MATH
28 STATIC_INLINE SkinnyVector8x16_t skinny64_rotate_right
29 (SkinnyVector8x16_t x,
unsigned count)
31 return (x >> count) | (x << (16 - count));
34 STATIC_INLINE SkinnyVector8x16_t skinny64_sbox(SkinnyVector8x16_t x)
36 SkinnyVector8x16_t bit0 = ~x;
37 SkinnyVector8x16_t bit1 = bit0 >> 1;
38 SkinnyVector8x16_t bit2 = bit0 >> 2;
39 SkinnyVector8x16_t bit3 = bit0 >> 3;
44 x = ((bit0 << 3) & 0x8888U) |
46 ((bit2 << 1) & 0x2222U) |
47 ((bit3 << 2) & 0x4444U);
51 STATIC_INLINE SkinnyVector8x16_t skinny64_inv_sbox(SkinnyVector8x16_t x)
53 SkinnyVector8x16_t bit0 = ~x;
54 SkinnyVector8x16_t bit1 = bit0 >> 1;
55 SkinnyVector8x16_t bit2 = bit0 >> 2;
56 SkinnyVector8x16_t bit3 = bit0 >> 3;
61 x = ((bit0 << 1) & 0x2222U) |
62 ((bit1 << 2) & 0x4444U) |
63 ((bit2 << 3) & 0x8888U) |
68 void _skinny64_parallel_encrypt_vec128
71 SkinnyVector8x16_t row0;
72 SkinnyVector8x16_t row1;
73 SkinnyVector8x16_t row2;
74 SkinnyVector8x16_t row3;
77 SkinnyVector8x16_t temp;
80 row0 = (SkinnyVector8x16_t)
81 {READ_WORD16(input, 0), READ_WORD16(input, 8),
82 READ_WORD16(input, 16), READ_WORD16(input, 24),
83 READ_WORD16(input, 32), READ_WORD16(input, 40),
84 READ_WORD16(input, 48), READ_WORD16(input, 56)};
85 row1 = (SkinnyVector8x16_t)
86 {READ_WORD16(input, 2), READ_WORD16(input, 10),
87 READ_WORD16(input, 18), READ_WORD16(input, 26),
88 READ_WORD16(input, 34), READ_WORD16(input, 42),
89 READ_WORD16(input, 50), READ_WORD16(input, 58)};
90 row2 = (SkinnyVector8x16_t)
91 {READ_WORD16(input, 4), READ_WORD16(input, 12),
92 READ_WORD16(input, 20), READ_WORD16(input, 28),
93 READ_WORD16(input, 36), READ_WORD16(input, 44),
94 READ_WORD16(input, 52), READ_WORD16(input, 60)};
95 row3 = (SkinnyVector8x16_t)
96 {READ_WORD16(input, 6), READ_WORD16(input, 14),
97 READ_WORD16(input, 22), READ_WORD16(input, 30),
98 READ_WORD16(input, 38), READ_WORD16(input, 46),
99 READ_WORD16(input, 54), READ_WORD16(input, 62)};
103 for (index = ks->
rounds; index > 0; --index, ++schedule) {
105 row0 = skinny64_sbox(row0);
106 row1 = skinny64_sbox(row1);
107 row2 = skinny64_sbox(row2);
108 row3 = skinny64_sbox(row3);
111 row0 ^= schedule->
row[0];
112 row1 ^= schedule->
row[1];
116 row1 = skinny64_rotate_right(row1, 4);
117 row2 = skinny64_rotate_right(row2, 8);
118 row3 = skinny64_rotate_right(row3, 12);
131 WRITE_WORD16(output, 0, row0[0]);
132 WRITE_WORD16(output, 2, row1[0]);
133 WRITE_WORD16(output, 4, row2[0]);
134 WRITE_WORD16(output, 6, row3[0]);
135 WRITE_WORD16(output, 8, row0[1]);
136 WRITE_WORD16(output, 10, row1[1]);
137 WRITE_WORD16(output, 12, row2[1]);
138 WRITE_WORD16(output, 14, row3[1]);
139 WRITE_WORD16(output, 16, row0[2]);
140 WRITE_WORD16(output, 18, row1[2]);
141 WRITE_WORD16(output, 20, row2[2]);
142 WRITE_WORD16(output, 22, row3[2]);
143 WRITE_WORD16(output, 24, row0[3]);
144 WRITE_WORD16(output, 26, row1[3]);
145 WRITE_WORD16(output, 28, row2[3]);
146 WRITE_WORD16(output, 30, row3[3]);
147 WRITE_WORD16(output, 32, row0[4]);
148 WRITE_WORD16(output, 34, row1[4]);
149 WRITE_WORD16(output, 36, row2[4]);
150 WRITE_WORD16(output, 38, row3[4]);
151 WRITE_WORD16(output, 40, row0[5]);
152 WRITE_WORD16(output, 42, row1[5]);
153 WRITE_WORD16(output, 44, row2[5]);
154 WRITE_WORD16(output, 46, row3[5]);
155 WRITE_WORD16(output, 48, row0[6]);
156 WRITE_WORD16(output, 50, row1[6]);
157 WRITE_WORD16(output, 52, row2[6]);
158 WRITE_WORD16(output, 54, row3[6]);
159 WRITE_WORD16(output, 56, row0[7]);
160 WRITE_WORD16(output, 58, row1[7]);
161 WRITE_WORD16(output, 60, row2[7]);
162 WRITE_WORD16(output, 62, row3[7]);
165 void _skinny64_parallel_decrypt_vec128
168 SkinnyVector8x16_t row0;
169 SkinnyVector8x16_t row1;
170 SkinnyVector8x16_t row2;
171 SkinnyVector8x16_t row3;
174 SkinnyVector8x16_t temp;
177 row0 = (SkinnyVector8x16_t)
178 {READ_WORD16(input, 0), READ_WORD16(input, 8),
179 READ_WORD16(input, 16), READ_WORD16(input, 24),
180 READ_WORD16(input, 32), READ_WORD16(input, 40),
181 READ_WORD16(input, 48), READ_WORD16(input, 56)};
182 row1 = (SkinnyVector8x16_t)
183 {READ_WORD16(input, 2), READ_WORD16(input, 10),
184 READ_WORD16(input, 18), READ_WORD16(input, 26),
185 READ_WORD16(input, 34), READ_WORD16(input, 42),
186 READ_WORD16(input, 50), READ_WORD16(input, 58)};
187 row2 = (SkinnyVector8x16_t)
188 {READ_WORD16(input, 4), READ_WORD16(input, 12),
189 READ_WORD16(input, 20), READ_WORD16(input, 28),
190 READ_WORD16(input, 36), READ_WORD16(input, 44),
191 READ_WORD16(input, 52), READ_WORD16(input, 60)};
192 row3 = (SkinnyVector8x16_t)
193 {READ_WORD16(input, 6), READ_WORD16(input, 14),
194 READ_WORD16(input, 22), READ_WORD16(input, 30),
195 READ_WORD16(input, 38), READ_WORD16(input, 46),
196 READ_WORD16(input, 54), READ_WORD16(input, 62)};
200 for (index = ks->
rounds; index > 0; --index, --schedule) {
211 row1 = skinny64_rotate_right(row1, 12);
212 row2 = skinny64_rotate_right(row2, 8);
213 row3 = skinny64_rotate_right(row3, 4);
216 row0 ^= schedule->
row[0];
217 row1 ^= schedule->
row[1];
221 row0 = skinny64_inv_sbox(row0);
222 row1 = skinny64_inv_sbox(row1);
223 row2 = skinny64_inv_sbox(row2);
224 row3 = skinny64_inv_sbox(row3);
228 WRITE_WORD16(output, 0, row0[0]);
229 WRITE_WORD16(output, 2, row1[0]);
230 WRITE_WORD16(output, 4, row2[0]);
231 WRITE_WORD16(output, 6, row3[0]);
232 WRITE_WORD16(output, 8, row0[1]);
233 WRITE_WORD16(output, 10, row1[1]);
234 WRITE_WORD16(output, 12, row2[1]);
235 WRITE_WORD16(output, 14, row3[1]);
236 WRITE_WORD16(output, 16, row0[2]);
237 WRITE_WORD16(output, 18, row1[2]);
238 WRITE_WORD16(output, 20, row2[2]);
239 WRITE_WORD16(output, 22, row3[2]);
240 WRITE_WORD16(output, 24, row0[3]);
241 WRITE_WORD16(output, 26, row1[3]);
242 WRITE_WORD16(output, 28, row2[3]);
243 WRITE_WORD16(output, 30, row3[3]);
244 WRITE_WORD16(output, 32, row0[4]);
245 WRITE_WORD16(output, 34, row1[4]);
246 WRITE_WORD16(output, 36, row2[4]);
247 WRITE_WORD16(output, 38, row3[4]);
248 WRITE_WORD16(output, 40, row0[5]);
249 WRITE_WORD16(output, 42, row1[5]);
250 WRITE_WORD16(output, 44, row2[5]);
251 WRITE_WORD16(output, 46, row3[5]);
252 WRITE_WORD16(output, 48, row0[6]);
253 WRITE_WORD16(output, 50, row1[6]);
254 WRITE_WORD16(output, 52, row2[6]);
255 WRITE_WORD16(output, 54, row3[6]);
256 WRITE_WORD16(output, 56, row0[7]);
257 WRITE_WORD16(output, 58, row1[7]);
258 WRITE_WORD16(output, 60, row2[7]);
259 WRITE_WORD16(output, 62, row3[7]);
266 void _skinny64_parallel_encrypt_vec128
274 void _skinny64_parallel_decrypt_vec128
Skinny64HalfCells_t schedule[SKINNY64_MAX_ROUNDS]
Key schedule for Skinny64 block ciphers.
Union that describes a 32-bit 2x4 array of cells.