Arduino Cryptography Library
GF128.cpp
1 /*
2  * Copyright (C) 2016 Southern Storm Software, Pty Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "GF128.h"
24 #include "utility/EndianUtil.h"
25 #include <string.h>
26 
58 void GF128::mulInit(uint32_t H[4], const void *key)
59 {
60 #if defined(__AVR__)
61  // Copy the key into H but leave it in big endian order because
62  // we can correct for the byte order in mul() below.
63  memcpy(H, key, 16);
64 #else
65  // Copy the key into H and convert from big endian to host order.
66  memcpy(H, key, 16);
67 #if defined(CRYPTO_LITTLE_ENDIAN)
68  H[0] = be32toh(H[0]);
69  H[1] = be32toh(H[1]);
70  H[2] = be32toh(H[2]);
71  H[3] = be32toh(H[3]);
72 #endif
73 #endif
74 }
75 
90 void GF128::mul(uint32_t Y[4], const uint32_t H[4])
91 {
92 #if defined(__AVR__)
93  uint32_t Z[4] = {0, 0, 0, 0}; // Z = 0
94  uint32_t V0 = H[0]; // V = H
95  uint32_t V1 = H[1];
96  uint32_t V2 = H[2];
97  uint32_t V3 = H[3];
98 
99  // Multiply Z by V for the set bits in Y, starting at the top.
100  // This is a very simple bit by bit version that may not be very
101  // fast but it should be resistant to cache timing attacks.
102  for (uint8_t posn = 0; posn < 16; ++posn) {
103  uint8_t value = ((const uint8_t *)Y)[posn];
104  for (uint8_t bit = 0; bit < 8; ++bit) {
105  __asm__ __volatile__ (
106  // Extract the high bit of "value" and turn it into a mask.
107  "ldd r24,%8\n"
108  "lsl r24\n"
109  "std %8,r24\n"
110  "mov __tmp_reg__,__zero_reg__\n"
111  "sbc __tmp_reg__,__zero_reg__\n"
112 
113  // XOR V with Z if the bit is 1.
114  "mov r24,%D0\n" // Z0 ^= (V0 & mask)
115  "and r24,__tmp_reg__\n"
116  "ldd r25,%D4\n"
117  "eor r25,r24\n"
118  "std %D4,r25\n"
119  "mov r24,%C0\n"
120  "and r24,__tmp_reg__\n"
121  "ldd r25,%C4\n"
122  "eor r25,r24\n"
123  "std %C4,r25\n"
124  "mov r24,%B0\n"
125  "and r24,__tmp_reg__\n"
126  "ldd r25,%B4\n"
127  "eor r25,r24\n"
128  "std %B4,r25\n"
129  "mov r24,%A0\n"
130  "and r24,__tmp_reg__\n"
131  "ldd r25,%A4\n"
132  "eor r25,r24\n"
133  "std %A4,r25\n"
134  "mov r24,%D1\n" // Z1 ^= (V1 & mask)
135  "and r24,__tmp_reg__\n"
136  "ldd r25,%D5\n"
137  "eor r25,r24\n"
138  "std %D5,r25\n"
139  "mov r24,%C1\n"
140  "and r24,__tmp_reg__\n"
141  "ldd r25,%C5\n"
142  "eor r25,r24\n"
143  "std %C5,r25\n"
144  "mov r24,%B1\n"
145  "and r24,__tmp_reg__\n"
146  "ldd r25,%B5\n"
147  "eor r25,r24\n"
148  "std %B5,r25\n"
149  "mov r24,%A1\n"
150  "and r24,__tmp_reg__\n"
151  "ldd r25,%A5\n"
152  "eor r25,r24\n"
153  "std %A5,r25\n"
154  "mov r24,%D2\n" // Z2 ^= (V2 & mask)
155  "and r24,__tmp_reg__\n"
156  "ldd r25,%D6\n"
157  "eor r25,r24\n"
158  "std %D6,r25\n"
159  "mov r24,%C2\n"
160  "and r24,__tmp_reg__\n"
161  "ldd r25,%C6\n"
162  "eor r25,r24\n"
163  "std %C6,r25\n"
164  "mov r24,%B2\n"
165  "and r24,__tmp_reg__\n"
166  "ldd r25,%B6\n"
167  "eor r25,r24\n"
168  "std %B6,r25\n"
169  "mov r24,%A2\n"
170  "and r24,__tmp_reg__\n"
171  "ldd r25,%A6\n"
172  "eor r25,r24\n"
173  "std %A6,r25\n"
174  "mov r24,%D3\n" // Z3 ^= (V3 & mask)
175  "and r24,__tmp_reg__\n"
176  "ldd r25,%D7\n"
177  "eor r25,r24\n"
178  "std %D7,r25\n"
179  "mov r24,%C3\n"
180  "and r24,__tmp_reg__\n"
181  "ldd r25,%C7\n"
182  "eor r25,r24\n"
183  "std %C7,r25\n"
184  "mov r24,%B3\n"
185  "and r24,__tmp_reg__\n"
186  "ldd r25,%B7\n"
187  "eor r25,r24\n"
188  "std %B7,r25\n"
189  "mov r24,%A3\n"
190  "and r24,__tmp_reg__\n"
191  "ldd r25,%A7\n"
192  "eor r25,r24\n"
193  "std %A7,r25\n"
194 
195  // Rotate V right by 1 bit.
196  "lsr %A0\n"
197  "ror %B0\n"
198  "ror %C0\n"
199  "ror %D0\n"
200  "ror %A1\n"
201  "ror %B1\n"
202  "ror %C1\n"
203  "ror %D1\n"
204  "ror %A2\n"
205  "ror %B2\n"
206  "ror %C2\n"
207  "ror %D2\n"
208  "ror %A3\n"
209  "ror %B3\n"
210  "ror %C3\n"
211  "ror %D3\n"
212  "mov r24,__zero_reg__\n"
213  "sbc r24,__zero_reg__\n"
214  "andi r24,0xE1\n"
215  "eor %A0,r24\n"
216  : "+r"(V0), "+r"(V1), "+r"(V2), "+r"(V3)
217  : "Q"(Z[0]), "Q"(Z[1]), "Q"(Z[2]), "Q"(Z[3]), "Q"(value)
218  : "r24", "r25"
219  );
220  }
221  }
222 
223  // We have finished the block so copy Z into Y and byte-swap.
224  __asm__ __volatile__ (
225  "ldd __tmp_reg__,%A0\n"
226  "st X+,__tmp_reg__\n"
227  "ldd __tmp_reg__,%B0\n"
228  "st X+,__tmp_reg__\n"
229  "ldd __tmp_reg__,%C0\n"
230  "st X+,__tmp_reg__\n"
231  "ldd __tmp_reg__,%D0\n"
232  "st X+,__tmp_reg__\n"
233  "ldd __tmp_reg__,%A1\n"
234  "st X+,__tmp_reg__\n"
235  "ldd __tmp_reg__,%B1\n"
236  "st X+,__tmp_reg__\n"
237  "ldd __tmp_reg__,%C1\n"
238  "st X+,__tmp_reg__\n"
239  "ldd __tmp_reg__,%D1\n"
240  "st X+,__tmp_reg__\n"
241  "ldd __tmp_reg__,%A2\n"
242  "st X+,__tmp_reg__\n"
243  "ldd __tmp_reg__,%B2\n"
244  "st X+,__tmp_reg__\n"
245  "ldd __tmp_reg__,%C2\n"
246  "st X+,__tmp_reg__\n"
247  "ldd __tmp_reg__,%D2\n"
248  "st X+,__tmp_reg__\n"
249  "ldd __tmp_reg__,%A3\n"
250  "st X+,__tmp_reg__\n"
251  "ldd __tmp_reg__,%B3\n"
252  "st X+,__tmp_reg__\n"
253  "ldd __tmp_reg__,%C3\n"
254  "st X+,__tmp_reg__\n"
255  "ldd __tmp_reg__,%D3\n"
256  "st X,__tmp_reg__\n"
257  : : "Q"(Z[0]), "Q"(Z[1]), "Q"(Z[2]), "Q"(Z[3]), "x"(Y)
258  );
259 #else // !__AVR__
260  uint32_t Z0 = 0; // Z = 0
261  uint32_t Z1 = 0;
262  uint32_t Z2 = 0;
263  uint32_t Z3 = 0;
264  uint32_t V0 = H[0]; // V = H
265  uint32_t V1 = H[1];
266  uint32_t V2 = H[2];
267  uint32_t V3 = H[3];
268 
269  // Multiply Z by V for the set bits in Y, starting at the top.
270  // This is a very simple bit by bit version that may not be very
271  // fast but it should be resistant to cache timing attacks.
272  for (uint8_t posn = 0; posn < 16; ++posn) {
273  uint8_t value = ((const uint8_t *)Y)[posn];
274  for (uint8_t bit = 0; bit < 8; ++bit, value <<= 1) {
275  // Extract the high bit of "value" and turn it into a mask.
276  uint32_t mask = (~((uint32_t)(value >> 7))) + 1;
277 
278  // XOR V with Z if the bit is 1.
279  Z0 ^= (V0 & mask);
280  Z1 ^= (V1 & mask);
281  Z2 ^= (V2 & mask);
282  Z3 ^= (V3 & mask);
283 
284  // Rotate V right by 1 bit.
285  mask = ((~(V3 & 0x01)) + 1) & 0xE1000000;
286  V3 = (V3 >> 1) | (V2 << 31);
287  V2 = (V2 >> 1) | (V1 << 31);
288  V1 = (V1 >> 1) | (V0 << 31);
289  V0 = (V0 >> 1) ^ mask;
290  }
291  }
292 
293  // We have finished the block so copy Z into Y and byte-swap.
294  Y[0] = htobe32(Z0);
295  Y[1] = htobe32(Z1);
296  Y[2] = htobe32(Z2);
297  Y[3] = htobe32(Z3);
298 #endif // !__AVR__
299 }
300 
314 void GF128::dbl(uint32_t V[4])
315 {
316 #if defined(__AVR__)
317  __asm__ __volatile__ (
318  "ld r16,Z\n"
319  "ldd r17,Z+1\n"
320  "ldd r18,Z+2\n"
321  "ldd r19,Z+3\n"
322  "lsr r16\n"
323  "ror r17\n"
324  "ror r18\n"
325  "ror r19\n"
326  "std Z+1,r17\n"
327  "std Z+2,r18\n"
328  "std Z+3,r19\n"
329  "ldd r17,Z+4\n"
330  "ldd r18,Z+5\n"
331  "ldd r19,Z+6\n"
332  "ldd r20,Z+7\n"
333  "ror r17\n"
334  "ror r18\n"
335  "ror r19\n"
336  "ror r20\n"
337  "std Z+4,r17\n"
338  "std Z+5,r18\n"
339  "std Z+6,r19\n"
340  "std Z+7,r20\n"
341  "ldd r17,Z+8\n"
342  "ldd r18,Z+9\n"
343  "ldd r19,Z+10\n"
344  "ldd r20,Z+11\n"
345  "ror r17\n"
346  "ror r18\n"
347  "ror r19\n"
348  "ror r20\n"
349  "std Z+8,r17\n"
350  "std Z+9,r18\n"
351  "std Z+10,r19\n"
352  "std Z+11,r20\n"
353  "ldd r17,Z+12\n"
354  "ldd r18,Z+13\n"
355  "ldd r19,Z+14\n"
356  "ldd r20,Z+15\n"
357  "ror r17\n"
358  "ror r18\n"
359  "ror r19\n"
360  "ror r20\n"
361  "std Z+12,r17\n"
362  "std Z+13,r18\n"
363  "std Z+14,r19\n"
364  "std Z+15,r20\n"
365  "mov r17,__zero_reg__\n"
366  "sbc r17,__zero_reg__\n"
367  "andi r17,0xE1\n"
368  "eor r16,r17\n"
369  "st Z,r16\n"
370  : : "z"(V)
371  : "r16", "r17", "r18", "r19", "r20"
372  );
373 #else
374  uint32_t V0 = be32toh(V[0]);
375  uint32_t V1 = be32toh(V[1]);
376  uint32_t V2 = be32toh(V[2]);
377  uint32_t V3 = be32toh(V[3]);
378  uint32_t mask = ((~(V3 & 0x01)) + 1) & 0xE1000000;
379  V3 = (V3 >> 1) | (V2 << 31);
380  V2 = (V2 >> 1) | (V1 << 31);
381  V1 = (V1 >> 1) | (V0 << 31);
382  V0 = (V0 >> 1) ^ mask;
383  V[0] = htobe32(V0);
384  V[1] = htobe32(V1);
385  V[2] = htobe32(V2);
386  V[3] = htobe32(V3);
387 #endif
388 }
389 
406 void GF128::dblEAX(uint32_t V[4])
407 {
408 #if defined(__AVR__)
409  __asm__ __volatile__ (
410  "ldd r16,Z+15\n"
411  "ldd r17,Z+14\n"
412  "ldd r18,Z+13\n"
413  "ldd r19,Z+12\n"
414  "lsl r16\n"
415  "rol r17\n"
416  "rol r18\n"
417  "rol r19\n"
418  "std Z+14,r17\n"
419  "std Z+13,r18\n"
420  "std Z+12,r19\n"
421  "ldd r17,Z+11\n"
422  "ldd r18,Z+10\n"
423  "ldd r19,Z+9\n"
424  "ldd r20,Z+8\n"
425  "rol r17\n"
426  "rol r18\n"
427  "rol r19\n"
428  "rol r20\n"
429  "std Z+11,r17\n"
430  "std Z+10,r18\n"
431  "std Z+9,r19\n"
432  "std Z+8,r20\n"
433  "ldd r17,Z+7\n"
434  "ldd r18,Z+6\n"
435  "ldd r19,Z+5\n"
436  "ldd r20,Z+4\n"
437  "rol r17\n"
438  "rol r18\n"
439  "rol r19\n"
440  "rol r20\n"
441  "std Z+7,r17\n"
442  "std Z+6,r18\n"
443  "std Z+5,r19\n"
444  "std Z+4,r20\n"
445  "ldd r17,Z+3\n"
446  "ldd r18,Z+2\n"
447  "ldd r19,Z+1\n"
448  "ld r20,Z\n"
449  "rol r17\n"
450  "rol r18\n"
451  "rol r19\n"
452  "rol r20\n"
453  "std Z+3,r17\n"
454  "std Z+2,r18\n"
455  "std Z+1,r19\n"
456  "st Z,r20\n"
457  "mov r17,__zero_reg__\n"
458  "sbc r17,__zero_reg__\n"
459  "andi r17,0x87\n"
460  "eor r16,r17\n"
461  "std Z+15,r16\n"
462  : : "z"(V)
463  : "r16", "r17", "r18", "r19", "r20"
464  );
465 #else
466  uint32_t V0 = be32toh(V[0]);
467  uint32_t V1 = be32toh(V[1]);
468  uint32_t V2 = be32toh(V[2]);
469  uint32_t V3 = be32toh(V[3]);
470  uint32_t mask = ((~(V0 >> 31)) + 1) & 0x00000087;
471  V0 = (V0 << 1) | (V1 >> 31);
472  V1 = (V1 << 1) | (V2 >> 31);
473  V2 = (V2 << 1) | (V3 >> 31);
474  V3 = (V3 << 1) ^ mask;
475  V[0] = htobe32(V0);
476  V[1] = htobe32(V1);
477  V[2] = htobe32(V2);
478  V[3] = htobe32(V3);
479 #endif
480 }
481 
497 void GF128::dblXTS(uint32_t V[4])
498 {
499 #if defined(__AVR__)
500  __asm__ __volatile__ (
501  "ld r16,Z\n"
502  "ldd r17,Z+1\n"
503  "ldd r18,Z+2\n"
504  "ldd r19,Z+3\n"
505  "lsl r16\n"
506  "rol r17\n"
507  "rol r18\n"
508  "rol r19\n"
509  "std Z+1,r17\n"
510  "std Z+2,r18\n"
511  "std Z+3,r19\n"
512  "ldd r17,Z+4\n"
513  "ldd r18,Z+5\n"
514  "ldd r19,Z+6\n"
515  "ldd r20,Z+7\n"
516  "rol r17\n"
517  "rol r18\n"
518  "rol r19\n"
519  "rol r20\n"
520  "std Z+4,r17\n"
521  "std Z+5,r18\n"
522  "std Z+6,r19\n"
523  "std Z+7,r20\n"
524  "ldd r17,Z+8\n"
525  "ldd r18,Z+9\n"
526  "ldd r19,Z+10\n"
527  "ldd r20,Z+11\n"
528  "rol r17\n"
529  "rol r18\n"
530  "rol r19\n"
531  "rol r20\n"
532  "std Z+8,r17\n"
533  "std Z+9,r18\n"
534  "std Z+10,r19\n"
535  "std Z+11,r20\n"
536  "ldd r17,Z+12\n"
537  "ldd r18,Z+13\n"
538  "ldd r19,Z+14\n"
539  "ldd r20,Z+15\n"
540  "rol r17\n"
541  "rol r18\n"
542  "rol r19\n"
543  "rol r20\n"
544  "std Z+12,r17\n"
545  "std Z+13,r18\n"
546  "std Z+14,r19\n"
547  "std Z+15,r20\n"
548  "mov r17,__zero_reg__\n"
549  "sbc r17,__zero_reg__\n"
550  "andi r17,0x87\n"
551  "eor r16,r17\n"
552  "st Z,r16\n"
553  : : "z"(V)
554  : "r16", "r17", "r18", "r19", "r20"
555  );
556 #else
557  uint32_t V0 = le32toh(V[0]);
558  uint32_t V1 = le32toh(V[1]);
559  uint32_t V2 = le32toh(V[2]);
560  uint32_t V3 = le32toh(V[3]);
561  uint32_t mask = ((~(V3 >> 31)) + 1) & 0x00000087;
562  V3 = (V3 << 1) | (V2 >> 31);
563  V2 = (V2 << 1) | (V1 >> 31);
564  V1 = (V1 << 1) | (V0 >> 31);
565  V0 = (V0 << 1) ^ mask;
566  V[0] = htole32(V0);
567  V[1] = htole32(V1);
568  V[2] = htole32(V2);
569  V[3] = htole32(V3);
570 #endif
571 }
static void dblXTS(uint32_t V[4])
Doubles a value in the GF(2^128) field using XTS conventions.
Definition: GF128.cpp:497
static void mulInit(uint32_t H[4], const void *key)
Initialize multiplication in the GF(2^128) field.
Definition: GF128.cpp:58
static void dblEAX(uint32_t V[4])
Doubles a value in the GF(2^128) field using EAX conventions.
Definition: GF128.cpp:406
static void mul(uint32_t Y[4], const uint32_t H[4])
Perform a multiplication in the GF(2^128) field.
Definition: GF128.cpp:90
static void dbl(uint32_t V[4])
Doubles a value in the GF(2^128) field.
Definition: GF128.cpp:314