Arduino Cryptography Library
Curve25519.cpp
1 /*
2  * Copyright (C) 2015 Southern Storm Software, Pty Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "Curve25519.h"
24 #include "Crypto.h"
25 #include "RNG.h"
26 #include "utility/LimbUtil.h"
27 #include <string.h>
28 
44 // Global switch to enable/disable AVR inline assembly optimizations.
45 #if defined(__AVR__)
46 // Disabled for now - there are issues with newer Arduino compilers. FIXME
47 //#define CURVE25519_ASM_AVR 1
48 #endif
49 
50 // The overhead of clean() calls in mul(), reduceQuick(), etc can
51 // add up to a lot of processing time during eval(). Only do such
52 // cleanups if strict mode has been enabled. Other implementations
53 // like curve25519-donna don't do any cleaning at all so the value
54 // of cleaning up the stack is dubious at best anyway.
55 #if defined(CURVE25519_STRICT_CLEAN)
56 #define strict_clean(x) clean(x)
57 #else
58 #define strict_clean(x) do { ; } while (0)
59 #endif
60 
80 bool Curve25519::eval(uint8_t result[32], const uint8_t s[32], const uint8_t x[32])
81 {
82  limb_t x_1[NUM_LIMBS_256BIT];
83  limb_t x_2[NUM_LIMBS_256BIT];
84  limb_t x_3[NUM_LIMBS_256BIT];
85  limb_t z_2[NUM_LIMBS_256BIT];
86  limb_t z_3[NUM_LIMBS_256BIT];
87  limb_t A[NUM_LIMBS_256BIT];
88  limb_t B[NUM_LIMBS_256BIT];
89  limb_t C[NUM_LIMBS_256BIT];
90  limb_t D[NUM_LIMBS_256BIT];
91  limb_t E[NUM_LIMBS_256BIT];
92  limb_t AA[NUM_LIMBS_256BIT];
93  limb_t BB[NUM_LIMBS_256BIT];
94  limb_t DA[NUM_LIMBS_256BIT];
95  limb_t CB[NUM_LIMBS_256BIT];
96  uint8_t mask;
97  uint8_t sposn;
98  uint8_t select;
99  uint8_t swap;
100  bool retval;
101 
102  // Unpack the "x" argument into the limb representation
103  // which also masks off the high bit. NULL means 9.
104  if (x) {
105  // x1 = x
106  BigNumberUtil::unpackLE(x_1, NUM_LIMBS_256BIT, x, 32);
107  x_1[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
108  } else {
109  memset(x_1, 0, sizeof(x_1)); // x_1 = 9
110  x_1[0] = 9;
111  }
112 
113  // Check that "x" is within the range of the modulo field.
114  // We can do this with a reduction - if there was no borrow
115  // then the value of "x" was out of range. Timing is sensitive
116  // here so that we don't reveal anything about the value of "x".
117  // If there was a reduction, then continue executing the rest
118  // of this function with the (now) in-range "x" value and
119  // report the failure at the end.
120  retval = (bool)(reduceQuick(x_1) & 0x01);
121 
122  // Initialize the other temporary variables.
123  memset(x_2, 0, sizeof(x_2)); // x_2 = 1
124  x_2[0] = 1;
125  memset(z_2, 0, sizeof(z_2)); // z_2 = 0
126  memcpy(x_3, x_1, sizeof(x_1)); // x_3 = x
127  memcpy(z_3, x_2, sizeof(x_2)); // z_3 = 1
128 
129  // Iterate over all 255 bits of "s" from the highest to the lowest.
130  // We ignore the high bit of the 256-bit representation of "s".
131  mask = 0x40;
132  sposn = 31;
133  swap = 0;
134  for (uint8_t t = 255; t > 0; --t) {
135  // Conditional swaps on entry to this bit but only if we
136  // didn't swap on the previous bit.
137  select = s[sposn] & mask;
138  swap ^= select;
139  cswap(swap, x_2, x_3);
140  cswap(swap, z_2, z_3);
141 
142  // Evaluate the curve.
143  add(A, x_2, z_2); // A = x_2 + z_2
144  square(AA, A); // AA = A^2
145  sub(B, x_2, z_2); // B = x_2 - z_2
146  square(BB, B); // BB = B^2
147  sub(E, AA, BB); // E = AA - BB
148  add(C, x_3, z_3); // C = x_3 + z_3
149  sub(D, x_3, z_3); // D = x_3 - z_3
150  mul(DA, D, A); // DA = D * A
151  mul(CB, C, B); // CB = C * B
152  add(x_3, DA, CB); // x_3 = (DA + CB)^2
153  square(x_3, x_3);
154  sub(z_3, DA, CB); // z_3 = x_1 * (DA - CB)^2
155  square(z_3, z_3);
156  mul(z_3, z_3, x_1);
157  mul(x_2, AA, BB); // x_2 = AA * BB
158  mulA24(z_2, E); // z_2 = E * (AA + a24 * E)
159  add(z_2, z_2, AA);
160  mul(z_2, z_2, E);
161 
162  // Move onto the next lower bit of "s".
163  mask >>= 1;
164  if (!mask) {
165  --sposn;
166  mask = 0x80;
167  swap = select << 7;
168  } else {
169  swap = select >> 1;
170  }
171  }
172 
173  // Final conditional swaps.
174  cswap(swap, x_2, x_3);
175  cswap(swap, z_2, z_3);
176 
177  // Compute x_2 * (z_2 ^ (p - 2)) where p = 2^255 - 19.
178  recip(z_3, z_2);
179  mul(x_2, x_2, z_3);
180 
181  // Pack the result into the return array.
182  BigNumberUtil::packLE(result, 32, x_2, NUM_LIMBS_256BIT);
183 
184  // Clean up and exit.
185  clean(x_1);
186  clean(x_2);
187  clean(x_3);
188  clean(z_2);
189  clean(z_3);
190  clean(A);
191  clean(B);
192  clean(C);
193  clean(D);
194  clean(E);
195  clean(AA);
196  clean(BB);
197  clean(DA);
198  clean(CB);
199  return retval;
200 }
201 
245 void Curve25519::dh1(uint8_t k[32], uint8_t f[32])
246 {
247  do {
248  // Generate a random "f" value and then adjust the value to make
249  // it valid as an "s" value for eval(). According to the specification
250  // we need to mask off the 3 right-most bits of f[0], mask off the
251  // left-most bit of f[31], and set the second to left-most bit of f[31].
252  RNG.rand(f, 32);
253  f[0] &= 0xF8;
254  f[31] = (f[31] & 0x7F) | 0x40;
255 
256  // Evaluate the curve function: k = Curve25519::eval(f, 9).
257  // We pass NULL to eval() to indicate the value 9. There is no
258  // need to check the return value from eval() because we know
259  // that 9 is a valid field element.
260  eval(k, f, 0);
261 
262  // If "k" is weak for contributory behaviour then reject it,
263  // generate another "f" value, and try again. This case is
264  // highly unlikely but we still perform the check just in case.
265  } while (isWeakPoint(k));
266 }
267 
283 bool Curve25519::dh2(uint8_t k[32], uint8_t f[32])
284 {
285  uint8_t weak;
286 
287  // Evaluate the curve function: k = Curve25519::eval(f, k).
288  // If "k" is weak for contributory behaviour before or after
289  // the curve evaluation, then fail the exchange. For safety
290  // we perform every phase of the weak checks even if we could
291  // bail out earlier so that the execution takes the same
292  // amount of time for weak and non-weak "k" values.
293  weak = isWeakPoint(k); // Is "k" weak before?
294  weak |= ((eval(k, f, k) ^ 0x01) & 0x01); // Is "k" weak during?
295  weak |= isWeakPoint(k); // Is "k" weak after?
296  clean(f, 32);
297  return (bool)((weak ^ 0x01) & 0x01);
298 }
299 
307 uint8_t Curve25519::isWeakPoint(const uint8_t k[32])
308 {
309  // List of weak points from http://cr.yp.to/ecdh.html
310  // That page lists some others but they are variants on these
311  // of the form "point + i * (2^255 - 19)" for i = 0, 1, 2.
312  // Here we mask off the high bit and eval() catches the rest.
313  static const uint8_t points[5][32] PROGMEM = {
314  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
315  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
316  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
317  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
318  {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
319  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
320  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
321  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
322  {0xE0, 0xEB, 0x7A, 0x7C, 0x3B, 0x41, 0xB8, 0xAE,
323  0x16, 0x56, 0xE3, 0xFA, 0xF1, 0x9F, 0xC4, 0x6A,
324  0xDA, 0x09, 0x8D, 0xEB, 0x9C, 0x32, 0xB1, 0xFD,
325  0x86, 0x62, 0x05, 0x16, 0x5F, 0x49, 0xB8, 0x00},
326  {0x5F, 0x9C, 0x95, 0xBC, 0xA3, 0x50, 0x8C, 0x24,
327  0xB1, 0xD0, 0xB1, 0x55, 0x9C, 0x83, 0xEF, 0x5B,
328  0x04, 0x44, 0x5C, 0xC4, 0x58, 0x1C, 0x8E, 0x86,
329  0xD8, 0x22, 0x4E, 0xDD, 0xD0, 0x9F, 0x11, 0x57},
330  {0xEC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
331  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
332  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
333  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F}
334  };
335 
336  // Check each of the weak points in turn. We perform the
337  // comparisons carefully so as not to reveal the value of "k"
338  // in the instruction timing. If "k" is indeed weak then
339  // we still check everything so as not to reveal which
340  // weak point it is.
341  uint8_t result = 0;
342  for (uint8_t posn = 0; posn < 5; ++posn) {
343  const uint8_t *point = points[posn];
344  uint8_t check = (pgm_read_byte(&(point[31])) ^ k[31]) & 0x7F;
345  for (uint8_t index = 31; index > 0; --index)
346  check |= (pgm_read_byte(&(point[index - 1])) ^ k[index - 1]);
347  result |= (uint8_t)((((uint16_t)0x0100) - check) >> 8);
348  }
349 
350  // The "result" variable will be non-zero if there was a match.
351  return result;
352 }
353 
366 void Curve25519::reduce(limb_t *result, limb_t *x, uint8_t size)
367 {
368  /*
369  Note: This explaination is best viewed with a UTF-8 text viewer.
370 
371  To help explain what this function is doing, the following describes
372  how to efficiently compute reductions modulo a base of the form (2ⁿ - b)
373  where b is greater than zero and (b + 1)² <= 2ⁿ.
374 
375  Here we are interested in reducing the result of multiplying two
376  numbers that are less than or equal to (2ⁿ - b - 1). That is,
377  multiplying numbers that have already been reduced.
378 
379  Given some x less than or equal to (2ⁿ - b - 1)², we want to find a
380  y less than (2ⁿ - b) such that:
381 
382  y ≡ x mod (2ⁿ - b)
383 
384  We know that for all integer values of k >= 0:
385 
386  y ≡ x - k * (2ⁿ - b)
387  ≡ x - k * 2ⁿ + k * b
388 
389  In our case we choose k = ⌊x / 2ⁿ⌋ and then let:
390 
391  w = (x mod 2ⁿ) + ⌊x / 2ⁿ⌋ * b
392 
393  The value w will either be the answer y or y can be obtained by
394  repeatedly subtracting (2ⁿ - b) from w until it is less than (2ⁿ - b).
395  At most b subtractions will be required.
396 
397  In our case b is 19 which is more subtractions than we would like to do,
398  but we can handle that by performing the above reduction twice and then
399  performing a single trial subtraction:
400 
401  w = (x mod 2ⁿ) + ⌊x / 2ⁿ⌋ * b
402  y = (w mod 2ⁿ) + ⌊w / 2ⁿ⌋ * b
403  if y >= (2ⁿ - b)
404  y -= (2ⁿ - b)
405 
406  The value y is the answer we want for reducing x modulo (2ⁿ - b).
407  */
408 
409 #if !defined(CURVE25519_ASM_AVR)
410  dlimb_t carry;
411  uint8_t posn;
412 
413  // Calculate (x mod 2^255) + ((x / 2^255) * 19) which will
414  // either produce the answer we want or it will produce a
415  // value of the form "answer + j * (2^255 - 19)".
416  carry = ((dlimb_t)(x[NUM_LIMBS_256BIT - 1] >> (LIMB_BITS - 1))) * 19U;
417  x[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
418  for (posn = 0; posn < size; ++posn) {
419  carry += ((dlimb_t)(x[posn + NUM_LIMBS_256BIT])) * 38U;
420  carry += x[posn];
421  x[posn] = (limb_t)carry;
422  carry >>= LIMB_BITS;
423  }
424  if (size < NUM_LIMBS_256BIT) {
425  // The high order half of the number is short; e.g. for mulA24().
426  // Propagate the carry through the rest of the low order part.
427  for (posn = size; posn < NUM_LIMBS_256BIT; ++posn) {
428  carry += x[posn];
429  x[posn] = (limb_t)carry;
430  carry >>= LIMB_BITS;
431  }
432  }
433 
434  // The "j" value may still be too large due to the final carry-out.
435  // We must repeat the reduction. If we already have the answer,
436  // then this won't do any harm but we must still do the calculation
437  // to preserve the overall timing.
438  carry *= 38U;
439  carry += ((dlimb_t)(x[NUM_LIMBS_256BIT - 1] >> (LIMB_BITS - 1))) * 19U;
440  x[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
441  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
442  carry += x[posn];
443  x[posn] = (limb_t)carry;
444  carry >>= LIMB_BITS;
445  }
446 
447  // At this point "x" will either be the answer or it will be the
448  // answer plus (2^255 - 19). Perform a trial subtraction which
449  // is equivalent to adding 19 and subtracting 2^255. We put the
450  // trial answer into the top-most limbs of the original "x" array.
451  // We add 19 here; the subtraction of 2^255 occurs in the next step.
452  carry = 19U;
453  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
454  carry += x[posn];
455  x[posn + NUM_LIMBS_256BIT] = (limb_t)carry;
456  carry >>= LIMB_BITS;
457  }
458 
459  // If there was a borrow, then the bottom-most limbs of "x" are the
460  // correct answer. If there was no borrow, then the top-most limbs
461  // of "x" are the correct answer. Select the correct answer but do
462  // it in a way that instruction timing will not reveal which value
463  // was selected. Borrow will occur if the high bit of the previous
464  // result is 0: turn the high bit into a selection mask.
465  limb_t mask = (limb_t)(((slimb_t)(x[NUM_LIMBS_512BIT - 1])) >> (LIMB_BITS - 1));
466  limb_t nmask = ~mask;
467  x[NUM_LIMBS_512BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
468  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
469  result[posn] = (x[posn] & nmask) | (x[posn + NUM_LIMBS_256BIT] & mask);
470  }
471 #else
472  __asm__ __volatile__ (
473  // Calculate (x mod 2^255) + ((x / 2^255) * 19) which will
474  // either produce the answer we want or it will produce a
475  // value of the form "answer + j * (2^255 - 19)".
476  "ldd r24,Z+31\n" // Extract the high bit of x[31]
477  "mov r25,r24\n" // and mask it off
478  "andi r25,0x7F\n"
479  "std Z+31,r25\n"
480  "lsl r24\n" // carry = high bit * 19
481  "mov r24,__zero_reg__\n"
482  "sbc r24,__zero_reg__\n"
483  "andi r24,19\n"
484 
485  "mov r25,%1\n" // load "size" into r25
486  "ldi r23,38\n" // r23 = 38
487  "mov r22,__zero_reg__\n" // r22 = 0 (we're about to destroy r1)
488  "1:\n"
489  "ld r16,Z\n" // r16 = x[0]
490  "ldd r17,Z+32\n" // r17 = x[32]
491  "mul r17,r23\n" // r0:r1 = r17 * 38
492  "add r0,r24\n" // r0:r1 += carry
493  "adc r1,r22\n"
494  "add r0,r16\n" // r0:r1 += r16
495  "adc r1,r22\n"
496  "st Z+,r0\n" // *x++ = r0
497  "mov r24,r1\n" // carry = r1
498  "dec r25\n" // if (--r25 != 0) loop
499  "brne 1b\n"
500 
501  // If the size is short, then we need to continue propagating carries.
502  "ldi r25,32\n"
503  "cp %1,r25\n"
504  "breq 3f\n"
505  "sub r25,%1\n"
506  "ld __tmp_reg__,Z\n"
507  "add __tmp_reg__,r24\n"
508  "st Z+,__tmp_reg__\n"
509  "dec r25\n"
510  "2:\n"
511  "ld __tmp_reg__,Z\n" // *x++ += carry
512  "adc __tmp_reg__,r22\n"
513  "st Z+,__tmp_reg__\n"
514  "dec r25\n"
515  "brne 2b\n"
516  "mov r24,r22\n" // put the carry back into r24
517  "adc r24,r22\n"
518  "3:\n"
519  "sbiw r30,32\n" // Point Z back to the start of "x"
520 
521  // The "j" value may still be too large due to the final carry-out.
522  // We must repeat the reduction. If we already have the answer,
523  // then this won't do any harm but we must still do the calculation
524  // to preserve the overall timing.
525  "mul r24,r23\n" // carry *= 38
526  "ldd r24,Z+31\n" // Extract the high bit of x[31]
527  "mov r25,r24\n" // and mask it off
528  "andi r25,0x7F\n"
529  "std Z+31,r25\n"
530  "lsl r24\n" // carry += high bit * 19
531  "mov r24,r22\n"
532  "sbc r24,r22\n"
533  "andi r24,19\n"
534  "add r0,r24\n"
535  "adc r1,r22\n" // 9-bit carry is now in r0:r1
536 
537  // Propagate the carry through the rest of x.
538  "ld r24,Z\n" // x[0]
539  "add r0,r24\n"
540  "adc r1,r22\n"
541  "st Z+,r0\n"
542  "ld r24,Z\n" // x[1]
543  "add r1,r24\n"
544  "st Z+,r1\n"
545  "ldi r25,30\n" // x[2..31]
546  "4:\n"
547  "ld r24,Z\n"
548  "adc r24,r22\n"
549  "st Z+,r24\n"
550  "dec r25\n"
551  "brne 4b\n"
552  "sbiw r30,32\n" // Point Z back to the start of "x"
553 
554  // We destroyed __zero_reg__ (r1) above, so restore its zero value.
555  "mov __zero_reg__,r22\n"
556 
557  // At this point "x" will either be the answer or it will be the
558  // answer plus (2^255 - 19). Perform a trial subtraction which
559  // is equivalent to adding 19 and subtracting 2^255. We put the
560  // trial answer into the top-most limbs of the original "x" array.
561  // We add 19 here; the subtraction of 2^255 occurs in the next step.
562  "ldi r24,8\n" // Loop counter.
563  "ldi r25,19\n" // carry = 19
564  "5:\n"
565  "ld r16,Z+\n" // r16:r19:carry = *xx++ + carry
566  "ld r17,Z+\n"
567  "ld r18,Z+\n"
568  "ld r19,Z+\n"
569  "add r16,r25\n" // r16:r19:carry += carry
570  "adc r17,__zero_reg__\n"
571  "adc r18,__zero_reg__\n"
572  "adc r19,__zero_reg__\n"
573  "mov r25,__zero_reg__\n"
574  "adc r25,r25\n"
575  "std Z+28,r16\n" // *tt++ = r16:r19
576  "std Z+29,r17\n"
577  "std Z+30,r18\n"
578  "std Z+31,r19\n"
579  "dec r24\n"
580  "brne 5b\n"
581 
582  // Subtract 2^255 from x[32..63] which is equivalent to extracting
583  // the top bit and then masking it off. If the top bit is zero
584  // then a borrow has occurred and this isn't the answer we want.
585  "mov r25,r19\n"
586  "andi r19,0x7F\n"
587  "std Z+31,r19\n"
588  "lsl r25\n"
589  "mov r25,__zero_reg__\n"
590  "sbc r25,__zero_reg__\n"
591 
592  // At this point, r25 is 0 if the original x[0..31] is the answer
593  // we want, or 0xFF if x[32..63] is the answer we want. Essentially
594  // we need to do a conditional move of either x[0..31] or x[32..63]
595  // into "result".
596  "sbiw r30,32\n" // Point Z back to x[0].
597  "ldi r24,8\n"
598  "6:\n"
599  "ldd r16,Z+32\n"
600  "ldd r17,Z+33\n"
601  "ldd r18,Z+34\n"
602  "ldd r19,Z+35\n"
603  "ld r20,Z+\n"
604  "ld r21,Z+\n"
605  "ld r22,Z+\n"
606  "ld r23,Z+\n"
607  "eor r16,r20\n"
608  "eor r17,r21\n"
609  "eor r18,r22\n"
610  "eor r19,r23\n"
611  "and r16,r25\n"
612  "and r17,r25\n"
613  "and r18,r25\n"
614  "and r19,r25\n"
615  "eor r20,r16\n"
616  "eor r21,r17\n"
617  "eor r22,r18\n"
618  "eor r23,r19\n"
619  "st X+,r20\n"
620  "st X+,r21\n"
621  "st X+,r22\n"
622  "st X+,r23\n"
623  "dec r24\n"
624  "brne 6b\n"
625 
626  : : "z"(x), "r"((uint8_t)(size * sizeof(limb_t))), "x"(result)
627  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
628  "r24", "r25"
629  );
630 #endif
631 }
632 
646 limb_t Curve25519::reduceQuick(limb_t *x)
647 {
648 #if !defined(CURVE25519_ASM_AVR)
649  limb_t temp[NUM_LIMBS_256BIT];
650  dlimb_t carry;
651  uint8_t posn;
652  limb_t *xx;
653  limb_t *tt;
654 
655  // Perform a trial subtraction of (2^255 - 19) from "x" which is
656  // equivalent to adding 19 and subtracting 2^255. We add 19 here;
657  // the subtraction of 2^255 occurs in the next step.
658  carry = 19U;
659  xx = x;
660  tt = temp;
661  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
662  carry += *xx++;
663  *tt++ = (limb_t)carry;
664  carry >>= LIMB_BITS;
665  }
666 
667  // If there was a borrow, then the original "x" is the correct answer.
668  // If there was no borrow, then "temp" is the correct answer. Select the
669  // correct answer but do it in a way that instruction timing will not
670  // reveal which value was selected. Borrow will occur if the high bit
671  // of "temp" is 0: turn the high bit into a selection mask.
672  limb_t mask = (limb_t)(((slimb_t)(temp[NUM_LIMBS_256BIT - 1])) >> (LIMB_BITS - 1));
673  limb_t nmask = ~mask;
674  temp[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
675  xx = x;
676  tt = temp;
677  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
678  *xx = ((*xx) & nmask) | ((*tt++) & mask);
679  ++xx;
680  }
681 
682  // Clean up "temp".
683  strict_clean(temp);
684 
685  // Return a zero value if we actually subtracted (2^255 - 19) from "x".
686  return nmask;
687 #else // CURVE25519_ASM_AVR
688  limb_t temp[NUM_LIMBS_256BIT];
689  uint8_t result;
690  __asm__ __volatile__ (
691  // Subtract (2^255 - 19) from "x", which is the same as adding 19
692  // and then subtracting 2^255.
693  "ldi r24,8\n" // Loop counter.
694  "ldi r25,19\n" // carry = 19
695  "1:\n"
696  "ld r16,Z+\n" // r16:r19:carry = *xx++ + carry
697  "ld r17,Z+\n"
698  "ld r18,Z+\n"
699  "ld r19,Z+\n"
700  "add r16,r25\n" // r16:r19:carry += carry
701  "adc r17,__zero_reg__\n"
702  "adc r18,__zero_reg__\n"
703  "adc r19,__zero_reg__\n"
704  "mov r25,__zero_reg__\n"
705  "adc r25,r25\n"
706  "st X+,r16\n" // *tt++ = r16:r19
707  "st X+,r17\n"
708  "st X+,r18\n"
709  "st X+,r19\n"
710  "dec r24\n"
711  "brne 1b\n"
712 
713  // Subtract 2^255 from "temp" which is equivalent to extracting
714  // the top bit and then masking it off. If the top bit is zero
715  // then a borrow has occurred and this isn't the answer we want.
716  "mov r25,r19\n"
717  "andi r19,0x7F\n"
718  "st -X,r19\n"
719  "lsl r25\n"
720  "mov r25,__zero_reg__\n"
721  "sbc r25,__zero_reg__\n"
722 
723  // At this point, r25 is 0 if the original "x" is the answer
724  // we want, or 0xFF if "temp" is the answer we want. Essentially
725  // we need to do a conditional move of "temp" into "x".
726  "sbiw r26,31\n" // Point X back to the start of "temp".
727  "sbiw r30,32\n" // Point Z back to the start of "x".
728  "ldi r24,8\n"
729  "2:\n"
730  "ld r16,X+\n"
731  "ld r17,X+\n"
732  "ld r18,X+\n"
733  "ld r19,X+\n"
734  "ld r20,Z\n"
735  "ldd r21,Z+1\n"
736  "ldd r22,Z+2\n"
737  "ldd r23,Z+3\n"
738  "eor r16,r20\n"
739  "eor r17,r21\n"
740  "eor r18,r22\n"
741  "eor r19,r23\n"
742  "and r16,r25\n"
743  "and r17,r25\n"
744  "and r18,r25\n"
745  "and r19,r25\n"
746  "eor r20,r16\n"
747  "eor r21,r17\n"
748  "eor r22,r18\n"
749  "eor r23,r19\n"
750  "st Z+,r20\n"
751  "st Z+,r21\n"
752  "st Z+,r22\n"
753  "st Z+,r23\n"
754  "dec r24\n"
755  "brne 2b\n"
756  "mov %0,r25\n"
757  : "=r"(result)
758  : "x"(temp), "z"(x)
759  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
760  "r24", "r25"
761  );
762  strict_clean(temp);
763  return result;
764 #endif // CURVE25519_ASM_AVR
765 }
766 
779 void Curve25519::mulNoReduce(limb_t *result, const limb_t *x, const limb_t *y)
780 {
781 #if !defined(CURVE25519_ASM_AVR)
782  uint8_t i, j;
783  dlimb_t carry;
784  limb_t word;
785  const limb_t *yy;
786  limb_t *rr;
787 
788  // Multiply the lowest word of x by y.
789  carry = 0;
790  word = x[0];
791  yy = y;
792  rr = result;
793  for (i = 0; i < NUM_LIMBS_256BIT; ++i) {
794  carry += ((dlimb_t)(*yy++)) * word;
795  *rr++ = (limb_t)carry;
796  carry >>= LIMB_BITS;
797  }
798  *rr = (limb_t)carry;
799 
800  // Multiply and add the remaining words of x by y.
801  for (i = 1; i < NUM_LIMBS_256BIT; ++i) {
802  word = x[i];
803  carry = 0;
804  yy = y;
805  rr = result + i;
806  for (j = 0; j < NUM_LIMBS_256BIT; ++j) {
807  carry += ((dlimb_t)(*yy++)) * word;
808  carry += *rr;
809  *rr++ = (limb_t)carry;
810  carry >>= LIMB_BITS;
811  }
812  *rr = (limb_t)carry;
813  }
814 #else
815  __asm__ __volatile__ (
816  // Save Y and copy the "result" pointer into it.
817  "push r28\n"
818  "push r29\n"
819  "mov r28,%A2\n"
820  "mov r29,%B2\n"
821 
822  // Multiply the first byte of "x" by y[0..31].
823  "ldi r25,8\n" // loop 8 times: 4 bytes of y each time
824  "clr r24\n" // carry = 0
825  "clr r22\n" // r22 = 0 to replace __zero_reg__
826  "ld r23,X+\n" // r23 = *x++
827  "1:\n"
828  "ld r16,Z\n" // r16 = y[0]
829  "mul r16,r23\n" // r8:r9 = y[0] * r23
830  "movw r8,r0\n"
831  "ldd r16,Z+2\n" // r16 = y[2]
832  "mul r16,r23\n" // r10:r11 = y[2] * r23
833  "movw r10,r0\n"
834  "ldd r16,Z+1\n" // r16 = y[1]
835  "mul r16,r23\n" // r9:r10:r11 += y[1] * r23
836  "add r9,r0\n"
837  "adc r10,r1\n"
838  "adc r11,r22\n"
839  "ldd r16,Z+3\n" // r16 = y[3]
840  "mul r16,r23\n" // r11:r1 += y[3] * r23
841  "add r11,r0\n"
842  "adc r1,r22\n"
843  "add r8,r24\n" // r8:r9:r10:r11:r1 += carry
844  "adc r9,r22\n"
845  "adc r10,r22\n"
846  "adc r11,r22\n"
847  "adc r1,r22\n"
848  "mov r24,r1\n" // carry = r1
849  "st Y+,r8\n" // *rr++ = r8:r9:r10:r11
850  "st Y+,r9\n"
851  "st Y+,r10\n"
852  "st Y+,r11\n"
853  "adiw r30,4\n"
854  "dec r25\n"
855  "brne 1b\n"
856  "st Y+,r24\n" // *rr++ = carry
857  "sbiw r28,32\n" // rr -= 32
858  "sbiw r30,32\n" // Point Z back to the start of y
859 
860  // Multiply and add the remaining bytes of "x" by y[0..31].
861  "ldi r21,31\n" // 31 more bytes of x to go.
862  "2:\n"
863  "ldi r25,8\n" // loop 8 times: 4 bytes of y each time
864  "clr r24\n" // carry = 0
865  "ld r23,X+\n" // r23 = *x++
866  "3:\n"
867  "ld r16,Z\n" // r16 = y[0]
868  "mul r16,r23\n" // r8:r9 = y[0] * r23
869  "movw r8,r0\n"
870  "ldd r16,Z+2\n" // r16 = y[2]
871  "mul r16,r23\n" // r10:r11 = y[2] * r23
872  "movw r10,r0\n"
873  "ldd r16,Z+1\n" // r16 = y[1]
874  "mul r16,r23\n" // r9:r10:r11 += y[1] * r23
875  "add r9,r0\n"
876  "adc r10,r1\n"
877  "adc r11,r22\n"
878  "ldd r16,Z+3\n" // r16 = y[3]
879  "mul r16,r23\n" // r11:r1 += y[3] * r23
880  "add r11,r0\n"
881  "adc r1,r22\n"
882  "add r8,r24\n" // r8:r9:r10:r11:r1 += carry
883  "adc r9,r22\n"
884  "adc r10,r22\n"
885  "adc r11,r22\n"
886  "adc r1,r22\n"
887  "ld r16,Y\n" // r8:r9:r10:r11:r1 += rr[0..3]
888  "add r8,r16\n"
889  "ldd r16,Y+1\n"
890  "adc r9,r16\n"
891  "ldd r16,Y+2\n"
892  "adc r10,r16\n"
893  "ldd r16,Y+3\n"
894  "adc r11,r16\n"
895  "adc r1,r22\n"
896  "mov r24,r1\n" // carry = r1
897  "st Y+,r8\n" // *rr++ = r8:r9:r10:r11
898  "st Y+,r9\n"
899  "st Y+,r10\n"
900  "st Y+,r11\n"
901  "adiw r30,4\n"
902  "dec r25\n"
903  "brne 3b\n"
904  "st Y+,r24\n" // *r++ = carry
905  "sbiw r28,32\n" // rr -= 32
906  "sbiw r30,32\n" // Point Z back to the start of y
907  "dec r21\n"
908  "brne 2b\n"
909 
910  // Restore Y and __zero_reg__.
911  "pop r29\n"
912  "pop r28\n"
913  "clr __zero_reg__\n"
914  : : "x"(x), "z"(y), "r"(result)
915  : "r8", "r9", "r10", "r11", "r16", "r20", "r21", "r22",
916  "r23", "r24", "r25"
917  );
918 #endif
919 }
920 
931 void Curve25519::mul(limb_t *result, const limb_t *x, const limb_t *y)
932 {
933  limb_t temp[NUM_LIMBS_512BIT];
934  mulNoReduce(temp, x, y);
935  reduce(result, temp, NUM_LIMBS_256BIT);
936  strict_clean(temp);
937  crypto_feed_watchdog();
938 }
939 
959 void Curve25519::mulA24(limb_t *result, const limb_t *x)
960 {
961 #if !defined(CURVE25519_ASM_AVR)
962  // The constant a24 = 121665 (0x1DB41) as a limb array.
963 #if BIGNUMBER_LIMB_8BIT
964  static limb_t const a24[3] PROGMEM = {0x41, 0xDB, 0x01};
965 #elif BIGNUMBER_LIMB_16BIT
966  static limb_t const a24[2] PROGMEM = {0xDB41, 0x0001};
967 #elif BIGNUMBER_LIMB_32BIT || BIGNUMBER_LIMB_64BIT
968  static limb_t const a24[1] PROGMEM = {0x0001DB41};
969 #else
970  #error "limb_t must be 8, 16, 32, or 64 bits in size"
971 #endif
972  #define NUM_A24_LIMBS (sizeof(a24) / sizeof(limb_t))
973 
974  // Multiply the lowest limb of a24 by x and zero-extend into the result.
975  limb_t temp[NUM_LIMBS_512BIT];
976  uint8_t i, j;
977  dlimb_t carry = 0;
978  limb_t word = pgm_read_limb(&(a24[0]));
979  const limb_t *xx = x;
980  limb_t *tt = temp;
981  for (i = 0; i < NUM_LIMBS_256BIT; ++i) {
982  carry += ((dlimb_t)(*xx++)) * word;
983  *tt++ = (limb_t)carry;
984  carry >>= LIMB_BITS;
985  }
986  *tt = (limb_t)carry;
987 
988  // Multiply and add the remaining limbs of a24.
989  for (i = 1; i < NUM_A24_LIMBS; ++i) {
990  word = pgm_read_limb(&(a24[i]));
991  carry = 0;
992  xx = x;
993  tt = temp + i;
994  for (j = 0; j < NUM_LIMBS_256BIT; ++j) {
995  carry += ((dlimb_t)(*xx++)) * word;
996  carry += *tt;
997  *tt++ = (limb_t)carry;
998  carry >>= LIMB_BITS;
999  }
1000  *tt = (limb_t)carry;
1001  }
1002 #else
1003  limb_t temp[NUM_LIMBS_512BIT];
1004  #define NUM_A24_LIMBS ((3 + sizeof(limb_t) - 1) / sizeof(limb_t))
1005  __asm__ __volatile__ (
1006  // Load the two low bytes of a24 into r16 and r17.
1007  // The third byte is 0x01 which we can deal with implicitly.
1008  "ldi r16,0x41\n"
1009  "ldi r17,0xDB\n"
1010 
1011  // Iterate over the bytes of "x" and multiply each with a24.
1012  "ldi r25,32\n" // 32 bytes in "x"
1013  "clr r22\n" // r22 = 0
1014  "clr r18\n" // r18:r19:r11 = 0 (carry)
1015  "clr r19\n"
1016  "clr r11\n"
1017  "1:\n"
1018  "ld r21,X+\n" // r21 = *x++
1019  "mul r21,r16\n" // r8:r9 = r21 * a24[0]
1020  "movw r8,r0\n"
1021  "mul r21,r17\n" // r9:r1 += r21 * a24[1]
1022  "add r9,r0\n"
1023  "adc r1,r21\n" // r1:r10 += r21 * a24[2] (implicitly 1)
1024  "mov r10,r22\n"
1025  "adc r10,r22\n"
1026  "add r8,r18\n" // r8:r9:r1:r10 += carry
1027  "adc r9,r19\n"
1028  "adc r1,r11\n"
1029  "adc r10,r22\n"
1030  "st Z+,r8\n" // *tt++ = r8
1031  "mov r18,r9\n" // carry = r9:r1:r10
1032  "mov r19,r1\n"
1033  "mov r11,r10\n"
1034  "dec r25\n"
1035  "brne 1b\n"
1036  "st Z,r18\n" // *tt = carry
1037  "std Z+1,r19\n"
1038  "std Z+2,r11\n"
1039 #if BIGNUMBER_LIMB_16BIT || BIGNUMBER_LIMB_32BIT
1040  "std Z+3,r22\n" // Zero pad to a limb boundary
1041 #endif
1042 
1043  // Restore __zero_reg__
1044  "clr __zero_reg__\n"
1045 
1046  : : "x"(x), "z"(temp)
1047  : "r8", "r9", "r10", "r11", "r16", "r17", "r18", "r19",
1048  "r20", "r21", "r22", "r25"
1049  );
1050 #endif
1051 
1052  // Reduce the intermediate result modulo 2^255 - 19.
1053  reduce(result, temp, NUM_A24_LIMBS);
1054  strict_clean(temp);
1055 }
1056 
1068 void Curve25519::mul_P(limb_t *result, const limb_t *x, const limb_t *y)
1069 {
1070  limb_t temp[NUM_LIMBS_512BIT];
1071  uint8_t i, j;
1072  dlimb_t carry;
1073  limb_t word;
1074  const limb_t *xx;
1075  limb_t *tt;
1076 
1077  // Multiply the lowest word of y by x.
1078  carry = 0;
1079  word = pgm_read_limb(&(y[0]));
1080  xx = x;
1081  tt = temp;
1082  for (i = 0; i < NUM_LIMBS_256BIT; ++i) {
1083  carry += ((dlimb_t)(*xx++)) * word;
1084  *tt++ = (limb_t)carry;
1085  carry >>= LIMB_BITS;
1086  }
1087  *tt = (limb_t)carry;
1088 
1089  // Multiply and add the remaining words of y by x.
1090  for (i = 1; i < NUM_LIMBS_256BIT; ++i) {
1091  word = pgm_read_limb(&(y[i]));
1092  carry = 0;
1093  xx = x;
1094  tt = temp + i;
1095  for (j = 0; j < NUM_LIMBS_256BIT; ++j) {
1096  carry += ((dlimb_t)(*xx++)) * word;
1097  carry += *tt;
1098  *tt++ = (limb_t)carry;
1099  carry >>= LIMB_BITS;
1100  }
1101  *tt = (limb_t)carry;
1102  }
1103 
1104  // Reduce the intermediate result modulo 2^255 - 19.
1105  reduce(result, temp, NUM_LIMBS_256BIT);
1106  strict_clean(temp);
1107 }
1108 
1119 void Curve25519::add(limb_t *result, const limb_t *x, const limb_t *y)
1120 {
1121 #if !defined(CURVE25519_ASM_AVR)
1122  dlimb_t carry = 0;
1123  uint8_t posn;
1124  limb_t *rr = result;
1125 
1126  // Add the two arrays to obtain the intermediate result.
1127  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1128  carry += *x++;
1129  carry += *y++;
1130  *rr++ = (limb_t)carry;
1131  carry >>= LIMB_BITS;
1132  }
1133 #else // CURVE25519_ASM_AVR
1134  __asm__ __volatile__ (
1135  // Save Y and copy the "result" pointer into it.
1136  "push r28\n"
1137  "push r29\n"
1138  "mov r28,%A2\n"
1139  "mov r29,%B2\n"
1140 
1141  // Unroll the loop to operate on 4 bytes at a time (8 iterations).
1142  "ldi r24,8\n" // Loop counter.
1143  "clr r25\n" // carry = 0
1144  "1:\n"
1145  "ld r16,X+\n" // r16:r19 = *x++
1146  "ld r17,X+\n"
1147  "ld r18,X+\n"
1148  "ld r19,X+\n"
1149  "ld r20,Z+\n" // r20:r23 = *y++
1150  "ld r21,Z+\n"
1151  "ld r22,Z+\n"
1152  "ld r23,Z+\n"
1153  "add r16,r25\n" // r16:r19:carry += carry
1154  "adc r17,__zero_reg__\n"
1155  "adc r18,__zero_reg__\n"
1156  "adc r19,__zero_reg__\n"
1157  "mov r25,__zero_reg__\n"
1158  "adc r25,r25\n"
1159  "add r16,r20\n" // r16:r19:carry += r20:r23
1160  "adc r17,r21\n"
1161  "adc r18,r22\n"
1162  "adc r19,r23\n"
1163  "adc r25,__zero_reg__\n"
1164  "st Y+,r16\n" // *rr++ = r16:r23
1165  "st Y+,r17\n"
1166  "st Y+,r18\n"
1167  "st Y+,r19\n"
1168  "dec r24\n"
1169  "brne 1b\n"
1170 
1171  // Restore Y.
1172  "pop r29\n"
1173  "pop r28\n"
1174  : : "x"(x), "z"(y), "r"(result)
1175  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
1176  "r24", "r25"
1177  );
1178 #endif // CURVE25519_ASM_AVR
1179 
1180  // Reduce the result using the quick trial subtraction method.
1181  reduceQuick(result);
1182 }
1183 
1194 void Curve25519::sub(limb_t *result, const limb_t *x, const limb_t *y)
1195 {
1196 #if !defined(CURVE25519_ASM_AVR)
1197  dlimb_t borrow;
1198  uint8_t posn;
1199  limb_t *rr = result;
1200 
1201  // Subtract y from x to generate the intermediate result.
1202  borrow = 0;
1203  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1204  borrow = ((dlimb_t)(*x++)) - (*y++) - ((borrow >> LIMB_BITS) & 0x01);
1205  *rr++ = (limb_t)borrow;
1206  }
1207 
1208  // If we had a borrow, then the result has gone negative and we
1209  // have to add 2^255 - 19 to the result to make it positive again.
1210  // The top bits of "borrow" will be all 1's if there is a borrow
1211  // or it will be all 0's if there was no borrow. Easiest is to
1212  // conditionally subtract 19 and then mask off the high bit.
1213  rr = result;
1214  borrow = (borrow >> LIMB_BITS) & 19U;
1215  borrow = ((dlimb_t)(*rr)) - borrow;
1216  *rr++ = (limb_t)borrow;
1217  for (posn = 1; posn < NUM_LIMBS_256BIT; ++posn) {
1218  borrow = ((dlimb_t)(*rr)) - ((borrow >> LIMB_BITS) & 0x01);
1219  *rr++ = (limb_t)borrow;
1220  }
1221  *(--rr) &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
1222 #else // CURVE25519_ASM_AVR
1223  __asm__ __volatile__ (
1224  // Save Y and copy the "result" pointer into it.
1225  "push r28\n"
1226  "push r29\n"
1227  "mov r28,%A2\n"
1228  "mov r29,%B2\n"
1229 
1230  // Unroll the sub loop to operate on 4 bytes at a time (8 iterations).
1231  "ldi r24,8\n" // Loop counter.
1232  "clr r25\n" // borrow = 0
1233  "1:\n"
1234  "ld r16,X+\n" // r16:r19 = *x++
1235  "ld r17,X+\n"
1236  "ld r18,X+\n"
1237  "ld r19,X+\n"
1238  "ld r20,Z+\n" // r20:r23 = *y++
1239  "ld r21,Z+\n"
1240  "ld r22,Z+\n"
1241  "ld r23,Z+\n"
1242  "sub r16,r25\n" // r16:r19:borrow -= borrow
1243  "sbc r17,__zero_reg__\n"
1244  "sbc r18,__zero_reg__\n"
1245  "sbc r19,__zero_reg__\n"
1246  "mov r25,__zero_reg__\n"
1247  "sbc r25,__zero_reg__\n"
1248  "sub r16,r20\n" // r16:r19:borrow -= r20:r23
1249  "sbc r17,r21\n"
1250  "sbc r18,r22\n"
1251  "sbc r19,r23\n"
1252  "sbc r25,__zero_reg__\n"
1253  "st Y+,r16\n" // *rr++ = r16:r23
1254  "st Y+,r17\n"
1255  "st Y+,r18\n"
1256  "st Y+,r19\n"
1257  "andi r25,1\n" // Only need the bottom bit of the borrow
1258  "dec r24\n"
1259  "brne 1b\n"
1260 
1261  // If there was a borrow, then we need to add 2^255 - 19 back.
1262  // We conditionally subtract 19 and then mask off the high bit.
1263  "neg r25\n" // borrow = mask(borrow) & 19
1264  "andi r25,19\n"
1265  "sbiw r28,32\n" // Point Y back to the start of "result"
1266  "ldi r24,8\n"
1267  "2:\n"
1268  "ld r16,Y\n" // r16:r19 = *rr
1269  "ldd r17,Y+1\n"
1270  "ldd r18,Y+2\n"
1271  "ldd r19,Y+3\n"
1272  "sub r16,r25\n"
1273  "sbc r17,__zero_reg__\n" // r16:r19:borrow -= borrow
1274  "sbc r18,__zero_reg__\n"
1275  "sbc r19,__zero_reg__\n"
1276  "mov r25,__zero_reg__\n"
1277  "sbc r25,__zero_reg__\n"
1278  "andi r25,1\n"
1279  "st Y+,r16\n" // *r++ = r16:r19
1280  "st Y+,r17\n"
1281  "st Y+,r18\n"
1282  "st Y+,r19\n"
1283  "dec r24\n"
1284  "brne 2b\n"
1285  "andi r19,0x7F\n" // Mask off the high bit in the last byte
1286  "sbiw r28,1\n"
1287  "st Y,r19\n"
1288 
1289  // Restore Y.
1290  "pop r29\n"
1291  "pop r28\n"
1292  : : "x"(x), "z"(y), "r"(result)
1293  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
1294  "r24", "r25"
1295  );
1296 #endif // CURVE25519_ASM_AVR
1297 }
1298 
1311 void Curve25519::cswap(limb_t select, limb_t *x, limb_t *y)
1312 {
1313 #if !defined(CURVE25519_ASM_AVR)
1314  uint8_t posn;
1315  limb_t dummy;
1316  limb_t sel;
1317 
1318  // Turn "select" into an all-zeroes or all-ones mask. We don't care
1319  // which bit or bits is set in the original "select" value.
1320  sel = (limb_t)(((((dlimb_t)1) << LIMB_BITS) - select) >> LIMB_BITS);
1321  --sel;
1322 
1323  // Swap the two values based on "select". Algorithm from:
1324  // http://tools.ietf.org/html/rfc7748
1325  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1326  dummy = sel & (x[posn] ^ y[posn]);
1327  x[posn] ^= dummy;
1328  y[posn] ^= dummy;
1329  }
1330 #else // CURVE25519_ASM_AVR
1331  __asm__ __volatile__ (
1332  // Combine all bytes from "select" into one and then turn
1333  // that byte into the "sel" mask in r24.
1334  "clr r24\n"
1335 #if BIGNUMBER_LIMB_8BIT
1336  "sub r24,%2\n"
1337 #elif BIGNUMBER_LIMB_16BIT
1338  "or %A2,%B2\n"
1339  "sub r24,%A2\n"
1340 #elif BIGNUMBER_LIMB_32BIT
1341  "or %A2,%B2\n"
1342  "or %A2,%C2\n"
1343  "or %A2,%D2\n"
1344  "sub r24,%A2\n"
1345 #endif
1346  "mov r24,__zero_reg__\n"
1347  "sbc r24,r24\n"
1348 
1349  // Perform the conditional swap 4 bytes at a time.
1350  "ldi r25,8\n"
1351  "1:\n"
1352  "ld r16,X+\n" // r16:r19 = *x
1353  "ld r17,X+\n"
1354  "ld r18,X+\n"
1355  "ld r19,X\n"
1356  "ld r20,Z\n" // r20:r23 = *y
1357  "ldd r21,Z+1\n"
1358  "ldd r22,Z+2\n"
1359  "ldd r23,Z+3\n"
1360  "mov r12,r16\n" // r12:r15 = (r16:r19 ^ r20:r23) & sel
1361  "mov r13,r17\n"
1362  "mov r14,r18\n"
1363  "mov r15,r19\n"
1364  "eor r12,r20\n"
1365  "eor r13,r21\n"
1366  "eor r14,r22\n"
1367  "eor r15,r23\n"
1368  "and r12,r24\n"
1369  "and r13,r24\n"
1370  "and r14,r24\n"
1371  "and r15,r24\n"
1372  "eor r16,r12\n" // r16:r19 ^= r12:r15
1373  "eor r17,r13\n"
1374  "eor r18,r14\n"
1375  "eor r19,r15\n"
1376  "eor r20,r12\n" // r20:r23 ^= r12:r15
1377  "eor r21,r13\n"
1378  "eor r22,r14\n"
1379  "eor r23,r15\n"
1380  "st X,r19\n" // *x++ = r16:r19
1381  "st -X,r18\n"
1382  "st -X,r17\n"
1383  "st -X,r16\n"
1384  "adiw r26,4\n"
1385  "st Z+,r20\n" // *y++ = r20:r23
1386  "st Z+,r21\n"
1387  "st Z+,r22\n"
1388  "st Z+,r23\n"
1389  "dec r25\n"
1390  "brne 1b\n"
1391 
1392  : : "x"(x), "z"(y), "r"(select)
1393  : "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19",
1394  "r20", "r21", "r22", "r23", "r24", "r25"
1395  );
1396 #endif // CURVE25519_ASM_AVR
1397 }
1398 
1411 void Curve25519::cmove(limb_t select, limb_t *x, const limb_t *y)
1412 {
1413 #if !defined(CURVE25519_ASM_AVR)
1414  uint8_t posn;
1415  limb_t dummy;
1416  limb_t sel;
1417 
1418  // Turn "select" into an all-zeroes or all-ones mask. We don't care
1419  // which bit or bits is set in the original "select" value.
1420  sel = (limb_t)(((((dlimb_t)1) << LIMB_BITS) - select) >> LIMB_BITS);
1421  --sel;
1422 
1423  // Move y into x based on "select". Similar to conditional swap above.
1424  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1425  dummy = sel & (x[posn] ^ y[posn]);
1426  x[posn] ^= dummy;
1427  }
1428 #else // CURVE25519_ASM_AVR
1429  __asm__ __volatile__ (
1430  // Combine all bytes from "select" into one and then turn
1431  // that byte into the "sel" mask in r24.
1432  "clr r24\n"
1433 #if BIGNUMBER_LIMB_8BIT
1434  "sub r24,%2\n"
1435 #elif BIGNUMBER_LIMB_16BIT
1436  "or %A2,%B2\n"
1437  "sub r24,%A2\n"
1438 #elif BIGNUMBER_LIMB_32BIT
1439  "or %A2,%B2\n"
1440  "or %A2,%C2\n"
1441  "or %A2,%D2\n"
1442  "sub r24,%A2\n"
1443 #endif
1444  "mov r24,__zero_reg__\n"
1445  "sbc r24,r24\n"
1446 
1447  // Perform the conditional move 4 bytes at a time.
1448  "ldi r25,8\n"
1449  "1:\n"
1450  "ld r16,X+\n" // r16:r19 = *x
1451  "ld r17,X+\n"
1452  "ld r18,X+\n"
1453  "ld r19,X\n"
1454  "ld r20,Z+\n" // r20:r23 = *y++
1455  "ld r21,Z+\n"
1456  "ld r22,Z+\n"
1457  "ld r23,Z+\n"
1458  "eor r20,r16\n" // r20:r23 = (r16:r19 ^ r20:r23) & sel
1459  "eor r21,r17\n"
1460  "eor r22,r18\n"
1461  "eor r23,r19\n"
1462  "and r20,r24\n"
1463  "and r21,r24\n"
1464  "and r22,r24\n"
1465  "and r23,r24\n"
1466  "eor r16,r20\n" // r16:r19 ^= r20:r23
1467  "eor r17,r21\n"
1468  "eor r18,r22\n"
1469  "eor r19,r23\n"
1470  "st X,r19\n" // *x++ = r16:r19
1471  "st -X,r18\n"
1472  "st -X,r17\n"
1473  "st -X,r16\n"
1474  "adiw r26,4\n"
1475  "dec r25\n"
1476  "brne 1b\n"
1477 
1478  : : "x"(x), "z"(y), "r"(select)
1479  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
1480  "r24", "r25"
1481  );
1482 #endif // CURVE25519_ASM_AVR
1483 }
1484 
1491 void Curve25519::pow250(limb_t *result, const limb_t *x)
1492 {
1493  limb_t t1[NUM_LIMBS_256BIT];
1494  uint8_t i, j;
1495 
1496  // The big-endian hexadecimal expansion of (2^250 - 1) is:
1497  // 03FFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
1498  //
1499  // The naive implementation needs to do 2 multiplications per 1 bit and
1500  // 1 multiplication per 0 bit. We can improve upon this by creating a
1501  // pattern 0000000001 ... 0000000001. If we square and multiply the
1502  // pattern by itself we can turn the pattern into the partial results
1503  // 0000000011 ... 0000000011, 0000000111 ... 0000000111, etc.
1504  // This averages out to about 1.1 multiplications per 1 bit instead of 2.
1505 
1506  // Build a pattern of 250 bits in length of repeated copies of 0000000001.
1507  #define RECIP_GROUP_SIZE 10
1508  #define RECIP_GROUP_BITS 250 // Must be a multiple of RECIP_GROUP_SIZE.
1509  square(t1, x);
1510  for (j = 0; j < (RECIP_GROUP_SIZE - 1); ++j)
1511  square(t1, t1);
1512  mul(result, t1, x);
1513  for (i = 0; i < ((RECIP_GROUP_BITS / RECIP_GROUP_SIZE) - 2); ++i) {
1514  for (j = 0; j < RECIP_GROUP_SIZE; ++j)
1515  square(t1, t1);
1516  mul(result, result, t1);
1517  }
1518 
1519  // Multiply bit-shifted versions of the 0000000001 pattern into
1520  // the result to "fill in" the gaps in the pattern.
1521  square(t1, result);
1522  mul(result, result, t1);
1523  for (j = 0; j < (RECIP_GROUP_SIZE - 2); ++j) {
1524  square(t1, t1);
1525  mul(result, result, t1);
1526  }
1527 
1528  // Clean up and exit.
1529  clean(t1);
1530 }
1531 
1539 void Curve25519::recip(limb_t *result, const limb_t *x)
1540 {
1541  // The reciprocal is the same as x ^ (p - 2) where p = 2^255 - 19.
1542  // The big-endian hexadecimal expansion of (p - 2) is:
1543  // 7FFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFEB
1544  // Start with the 250 upper bits of the expansion of (p - 2).
1545  pow250(result, x);
1546 
1547  // Deal with the 5 lowest bits of (p - 2), 01011, from highest to lowest.
1548  square(result, result);
1549  square(result, result);
1550  mul(result, result, x);
1551  square(result, result);
1552  square(result, result);
1553  mul(result, result, x);
1554  square(result, result);
1555  mul(result, result, x);
1556 }
1557 
1573 bool Curve25519::sqrt(limb_t *result, const limb_t *x)
1574 {
1575  // sqrt(-1) mod (2^255 - 19).
1576  static limb_t const numSqrtM1[NUM_LIMBS_256BIT] PROGMEM = {
1577  LIMB_PAIR(0x4A0EA0B0, 0xC4EE1B27), LIMB_PAIR(0xAD2FE478, 0x2F431806),
1578  LIMB_PAIR(0x3DFBD7A7, 0x2B4D0099), LIMB_PAIR(0x4FC1DF0B, 0x2B832480)
1579  };
1580  limb_t y[NUM_LIMBS_256BIT];
1581 
1582  // Algorithm from: http://tools.ietf.org/html/rfc7748
1583 
1584  // Compute a candidate root: result = x^((p + 3) / 8) mod p.
1585  // (p + 3) / 8 = (2^252 - 2) which is 251 one bits followed by a zero:
1586  // 0FFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE
1587  pow250(result, x);
1588  square(result, result);
1589  mul(result, result, x);
1590  square(result, result);
1591 
1592  // Did we get the square root immediately?
1593  square(y, result);
1594  if (memcmp(x, y, sizeof(y)) == 0) {
1595  clean(y);
1596  return true;
1597  }
1598 
1599  // Multiply the result by sqrt(-1) and check again.
1600  mul_P(result, result, numSqrtM1);
1601  square(y, result);
1602  if (memcmp(x, y, sizeof(y)) == 0) {
1603  clean(y);
1604  return true;
1605  }
1606 
1607  // The number does not have a square root.
1608  clean(y);
1609  return false;
1610 }
static void unpackLE(limb_t *limbs, size_t count, const uint8_t *bytes, size_t len)
Unpacks the little-endian byte representation of a big number into a limb array.
static void packLE(uint8_t *bytes, size_t len, const limb_t *limbs, size_t count)
Packs the little-endian byte representation of a big number into a byte array.
static bool dh2(uint8_t k[32], uint8_t f[32])
Performs phase 2 of a Diffie-Hellman key exchange using Curve25519.
Definition: Curve25519.cpp:283
static void dh1(uint8_t k[32], uint8_t f[32])
Performs phase 1 of a Diffie-Hellman key exchange using Curve25519.
Definition: Curve25519.cpp:245
static bool eval(uint8_t result[32], const uint8_t s[32], const uint8_t x[32])
Evaluates the raw Curve25519 function.
Definition: Curve25519.cpp:80
void rand(uint8_t *data, size_t len)
Generates random bytes into a caller-supplied buffer.
Definition: RNG.cpp:660