File Coverage

curve25519_i64.c

Criterion	Covered	Total	%
statement	456	457	99.7
branch	67	68	98.5
condition			n/a
subroutine			n/a
pod			n/a
total	523	525	99.6

line	stmt	bran	code
1			/* Generic 64-bit integer implementation of Curve25519 ECDH
2			* Written by Matthijs van Duin, 200608242056
3			* Public domain.
4			*
5			* Based on work by Daniel J Bernstein, http://cr.yp.to/ecdh.html
6			*/
7
8			#include
9			#include
10			#include "curve25519_i64.h"
11
12
13			typedef int32_t i25519[10];
14			typedef const int32_t *i25519ptr;
15			typedef const uint8_t *srcptr;
16			typedef uint8_t *dstptr;
17
18			typedef struct expstep expstep;
19
20			struct expstep {
21			unsigned nsqr;
22			unsigned muli;
23			};
24
25
26			/******************* constants *******************/
27
28			const k25519
29			zero25519 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
31			prime25519 = { 237, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
32			255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
33			255, 255, 255, 255, 255, 255, 255, 127 },
34			order25519 = { 237, 211, 245, 92, 26, 99, 18, 88, 214, 156, 247, 162, 222, 249,
35			222, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 };
36
37			/* smallest multiple of the order that's >= 2^255 */
38			static const k25519
39			order_times_8 = { 104, 159, 174, 231, 210, 24, 147, 192, 178, 230, 188, 23,
40			245, 206, 247, 166, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 };
41
42			/* constants 2Gy and 1/(2Gy) */
43			static const i25519
44			base_2y = { 39999547, 18689728, 59995525, 1648697, 57546132,
45			24010086, 19059592, 5425144, 63499247, 16420658 },
46			base_r2y = { 5744, 8160848, 4790893, 13779497, 35730846,
47			12541209, 49101323, 30047407, 40071253, 6226132 };
48
49
50			/******************* radix 2^8 math *******************/
51
52	14		static void cpy32(k25519 d, const k25519 s) {
53			int i;
54	462	100	for (i = 0; i < 32; i++)
55	448		d[i] = s[i];
56	14		}
57
58			/* p[m..n+m-1] = q[m..n+m-1] + z * x */
59			/* n is the size of x */
60			/* n+m is the size of p and q */
61			static inline
62	3408		int mula_small(dstptr p, srcptr q, unsigned m, srcptr x, unsigned n, int z) {
63	3408		int v = 0;
64			unsigned i;
65	77491	100	for (i = 0; i < n; i++) {
66	74083		p[i+m] = v += q[i+m] + z * x[i];
67	74083		v >>= 8;
68			}
69	3408		return v;
70			}
71
72			/* p += x * y * z where z is a small integer
73			* x is size 32, y is size t, p is size 32+t
74			* y is allowed to overlap with p+32 if you don't care about the upper half */
75	910		static int mula32(dstptr p, srcptr x, srcptr y, unsigned t, int z) {
76	910		const unsigned n = 31;
77	910		int w = 0;
78			unsigned i;
79	2037	100	for (i = 0; i < t; i++) {
80	1127		int zy = z * y[i];
81	1127		p[i+n] = w += mula_small(p,p, i, x, n, zy) + p[i+n] + zy * x[n];
82	1127		w >>= 8;
83			}
84	910		p[i+n] += w;
85	910		return w >> 8;
86			}
87
88			/* divide r (size n) by d (size t), returning quotient q and remainder r
89			* quotient is size n-t+1, remainder is size t
90			* requires t > 0 && d[t-1] != 0
91			* requires that r[-1] and d[-1] are valid memory locations
92			* q may overlap with r+t */
93	918		static void divmod(dstptr q, dstptr r, int n, srcptr d, int t) {
94	918		int rn = 0;
95	918		int dt = d[t-1] << 8 \| (d[t-2] & -(t > 1));
96
97	2054	100	while (n-- >= t) {
98	1136		int z = (rn << 16 \| r[n] << 8 \| (r[n-1] & -(n > 0))) / dt;
99	1136		rn += mula_small(r,r, n-t+1, d, t, -z);
100	1136		q[n-t+1] = z + rn; /* rn is 0 or -1 (underflow) */
101	1136		mula_small(r,r, n-t+1, d, t, -rn);
102	1136		rn = r[n];
103	1136		r[n] = 0;
104			}
105
106	918		r[t-1] = rn;
107	918		}
108
109	921		static inline unsigned numsize(srcptr x, unsigned n) {
110	1299	100	while (n-- && !x[n])
		100
111			;
112	921		return n+1;
113			}
114
115			/* Returns x if a contains the gcd, y if b. Also, the returned buffer contains
116			* the inverse of a mod b, as 32-byte signed.
117			* x and y must have 64 bytes space for temporary use.
118			* requires that a[-1] and b[-1] are valid memory locations */
119	6		static dstptr egcd32(dstptr x, dstptr y, dstptr a, dstptr b) {
120	6		unsigned an, bn = 32, qn, i;
121	198	100	for (i = 0; i < 32; i++)
122	192		x[i] = y[i] = 0;
123	6		x[0] = 1;
124	6	50	if (!(an = numsize(a, 32)))
125	0		return y; /* division by zero */
126			while (42) {
127	459		qn = bn - an + 1;
128	459		divmod(y+32, b, bn, a, an);
129	459	100	if (!(bn = numsize(b, bn)))
130	3		return x;
131	456		mula32(y, x, y+32, qn, -1);
132
133	456		qn = an - bn + 1;
134	456		divmod(x+32, a, an, b, bn);
135	456	100	if (!(an = numsize(a, an)))
136	3		return y;
137	453		mula32(x, y, x+32, qn, -1);
138	453		}
139			}
140
141
142			/******************* radix 2^25.5 GF(2^255-19) math *******************/
143
144			#define P25 33554431 /* (1 << 25) - 1 */
145			#define P26 67108863 /* (1 << 26) - 1 */
146
147
148			/* debugging code */
149
150			#if 0
151			#include
152			#include
153			static void check_range(const char *where, int32_t x, int32_t lb, int32_t ub) {
154			if (x < lb \|\| x > ub) {
155			fprintf(stderr, "%s check failed: %08x (%d)\n", where, x, x);
156			abort();
157			}
158			}
159			static void check_nonred(const char *where, const i25519 x) {
160			int i;
161			for (i = 0; i < 10; i++)
162			check_range(where, x[i], -185861411, 185861411);
163			}
164			static void check_reduced(const char *where, const i25519 x) {
165			int i;
166			for (i = 0; i < 9; i++)
167			check_range(where, x[i], 0, P26 >> (i & 1));
168			check_range(where, x[9], -675, P25+675);
169			}
170			#else
171			#define check_range(w, x, l, u)
172			#define check_nonred(w, x)
173			#define check_reduced(w, x)
174			#endif
175
176
177			/* convenience macros */
178
179			#define M(i) ((uint32_t) m[i])
180			#define X(i) ((int64_t) x[i])
181			#define m64(arg1,arg2) ((int64_t) (arg1) * (arg2))
182
183
184			/* Convert to internal format from little-endian byte format */
185	5		static void unpack25519(i25519 x, const k25519 m) {
186	5		x[0] = M( 0) \| M( 1)<<8 \| M( 2)<<16 \| (M( 3)& 3)<<24;
187	5		x[1] = (M( 3)&~ 3)>>2 \| M( 4)<<6 \| M( 5)<<14 \| (M( 6)& 7)<<22;
188	5		x[2] = (M( 6)&~ 7)>>3 \| M( 7)<<5 \| M( 8)<<13 \| (M( 9)&31)<<21;
189	5		x[3] = (M( 9)&~31)>>5 \| M(10)<<3 \| M(11)<<11 \| (M(12)&63)<<19;
190	5		x[4] = (M(12)&~63)>>6 \| M(13)<<2 \| M(14)<<10 \| M(15) <<18;
191	5		x[5] = M(16) \| M(17)<<8 \| M(18)<<16 \| (M(19)& 1)<<24;
192	5		x[6] = (M(19)&~ 1)>>1 \| M(20)<<7 \| M(21)<<15 \| (M(22)& 7)<<23;
193	5		x[7] = (M(22)&~ 7)>>3 \| M(23)<<5 \| M(24)<<13 \| (M(25)&15)<<21;
194	5		x[8] = (M(25)&~15)>>4 \| M(26)<<4 \| M(27)<<12 \| (M(28)&63)<<20;
195	5		x[9] = (M(28)&~63)>>6 \| M(29)<<2 \| M(30)<<10 \| M(31) <<18;
196			check_reduced("unpack output", x);
197	5		}
198
199
200			/* Check if reduced-form input >= 2^255-19 */
201	18		static inline int is_overflow(const i25519 x) {
202	18		return ((x[0] > P26-19) & ((x[1] & x[3] & x[5] & x[7] & x[9]) == P25) &
203	18		((x[2] & x[4] & x[6] & x[8]) == P26)
204	18		) \| (x[9] > P25);
205			}
206
207
208			/* Convert from internal format to little-endian byte format. The
209			* number must be in a reduced form which is output by the following ops:
210			* unpack, mul, sqr
211			* set -- if input in range 0 .. P25
212			* If you're unsure if the number is reduced, first multiply it by 1. */
213	11		static void pack25519(const i25519 x, k25519 m) {
214	11		int32_t ld = 0, ud = 0;
215			int64_t t;
216			check_reduced("pack input", x);
217	11		ld = is_overflow(x) - (x[9] < 0);
218	11		ud = ld * -(P25+1);
219	11		ld *= 19;
220	11		t = ld + X(0) + (X(1) << 26);
221	11		m[ 0] = t; m[ 1] = t >> 8; m[ 2] = t >> 16; m[ 3] = t >> 24;
222	11		t = (t >> 32) + (X(2) << 19);
223	11		m[ 4] = t; m[ 5] = t >> 8; m[ 6] = t >> 16; m[ 7] = t >> 24;
224	11		t = (t >> 32) + (X(3) << 13);
225	11		m[ 8] = t; m[ 9] = t >> 8; m[10] = t >> 16; m[11] = t >> 24;
226	11		t = (t >> 32) + (X(4) << 6);
227	11		m[12] = t; m[13] = t >> 8; m[14] = t >> 16; m[15] = t >> 24;
228	11		t = (t >> 32) + X(5) + (X(6) << 25);
229	11		m[16] = t; m[17] = t >> 8; m[18] = t >> 16; m[19] = t >> 24;
230	11		t = (t >> 32) + (X(7) << 19);
231	11		m[20] = t; m[21] = t >> 8; m[22] = t >> 16; m[23] = t >> 24;
232	11		t = (t >> 32) + (X(8) << 12);
233	11		m[24] = t; m[25] = t >> 8; m[26] = t >> 16; m[27] = t >> 24;
234	11		t = (t >> 32) + ((X(9) + ud) << 6);
235	11		m[28] = t; m[29] = t >> 8; m[30] = t >> 16; m[31] = t >> 24;
236	11		}
237
238			/* Copy a number */
239	13		static void cpy25519(i25519 out, const i25519 in) {
240			int i;
241	143	100	for (i = 0; i < 10; i++)
242	130		out[i] = in[i];
243	13		}
244
245			/* Set a number to value, which must be in range -185861411 .. 185861411 */
246	41		static void set25519(i25519 out, const int32_t in) {
247			int i;
248	41		out[0] = in;
249	410	100	for (i = 1; i < 10; i++)
250	369		out[i] = 0;
251	41		}
252
253			/* Add/subtract two numbers. The inputs must be in reduced form, and the
254			* output isn't, so to do another addition or subtraction on the output,
255			* first multiply it by one to reduce it. */
256	11791		static void add25519(i25519 xy, const i25519 x, const i25519 y) {
257	11791		xy[0] = x[0] + y[0]; xy[1] = x[1] + y[1];
258	11791		xy[2] = x[2] + y[2]; xy[3] = x[3] + y[3];
259	11791		xy[4] = x[4] + y[4]; xy[5] = x[5] + y[5];
260	11791		xy[6] = x[6] + y[6]; xy[7] = x[7] + y[7];
261	11791		xy[8] = x[8] + y[8]; xy[9] = x[9] + y[9];
262	11791		}
263	11785		static void sub25519(i25519 xy, const i25519 x, const i25519 y) {
264	11785		xy[0] = x[0] - y[0]; xy[1] = x[1] - y[1];
265	11785		xy[2] = x[2] - y[2]; xy[3] = x[3] - y[3];
266	11785		xy[4] = x[4] - y[4]; xy[5] = x[5] - y[5];
267	11785		xy[6] = x[6] - y[6]; xy[7] = x[7] - y[7];
268	11785		xy[8] = x[8] - y[8]; xy[9] = x[9] - y[9];
269	11785		}
270
271			/* Multiply a number by a small integer in range -185861411 .. 185861411.
272			* The output is in reduced form, the input x need not be. x and xy may point
273			* to the same buffer. */
274	2825		static i25519ptr mul25519small(i25519 xy, const i25519 x, const int32_t y) {
275			register int64_t t;
276			check_nonred("mul small x input", x);
277			check_range("mul small y input", y, -185861411, 185861411);
278	2825		t = m64(x[8],y);
279	2825		xy[8] = t & ((1 << 26) - 1);
280	2825		t = (t >> 26) + m64(x[9],y);
281	2825		xy[9] = t & ((1 << 25) - 1);
282	2825		t = 19 * (t >> 25) + m64(x[0],y);
283	2825		xy[0] = t & ((1 << 26) - 1);
284	2825		t = (t >> 26) + m64(x[1],y);
285	2825		xy[1] = t & ((1 << 25) - 1);
286	2825		t = (t >> 25) + m64(x[2],y);
287	2825		xy[2] = t & ((1 << 26) - 1);
288	2825		t = (t >> 26) + m64(x[3],y);
289	2825		xy[3] = t & ((1 << 25) - 1);
290	2825		t = (t >> 25) + m64(x[4],y);
291	2825		xy[4] = t & ((1 << 26) - 1);
292	2825		t = (t >> 26) + m64(x[5],y);
293	2825		xy[5] = t & ((1 << 25) - 1);
294	2825		t = (t >> 25) + m64(x[6],y);
295	2825		xy[6] = t & ((1 << 26) - 1);
296	2825		t = (t >> 26) + m64(x[7],y);
297	2825		xy[7] = t & ((1 << 25) - 1);
298	2825		t = (t >> 25) + xy[8];
299	2825		xy[8] = t & ((1 << 26) - 1);
300	2825		xy[9] += (int32_t)(t >> 26);
301			check_reduced("mul small output", xy);
302	2825		return xy;
303			}
304
305			/* Multiply two numbers. The output is in reduced form, the inputs need not
306			* be. */
307	15099		static i25519ptr mul25519(i25519 xy, const i25519 x, const i25519 y) {
308			register int64_t t;
309			check_nonred("mul input x", x);
310			check_nonred("mul input y", y);
311	30198		t = m64(x[0],y[8]) + m64(x[2],y[6]) + m64(x[4],y[4]) + m64(x[6],y[2]) +
312	30198		m64(x[8],y[0]) + 2 * (m64(x[1],y[7]) + m64(x[3],y[5]) +
313	30198		m64(x[5],y[3]) + m64(x[7],y[1])) + 38 *
314	15099		m64(x[9],y[9]);
315	15099		xy[8] = t & ((1 << 26) - 1);
316	45297		t = (t >> 26) + m64(x[0],y[9]) + m64(x[1],y[8]) + m64(x[2],y[7]) +
317	30198		m64(x[3],y[6]) + m64(x[4],y[5]) + m64(x[5],y[4]) +
318	30198		m64(x[6],y[3]) + m64(x[7],y[2]) + m64(x[8],y[1]) +
319	15099		m64(x[9],y[0]);
320	15099		xy[9] = t & ((1 << 25) - 1);
321	45297		t = m64(x[0],y[0]) + 19 * ((t >> 25) + m64(x[2],y[8]) + m64(x[4],y[6])
322	30198		+ m64(x[6],y[4]) + m64(x[8],y[2])) + 38 *
323	30198		(m64(x[1],y[9]) + m64(x[3],y[7]) + m64(x[5],y[5]) +
324	30198		m64(x[7],y[3]) + m64(x[9],y[1]));
325	15099		xy[0] = t & ((1 << 26) - 1);
326	30198		t = (t >> 26) + m64(x[0],y[1]) + m64(x[1],y[0]) + 19 * (m64(x[2],y[9])
327	15099		+ m64(x[3],y[8]) + m64(x[4],y[7]) + m64(x[5],y[6]) +
328	30198		m64(x[6],y[5]) + m64(x[7],y[4]) + m64(x[8],y[3]) +
329	15099		m64(x[9],y[2]));
330	15099		xy[1] = t & ((1 << 25) - 1);
331	60396		t = (t >> 25) + m64(x[0],y[2]) + m64(x[2],y[0]) + 19 * (m64(x[4],y[8])
332	45297		+ m64(x[6],y[6]) + m64(x[8],y[4])) + 2 * m64(x[1],y[1])
333	30198		+ 38 * (m64(x[3],y[9]) + m64(x[5],y[7]) +
334	15099		m64(x[7],y[5]) + m64(x[9],y[3]));
335	15099		xy[2] = t & ((1 << 26) - 1);
336	45297		t = (t >> 26) + m64(x[0],y[3]) + m64(x[1],y[2]) + m64(x[2],y[1]) +
337	45297		m64(x[3],y[0]) + 19 * (m64(x[4],y[9]) + m64(x[5],y[8]) +
338	15099		m64(x[6],y[7]) + m64(x[7],y[6]) +
339	15099		m64(x[8],y[5]) + m64(x[9],y[4]));
340	15099		xy[3] = t & ((1 << 25) - 1);
341	45297		t = (t >> 25) + m64(x[0],y[4]) + m64(x[2],y[2]) + m64(x[4],y[0]) + 19 *
342	45297		(m64(x[6],y[8]) + m64(x[8],y[6])) + 2 * (m64(x[1],y[3]) +
343	45297		m64(x[3],y[1])) + 38 *
344	15099		(m64(x[5],y[9]) + m64(x[7],y[7]) + m64(x[9],y[5]));
345	15099		xy[4] = t & ((1 << 26) - 1);
346	45297		t = (t >> 26) + m64(x[0],y[5]) + m64(x[1],y[4]) + m64(x[2],y[3]) +
347	45297		m64(x[3],y[2]) + m64(x[4],y[1]) + m64(x[5],y[0]) + 19 *
348	30198		(m64(x[6],y[9]) + m64(x[7],y[8]) + m64(x[8],y[7]) +
349	15099		m64(x[9],y[6]));
350	15099		xy[5] = t & ((1 << 25) - 1);
351	45297		t = (t >> 25) + m64(x[0],y[6]) + m64(x[2],y[4]) + m64(x[4],y[2]) +
352	60396		m64(x[6],y[0]) + 19 * m64(x[8],y[8]) + 2 * (m64(x[1],y[5]) +
353	45297		m64(x[3],y[3]) + m64(x[5],y[1])) + 38 *
354	15099		(m64(x[7],y[9]) + m64(x[9],y[7]));
355	15099		xy[6] = t & ((1 << 26) - 1);
356	45297		t = (t >> 26) + m64(x[0],y[7]) + m64(x[1],y[6]) + m64(x[2],y[5]) +
357	30198		m64(x[3],y[4]) + m64(x[4],y[3]) + m64(x[5],y[2]) +
358	30198		m64(x[6],y[1]) + m64(x[7],y[0]) + 19 * (m64(x[8],y[9]) +
359	15099		m64(x[9],y[8]));
360	15099		xy[7] = t & ((1 << 25) - 1);
361	15099		t = (t >> 25) + xy[8];
362	15099		xy[8] = t & ((1 << 26) - 1);
363	15099		xy[9] += (int32_t)(t >> 26);
364			check_reduced("mul output", xy);
365	15099		return xy;
366			}
367
368			/* Square a number. Optimization of mul25519(x2, x, x) */
369	16614		static i25519ptr sqr25519(i25519 x2, const i25519 x) {
370			register int64_t t;
371			check_nonred("sqr input", x);
372	49842		t = m64(x[4],x[4]) + 2 * (m64(x[0],x[8]) + m64(x[2],x[6])) + 38 *
373	33228		m64(x[9],x[9]) + 4 * (m64(x[1],x[7]) + m64(x[3],x[5]));
374	16614		x2[8] = t & ((1 << 26) - 1);
375	49842		t = (t >> 26) + 2 * (m64(x[0],x[9]) + m64(x[1],x[8]) + m64(x[2],x[7]) +
376	33228		m64(x[3],x[6]) + m64(x[4],x[5]));
377	16614		x2[9] = t & ((1 << 25) - 1);
378	83070		t = 19 * (t >> 25) + m64(x[0],x[0]) + 38 * (m64(x[2],x[8]) +
379	83070		m64(x[4],x[6]) + m64(x[5],x[5])) + 76 * (m64(x[1],x[9])
380	16614		+ m64(x[3],x[7]));
381	16614		x2[0] = t & ((1 << 26) - 1);
382	49842		t = (t >> 26) + 2 * m64(x[0],x[1]) + 38 * (m64(x[2],x[9]) +
383	33228		m64(x[3],x[8]) + m64(x[4],x[7]) + m64(x[5],x[6]));
384	16614		x2[1] = t & ((1 << 25) - 1);
385	49842		t = (t >> 25) + 19 * m64(x[6],x[6]) + 2 * (m64(x[0],x[2]) +
386	33228		m64(x[1],x[1])) + 38 * m64(x[4],x[8]) + 76 *
387	16614		(m64(x[3],x[9]) + m64(x[5],x[7]));
388	16614		x2[2] = t & ((1 << 26) - 1);
389	33228		t = (t >> 26) + 2 * (m64(x[0],x[3]) + m64(x[1],x[2])) + 38 *
390	16614		(m64(x[4],x[9]) + m64(x[5],x[8]) + m64(x[6],x[7]));
391	16614		x2[3] = t & ((1 << 25) - 1);
392	49842		t = (t >> 25) + m64(x[2],x[2]) + 2 * m64(x[0],x[4]) + 38 *
393	49842		(m64(x[6],x[8]) + m64(x[7],x[7])) + 4 * m64(x[1],x[3]) + 76 *
394	16614		m64(x[5],x[9]);
395	16614		x2[4] = t & ((1 << 26) - 1);
396	49842		t = (t >> 26) + 2 * (m64(x[0],x[5]) + m64(x[1],x[4]) + m64(x[2],x[3]))
397	33228		+ 38 * (m64(x[6],x[9]) + m64(x[7],x[8]));
398	16614		x2[5] = t & ((1 << 25) - 1);
399	66456		t = (t >> 25) + 19 * m64(x[8],x[8]) + 2 * (m64(x[0],x[6]) +
400	49842		m64(x[2],x[4]) + m64(x[3],x[3])) + 4 * m64(x[1],x[5]) +
401	16614		76 * m64(x[7],x[9]);
402	16614		x2[6] = t & ((1 << 26) - 1);
403	66456		t = (t >> 26) + 2 * (m64(x[0],x[7]) + m64(x[1],x[6]) + m64(x[2],x[5]) +
404	49842		m64(x[3],x[4])) + 38 * m64(x[8],x[9]);
405	16614		x2[7] = t & ((1 << 25) - 1);
406	16614		t = (t >> 25) + x2[8];
407	16614		x2[8] = t & ((1 << 26) - 1);
408	16614		x2[9] += (t >> 26);
409			check_reduced("sqr output", x2);
410	16614		return x2;
411			}
412
413			/* Calculates a reciprocal. The output is in reduced form, the inputs need not
414			* be. Simply calculates y = x^(p-2) so it's not too fast. */
415			/* When sqrtassist is true, it instead calculates y = x^((p-5)/8) */
416	19		static void recip25519(i25519 y, const i25519 x, int sqrtassist) {
417			i25519 t0, t1, t2, t3, t4;
418			int i;
419			/* the chain for x^(2^255-21) is straight from djb's implementation */
420	19		sqr25519(t1, x); /* 2 == 2 * 1 */
421	19		sqr25519(t2, t1); /* 4 == 2 * 2 */
422	19		sqr25519(t0, t2); /* 8 == 2 * 4 */
423	19		mul25519(t2, t0, x); /* 9 == 8 + 1 */
424	19		mul25519(t0, t2, t1); /* 11 == 9 + 2 */
425	19		sqr25519(t1, t0); /* 22 == 2 * 11 */
426	19		mul25519(t3, t1, t2); /* 31 == 22 + 9
427			== 2^5 - 2^0 */
428	19		sqr25519(t1, t3); /* 2^6 - 2^1 */
429	19		sqr25519(t2, t1); /* 2^7 - 2^2 */
430	19		sqr25519(t1, t2); /* 2^8 - 2^3 */
431	19		sqr25519(t2, t1); /* 2^9 - 2^4 */
432	19		sqr25519(t1, t2); /* 2^10 - 2^5 */
433	19		mul25519(t2, t1, t3); /* 2^10 - 2^0 */
434	19		sqr25519(t1, t2); /* 2^11 - 2^1 */
435	19		sqr25519(t3, t1); /* 2^12 - 2^2 */
436	95	100	for (i = 1; i < 5; i++) {
437	76		sqr25519(t1, t3);
438	76		sqr25519(t3, t1);
439			} /* t3 / / 2^20 - 2^10 */
440	19		mul25519(t1, t3, t2); /* 2^20 - 2^0 */
441	19		sqr25519(t3, t1); /* 2^21 - 2^1 */
442	19		sqr25519(t4, t3); /* 2^22 - 2^2 */
443	190	100	for (i = 1; i < 10; i++) {
444	171		sqr25519(t3, t4);
445	171		sqr25519(t4, t3);
446			} /* t4 / / 2^40 - 2^20 */
447	19		mul25519(t3, t4, t1); /* 2^40 - 2^0 */
448	114	100	for (i = 0; i < 5; i++) {
449	95		sqr25519(t1, t3);
450	95		sqr25519(t3, t1);
451			} /* t3 / / 2^50 - 2^10 */
452	19		mul25519(t1, t3, t2); /* 2^50 - 2^0 */
453	19		sqr25519(t2, t1); /* 2^51 - 2^1 */
454	19		sqr25519(t3, t2); /* 2^52 - 2^2 */
455	475	100	for (i = 1; i < 25; i++) {
456	456		sqr25519(t2, t3);
457	456		sqr25519(t3, t2);
458			} /* t3 / / 2^100 - 2^50 */
459	19		mul25519(t2, t3, t1); /* 2^100 - 2^0 */
460	19		sqr25519(t3, t2); /* 2^101 - 2^1 */
461	19		sqr25519(t4, t3); /* 2^102 - 2^2 */
462	950	100	for (i = 1; i < 50; i++) {
463	931		sqr25519(t3, t4);
464	931		sqr25519(t4, t3);
465			} /* t4 / / 2^200 - 2^100 */
466	19		mul25519(t3, t4, t2); /* 2^200 - 2^0 */
467	494	100	for (i = 0; i < 25; i++) {
468	475		sqr25519(t4, t3);
469	475		sqr25519(t3, t4);
470			} /* t3 / / 2^250 - 2^50 */
471	19		mul25519(t2, t3, t1); /* 2^250 - 2^0 */
472	19		sqr25519(t1, t2); /* 2^251 - 2^1 */
473	19		sqr25519(t2, t1); /* 2^252 - 2^2 */
474	19	100	if (sqrtassist) {
475	1		mul25519(y, x, t2); /* 2^252 - 3 */
476			} else {
477	18		sqr25519(t1, t2); /* 2^253 - 2^3 */
478	18		sqr25519(t2, t1); /* 2^254 - 2^4 */
479	18		sqr25519(t1, t2); /* 2^255 - 2^5 */
480	18		mul25519(y, t1, t0); /* 2^255 - 21 */
481			}
482	19		}
483
484			/* checks if x is "negative", requires reduced input */
485	7		static inline int is_negative(i25519 x) {
486	7		return (is_overflow(x) \| (x[9] < 0)) ^ (x[0] & 1);
487			}
488
489			/* a square root */
490	1		static void sqrt25519(i25519 x, const i25519 u) {
491			i25519 v, t1, t2;
492	1		add25519(t1, u, u); /* t1 = 2u */
493	1		recip25519(v, t1, 1); /* v = (2u)^((p-5)/8) */
494	1		sqr25519(x, v); /* x = v^2 */
495	1		mul25519(t2, t1, x); /* t2 = 2uv^2 */
496	1		t2[0]--; /* t2 = 2uv^2-1 */
497	1		mul25519(t1, v, t2); /* t1 = v(2uv^2-1) */
498	1		mul25519(x, u, t1); /* x = uv(2uv^2-1) */
499	1		}
500
501
502			/******************* Elliptic curve *******************/
503
504			/* y^2 = x^3 + 486662 x^2 + x over GF(2^255-19) */
505
506
507			/* t1 = ax + az
508			* t2 = ax - az */
509	5888		static inline void mont_prep(i25519 t1, i25519 t2, i25519 ax, i25519 az) {
510	5888		add25519(t1, ax, az);
511	5888		sub25519(t2, ax, az);
512	5888		}
513
514			/* A = P + Q where
515			* X(A) = ax/az
516			* X(P) = (t1+t2)/(t1-t2)
517			* X(Q) = (t3+t4)/(t3-t4)
518			* X(P-Q) = dx
519			* clobbers t1 and t2, preserves t3 and t4 */
520	3072		static inline void mont_add(i25519 t1, i25519 t2, i25519 t3, i25519 t4,
521			i25519 ax, i25519 az, const i25519 dx) {
522	3072		mul25519(ax, t2, t3);
523	3072		mul25519(az, t1, t4);
524	3072		add25519(t1, ax, az);
525	3072		sub25519(t2, ax, az);
526	3072		sqr25519(ax, t1);
527	3072		sqr25519(t1, t2);
528	3072		mul25519(az, t1, dx);
529	3072		}
530
531			/* B = 2 * Q where
532			* X(B) = bx/bz
533			* X(Q) = (t3+t4)/(t3-t4)
534			* clobbers t1 and t2, preserves t3 and t4 */
535	2816		static inline void mont_dbl(i25519 t1, i25519 t2, i25519 t3, i25519 t4,
536			i25519 bx, i25519 bz) {
537	2816		sqr25519(t1, t3);
538	2816		sqr25519(t2, t4);
539	2816		mul25519(bx, t1, t2);
540	2816		sub25519(t2, t1, t2);
541	2816		mul25519small(bz, t2, 121665);
542	2816		add25519(t1, t1, bz);
543	2816		mul25519(bz, t1, t2);
544	2816		}
545
546			/* Y^2 = X^3 + 486662 X^2 + X
547			* t is a temporary */
548	7		static inline void x_to_y2(i25519 t, i25519 y2, const i25519 x) {
549	7		sqr25519(t, x);
550	7		mul25519small(y2, x, 486662);
551	7		add25519(t, t, y2);
552	7		t[0]++;
553	7		mul25519(y2, t, x);
554	7		}
555
556			/* P = kG and s = sign(P)/k */
557	10		void core25519(k25519 Px, k25519 s, const k25519 k, const k25519 Gx) {
558			i25519 dx, x[2], z[2], t1, t2, t3, t4;
559			unsigned i, j;
560
561			/* unpack the base */
562	10	100	if (Gx)
563	4		unpack25519(dx, Gx);
564			else
565	6		set25519(dx, 9);
566
567			/* 0G = point-at-infinity */
568	10		set25519(x[0], 1);
569	10		set25519(z[0], 0);
570
571			/* 1G = G */
572	10		cpy25519(x[1], dx);
573	10		set25519(z[1], 1);
574
575	330	100	for (i = 32; i--; ) {
576	2880	100	for (j = 8; j--; ) {
577			/* swap arguments depending on bit */
578	2560		const int bit1 = k[i] >> j & 1;
579	2560		const int bit0 = ~k[i] >> j & 1;
580	2560		int32_t *const ax = x[bit0];
581	2560		int32_t *const az = z[bit0];
582	2560		int32_t *const bx = x[bit1];
583	2560		int32_t *const bz = z[bit1];
584
585			/* a' = a + b */
586			/* b' = 2 b */
587	2560		mont_prep(t1, t2, ax, az);
588	2560		mont_prep(t3, t4, bx, bz);
589	2560		mont_add(t1, t2, t3, t4, ax, az, dx);
590	2560		mont_dbl(t1, t2, t3, t4, bx, bz);
591			}
592			}
593
594	10		recip25519(t1, z[0], 0);
595	10		mul25519(dx, x[0], t1);
596	10		pack25519(dx, Px);
597
598			/* calculate s such that s abs(P) = G .. assumes G is std base point */
599	10	100	if (s) {
600	6		x_to_y2(t2, t1, dx); /* t1 = Py^2 */
601	6		recip25519(t3, z[1], 0); /* where Q=P+G ... */
602	6		mul25519(t2, x[1], t3); /* t2 = Qx */
603	6		add25519(t2, t2, dx); /* t2 = Qx + Px */
604	6		t2[0] += 9 + 486662; /* t2 = Qx + Px + Gx + 486662 */
605	6		dx[0] -= 9; /* dx = Px - Gx */
606	6		sqr25519(t3, dx); /* t3 = (Px - Gx)^2 */
607	6		mul25519(dx, t2, t3); /* dx = t2 (Px - Gx)^2 */
608	6		sub25519(dx, dx, t1); /* dx = t2 (Px - Gx)^2 - Py^2 */
609	6		dx[0] -= 39420360; /* dx = t2 (Px - Gx)^2 - Py^2 - Gy^2 */
610	6		mul25519(t1, dx, base_r2y); /* t1 = -Py */
611	6	100	if (is_negative(t1)) /* sign is 1, so just copy */
612	2		cpy32(s, k);
613			else /* sign is -1, so negate */
614	4		mula_small(s, order_times_8, 0, k, 32, -1);
615
616			/* reduce s mod q
617			* (is this needed? do it just in case, it's fast anyway) */
618			//divmod((dstptr) t1, s, 32, order25519, 32);
619
620			/* take reciprocal of s mod q */
621	6		cpy32((dstptr) t1, order25519);
622	6		cpy32(s, egcd32((dstptr) x, (dstptr) z, s, (dstptr) t1));
623	6	100	if ((int8_t) s[31] < 0)
624	3		mula_small(s, s, 0, order25519, 32, 1);
625			}
626	10		}
627
628			/* v = (x - h) s mod q */
629	1		int sign25519(k25519 v, const k25519 h, const priv25519 x, const spriv25519 s) {
630			uint8_t tmp[65];
631			unsigned w;
632			int i;
633
634			k25519 h1;
635			priv25519 x1;
636
637	33	100	for (i = 0; i < 32; i++) {
638	32		h1[i] = h[i];
639	32		x1[i] = x[i];
640			}
641
642	33	100	for (i = 0; i < 32; i++)
643	32		tmp[i] = 0;
644	1		divmod(tmp, h1, 32, order25519, 32);
645
646	33	100	for (i = 0; i < 32; i++)
647	32		tmp[i] = 0;
648	1		divmod(tmp, x1, 32, order25519, 32);
649
650	33	100	for (i = 0; i < 32; i++)
651	32		v[i] = 0;
652	1		i = mula_small(v, x1, 0, h1, 32, -1);
653	1		mula_small(v, v, 0, order25519, 32, (15-(int8_t) v[31])/16);
654	65	100	for (i = 0; i < 64; i++)
655	64		tmp[i] = 0;
656	1		mula32(tmp, v, s, 32, 1);
657	1		divmod(tmp+32, tmp, 64, order25519, 32);
658	33	100	for (w = 0, i = 0; i < 32; i++)
659	32		w \|= v[i] = tmp[i];
660	1		return w != 0;
661			}
662
663			/* Y = v abs(P) + h G */
664	1		void verify25519(pub25519 Y, const k25519 v, const k25519 h, const pub25519 P) {
665			k25519 d;
666			i25519 p[2], s[2], yx[3], yz[3], t1[3], t2[3];
667	1		unsigned vi = 0, hi = 0, di = 0, nvh, i, j, k;
668
669			/* set p[0] to G and p[1] to P */
670
671	1		set25519(p[0], 9);
672	1		unpack25519(p[1], P);
673
674			/* set s[0] to P+G and s[1] to P-G */
675
676			/* s[0] = (Py^2 + Gy^2 - 2 Py Gy)/(Px - Gx)^2 - Px - Gx - 486662 */
677			/* s[1] = (Py^2 + Gy^2 + 2 Py Gy)/(Px - Gx)^2 - Px - Gx - 486662 */
678
679	1		x_to_y2(t1[0], t2[0], p[1]); /* t2[0] = Py^2 */
680	1		sqrt25519(t1[0], t2[0]); /* t1[0] = Py or -Py */
681	1		j = is_negative(t1[0]); /* ... check which */
682	1		t2[0][0] += 39420360; /* t2[0] = Py^2 + Gy^2 */
683	1		mul25519(t2[1], base_2y, t1[0]);/* t2[1] = 2 Py Gy or -2 Py Gy */
684	1		sub25519(t1[j], t2[0], t2[1]); /* t1[0] = Py^2 + Gy^2 - 2 Py Gy */
685	1		add25519(t1[1-j], t2[0], t2[1]);/* t1[1] = Py^2 + Gy^2 + 2 Py Gy */
686	1		cpy25519(t2[0], p[1]); /* t2[0] = Px */
687	1		t2[0][0] -= 9; /* t2[0] = Px - Gx */
688	1		sqr25519(t2[1], t2[0]); /* t2[1] = (Px - Gx)^2 */
689	1		recip25519(t2[0], t2[1], 0); /* t2[0] = 1/(Px - Gx)^2 */
690	1		mul25519(s[0], t1[0], t2[0]); /* s[0] = t1[0]/(Px - Gx)^2 */
691	1		sub25519(s[0], s[0], p[1]); /* s[0] = t1[0]/(Px - Gx)^2 - Px */
692	1		s[0][0] -= 9 + 486662; /* s[0] = X(P+G) */
693	1		mul25519(s[1], t1[1], t2[0]); /* s[1] = t1[1]/(Px - Gx)^2 */
694	1		sub25519(s[1], s[1], p[1]); /* s[1] = t1[1]/(Px - Gx)^2 - Px */
695	1		s[1][0] -= 9 + 486662; /* s[1] = X(P-G) */
696	1		mul25519small(s[0], s[0], 1); /* reduce s[0] */
697	1		mul25519small(s[1], s[1], 1); /* reduce s[1] */
698
699
700			/* prepare the chain */
701	33	100	for (i = 0; i < 32; i++) {
702	32		vi = (vi >> 8) ^ v[i] ^ (v[i] << 1);
703	32		hi = (hi >> 8) ^ h[i] ^ (h[i] << 1);
704	32		nvh = ~(vi ^ hi);
705	32		di = (nvh & (di & 0x80) >> 7) ^ vi;
706	32		di ^= nvh & (di & 0x01) << 1;
707	32		di ^= nvh & (di & 0x02) << 1;
708	32		di ^= nvh & (di & 0x04) << 1;
709	32		di ^= nvh & (di & 0x08) << 1;
710	32		di ^= nvh & (di & 0x10) << 1;
711	32		di ^= nvh & (di & 0x20) << 1;
712	32		di ^= nvh & (di & 0x40) << 1;
713	32		d[i] = di;
714			}
715
716	1		di = ((nvh & (di & 0x80) << 1) ^ vi) >> 8;
717
718			/* initialize state */
719	1		set25519(yx[0], 1);
720	1		cpy25519(yx[1], p[di]);
721	1		cpy25519(yx[2], s[0]);
722	1		set25519(yz[0], 0);
723	1		set25519(yz[1], 1);
724	1		set25519(yz[2], 1);
725
726			/* y[0] is (even)P + (even)G
727			* y[1] is (even)P + (odd)G if current d-bit is 0
728			* y[1] is (odd)P + (even)G if current d-bit is 1
729			* y[2] is (odd)P + (odd)G
730			*/
731
732	1		vi = 0;
733	1		hi = 0;
734
735			/* and go for it! */
736	33	100	for (i = 32; i--; ) {
737	32		vi = (vi << 8) \| v[i];
738	32		hi = (hi << 8) \| h[i];
739	32		di = (di << 8) \| d[i];
740
741	288	100	for (j = 8; j--; ) {
742	256		mont_prep(t1[0], t2[0], yx[0], yz[0]);
743	256		mont_prep(t1[1], t2[1], yx[1], yz[1]);
744	256		mont_prep(t1[2], t2[2], yx[2], yz[2]);
745
746	512		k = ((vi ^ vi >> 1) >> j & 1)
747	256		+ ((hi ^ hi >> 1) >> j & 1);
748	256		mont_dbl(yx[2], yz[2], t1[k], t2[k], yx[0], yz[0]);
749
750	256		k = (di >> j & 2) ^ ((di >> j & 1) << 1);
751	256		mont_add(t1[1], t2[1], t1[k], t2[k], yx[1], yz[1],
752	256		p[di >> j & 1]);
753
754	256		mont_add(t1[2], t2[2], t1[0], t2[0], yx[2], yz[2],
755	256		s[((vi ^ hi) >> j & 2) >> 1]);
756			}
757			}
758
759	1		k = (vi & 1) + (hi & 1);
760	1		recip25519(t1[0], yz[k], 0);
761	1		mul25519(t1[1], yx[k], t1[0]);
762
763	1		pack25519(t1[1], Y);
764	1		}