File Coverage

factor.c

Criterion	Covered	Total	%
statement	719	982	73.2
branch	555	1046	53.0
condition			n/a
subroutine			n/a
pod			n/a
total	1274	2028	62.8

line	stmt	bran	code
1			#include
2			#include
3			#include
4			#include
5
6			#define FUNC_isqrt 1
7			#define FUNC_icbrt 1
8			#define FUNC_gcd_ui 1
9			#define FUNC_is_perfect_square 1
10			#define FUNC_clz 1
11			#include "ptypes.h"
12			#include "factor.h"
13			#include "sieve.h"
14			#include "util.h"
15			#include "mulmod.h"
16			#include "cache.h"
17			#include "primality.h"
18			#include "montmath.h"
19			static int holf32(uint32_t n, UV *factors, uint32_t rounds);
20
21			/*
22			* You need to remember to use UV for unsigned and IV for signed types that
23			* are large enough to hold our data.
24			* If you use int, that's 32-bit on LP64 and LLP64 machines. You lose.
25			* If you use long, that's 32-bit on LLP64 machines. You lose.
26			* If you use long long, you may be too large which isn't so bad, but some
27			* compilers may not understand the type at all.
28			* perl.h already figured all this out, and provided us with these types which
29			* match the native integer type used inside our Perl, so just use those.
30			*/
31
32			static const unsigned short primes_small[] =
33			{0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
34			101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,
35			193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,
36			293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,
37			409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,509,
38			521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,617,619,631,
39			641,643,647,653,659,661,673,677,683,691,701,709,719,727,733,739,743,751,
40			757,761,769,773,787,797,809,811,821,823,827,829,839,853,857,859,863,877,
41			881,883,887,907,911,919,929,937,941,947,953,967,971,977,983,991,997,1009,
42			1013,1019,1021,1031,1033,1039,1049,1051,1061,1063,1069,1087,1091,1093,
43			1097,1103,1109,1117,1123,1129,1151,1153,1163,1171,1181,1187,1193,1201,
44			1213,1217,1223,1229,1231,1237,1249,1259,1277,1279,1283,1289,1291,1297,
45			1301,1303,1307,1319,1321,1327,1361,1367,1373,1381,1399,1409,1423,1427,
46			1429,1433,1439,1447,1451,1453,1459,1471,1481,1483,1487,1489,1493,1499,
47			1511,1523,1531,1543,1549,1553,1559,1567,1571,1579,1583,1597,1601,1607,
48			1609,1613,1619,1621,1627,1637,1657,1663,1667,1669,1693,1697,1699,1709,
49			1721,1723,1733,1741,1747,1753,1759,1777,1783,1787,1789,1801,1811,1823,
50			1831,1847,1861,1867,1871,1873,1877,1879,1889,1901,1907,1913,1931,1933,
51			1949,1951,1973,1979,1987,1993,1997,1999,2003,2011};
52			#define NPRIMES_SMALL (sizeof(primes_small)/sizeof(primes_small[0]))
53
54	34004		static int _small_trial_factor(UV n, UV factors, UV newn, uint32_t *lastf)
55			{
56	34004		int nfactors = 0;
57	34004		uint32_t f = 7;
58
59	34004	100	if (n > 1) {
60	48737	100	while ( (n & 1) == 0 ) { factors[nfactors++] = 2; n /= 2; }
61	44379	100	while ( (n % 3) == 0 ) { factors[nfactors++] = 3; n /= 3; }
62	39785	100	while ( (n % 5) == 0 ) { factors[nfactors++] = 5; n /= 5; }
63			}
64
65	34004	100	if (f*f <= n) {
66	26728		uint32_t const lastsp = 83;
67	26728		uint32_t sp = 4;
68			/* Trial division from 7 to 421. Use 32-bit if possible. */
69	26728	100	if (n <= 4294967295U) {
70	26561		uint32_t un = n;
71	238949	100	while (sp < lastsp) {
72	251168	100	while ( (un%f) == 0 ) {
73	12438		factors[nfactors++] = f;
74	12438		un /= f;
75			}
76	238730		f = primes_small[++sp];
77	238730	100	if (f*f > un) break;
78			}
79	26561		n = un;
80			} else {
81	12632	100	while (sp < lastsp) {
82	12697	100	while ( (n%f) == 0 ) {
83	220		factors[nfactors++] = f;
84	220		n /= f;
85			}
86	12477		f = primes_small[++sp];
87	12477	100	if (f*f > n) break;
88			}
89			}
90			/* If n is small and still composite, finish it here */
91	26728	100	if (n < 20112011 && ff <= n) { /* Trial division from 431 to 2003 */
		100
92	65		uint32_t un = n;
93	5934	50	while (sp < NPRIMES_SMALL) {
94	5955	100	while ( (un%f) == 0 ) {
95	21		factors[nfactors++] = f;
96	21		un /= f;
97			}
98	5934		f = primes_small[++sp];
99	5934	100	if (f*f > un) break;
100			}
101	65		n = un;
102			}
103			}
104	34004	100	if (f*f > n && n != 1) {
		100
105	30179		factors[nfactors++] = n;
106	30179		n = 1;
107			}
108	34004	50	if (newn) *newn = n;
109	34004	50	if (lastf) *lastf = f;
110	34004		return nfactors;
111			}
112
113	205		static int _power_factor(UV n, UV *factors)
114			{
115	205		int nfactors, i, j, k = powerof(n);
116	205	100	if (k > 1) {
117	5		UV p = rootof(n, k);
118	5		nfactors = factor(p, factors);
119	17	100	for (i = nfactors; i >= 0; i--)
120	38	100	for (j = 0; j < k; j++)
121	26		factors[k*i+j] = factors[i];
122	5		return k*nfactors;
123			}
124	200		factors[0] = n;
125	200		return 1;
126			}
127
128			/* Find one factor of an input n. */
129	190		int factor_one(UV n, UV *factors, int primality, int trial)
130			{
131			int nfactors;
132	190	50	if (n < 4) {
133	0		factors[0] = n;
134	0		return (n == 1) ? 0 : 1;
135			}
136			/* TODO: deal with small n */
137	190	50	if (trial) {
138			uint32_t sp, f;
139	0	0	if (!(n&1)) { factors[0] = 2; factors[1] = n >> 1; return 2; }
140	0	0	if (!(n%3)) { factors[0] = 3; factors[1] = n / 3; return 2; }
141	0	0	if (!(n%5)) { factors[0] = 5; factors[1] = n / 5; return 2; }
142	0	0	for (sp = 4; (f = primes_small[sp]) < 2011; sp++) {
143	0	0	if ( (n % f) == 0 )
144	0		{ factors[0] = f; factors[1] = n/f; return 2; }
145			}
146	0	0	if (n < f*f) { factors[0] = n; return 1; }
147			}
148	190	50	if (primality && is_prime(n)) {
		0
149	0		factors[0] = n;
150	0		return 1;
151			}
152			#if 0 /* Simple solution, just fine on x86_64 */
153			nfactors = (n < 1073741824UL) ? holf32(n, factors, 1000000)
154			: pbrent_factor(n, factors, 500000, 1);
155			if (nfactors < 2) croak("factor_one failed on %lu\n", n);
156			#endif
157			{
158			/* Adjust the number of rounds based on the number size and speed */
159	190	50	UV const nbits = BITS_PER_WORD - clz(n);
160			#if USE_MONTMATH
161	190	100	UV const br_rounds = 8000 + (9000 * ((nbits <= 45) ? 0 : (nbits-45)));
162	190		UV const sq_rounds = 200000;
163			#elif MULMODS_ARE_FAST
164			UV const br_rounds = 500 + ( 200 * ((nbits <= 45) ? 0 : (nbits-45)));
165			UV const sq_rounds = 100000;
166			#else
167			UV const br_rounds = (nbits >= 63) ? 120000 : (nbits >= 58) ? 500 : 0;
168			UV const sq_rounds = 200000;
169			#endif
170
171			#if BITS_PER_WORD == 64
172			/* For small semiprimes the fastest solution is HOLF under 32, then
173			* Lehman (no trial) under 38. However on random inputs, HOLF is
174			* best only under 28-30 bits, and adding Lehman is always slower. */
175	190	100	if (nbits <= 30) { /* This should always succeed */
176	35		nfactors = holf32(n, factors, 1000000);
177	35	50	if (nfactors > 1) return nfactors;
178			}
179			#endif
180			/* Almost all inputs are factored here */
181	155	50	if (br_rounds > 0) {
182	155		nfactors = pbrent_factor(n, factors, br_rounds, 1);
183	155	50	if (nfactors > 1) return nfactors;
184			}
185			#if USE_MONTMATH
186	0		nfactors = pbrent_factor(n, factors, 2*br_rounds, 3);
187	0	0	if (nfactors > 1) return nfactors;
188			#endif
189			/* Random 64-bit inputs at this point:
190			* About 3.1% are small enough that we did with HOLF.
191			* montmath: 96.89% pbrent, 0.01% pbrent2
192			* fast: 73.43% pbrent, 21.97% squfof, 1.09% p-1, 0.49% prho, long
193			* slow: 75.34% squfof, 19.47% pbrent, 0.20% p-1, 0.06% prho
194			*/
195			/* SQUFOF with these parameters gets 99.9% of everything left */
196	0	0	if (nbits <= 62) {
197	0		nfactors = squfof_factor(n, factors, sq_rounds);
198	0	0	if (nfactors > 1) return nfactors;
199			}
200			/* At this point we should only have 16+ digit semiprimes. */
201	0		nfactors = pminus1_factor(n, factors, 8000, 120000);
202	0	0	if (nfactors > 1) return nfactors;
203			/* Get the stragglers */
204	0		nfactors = prho_factor(n, factors, 120000);
205	0	0	if (nfactors > 1) return nfactors;
206	0		nfactors = pbrent_factor(n, factors, 500000, 5);
207	0	0	if (nfactors > 1) return nfactors;
208	0		nfactors = prho_factor(n, factors, 120000);
209	0	0	if (nfactors > 1) return nfactors;
210	0		croak("factor_one failed on %lu\n", n);
211			}
212			return nfactors;
213			}
214
215			/******************************************************************************/
216			/* Main factor loop */
217			/* */
218			/* Puts factors in factors[] and returns the number found. */
219			/******************************************************************************/
220	34004		int factor(UV n, UV *factors)
221			{
222			UV tofac_stack[MPU_MAX_FACTORS+1];
223	34004		int nsmallfactors, npowerfactors, nfactors, i, j, ntofac = 0;
224			uint32_t f;
225
226	34004		nfactors = _small_trial_factor(n, factors, &n, &f);
227	34004	100	if (n == 1) return nfactors;
228
229			#if BITS_PER_WORD == 64
230			/* For small values less than f^3, use simple factor to split semiprime */
231	309	100	if (n < 100000000 && n < fff) {
		100
232	104	100	if (MR32(n)) factors[nfactors++] = n;
233	52		else nfactors += holf32(n, factors+nfactors, 10000);
234	104		return nfactors;
235			}
236			#endif
237
238	205		nsmallfactors = nfactors;
239
240			/* Perfect powers. Factor root only once. */
241	205		npowerfactors = _power_factor(n, factors+nsmallfactors);
242	205	100	if (npowerfactors > 1) return nsmallfactors + npowerfactors;
243
244			/* loop over each remaining factor, until ntofac == 0 */
245			do {
246	530	100	while ( (n >= f*f) && (!is_def_prime(n)) ) {
		100
		100
		100
247	165		int split_success = factor_one(n, tofac_stack+ntofac, 0, 0) - 1;
248	165	50	if (split_success != 1 \|\| tofac_stack[ntofac] == 1 \|\| tofac_stack[ntofac] == n)
		50
		50
249	0		croak("internal: factor_one failed to factor %"UVuf"\n", n);
250	165		ntofac++; /* Leave one on the to-be-factored stack */
251	165		n = tofac_stack[ntofac]; /* Set n to the other one */
252			}
253			/* n is now prime (or 1), so add to already-factored stack */
254	365	50	if (n != 1) factors[nfactors++] = n;
255			/* Pop the next number off the to-factor stack */
256	365	100	if (ntofac > 0) n = tofac_stack[ntofac-1];
257	365	100	} while (ntofac-- > 0);
258
259			/* Sort the non-small factors */
260	365	100	for (i = nsmallfactors+1; i < nfactors; i++) {
261	165		UV fi = factors[i];
262	358	100	for (j = i; j > 0 && factors[j-1] > fi; j--)
		100
263	193		factors[j] = factors[j-1];
264	165		factors[j] = fi;
265			}
266	34004		return nfactors;
267			}
268
269
270	12675		int factor_exp(UV n, UV factors, UV exponents)
271			{
272	12675		int i = 1, j = 1, nfactors;
273
274	12675	100	if (n == 1) return 0;
275	12664		nfactors = factor(n, factors);
276
277	12664	100	if (exponents == 0) {
278	218	100	for (; i < nfactors; i++)
279	165	100	if (factors[i] != factors[i-1])
280	111		factors[j++] = factors[i];
281			} else {
282	12611		exponents[0] = 1;
283	29074	100	for (; i < nfactors; i++) {
284	16463	100	if (factors[i] != factors[i-1]) {
285	12924		exponents[j] = 1;
286	12924		factors[j++] = factors[i];
287			} else {
288	3539		exponents[j-1]++;
289			}
290			}
291			}
292	12664		return j;
293			}
294
295	1619		int trial_factor(UV n, UV *factors, UV f, UV last)
296			{
297	1619		int sp, nfactors = 0;
298
299	1619	50	if (f < 2) f = 2;
300	1619	100	if (last == 0 \|\| last*last > n) last = UV_MAX;
		100
301
302	1619	100	if (n < 4 \|\| last < f) {
		50
303	8		factors[0] = n;
304	8		return (n == 1) ? 0 : 1;
305			}
306
307			/* possibly do uint32_t specific code here */
308
309	1611	50	if (f < primes_small[NPRIMES_SMALL-1]) {
310	3232	100	while ( (n & 1) == 0 ) { factors[nfactors++] = 2; n >>= 1; }
311	2415	100	if (3<=last) while ( (n % 3) == 0 ) { factors[nfactors++] = 3; n /= 3; }
		100
312	1983	100	if (5<=last) while ( (n % 5) == 0 ) { factors[nfactors++] = 5; n /= 5; }
		100
313	5601	100	for (sp = 4; sp < (int)NPRIMES_SMALL; sp++) {
314	5598		f = primes_small[sp];
315	5598	100	if (f*f > n \|\| f > last) break;
		100
316	4374	100	while ( (n%f) == 0 ) {
317	384		factors[nfactors++] = f;
318	384		n /= f;
319			}
320			}
321			}
322			/* Trial division using a mod-30 wheel for larger values */
323	1611	100	if (f*f <= n && f <= last) {
		100
324	3		UV m, newlimit, limit = isqrt(n);
325	3	100	if (limit > last) limit = last;
326	3		m = f % 30;
327	7906	100	while (f <= limit) {
328	7903	100	if ( (n%f) == 0 ) {
329			do {
330	2		factors[nfactors++] = f;
331	2		n /= f;
332	2	50	} while ( (n%f) == 0 );
333	2		newlimit = isqrt(n);
334	2	50	if (newlimit < limit) limit = newlimit;
335			}
336	7903		f += wheeladvance30[m];
337	7903		m = nextwheel30[m];
338			}
339			}
340			/* All done! */
341	1611	100	if (n != 1)
342	1485		factors[nfactors++] = n;
343	1611		return nfactors;
344			}
345
346
347	3834		static void _divisors_from_factors(UV nfactors, UV* fp, UV* fe, UV* res) {
348	3834		UV s, count = 1;
349
350	3834		res[0] = 1;
351	12779	100	for (s = 0; s < nfactors; s++) {
352	8945		UV i, j, scount = count, p = fp[s], e = fe[s], mult = 1;
353	21065	100	for (j = 0; j < e; j++) {
354	12120		mult *= p;
355	40537	100	for (i = 0; i < scount; i++)
356	28417		res[count++] = res[i] * mult;
357			}
358			}
359	3834		}
360
361	3838		UV* _divisor_list(UV n, UV *num_divisors)
362			{
363			UV factors[MPU_MAX_FACTORS+1];
364			UV exponents[MPU_MAX_FACTORS+1];
365			UV* divs;
366			int i, nfactors, ndivisors;
367
368	3838	100	if (n <= 1) {
369	4		New(0, divs, 2, UV);
370	4	100	if (n == 0) { divs[0] = 0; divs[1] = 1; *num_divisors = 2; }
371	4	100	if (n == 1) { divs[0] = 1; *num_divisors = 1; }
372	4		return divs;
373			}
374			/* Factor and convert to factor/exponent pair */
375	3834		nfactors = factor_exp(n, factors, exponents);
376			/* Calculate number of divisors, allocate space, fill with divisors */
377	3834		ndivisors = exponents[0] + 1;
378	8945	100	for (i = 1; i < nfactors; i++)
379	5111		ndivisors *= (exponents[i] + 1);
380	3834	50	New(0, divs, ndivisors, UV);
381	3834		_divisors_from_factors(nfactors, factors, exponents, divs);
382			/* Sort divisors (numeric ascending) */
383	3834		qsort(divs, ndivisors, sizeof(UV), _numcmp);
384			/* Return number of divisors and list */
385	3834		*num_divisors = ndivisors;
386	3838		return divs;
387			}
388
389
390			/* The usual method, on OEIS for instance, is:
391			* (p^(k*(e+1))-1) / (p^k-1)
392			* but that overflows quicky. Instead we rearrange as:
393			* 1 + p^k + p^k^2 + ... p^k^e
394			* Return 0 if the result overflowed.
395			*/
396			static const UV sigma_overflow[11] =
397			#if BITS_PER_WORD == 64
398			{UVCONST(3000000000000000000),UVCONST(3000000000),2487240,64260,7026,
399			1622, 566, 256, 139, 85, 57};
400			#else
401			{UVCONST(845404560), 52560, 1548, 252, 84, 41, 24, 16, 12, 10, 8};
402			#endif
403	1505		UV divisor_sum(UV n, UV k)
404			{
405			UV factors[MPU_MAX_FACTORS+1];
406			int nfac, i, j;
407	1505		UV product = 1;
408
409	1505	50	if (k > 11 \|\| (k > 0 && n >= sigma_overflow[k-1])) return 0;
		100
		50
410	1505	100	if (n <= 1) /* n=0 divisors are [0,1] */
411	287	100	return (n == 1) ? 1 : (k == 0) ? 2 : 1; /* n=1 divisors are [1] */
		100
412	1218		nfac = factor(n,factors);
413	1218	100	if (k == 0) {
414	2339	100	for (i = 0; i < nfac; i++) {
415	1365		UV e = 1, f = factors[i];
416	1806	100	while (i+1 < nfac && f == factors[i+1]) { e++; i++; }
		100
417	1365		product *= (e+1);
418			}
419	244	100	} else if (k == 1) {
420	427	100	for (i = 0; i < nfac; i++) {
421	270		UV f = factors[i];
422	270		UV pke = f, fmult = 1 + f;
423	372	100	while (i+1 < nfac && f == factors[i+1]) {
		100
424	102		pke *= f;
425	102		fmult += pke;
426	102		i++;
427			}
428	270		product *= fmult;
429			}
430			} else {
431	222	100	for (i = 0; i < nfac; i++) {
432	135		UV f = factors[i];
433	135		UV fmult, pke, pk = f;
434	328	100	for (j = 1; j < (int)k; j++) pk *= f;
435	135		fmult = 1 + pk;
436	135		pke = pk;
437	188	100	while (i+1 < nfac && f == factors[i+1]) {
		100
438	53		pke *= pk;
439	53		fmult += pke;
440	53		i++;
441			}
442	135		product *= fmult;
443			}
444			}
445	1505		return product;
446			}
447
448
449
450
451	255		static int found_factor(UV n, UV f, UV* factors)
452			{
453	255		UV f2 = n/f;
454	255		int i = f > f2;
455	255	50	if (f == 1 \|\| f2 == 1) {
		50
456	0		factors[0] = n;
457	0		return 1;
458			}
459	255		factors[i] = f;
460	255		factors[1-i] = f2;
461	255	50	MPUassert( factors[0] * factors[1] == n , "incorrect factoring");
462	255		return 2;
463			}
464
465			/* Knuth volume 2, algorithm C.
466			* Can't compete with HOLF, SQUFOF, pbrent, etc.
467			*/
468	2		int fermat_factor(UV n, UV *factors, UV rounds)
469			{
470			IV sqn, x, y, r;
471	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in fermat_factor");
		50
472	2		sqn = isqrt(n);
473	2		x = 2 * sqn + 1;
474	2		y = 1;
475	2		r = (sqn*sqn) - n;
476
477	10	100	while (r != 0) {
478	8	50	if (rounds-- == 0) { factors[0] = n; return 1; }
479	8		r += x;
480	8		x += 2;
481			do {
482	26		r -= y;
483	26		y += 2;
484	26	100	} while (r > 0);
485			}
486	2		r = (x-y)/2;
487	2		return found_factor(n, r, factors);
488			}
489
490			/* Hart's One Line Factorization. */
491	2		int holf_factor(UV n, UV *factors, UV rounds)
492			{
493			UV i, s, m, f;
494
495	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in holf_factor");
		50
496
497			/* We skip the perfect-square test for s in the loop, so we
498			* will never succeed if n is a perfect square. Test that now. */
499	2	50	if (is_perfect_square(n))
500	0		return found_factor(n, isqrt(n), factors);
501
502	2	50	if (n <= (UV_MAX >> 6)) { /* Try with premultiplier first */
503	2	50	UV npre = n * ( (n <= (UV_MAX >> 13)) ? 720 :
		0
		0
		0
504			(n <= (UV_MAX >> 11)) ? 480 :
505			(n <= (UV_MAX >> 10)) ? 360 :
506			(n <= (UV_MAX >> 8)) ? 60 : 30 );
507	2		UV ni = npre;
508			#if 0 /* Straightforward */
509			while (rounds--) {
510			s = isqrt(ni) + 1;
511			m = (s*s) - ni;
512			if (is_perfect_square(m)) {
513			f = gcd_ui(n, s - isqrt(m));
514			if (f > 1 && f < n)
515			return found_factor(n, f, factors);
516			}
517			if (ni >= (ni+npre)) break;
518			ni += npre;
519			}
520			#else /* More optimized */
521	3	50	while (rounds--) {
522	3		s = 1 + (UV)sqrt((double)ni);
523	3		m = (s*s) - ni;
524	3		f = m & 127;
525	3	100	if (!((f0x8bc40d7d) & (f0xa1e2f5d1) & 0x14020a)) {
526	2		f = (UV)sqrt((double)m);
527	2	50	if (m == f*f) {
528	2		f = gcd_ui(n, s - f);
529	2	50	if (f > 1 && f < n)
		50
530	2		return found_factor(n, f, factors);
531			}
532			}
533	1	50	if (ni >= (ni+npre)) break;
534	1		ni += npre;
535			}
536			#endif
537			}
538
539	0	0	for (i = 1; i <= rounds; i++) {
540	0		s = (UV) sqrt( (double)n * (double)i );
541			/* Assume s^2 isn't a perfect square. We're rapidly losing precision
542			* so we won't be able to accurately detect it anyway. */
543	0		s++; /* s = ceil(sqrt(ni)) /
544	0		m = sqrmod(s, n);
545	0	0	if (is_perfect_square(m)) {
546	0		f = isqrt(m);
547	0	0	f = gcd_ui( (s>f) ? s-f : f-s, n);
548			/* This should always succeed, but with overflow concerns.... */
549	0		return found_factor(n, f, factors);
550			}
551			}
552	0		factors[0] = n;
553	0		return 1;
554			}
555	87		static int holf32(uint32_t n, UV *factors, uint32_t rounds) {
556			UV npre, ni; /* These should be 64-bit */
557			uint32_t s, m, f;
558
559	87	50	if (n < 3) { factors[0] = n; return 1; }
560	87	50	if (!(n&1)) { factors[0] = 2; factors[1] = n/2; return 2; }
561	87	100	if (is_perfect_square(n)) { factors[0] = factors[1] = isqrt(n); return 2; }
562
563	86		ni = npre = (UV) n * ((BITS_PER_WORD == 64) ? 5040 : 1);
564	2649	50	while (rounds--) {
565	2649		s = 1 + (uint32_t)sqrt((double)ni);
566	2649		m = ((UV)s*(UV)s) - ni;
567	2649		f = m & 127;
568	2649	100	if (!((f0x8bc40d7d) & (f0xa1e2f5d1) & 0x14020a)) {
569	2059		f = (uint32_t)sqrt((double)m);
570	2059	100	if (m == f*f) {
571	86		f = gcd_ui(n, s - f);
572	86	50	if (f > 1 && f < n)
		50
573	86		return found_factor(n, f, factors);
574			}
575			}
576	2563	50	if (ni >= (ni+npre)) break; /* We've overflowed */
577	2563		ni += npre;
578			}
579	0		factors[0] = n;
580	0		return 1;
581			}
582
583
584			#define ABSDIFF(x,y) (x>y) ? x-y : y-x
585			#if USE_MONTMATH
586			/* Pollard Rho with Brent's updates, using Montgomery reduction. */
587	157		int pbrent_factor(UV n, UV *factors, UV rounds, UV a)
588			{
589	157	50	UV const nbits = BITS_PER_WORD - clz(n);
590	157	100	const UV inner = (nbits <= 31) ? 32 : (nbits <= 35) ? 64 : (nbits <= 40) ? 160 : (nbits <= 52) ? 256 : 320;
		100
		100
		100
591			UV f, m, r, rleft, Xi, Xm, Xs;
592	157		int irounds, fails = 6;
593	157		const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
594
595	157	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pbrent_factor");
		50
596	157		r = f = 1;
597	157		Xi = Xm = Xs = mont1;
598	157		a = mont_geta(a,n);
599
600	1219	50	while (rounds > 0) {
601	1219		rleft = (r > rounds) ? rounds : r;
602	1219		Xm = Xi;
603			/* Do rleft rounds, inner at a time */
604	3157	100	while (rleft > 0) {
605	2095		irounds = (rleft > (UV)inner) ? inner : rleft;
606	2095		rleft -= irounds;
607	2095		rounds -= irounds;
608	2095		Xs = Xi;
609	2095	100	if (n < (1ULL << 63)) {
610	1741		Xi = mont_mulmod63(Xi,Xi+a,n);
611	1741	100	m = ABSDIFF(Xi,Xm);
612	222990	100	while (--irounds > 0) {
613	221249		Xi = mont_mulmod63(Xi,Xi+a,n);
614	221249	100	f = ABSDIFF(Xi,Xm);
615	221249		m = mont_mulmod63(m, f, n);
616			}
617	354	50	} else if (a == mont1) {
618	354		Xi = mont_mulmod64(Xi,Xi+a,n);
619	354	100	m = ABSDIFF(Xi,Xm);
620	104189	100	while (--irounds > 0) {
621	103835		Xi = mont_mulmod64(Xi,Xi+a,n);
622	103835	100	f = ABSDIFF(Xi,Xm);
623	103835		m = mont_mulmod64(m, f, n);
624			}
625			} else {
626	0		Xi = addmod(mont_mulmod64(Xi,Xi,n), a, n);
627	0	0	m = ABSDIFF(Xi,Xm);
628	0	0	while (--irounds > 0) {
629	0		Xi = addmod(mont_mulmod64(Xi,Xi,n), a, n);
630	0	0	f = ABSDIFF(Xi,Xm);
631	0		m = mont_mulmod64(m, f, n);
632			}
633			}
634	2095		f = gcd_ui(m, n);
635	2095	100	if (f != 1)
636	157		break;
637			}
638			/* If f == 1, then we didn't find a factor. Move on. */
639	1219	100	if (f == 1) {
640	1062		r *= 2;
641	1062		continue;
642			}
643	157	50	if (f == n) { /* back up, with safety */
644	0		Xi = Xs;
645			do {
646	0	0	if (n < (1ULL << 63) \|\| a == mont1)
		0
647	0	0	Xi = mont_mulmod(Xi,Xi+a,n);
648			else
649	0	0	Xi = addmod(mont_mulmod(Xi,Xi,n),a,n);
650	0	0	m = ABSDIFF(Xi,Xm);
651	0		f = gcd_ui(m, n);
652	0	0	} while (f == 1 && r-- != 0);
		0
653			}
654	157	50	if (f == 0 \|\| f == n) {
		50
655	0	0	if (fails-- <= 0) break;
656	0		Xi = Xm = mont1;
657	0		a = addmod(a, mont_geta(11,n), n);
658	0		continue;
659			}
660	157		return found_factor(n, f, factors);
661			}
662	0		factors[0] = n;
663	0		return 1;
664			}
665			#else
666			/* Pollard Rho with Brent's updates. */
667			int pbrent_factor(UV n, UV *factors, UV rounds, UV a)
668			{
669			UV f, m, r, Xi, Xm;
670			const UV inner = (n <= 4000000000UL) ? 32 : 160;
671			int fails = 6;
672
673			MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pbrent_factor");
674
675			r = f = Xi = Xm = 1;
676			while (rounds > 0) {
677			UV rleft = (r > rounds) ? rounds : r;
678			UV saveXi = Xi;
679			/* Do rleft rounds, inner at a time */
680			while (rleft > 0) {
681			UV dorounds = (rleft > inner) ? inner : rleft;
682			saveXi = Xi;
683			rleft -= dorounds;
684			rounds -= dorounds;
685			Xi = sqraddmod(Xi, a, n); /* First iteration, no mulmod needed */
686			m = ABSDIFF(Xi,Xm);
687			while (--dorounds > 0) { /* Now do inner-1=63 more iterations */
688			Xi = sqraddmod(Xi, a, n);
689			f = ABSDIFF(Xi,Xm);
690			m = mulmod(m, f, n);
691			}
692			f = gcd_ui(m, n);
693			if (f != 1)
694			break;
695			}
696			/* If f == 1, then we didn't find a factor. Move on. */
697			if (f == 1) {
698			r *= 2;
699			Xm = Xi;
700			continue;
701			}
702			if (f == n) { /* back up, with safety */
703			Xi = saveXi;
704			do {
705			Xi = sqraddmod(Xi, a, n);
706			f = gcd_ui( ABSDIFF(Xi,Xm), n);
707			} while (f == 1 && r-- != 0);
708			}
709			if (f == 0 \|\| f == n) {
710			if (fails-- <= 0) break;
711			Xm = addmod(Xm, 11, n);
712			Xi = Xm;
713			a++;
714			continue;
715			}
716			return found_factor(n, f, factors);
717			}
718			factors[0] = n;
719			return 1;
720			}
721			#endif
722
723			/* Pollard's Rho. */
724	2		int prho_factor(UV n, UV *factors, UV rounds)
725			{
726			UV a, f, i, m, oldU, oldV;
727	2		const UV inner = 64;
728	2		UV U = 7;
729	2		UV V = 7;
730	2		int fails = 3;
731
732	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in prho_factor");
		50
733
734			/* We could just as well say a = 1 */
735	2		switch (n%8) {
736	0		case 1: a = 1; break;
737	2		case 3: a = 2; break;
738	0		case 5: a = 3; break;
739	0		case 7: a = 5; break;
740	0		default: a = 7; break;
741			}
742
743	2		rounds = (rounds + inner - 1) / inner;
744
745	2	50	while (rounds-- > 0) {
746	2		m = 1; oldU = U; oldV = V;
747	130	100	for (i = 0; i < inner; i++) {
748	128		U = sqraddmod(U, a, n);
749	128		V = sqraddmod(V, a, n);
750	128		V = sqraddmod(V, a, n);
751	128	100	f = (U > V) ? U-V : V-U;
752	128		m = mulmod(m, f, n);
753			}
754	2		f = gcd_ui(m, n);
755	2	50	if (f == 1)
756	0		continue;
757	2	50	if (f == n) { /* back up to find a factor*/
758	2		U = oldU; V = oldV;
759	2		i = inner;
760			do {
761	6		U = sqraddmod(U, a, n);
762	6		V = sqraddmod(V, a, n);
763	6		V = sqraddmod(V, a, n);
764	6	100	f = gcd_ui( (U > V) ? U-V : V-U, n);
765	6	100	} while (f == 1 && i-- != 0);
		50
766			}
767	2	50	if (f == 0 \|\| f == n) {
		50
768	0	0	if (fails-- <= 0) break;
769	0		U = addmod(U,2,n);
770	0		V = U;
771	0		a++;
772	0		continue;
773			}
774	2		return found_factor(n, f, factors);
775			}
776	0		factors[0] = n;
777	0		return 1;
778			}
779
780			/* Pollard's P-1 */
781	2		int pminus1_factor(UV n, UV *factors, UV B1, UV B2)
782			{
783			UV f, k, kmin;
784	2		UV a = 2, q = 2;
785	2		UV savea = 2, saveq = 2;
786	2		UV j = 1;
787	2		UV sqrtB1 = isqrt(B1);
788			#if USE_MONTMATH
789	2		const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
790	2		UV ma = mont_geta(a,n);
791			#define PMINUS1_APPLY_POWER ma = mont_powmod(ma, k, n)
792			#define PMINUS1_RECOVER_A a = mont_recover(ma,n)
793			#else
794			#define PMINUS1_APPLY_POWER a = powmod(a, k, n)
795			#define PMINUS1_RECOVER_A
796			#endif
797	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pminus1_factor");
		50
798
799	2	50	if (B1 <= primes_small[NPRIMES_SMALL-2]) {
800			UV i;
801	0	0	for (i = 1; primes_small[i] <= B1; i++) {
802	0		q = k = primes_small[i];
803	0	0	if (q <= sqrtB1) {
804	0		k = q*q; kmin = B1/q;
805	0	0	while (k <= kmin) k *= q;
806			}
807	0		PMINUS1_APPLY_POWER;
808	0	0	if ( (j++ % 32) == 0) {
809	0	0	PMINUS1_RECOVER_A;
810	0	0	if (a == 0 \|\| gcd_ui(a-1, n) != 1)
		0
811			break;
812	0		savea = a; saveq = q;
813			}
814			}
815	0	0	PMINUS1_RECOVER_A;
816			} else {
817	64	50	START_DO_FOR_EACH_PRIME(2, B1) {
		100
		100
		100
		100
		50
		50
		50
		50
		50
818	64		q = k = p;
819	64	50	if (q <= sqrtB1) {
820	64		k = q*q; kmin = B1/q;
821	200	100	while (k <= kmin) k *= q;
822			}
823	64		PMINUS1_APPLY_POWER;
824	64	100	if ( (j++ % 32) == 0) {
825	2	50	PMINUS1_RECOVER_A;
826	2	50	if (a == 0 \|\| gcd_ui(a-1, n) != 1)
		50
827			break;
828	0		savea = a; saveq = q;
829			}
830	62		} END_DO_FOR_EACH_PRIME
831	2	50	PMINUS1_RECOVER_A;
832			}
833	2	50	if (a == 0) { factors[0] = n; return 1; }
834	2		f = gcd_ui(a-1, n);
835
836			/* If we found more than one factor in stage 1, backup and single step */
837	2	50	if (f == n) {
838	2		a = savea;
839	4	50	START_DO_FOR_EACH_PRIME(saveq, B1) {
		50
		100
		50
		0
		0
		0
		0
		0
		50
840	4		k = p; kmin = B1/p;
841	62	100	while (k <= kmin) k *= p;
842	4		a = powmod(a, k, n);
843	4		f = gcd_ui(a-1, n);
844	4		q = p;
845	4	100	if (f != 1)
846	2		break;
847	4		} END_DO_FOR_EACH_PRIME
848			/* If f == n again, we could do:
849			* for (savea = 3; f == n && savea < 100; savea = next_prime(savea)) {
850			* a = savea;
851			* for (q = 2; q <= B1; q = next_prime(q)) {
852			* ...
853			* }
854			* }
855			* but this could be a huge time sink if B1 is large, so just fail.
856			*/
857			}
858
859			/* STAGE 2 */
860	2	50	if (f == 1 && B2 > B1) {
		0
861	0		UV bm = a;
862	0		UV b = 1;
863			UV bmdiff;
864	0		UV precomp_bm[111] = {0}; /* Enough for B2 = 189M */
865
866			/* calculate (a^q)^2, (a^q)^4, etc. */
867	0		bmdiff = sqrmod(bm, n);
868	0		precomp_bm[0] = bmdiff;
869	0	0	for (j = 1; j < 20; j++) {
870	0		bmdiff = mulmod(bmdiff,bm,n);
871	0		bmdiff = mulmod(bmdiff,bm,n);
872	0		precomp_bm[j] = bmdiff;
873			}
874
875	0		a = powmod(a, q, n);
876	0		j = 1;
877	0	0	START_DO_FOR_EACH_PRIME( q+1, B2 ) {
		0
		0
		0
		0
		0
		0
		0
		0
		0
878	0		UV lastq = q;
879			UV qdiff;
880	0		q = p;
881			/* compute a^q = a^lastq * a^(q-lastq) */
882	0		qdiff = (q - lastq) / 2 - 1;
883	0	0	if (qdiff >= 111) {
884	0		bmdiff = powmod(bm, q-lastq, n); /* Big gap */
885			} else {
886	0		bmdiff = precomp_bm[qdiff];
887	0	0	if (bmdiff == 0) {
888	0	0	if (precomp_bm[qdiff-1] != 0)
889	0		bmdiff = mulmod(mulmod(precomp_bm[qdiff-1],bm,n),bm,n);
890			else
891	0		bmdiff = powmod(bm, q-lastq, n);
892	0		precomp_bm[qdiff] = bmdiff;
893			}
894			}
895	0		a = mulmod(a, bmdiff, n);
896	0	0	if (a == 0) break;
897	0		b = mulmod(b, a-1, n); /* if b == 0, we found multiple factors */
898	0	0	if ( (j++ % 64) == 0 ) {
899	0		f = gcd_ui(b, n);
900	0	0	if (f != 1)
901	0		break;
902			}
903	0		} END_DO_FOR_EACH_PRIME
904	0		f = gcd_ui(b, n);
905			}
906	2		return found_factor(n, f, factors);
907			}
908
909			/* Simple Williams p+1 */
910	6		static void pp1_pow(UV *cX, UV exp, UV n)
911			{
912	6		UV X0 = *cX;
913	6		UV X = *cX;
914	6		UV Y = mulsubmod(X, X, 2, n);
915	6	50	UV bit = UVCONST(1) << (clz(exp)-1);
916	345	100	while (bit) {
917	339		UV T = mulsubmod(X, Y, X0, n);
918	339	100	if ( exp & bit ) {
919	15		X = T;
920	15		Y = mulsubmod(Y, Y, 2, n);
921			} else {
922	324		Y = T;
923	324		X = mulsubmod(X, X, 2, n);
924			}
925	339		bit >>= 1;
926			}
927	6		*cX = X;
928	6		}
929	2		int pplus1_factor(UV n, UV *factors, UV B1)
930			{
931			UV X1, X2, f;
932	2		UV sqrtB1 = isqrt(B1);
933	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pplus1_factor");
		50
934
935	2		X1 = 7 % n;
936	2		X2 = 11 % n;
937	2		f = 1;
938	4	50	START_DO_FOR_EACH_PRIME(2, B1) {
		50
		100
		100
		0
		0
		0
		0
		0
		50
939	4		UV k = p;
940	4	50	if (p < sqrtB1) {
941	4		UV kmin = B1/p;
942	21	100	while (k <= kmin)
943	17		k *= p;
944			}
945	4		pp1_pow(&X1, k, n);
946	4	50	if (X1 != 2) {
947	4		f = gcd_ui( submod(X1, 2, n) , n);
948	4	100	if (f != 1 && f != n) break;
		50
949			}
950	2		pp1_pow(&X2, k, n);
951	2	50	if (X2 != 2) {
952	0		f = gcd_ui( submod(X2, 2, n) , n);
953	0	0	if (f != 1 && f != n) break;
		0
954			}
955	2		} END_DO_FOR_EACH_PRIME
956
957	2		return found_factor(n, f, factors);
958			}
959
960
961			/* SQUFOF, based on Ben Buhrow's racing version. */
962			#if 1
963			/* limit to 62-bit inputs, use 32-bit types, faster */
964			#define SQUFOF_TYPE uint32_t
965			#define SQUFOF_MAX (UV_MAX >> 2)
966			#else
967			/* All 64-bit inputs possible, though we severely limit multipliers */
968			#define SQUFOF_TYPE UV
969			#define SQUFOF_MAX UV_MAX
970			#endif
971			typedef struct
972			{
973			int valid;
974			SQUFOF_TYPE P;
975			SQUFOF_TYPE bn;
976			SQUFOF_TYPE Qn;
977			SQUFOF_TYPE Q0;
978			SQUFOF_TYPE b0;
979			SQUFOF_TYPE it;
980			SQUFOF_TYPE imax;
981			SQUFOF_TYPE mult;
982			} mult_t;
983
984			/* N < 2^63 (or 2^31). Returns 0 or a factor */
985	3		static UV squfof_unit(UV n, mult_t* mult_save)
986			{
987			SQUFOF_TYPE imax,i,Q0,Qn,bn,b0,P,bbn,Ro,S,So,t1,t2;
988
989	3		P = mult_save->P;
990	3		bn = mult_save->bn;
991	3		Qn = mult_save->Qn;
992	3		Q0 = mult_save->Q0;
993	3		b0 = mult_save->b0;
994	3		i = mult_save->it;
995	3		imax = i + mult_save->imax;
996
997			#define SQUARE_SEARCH_ITERATION \
998			t1 = P; \
999			P = bn*Qn - P; \
1000			t2 = Qn; \
1001			Qn = Q0 + bn*(t1-P); \
1002			Q0 = t2; \
1003			bn = (b0 + P) / Qn; \
1004			i++;
1005
1006			while (1) {
1007	3		int j = 0;
1008	3	50	if (i & 0x1) {
1009	0		SQUARE_SEARCH_ITERATION;
1010			}
1011			/* i is now even */
1012			while (1) {
1013			/* We need to know P, bn, Qn, Q0, iteration count, i from prev */
1014	4	50	if (i >= imax) {
1015			/* save state and try another multiplier. */
1016	0		mult_save->P = P;
1017	0		mult_save->bn = bn;
1018	0		mult_save->Qn = Qn;
1019	0		mult_save->Q0 = Q0;
1020	0		mult_save->it = i;
1021	0		return 0;
1022			}
1023
1024	4		SQUARE_SEARCH_ITERATION;
1025
1026			/* Even iteration. Check for square: Qn = SS /
1027	4		t2 = Qn & 127;
1028	4	100	if (!((t20x8bc40d7d) & (t20xa1e2f5d1) & 0x14020a)) {
1029	3		t1 = (uint32_t) sqrt(Qn);
1030	3	50	if (Qn == t1*t1)
1031	3		break;
1032			}
1033
1034			/* Odd iteration. */
1035	1		SQUARE_SEARCH_ITERATION;
1036	1		}
1037	3		S = t1; /* isqrt(Qn); */
1038	3		mult_save->it = i;
1039
1040			/* Reduce to G0 */
1041	3		Ro = P + S*((b0 - P)/S);
1042	3		So = (n - (UV)Ro*(UV)Ro)/(UV)S;
1043	3		bbn = (b0+Ro)/So;
1044
1045			/* Search for symmetry point */
1046			#define SYMMETRY_POINT_ITERATION \
1047			t1 = Ro; \
1048			Ro = bbn*So - Ro; \
1049			t2 = So; \
1050			So = S + bbn*(t1-Ro); \
1051			S = t2; \
1052			bbn = (b0+Ro)/So; \
1053			if (Ro == t1) break;
1054
1055	3		j = 0;
1056			while (1) {
1057	3	100	SYMMETRY_POINT_ITERATION;
1058	1	50	SYMMETRY_POINT_ITERATION;
1059	0	0	SYMMETRY_POINT_ITERATION;
1060	0	0	SYMMETRY_POINT_ITERATION;
1061	0	0	if (j++ > 2000000) {
1062	0		mult_save->valid = 0;
1063	0		return 0;
1064			}
1065	0		}
1066
1067	3		t1 = gcd_ui(Ro, n);
1068	3	50	if (t1 > 1)
1069	3		return t1;
1070	0		}
1071			}
1072
1073			/* Gower and Wagstaff 2008:
1074			* http://www.ams.org/journals/mcom/2008-77-261/S0025-5718-07-02010-8/
1075			* Section 5.3. I've added some with 13,17,19. Sorted by F(). */
1076			static const UV squfof_multipliers[] =
1077			/* { 35711, 357, 3511, 35, 3711, 37, 5711, 57,
1078			311, 3, 511, 5, 711, 7, 11, 1 }; /
1079			{ 35711, 357, 3571113, 35713, 3571117, 3511,
1080			35717, 35, 3571119, 351113,35719, 3571317,
1081			3513, 3711, 37, 5711, 3713, 57,
1082			3517, 5713, 3519, 311, 37*17, 3,
1083			31113, 511, 3719, 313, 5, 51113,
1084			5719, 513, 711, 7, 317, 713,
1085			11, 1 };
1086			#define NSQUFOF_MULT (sizeof(squfof_multipliers)/sizeof(squfof_multipliers[0]))
1087
1088	2		int squfof_factor(UV n, UV *factors, UV rounds)
1089			{
1090			mult_t mult_save[NSQUFOF_MULT];
1091	2		UV i, nn64, sqrtnn64, mult, f64,rounds_done = 0;
1092	2		int mults_racing = NSQUFOF_MULT;
1093
1094			/* Caller should have handled these trivial cases */
1095	2	50	MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in squfof_factor");
		50
1096
1097			/* Too big */
1098	2	50	if (n > SQUFOF_MAX) {
1099	0		factors[0] = n; return 1;
1100			}
1101
1102	78	100	for (i = 0; i < NSQUFOF_MULT; i++) {
1103	76		mult_save[i].valid = -1;
1104	76		mult_save[i].it = 0;
1105			}
1106
1107			/* Race each multiplier for a bit (20-20k rounds) */
1108	2	50	while (mults_racing > 0 && rounds_done < rounds) {
		50
1109	3	50	for (i = 0; i < NSQUFOF_MULT && rounds_done < rounds; i++) {
		50
1110	3	50	if (mult_save[i].valid == 0) continue;
1111	3		mult = squfof_multipliers[i];
1112	3		nn64 = n * mult;
1113	3	50	if (mult_save[i].valid == -1) {
1114	3	50	if ((SQUFOF_MAX / mult) < n) {
1115	0		mult_save[i].valid = 0; /* This multiplier would overflow 64-bit */
1116	0		mults_racing--;
1117	0		continue;
1118			}
1119	3		sqrtnn64 = isqrt(nn64);
1120	3		mult_save[i].valid = 1;
1121	3		mult_save[i].Q0 = 1;
1122	3		mult_save[i].b0 = sqrtnn64;
1123	3		mult_save[i].P = sqrtnn64;
1124	3		mult_save[i].Qn = (SQUFOF_TYPE)(nn64 - sqrtnn64 * sqrtnn64);
1125	3	50	if (mult_save[i].Qn == 0)
1126	0		return found_factor(n, sqrtnn64, factors);
1127	3		mult_save[i].bn = (2 * sqrtnn64) / (UV)mult_save[i].Qn;
1128	3		mult_save[i].it = 0;
1129	3		mult_save[i].mult = mult;
1130	3		mult_save[i].imax = (UV) (sqrt(sqrtnn64) / 16);
1131	3	50	if (mult_save[i].imax < 20) mult_save[i].imax = 20;
1132	3	50	if (mult_save[i].imax > rounds) mult_save[i].imax = rounds;
1133			}
1134	3	50	if (mults_racing == 1) /* Do all rounds if only one multiplier left */
1135	0		mult_save[i].imax = (rounds - rounds_done);
1136	3		f64 = squfof_unit(nn64, &mult_save[i]);
1137	3	50	if (f64 > 1) {
1138	3		UV f64red = f64 / gcd_ui(f64, mult);
1139	3	100	if (f64red > 1) {
1140			/* unsigned long totiter = 0;
1141			{int K; for (K = 0; K < NSQUFOF_MULT; K++) totiter += mult_save[K].it; }
1142			printf(" n %lu mult %lu it %lu (%lu)\n",n,mult,totiter,(UV)mult_save[i].it); */
1143	2		return found_factor(n, f64red, factors);
1144			}
1145			/* Found trivial factor. Quit working with this multiplier. */
1146	1		mult_save[i].valid = 0;
1147			}
1148	1	50	if (mult_save[i].valid == 0)
1149	1		mults_racing--;
1150	1		rounds_done += mult_save[i].imax; /* Assume we did all rounds */
1151			}
1152			}
1153
1154			/* No factors found */
1155	0		factors[0] = n;
1156	2		return 1;
1157			}
1158
1159			#define SQR_TAB_SIZE 512
1160			static int sqr_tab_init = 0;
1161			static double sqr_tab[SQR_TAB_SIZE];
1162	0		static void make_sqr_tab(void) {
1163			int i;
1164	0	0	for (i = 0; i < SQR_TAB_SIZE; i++)
1165	0		sqr_tab[i] = sqrt((double)i);
1166	0		sqr_tab_init = 1;
1167	0		}
1168
1169			/* Lehman written and tuned by Warren D. Smith.
1170			* Revised by Ben Buhrow and Dana Jacobsen. */
1171	0		int lehman_factor(UV n, UV *factors, int do_trial) {
1172	0	0	const double Tune = ((n >> 31) >> 5) ? 3.5 : 5.0;
1173			double x, sqrtn;
1174			UV a,c,kN,kN4,B2;
1175	0		uint32_t b,p,k,r,B,U,Bred,inc,ip=2;
1176
1177	0	0	if (!(n&1)) return found_factor(n, 2, factors);
1178
1179	0		B = Tune * (1+rootof(n,3));
1180
1181	0	0	if (do_trial) {
1182	0		uint32_t FirstCut = 0.1 * B;
1183	0	0	if (FirstCut < 84) FirstCut = 84;
1184	0	0	if (FirstCut > 65535) FirstCut = 65535;
1185	0	0	for (ip = 2; ip < NPRIMES_SMALL; ip++) {
1186	0		p = primes_small[ip];
1187	0	0	if (p >= FirstCut)
1188	0		break;
1189	0	0	if (n % p == 0)
1190	0		return found_factor(n, p, factors);
1191			}
1192			}
1193
1194	0	0	if (n >= UVCONST(8796393022207)) { factors[0] = n; return 1; }
1195	0		Bred = B / (Tune * Tune * Tune);
1196	0		B2 = B*B;
1197	0		kN = 0;
1198
1199	0	0	if (!sqr_tab_init) make_sqr_tab();
1200	0		sqrtn = sqrt(n);
1201
1202	0	0	for (k = 1; k <= Bred; k++) {
1203	0	0	if (k&1) { inc = 4; r = (k+n) % 4; }
1204	0		else { inc = 2; r = 1; }
1205	0		kN += n;
1206	0	0	if (kN >= UVCONST(1152921504606846976)) { factors[0] = n; return 1; }
1207	0		kN4 = kN*4;
1208
1209	0	0	x = (k < SQR_TAB_SIZE) ? sqrtn * sqr_tab[k] : sqrt((double)kN);
1210	0		a = x;
1211	0	0	if ((UV)a * (UV)a == kN)
1212	0		return found_factor(n, gcd_ui(a,n), factors);
1213	0		x *= 2;
1214	0		a = x + 0.9999999665; /* Magic constant */
1215	0		b = a % inc;
1216	0		b = a + (inc+r-b) % inc;
1217	0		c = (UV)b*(UV)b - kN4;
1218	0		U = x + B2/(2*x);
1219	0	0	for (a = b; a <= U; c += inc*(a+a+inc), a += inc) {
1220			/* Check for perfect square */
1221	0		b = c & 127;
1222	0	0	if (!((b0x8bc40d7d) & (b0xa1e2f5d1) & 0x14020a)) {
1223	0		b = (uint32_t) sqrt(c);
1224	0	0	if (c == b*b) {
1225	0		B2 = gcd_ui(a+b, n);
1226	0		return found_factor(n, B2, factors);
1227			}
1228			}
1229			}
1230			}
1231	0	0	if (do_trial) {
1232	0	0	if (B > 65535) B = 65535;
1233			/* trial divide from primes[ip] to B. We could:
1234			* 1) use table of 6542 shorts for the primes.
1235			* 2) use a wheel
1236			* 3) let trial_factor handle it
1237			*/
1238	0	0	if (ip >= NPRIMES_SMALL) ip = NPRIMES_SMALL-1;
1239	0		return trial_factor(n, factors, primes_small[ip], B);
1240			}
1241	0		factors[0] = n;
1242	0		return 1;
1243			}
1244
1245			static const uint32_t _fr_chunk = 8192;
1246			static const uint32_t _fr_sieve_crossover = 10000000; /* About 10^14 */
1247
1248	1		static void _vec_factor(UV lo, UV hi, UV nfactors, UV farray, UV noffset, int square_free)
1249			{
1250			UV *N, j, n, sqrthi, sievelim;
1251	1		sqrthi = isqrt(hi);
1252	1		n = hi-lo+1;
1253	1	50	New(0, N, hi-lo+1, UV);
1254	999	100	for (j = 0; j < n; j++) {
1255	998		N[j] = 1;
1256	998		nfactors[j] = 0;
1257			}
1258	1	50	sievelim = (sqrthi < _fr_sieve_crossover) ? sqrthi : icbrt(hi);
1259	12	50	START_DO_FOR_EACH_PRIME(2, sievelim) {
		100
		100
		100
		100
		50
		50
		50
		50
		100
1260			UV q, t, A;
1261	11	50	if (square_free == 0) {
1262	0		UV kmin = hi / p;
1263	0	0	for (q = p; q <= kmin; q *= p) {
1264	0		t = lo / q, A = t * q;
1265	0	0	if (A < lo) A += q;
1266	0	0	for (j = A-lo; j < n; j += q) {
1267	0		farray[ j*noffset + nfactors[j]++ ] = p;
1268	0		N[j] *= p;
1269			}
1270			}
1271			} else {
1272	11		q = pp, t = lo / q, A = t q;
1273	11	50	if (A < lo) A += q;
1274	451	100	for (j = A-lo; j < n; j += q) {
1275	440		N[j] = 0;
1276	440		nfactors[j] = 0;
1277			}
1278	11		q = p, t = lo / q, A = t * q;
1279	11	100	if (A < lo) A += q;
1280	1569	100	for (j = A-lo; j < n; j += q) {
1281	1558	100	if (N[j] > 0) {
1282	824		farray[ j*noffset + nfactors[j]++ ] = p;
1283	824		N[j] *= p;
1284			}
1285			}
1286			}
1287	11		} END_DO_FOR_EACH_PRIME
1288
1289	1	50	if (sievelim == sqrthi) {
1290			/* Handle the unsieved results, which are prime */
1291	999	100	for (j = 0; j < n; j++) {
1292	998	100	if (N[j] == 1)
1293	157		farray[ j*noffset + nfactors[j]++ ] = j+lo;
1294	841	100	else if (N[j] > 0 && N[j] != j+lo)
		100
1295	298		farray[ j*noffset + nfactors[j]++ ] = (j+lo) / N[j];
1296			}
1297			} else {
1298			/* Handle the unsieved results, which are prime or semi-prime */
1299	0	0	for (j = 0; j < n; j++) {
1300	0		UV rem = j+lo;
1301	0	0	if (N[j] > 0 && N[j] != rem) {
		0
1302	0	0	if (N[j] != 1)
1303	0		rem /= N[j];
1304	0	0	if (square_free && is_perfect_square(rem)) {
		0
1305	0		nfactors[j] = 0;
1306			} else {
1307	0		UV* f = farray + j*noffset + nfactors[j];
1308	0		nfactors[j] += factor_one(rem, f, 1, 0);
1309			}
1310			}
1311			}
1312			}
1313	1		Safefree(N);
1314	1		}
1315
1316	5		factor_range_context_t factor_range_init(UV lo, UV hi, int square_free) {
1317			factor_range_context_t ctx;
1318	5		ctx.lo = lo;
1319	5		ctx.hi = hi;
1320	5		ctx.n = lo-1;
1321	5		ctx.is_square_free = square_free ? 1 : 0;
1322	5	100	if (hi-lo+1 > 100) { /* Sieve in chunks */
1323	1	50	if (square_free) ctx._noffset = (hi <= 42949672965UL) ? 10 : 15;
		50
1324	0	0	else ctx._noffset = BITS_PER_WORD - clz(hi);
1325	1		ctx._coffset = _fr_chunk;
1326	1	50	New(0, ctx._nfactors, _fr_chunk, UV);
1327	1	50	New(0, ctx._farray, _fr_chunk * ctx._noffset, UV);
1328			{ /* Prealloc all the sieving primes now. */
1329	1		UV t = isqrt(hi);
1330	1	50	if (t >= _fr_sieve_crossover) t = icbrt(hi);
1331	1		get_prime_cache(t, 0);
1332			}
1333			} else { /* factor each number */
1334	4	100	New(0, ctx.factors, square_free ? 15 : 63, UV);
1335	4		ctx._nfactors = 0;
1336	4		ctx._farray = ctx.factors;
1337	4		ctx._noffset = 0;
1338			}
1339	5		return ctx;
1340			}
1341
1342	1207		int factor_range_next(factor_range_context_t *ctx) {
1343			int j, nfactors;
1344			UV n;
1345	1207	50	if (ctx->n >= ctx->hi)
1346	0		return -1;
1347	1207		n = ++(ctx->n);
1348	1207	100	if (ctx->_nfactors) {
1349	998	100	if (ctx->_coffset >= _fr_chunk) {
1350	1		UV clo = n;
1351	1		UV chi = n + _fr_chunk - 1;
1352	1	50	if (chi > ctx->hi) chi = ctx->hi;
1353	1		_vec_factor(clo, chi, ctx->_nfactors, ctx->_farray, ctx->_noffset, ctx->is_square_free);
1354	1		ctx->_coffset = 0;
1355			}
1356	998		nfactors = ctx->_nfactors[ctx->_coffset];
1357	998		ctx->factors = ctx->_farray + ctx->_coffset * ctx->_noffset;
1358	998		ctx->_coffset++;
1359			} else {
1360	209	100	if (ctx->is_square_free && n >= 49 && (!(n% 4) \|\| !(n% 9) \|\| !(n%25) \|\| !(n%49)))
		100
		100
		100
		100
		100
1361	22		return 0;
1362	187		nfactors = factor(n, ctx->factors);
1363	187	100	if (ctx->is_square_free) {
1364	118	100	for (j = 1; j < nfactors; j++)
1365	58	100	if (ctx->factors[j] == ctx->factors[j-1])
1366	17		break;
1367	77	100	if (j < nfactors) return 0;
1368			}
1369			}
1370	1168		return nfactors;
1371			}
1372
1373	0		void factor_range_destroy(factor_range_context_t *ctx) {
1374	0	0	if (ctx->_farray != 0) Safefree(ctx->_farray);
1375	0	0	if (ctx->_nfactors != 0) Safefree(ctx->_nfactors);
1376	0		ctx->_farray = ctx->_nfactors = ctx->factors = 0;
1377	0		}
1378
1379
1380			/******************************************************************************/
1381			/* DLP */
1382			/******************************************************************************/
1383
1384	23		static UV dlp_trial(UV a, UV g, UV p, UV maxrounds) {
1385			UV k, t;
1386	23	50	if (maxrounds > p) maxrounds = p;
1387
1388			#if USE_MONTMATH
1389	23	100	if (p&1) {
1390	18		const uint64_t npi = mont_inverse(p), mont1 = mont_get1(p);
1391	18		g = mont_geta(g, p);
1392	18		a = mont_geta(a, p);
1393	13930	50	for (t = g, k = 1; k < maxrounds; k++) {
1394	13930	100	if (t == a)
1395	18		return k;
1396	13912	50	t = mont_mulmod(t, g, p);
1397	13912	50	if (t == g) break; /* Stop at cycle */
1398			}
1399			} else
1400			#endif
1401			{
1402	9	50	for (t = g, k = 1; k < maxrounds; k++) {
1403	9	100	if (t == a)
1404	4		return k;
1405	5		t = mulmod(t, g, p);
1406	5	100	if (t == g) break; /* Stop at cycle */
1407			}
1408			}
1409	1		return 0;
1410			}
1411
1412			/******************************************************************************/
1413			/* DLP - Pollard Rho */
1414			/******************************************************************************/
1415
1416			/* Compare with Pomerance paper (dartmouth dtalk4):
1417			* Type I/II/III = our case 1, 0, 2.
1418			* x_i = u, a_i = v, b_i = w
1419			*
1420			* Also see Bai/Brent 2008 for many ideas to speed this up.
1421			* https://maths-people.anu.edu.au/~brent/pd/rpb231.pdf
1422			* E.g. Teske adding-walk, Brent's cycle algo, Teske modified cycle
1423			*/
1424			#define pollard_rho_cycle(u,v,w,p,n,a,g) \
1425			switch (u % 3) { \
1426			case 0: u = mulmod(u,u,p); v = mulmod(v,2,n); w = mulmod(w,2,n); break;\
1427			case 1: u = mulmod(u,a,p); v = addmod(v,1,n); break;\
1428			case 2: u = mulmod(u,g,p); w = addmod(w,1,n); break;\
1429			}
1430
1431			typedef struct prho_state_t {
1432			UV u;
1433			UV v;
1434			UV w;
1435			UV U;
1436			UV V;
1437			UV W;
1438			UV round;
1439			int failed;
1440			int verbose;
1441			} prho_state_t;
1442
1443	4		static UV dlp_prho_uvw(UV a, UV g, UV p, UV n, UV rounds, prho_state_t *s) {
1444	4		UV i, k = 0;
1445	4		UV u=s->u, v=s->v, w=s->w;
1446	4		UV U=s->U, V=s->V, W=s->W;
1447	4		int const verbose = s->verbose;
1448
1449	4	50	if (s->failed) return 0;
1450	4	50	if (s->round + rounds > n) rounds = n - s->round;
1451
1452	26787	100	for (i = 1; i <= rounds; i++) {
1453	26785		pollard_rho_cycle(u,v,w,p,n,a,g); /* xi, ai, bi */
1454	26785		pollard_rho_cycle(U,V,W,p,n,a,g);
1455	26785		pollard_rho_cycle(U,V,W,p,n,a,g); /* x2i, a2i, b2i */
1456	26785	50	if (verbose > 3) printf( "%3"UVuf" %4"UVuf" %3"UVuf" %3"UVuf" %4"UVuf" %3"UVuf" %3"UVuf"\n", i, u, v, w, U, V, W );
1457	26785	100	if (u == U) {
1458			UV r1, r2, G, G2;
1459	2		r1 = submod(v, V, n);
1460	2	50	if (r1 == 0) {
1461	0	0	if (verbose) printf("DLP Rho failure, r=0\n");
1462	0		s->failed = 1;
1463	0		k = 0;
1464	0		break;
1465			}
1466	2		r2 = submod(W, w, n);
1467
1468	2		G = gcd_ui(r1,n);
1469	2		G2 = gcd_ui(G,r2);
1470	2		k = divmod(r2/G2, r1/G2, n/G2);
1471	2	50	if (G > 1) {
1472	0	0	if (powmod(g,k,p) == a) {
1473	0	0	if (verbose > 2) printf(" common GCD %"UVuf"\n", G2);
1474			} else {
1475	0		UV m, l = divmod(r2, r1, n/G);
1476	0	0	for (m = 0; m < G; m++) {
1477	0		k = addmod(l, mulmod(m,(n/G),n), n);
1478	0	0	if (powmod(g,k,p) == a) break;
1479			}
1480	0	0	if (m 2) printf(" GCD %"UVuf", found with m=%"UVuf"\n", G, m);
		0
1481			}
1482			}
1483
1484	2	50	if (powmod(g,k,p) != a) {
1485	0	0	if (verbose > 2) printf("r1 = %"UVuf" r2 = %"UVuf" k = %"UVuf"\n", r1, r2, k);
1486	0	0	if (verbose) printf("Incorrect DLP Rho solution: %"UVuf"\n", k);
1487	0		s->failed = 1;
1488	0		k = 0;
1489			}
1490	2		break;
1491			}
1492			}
1493	4		s->round += i-1;
1494	4	50	if (verbose && k) printf("DLP Rho solution found after %"UVuf" steps\n", s->round + 1);
		0
1495	4		s->u = u; s->v = v; s->w = w; s->U = U; s->V = V; s->W = W;
1496	4		return k;
1497			}
1498
1499			#if 0
1500			static UV dlp_prho(UV a, UV g, UV p, UV n, UV maxrounds) {
1501			#ifdef DEBUG
1502			int const verbose = _XS_get_verbose()
1503			#else
1504			int const verbose = 0;
1505			#endif
1506			prho_state_t s = {1, 0, 0, 1, 0, 0, 0, 0, verbose};
1507			return dlp_prho_uvw(a, g, p, n, maxrounds, &s);
1508			}
1509			#endif
1510
1511
1512			/******************************************************************************/
1513			/* DLP - BSGS */
1514			/******************************************************************************/
1515
1516			typedef struct bsgs_hash_t {
1517			UV M; /* The baby step index */
1518			UV V; /* The powmod value */
1519			struct bsgs_hash_t* next;
1520			} bsgs_hash_t;
1521
1522			/****************************************/
1523			/* Simple and limited pool allocation */
1524			#define BSGS_ENTRIES_PER_PAGE 8000
1525			typedef struct bsgs_page_top_t {
1526			struct bsgs_page_t* first;
1527			bsgs_hash_t** table;
1528			UV size;
1529			int nused;
1530			int npages;
1531			} bsgs_page_top_t;
1532
1533			typedef struct bsgs_page_t {
1534			bsgs_hash_t entries[BSGS_ENTRIES_PER_PAGE];
1535			struct bsgs_page_t* next;
1536			} bsgs_page_t;
1537
1538	4139		static bsgs_hash_t* get_entry(bsgs_page_top_t* top) {
1539	4139	100	if (top->nused == 0 \|\| top->nused >= BSGS_ENTRIES_PER_PAGE) {
		50
1540			bsgs_page_t* newpage;
1541	2		Newz(0, newpage, 1, bsgs_page_t);
1542	2		newpage->next = top->first;
1543	2		top->first = newpage;
1544	2		top->nused = 0;
1545	2		top->npages++;
1546			}
1547	4139		return top->first->entries + top->nused++;
1548			}
1549	2		static void destroy_pages(bsgs_page_top_t* top) {
1550	2		bsgs_page_t* head = top->first;
1551	4	100	while (head != 0) {
1552	2		bsgs_page_t* next = head->next;
1553	2		Safefree(head);
1554	2		head = next;
1555			}
1556	2		top->first = 0;
1557	2		}
1558			/****************************************/
1559
1560	2		static void bsgs_hash_put(bsgs_page_top_t* pagetop, UV v, UV i) {
1561	2		UV idx = v % pagetop->size;
1562	2		bsgs_hash_t** table = pagetop->table;
1563	2		bsgs_hash_t* entry = table[idx];
1564
1565	2	50	while (entry && entry->V != v)
		0
1566	0		entry = entry->next;
1567
1568	2	50	if (!entry) {
1569	2		entry = get_entry(pagetop);
1570	2		entry->M = i;
1571	2		entry->V = v;
1572	2		entry->next = table[idx];
1573	2		table[idx] = entry;
1574			}
1575	2		}
1576
1577	0		static UV bsgs_hash_get(bsgs_page_top_t* pagetop, UV v) {
1578	0		bsgs_hash_t* entry = pagetop->table[v % pagetop->size];
1579	0	0	while (entry && entry->V != v)
		0
1580	0		entry = entry->next;
1581	0	0	return (entry) ? entry->M : 0;
1582			}
1583
1584	4137		static UV bsgs_hash_put_get(bsgs_page_top_t* pagetop, UV v, UV i) {
1585	4137		UV idx = v % pagetop->size;
1586	4137		bsgs_hash_t** table = pagetop->table;
1587	4137		bsgs_hash_t* entry = table[idx];
1588
1589	4260	100	while (entry && entry->V != v)
		50
1590	123		entry = entry->next;
1591
1592	4137	50	if (entry)
1593	0		return entry->M;
1594
1595	4137		entry = get_entry(pagetop);
1596	4137		entry->M = i;
1597	4137		entry->V = v;
1598	4137		entry->next = table[idx];
1599	4137		table[idx] = entry;
1600	4137		return 0;
1601			}
1602
1603	3		static UV dlp_bsgs(UV a, UV g, UV p, UV n, UV maxent, int race_rho) {
1604			bsgs_page_top_t PAGES;
1605			UV i, m, maxm, hashmap_count;
1606			UV aa, S, gm, T, gs_i, bs_i;
1607	3		UV result = 0;
1608			#ifdef DEBUG
1609			int const verbose = _XS_get_verbose();
1610			#else
1611	3		int const verbose = 0;
1612			#endif
1613	3		prho_state_t rho_state = {1, 0, 0, 1, 0, 0, 0, 0, verbose};
1614
1615	3	50	if (n <= 2) return 0; /* Shouldn't be here with gorder this low */
1616
1617	3	50	if (race_rho) {
1618	3		result = dlp_prho_uvw(a, g, p, n, 10000, &rho_state);
1619	3	100	if (result) {
1620	1	50	if (verbose) printf("rho found solution in BSGS step 0\n");
1621	1		return result;
1622			}
1623			}
1624
1625	2	50	if (a == 0) return 0; /* We don't handle this case */
1626
1627	2		maxm = isqrt(n);
1628	2		m = (maxent > maxm) ? maxm : maxent;
1629
1630	2	50	hashmap_count = (m < 65537) ? 65537 :
1631	0	0	(m > 40000000) ? 40000003 :
1632			next_prime(m); /* Ave depth around 2 */
1633
1634			/* Create table. Size: 8hashmap_count bytes. /
1635	2		PAGES.size = hashmap_count;
1636	2		PAGES.first = 0;
1637	2		PAGES.nused = 0;
1638	2		PAGES.npages = 0;
1639	2	50	Newz(0, PAGES.table, hashmap_count, bsgs_hash_t*);
1640
1641	2		aa = mulmod(a,a,p);
1642	2		S = a;
1643	2		gm = powmod(g, m, p);
1644	2		T = gm;
1645	2		gs_i = 0;
1646	2		bs_i = 0;
1647
1648	2		bsgs_hash_put(&PAGES, S, 0); /* First baby step */
1649	2		S = mulmod(S, g, p);
1650			/* Interleaved Baby Step Giant Step */
1651	2069	50	for (i = 1; i <= m; i++) {
1652	2069		gs_i = bsgs_hash_put_get(&PAGES, S, i);
1653	2069	50	if (gs_i) { bs_i = i; break; }
1654	2069		S = mulmod(S, g, p);
1655	2069	100	if (S == aa) { /* We discovered the solution! */
1656	1	50	if (verbose) printf(" dlp bsgs: solution at BS step %"UVuf"\n", i+1);
1657	1		result = i+1;
1658	1		break;
1659			}
1660	2068		bs_i = bsgs_hash_put_get(&PAGES, T, i);
1661	2068	50	if (bs_i) { gs_i = i; break; }
1662	2068		T = mulmod(T, gm, p);
1663	2068	50	if (race_rho && (i % 2048) == 0) {
		100
1664	1		result = dlp_prho_uvw(a, g, p, n, 100000, &rho_state);
1665	1	50	if (result) {
1666	1	50	if (verbose) printf("rho found solution in BSGS step %"UVuf"\n", i);
1667	1		break;
1668			}
1669			}
1670			}
1671
1672	2	50	if (!result) {
1673			/* Extend Giant Step search */
1674	0	0	if (!(gs_i \|\| bs_i)) {
		0
1675	0		UV b = (p+m-1)/m;
1676	0	0	if (m < maxm && b > 8m) b = 8m;
		0
1677	0	0	for (i = m+1; i < b; i++) {
1678	0		bs_i = bsgs_hash_get(&PAGES, T);
1679	0	0	if (bs_i) { gs_i = i; break; }
1680	0		T = mulmod(T, gm, p);
1681	0	0	if (race_rho && (i % 2048) == 0) {
		0
1682	0		result = dlp_prho_uvw(a, g, p, n, 100000, &rho_state);
1683	0	0	if (result) {
1684	0	0	if (verbose) printf("rho found solution in BSGS step %"UVuf"\n", i);
1685	0		break;
1686			}
1687			}
1688			}
1689			}
1690
1691	0	0	if (gs_i \|\| bs_i) {
		0
1692	0		result = submod(mulmod(gs_i, m, p), bs_i, p);
1693			}
1694			}
1695	2	50	if (verbose) printf(" dlp bsgs using %d pages (%.1fMB+%.1fMB) for hash\n", PAGES.npages, ((double)PAGES.npages * sizeof(bsgs_page_t)) / (10241024), ((double)hashmap_count sizeof(bsgs_hash_t)) / (10241024));
1696
1697	2		destroy_pages(&PAGES);
1698	2		Safefree(PAGES.table);
1699	2	50	if (result != 0 && powmod(g,result,p) != a) {
		50
1700	0	0	if (verbose) printf("Incorrect DLP BSGS solution: %"UVuf"\n", result);
1701	0		result = 0;
1702			}
1703	2	50	if (race_rho && result == 0) {
		50
1704	0		result = dlp_prho_uvw(a, g, p, n, 2000000000U, &rho_state);
1705			}
1706	3		return result;
1707			}
1708
1709			/* Find smallest k where a = g^k mod p */
1710			#define DLP_TRIAL_NUM 10000
1711	16		static UV znlog_solve(UV a, UV g, UV p, UV n) {
1712			UV k, sqrtn;
1713	16		const int verbose = _XS_get_verbose();
1714
1715	16	50	if (a >= p) a %= p;
1716	16	50	if (g >= p) g %= p;
1717
1718	16	100	if (a == 1 \|\| g == 0 \|\| p <= 2)
		50
		50
1719	1		return 0;
1720
1721	15	50	if (verbose > 1 && n != p-1) printf(" g=%"UVuf" p=%"UVuf", order %"UVuf"\n", g, p, n);
		0
1722
1723			/* printf(" solving znlog(%"UVuf",%"UVuf",%"UVuf") n=%"UVuf"\n", a, g, p, n); */
1724
1725	15	50	if (n == 0 \|\| n <= DLP_TRIAL_NUM) {
		100
1726	12		k = dlp_trial(a, g, p, DLP_TRIAL_NUM);
1727	12	50	if (verbose) printf(" dlp trial 10k %s\n", (k!=0 \|\| p <= DLP_TRIAL_NUM) ? "success" : "failure");
		0
		0
1728	12	50	if (k != 0 \|\| (n > 0 && n <= DLP_TRIAL_NUM)) return k;
		0
		0
1729			}
1730
1731			{ /* Existence checks */
1732	3		UV aorder, gorder = n;
1733	3	50	if (gorder != 0 && powmod(a, gorder, p) != 1) return 0;
		50
1734	3		aorder = znorder(a,p);
1735	3	50	if (aorder == 0 && gorder != 0) return 0;
		0
1736	3	50	if (aorder != 0 && gorder % aorder != 0) return 0;
		50
1737			}
1738
1739	3	50	sqrtn = (n == 0) ? 0 : isqrt(n);
1740	3	50	if (n == 0) n = p-1;
1741
1742			{
1743	3	50	UV maxent = (sqrtn > 0) ? sqrtn+1 : 100000;
1744	3		k = dlp_bsgs(a, g, p, n, maxent/2, /* race rho */ 1);
1745	3	50	if (verbose) printf(" dlp bsgs %"UVuf"k %s\n", maxent/1000, k!=0 ? "success" : "failure");
		0
1746	3	50	if (k != 0) return k;
1747	0	0	if (sqrtn > 0 && sqrtn < maxent) return 0;
		0
1748			}
1749
1750	0	0	if (verbose) printf(" dlp doing exhaustive trial\n");
1751	0		k = dlp_trial(a, g, p, p);
1752	0		return k;
1753			}
1754
1755			/* Silver-Pohlig-Hellman */
1756	5		static UV znlog_ph(UV a, UV g, UV p, UV p1) {
1757			UV fac[MPU_MAX_FACTORS+1];
1758			UV exp[MPU_MAX_FACTORS+1];
1759			int i, nfactors;
1760			UV x, j;
1761
1762	5	50	if (p1 == 0) return 0; /* TODO: Should we plow on with p1=p-1? */
1763	5		nfactors = factor_exp(p1, fac, exp);
1764	5	50	if (nfactors == 1)
1765	0		return znlog_solve(a, g, p, p1);
1766	21	100	for (i = 0; i < nfactors; i++) {
1767			UV pi, delta, gamma;
1768	17	100	pi = fac[i]; for (j = 1; j < exp[i]; j++) pi *= fac[i];
1769	16		delta = powmod(a,p1/pi,p);
1770	16		gamma = powmod(g,p1/pi,p);
1771			/* printf(" solving znlog(%"UVuf",%"UVuf",%"UVuf")\n", delta, gamma, p); */
1772	16		fac[i] = znlog_solve( delta, gamma, p, znorder(gamma,p) );
1773	16		exp[i] = pi;
1774			}
1775	5		x = chinese(fac, exp, nfactors, &i);
1776	5	50	if (i == 1 && powmod(g, x, p) == a)
		50
1777	5		return x;
1778	5		return 0;
1779			}
1780
1781			/* Find smallest k where a = g^k mod p */
1782	20		UV znlog(UV a, UV g, UV p) {
1783			UV k, gorder, aorder;
1784	20		const int verbose = _XS_get_verbose();
1785
1786	20	50	if (a >= p) a %= p;
1787	20	50	if (g >= p) g %= p;
1788
1789	20	100	if (a == 1 \|\| g == 0 \|\| p <= 2)
		50
		50
1790	2		return 0;
1791
1792			/* TODO: We call znorder with the same p many times. We should have a
1793			* method for znorder given {phi,nfactors,fac,exp} */
1794
1795	18		gorder = znorder(g,p);
1796	18	100	if (gorder != 0 && powmod(a, gorder, p) != 1) return 0;
		100
1797	16		aorder = znorder(a,p);
1798	16	100	if (aorder == 0 && gorder != 0) return 0;
		50
1799	16	100	if (aorder != 0 && gorder % aorder != 0) return 0;
		50
1800
1801			/* TODO: Come up with a better solution for a=0 */
1802	16	100	if (a == 0 \|\| p < DLP_TRIAL_NUM \|\| (gorder > 0 && gorder < DLP_TRIAL_NUM)) {
		100
		50
		50
1803	11	50	if (verbose > 1) printf(" dlp trial znlog(%"UVuf",%"UVuf",%"UVuf")\n",a,g,p);
1804	11		k = dlp_trial(a, g, p, p);
1805	11		return k;
1806			}
1807
1808	5	50	if (!is_prob_prime(gorder)) {
1809	5		k = znlog_ph(a, g, p, gorder);
1810	5	50	if (verbose) printf(" dlp PH %s\n", k!=0 ? "success" : "failure");
		0
1811	5	50	if (k != 0) return k;
1812			}
1813
1814	0		return znlog_solve(a, g, p, gorder);
1815			}
1816
1817
1818			/* Compile with:
1819			* gcc -O3 -fomit-frame-pointer -march=native -Wall -DSTANDALONE -DFACTOR_STANDALONE factor.c util.c primality.c cache.c sieve.c chacha.c csprng.c prime_nth_count.c lmo.c -lm
1820			*/
1821			#ifdef FACTOR_STANDALONE
1822			#include
1823			int main(int argc, char *argv[])
1824			{
1825			UV n;
1826			UV factors[MPU_MAX_FACTORS+1];
1827			int nfactors, i, a;
1828
1829			if (argc <= 1) {
1830			char line[1024];
1831			while (1) {
1832			if (!fgets(line,sizeof(line),stdin)) break;
1833			n = strtoull(line, 0, 10);
1834			nfactors = factor(n, factors);
1835			if (nfactors == 1) {
1836			printf("%"UVuf": %"UVuf"\n",n,n);
1837			} else if (nfactors == 2) {
1838			printf("%"UVuf": %"UVuf" %"UVuf"\n",n,factors[0],factors[1]);
1839			} else if (nfactors == 3) {
1840			printf("%"UVuf": %"UVuf" %"UVuf" %"UVuf"\n",n,factors[0],factors[1],factors[2]);
1841			} else {
1842			printf("%"UVuf": %"UVuf" %"UVuf" %"UVuf" %"UVuf"",n,factors[0],factors[1],factors[2],factors[3]);
1843			for (i = 4; i < nfactors; i++) printf(" %"UVuf"", factors[i]);
1844			printf("\n");
1845			}
1846			}
1847			exit(0);
1848			}
1849
1850			for (a = 1; a < argc; a++) {
1851			n = strtoul(argv[a], 0, 10);
1852			if (n == ULONG_MAX && errno == ERANGE) { printf("Argument larger than ULONG_MAX\n"); return(-1); }
1853			nfactors = factor(n, factors);
1854			printf("%"UVuf":", n);
1855			for (i = 0; i < nfactors; i++)
1856			printf(" %"UVuf"", factors[i]);
1857			printf("\n");
1858			}
1859
1860			return(0);
1861			}
1862			#endif