File Coverage

lehmer.c

Criterion	Covered	Total	%
statement	0	4	0.0
branch	0	8	0.0
condition			n/a
subroutine			n/a
pod			n/a
total	0	12	0.0

line	stmt	bran	code
1			#if defined(LEHMER) \|\| defined(PRIMESIEVE_STANDALONE)
2
3			#include
4			#include
5			#include
6			#include
7
8			/*****************************************************************************
9			*
10			* Lehmer prime counting utility. Calculates pi(x), count of primes <= x.
11			*
12			* Copyright (c) 2012-2013 Dana Jacobsen (dana@acm.org).
13			* This is free software; you can redistribute it and/or modify it under
14			* the same terms as the Perl 5 programming language system itself.
15			*
16			* This file is part of the Math::Prime::Util Perl module, but also can be
17			* compiled as a standalone UNIX program using primesieve 5.x.
18			*
19			* g++ -O3 -DPRIMESIEVE_STANDALONE lehmer.c -o prime_count -lprimesieve
20			*
21			* The phi(x,a) calculation is unique, to the best of my knowledge. It uses
22			* two lists of all x values + signed counts for the given 'a' value, and walks
23			* 'a' down until it is small enough to calculate directly using a table.
24			* This is relatively fast and low memory compared to many other solutions.
25			* As with all Lehmer-Meissel-Legendre algorithms, memory use will be a
26			* constraint with large values of x.
27			*
28			* Math::Prime::Util now includes an extended LMO implementation, which will
29			* be quite a bit faster and much less memory than this code. It is the
30			* default method for large counts. Timing comparisons are in that file.
31			*
32			* Times and memory use for prime_count(10^15) on a Haswell 4770K, asterisk
33			* indicates parallel operation. The standalone versions of my code use
34			* Kim Walisch's excellent primesieve, which is faster than my sieve.
35			* His Lehmer/Meissel/Legendre seem a bit slower in serial, but
36			* parallelize much better.
37			*
38			* 4.74s 1.3MB LMO
39			* 24.53s* 137.9MB Lehmer Walisch primecount v0.9, 8 threads
40			* 38.74s* 150.3MB LMOS Walisch primecount v0.9, 8 threads
41			* 42.52s* 159.4MB Lehmer standalone, 8 threads
42			* 42.82s* 137.9MB Meissel Walisch primecount v0.9, 8 threads
43			* 51.88s 153.9MB LMOS standalone, 1 thread
44			* 52.01s* 145.5MB Legendre Walisch primecount v0.9, 8 threads
45			* 64.96s 160.3MB Lehmer standalone, 1 thread
46			* 67.16s 67.0MB LMOS
47			* 80.42s 286.6MB Meissel
48			* 99.70s 159.6MB Lehmer
49			* 107.43s 28.5MB Lehmer Walisch primecount v0.9, 1 thread
50			* 174.51s 83.5MB Legendre
51			* 185.11s 25.6MB LMOS Walisch primecount v0.9, 1 thread
52			* 191.19s 24.8MB Meissel Walisch primecount v0.9, 1 thread
53			* 868.96s 1668.1MB Lehmer pix4 by T.R. Nicely
54			*
55			* Reference: Hans Riesel, "Prime Numbers and Computer Methods for
56			* Factorization", 2nd edition, 1994.
57			*/
58
59			/* Below this size, just sieve (with table speedup). */
60			#define SIEVE_LIMIT 60000000
61			#define MAX_PHI_MEM (89610241024)
62
63			static int const verbose = 0;
64			#define STAGE_TIMING 0
65
66			#if STAGE_TIMING
67			#include
68			#define DECLARE_TIMING_VARIABLES struct timeval t0, t1;
69			#define TIMING_START gettimeofday(&t0, 0);
70			#define TIMING_END_PRINT(text) \
71			{ unsigned long long t; \
72			gettimeofday(&t1, 0); \
73			t = (t1.tv_sec-t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec); \
74			printf("%s: %10.5f\n", text, ((double)t) / 1000000); }
75			#else
76			#define DECLARE_TIMING_VARIABLES
77			#define TIMING_START
78			#define TIMING_END_PRINT(text)
79			#endif
80
81
82			#ifdef PRIMESIEVE_STANDALONE
83
84			/* countPrimes can be pretty slow for small ranges, so sieve more small primes
85			* and count using binary search. Uses a lot of memory though. For big
86			* ranges, countPrimes is really fast. If you use primesieve 4.2, the
87			* crossover point is lower (better). */
88			#define SIEVE_MULT 10
89
90			/* Translations from Perl + Math::Prime::Util to C/C++ + primesieve */
91			typedef unsigned long UV;
92			typedef signed long IV;
93			#define UV_MAX ULONG_MAX
94			#define UVCONST(x) ((unsigned long)x##UL)
95			#define New(id, mem, size, type) mem = (type) malloc((size)sizeof(type))
96			#define Newz(id, mem, size, type) mem = (type*) calloc(size, sizeof(type))
97			#define Renew(mem, size, type) mem = (type) realloc(mem,(size)sizeof(type))
98			#define Safefree(mem) free((void*)mem)
99			#define croak(fmt,...) { printf(fmt,##__VA_ARGS__); exit(1); }
100			#define prime_precalc(n) /* */
101			#define BITS_PER_WORD ((ULONG_MAX <= 4294967295UL) ? 32 : 64)
102
103			static UV isqrt(UV n) {
104			UV root;
105			if (sizeof(UV)==8 && n >= UVCONST(18446744065119617025)) return 4294967295UL;
106			if (sizeof(UV)==4 && n >= 4294836225UL) return 65535UL;
107			root = (UV) sqrt((double)n);
108			while (root*root > n) root--;
109			while ((root+1)*(root+1) <= n) root++;
110			return root;
111			}
112			static UV icbrt(UV n) {
113			UV b, root = 0;
114			int s;
115			if (sizeof(UV) == 8) {
116			s = 63; if (n >= UVCONST(18446724184312856125)) return 2642245UL;
117			} else {
118			s = 30; if (n >= 4291015625UL) return 1625UL;
119			}
120			for ( ; s >= 0; s -= 3) {
121			root += root;
122			b = 3root(root+1)+1;
123			if ((n >> s) >= b) {
124			n -= b << s;
125			root++;
126			}
127			}
128			return root;
129			}
130
131			/* Use version 5.x of PrimeSieve */
132			#include
133			#include
134			#include
135			#include
136			#ifdef _OPENMP
137			#include
138			#endif
139
140			#define segment_prime_count(a, b) primesieve::parallel_count_primes(a, b)
141
142			/* Generate an array of n small primes, where the kth prime is element p[k].
143			* Remember to free when done. */
144			#define TINY_PRIME_SIZE 20000
145			static uint32_t* tiny_primes = 0;
146			static uint32_t* generate_small_primes(UV n)
147			{
148			uint32_t* primes;
149			New(0, primes, n+1, uint32_t);
150			if (n < TINY_PRIME_SIZE) {
151			if (tiny_primes == 0)
152			tiny_primes = generate_small_primes(TINY_PRIME_SIZE+1);
153			memcpy(primes, tiny_primes, (n+1) * sizeof(uint32_t));
154			return primes;
155			}
156			primes[0] = 0;
157			{
158			std::vector v;
159			primesieve::generate_n_primes(n, &v);
160			memcpy(primes+1, &v[0], n * sizeof(uint32_t));
161			}
162			return primes;
163			}
164
165			#else
166
167			/* We will use pre-sieving to speed up counting for small ranges */
168			#define SIEVE_MULT 1
169
170			#define FUNC_isqrt 1
171			#define FUNC_icbrt 1
172			#include "lehmer.h"
173			#include "util.h"
174			#include "cache.h"
175			#include "sieve.h"
176
177			/* Generate an array of n small primes, where the kth prime is element p[k].
178			* Remember to free when done. */
179			static uint32_t* generate_small_primes(UV n)
180			{
181			uint32_t* primes;
182			UV i = 0;
183			double fn = (double)n;
184			double flogn = log(fn);
185			double flog2n = log(flogn);
186			UV nth_prime = /* Dusart 2010 for > 179k, custom for 18-179k */
187			(n >= 688383) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n-2.00)/flogn))) :
188			(n >= 178974) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n-1.95)/flogn))) :
189			(n >= 18) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n+0.30)/flogn)))
190			: 59;
191
192			if (n > 203280221)
193			croak("generate small primes with argument too large: %lu\n", (unsigned long)n);
194			New(0, primes, n+1, uint32_t);
195			primes[0] = 0;
196			START_DO_FOR_EACH_PRIME(2, nth_prime) {
197			if (i >= n) break;
198			primes[++i] = p;
199			} END_DO_FOR_EACH_PRIME
200			if (i < n)
201			croak("Did not generate enough small primes.\n");
202			if (verbose > 1) printf("generated %lu small primes, from 2 to %lu\n", i, (unsigned long)primes[i]);
203			return primes;
204			}
205			#endif
206
207
208			/* Given an array of primes[1..lastprime], return Pi(n) where n <= lastprime.
209			* This is actually quite fast, and definitely faster than sieving. By using
210			* this we can avoid caching prime counts and also skip most calls to the
211			* segment siever.
212			*/
213			static UV bs_prime_count(uint32_t n, uint32_t const* const primes, uint32_t lastidx)
214			{
215			UV i, j;
216			if (n <= 2) return (n == 2);
217			/* If n is out of range, we could:
218			* 1. return segment_prime_count(2, n);
219			* 2. if (n == primes[lastidx]) return lastidx else croak("bspc range");
220			* 3. if (n >= primes[lastidx]) return lastidx;
221			*/
222			if (n >= primes[lastidx]) return lastidx;
223			j = lastidx;
224			if (n < 8480) {
225			i = 1 + (n>>4);
226			if (j > 1060) j = 1060;
227			} else if (n < 25875000) {
228			i = 793 + (n>>5);
229			if (j > (n>>3)) j = n>>3;
230			} else {
231			i = 1617183;
232			if (j > (n>>4)) j = n>>4;
233			}
234			while (i < j) {
235			UV mid = i + (j-i)/2;
236			if (primes[mid] <= n) i = mid+1;
237			else j = mid;
238			}
239			/* if (i-1 != segment_prime_count(2, n)) croak("wrong count for %lu: %lu vs. %lu\n", n, i-1, segment_prime_count(2, n)); */
240			return i-1;
241			}
242
243			#define FAST_DIV(x,y) \
244			( ((x) <= 4294967295U) ? (uint32_t)(x)/(uint32_t)(y) : (x)/(y) )
245
246			/* static uint32_t sprime[] = {0,2, 3, 5, 7, 11, 13, 17, 19, 23}; */
247			/* static uint32_t sprimorial[] = {1,2,6,30,210,2310,30030,510510}; */
248			/* static uint32_t stotient[] = {1,1,2, 8, 48, 480, 5760, 92160}; */
249			static const uint16_t _s0[ 1] = {0};
250			static const uint16_t _s1[ 2] = {0,1};
251			static const uint16_t _s2[ 6] = {0,1,1,1,1,2};
252			static const uint16_t _s3[30] = {0,1,1,1,1,1,1,2,2,2,2,3,3,4,4,4,4,5,5,6,6,6,6,7,7,7,7,7,7,8};
253			static uint16_t _s4[210];
254			static uint16_t _s5[2310];
255			static uint16_t _s6[30030];
256			static const uint16_t* sphicache[7] = { _s0,_s1,_s2,_s3,_s4,_s5,_s6 };
257			static int sphi_init = 0;
258
259			#define PHIC 7
260
261			static UV tablephi(UV x, uint32_t a) {
262			switch (a) {
263			case 0: return x;
264			case 1: return x-x/2;
265			case 2: return x-x/2-x/3+x/6;
266			case 3: return (x/ 30U) * 8U + sphicache[3][x % 30U];
267			case 4: return (x/ 210U) * 48U + sphicache[4][x % 210U];
268			case 5: return (x/ 2310U) * 480U + sphicache[5][x % 2310U];
269			case 6: return (x/ 30030U) * 5760U + sphicache[6][x % 30030U];
270			#if PHIC >= 7
271			case 7: {
272			UV xp = x / 17U;
273			return ((x /30030U) * 5760U + sphicache[6][x % 30030U]) -
274			((xp/30030U) * 5760U + sphicache[6][xp % 30030U]);
275			}
276			#endif
277			#if PHIC >= 8
278			case 8: {
279			UV xp = x / 17U;
280			UV x2 = x / 19U;
281			UV x2p = x2 / 17U;
282			return ((x /30030U) * 5760U + sphicache[6][x % 30030U]) -
283			((xp /30030U) * 5760U + sphicache[6][xp % 30030U]) -
284			((x2 /30030U) * 5760U + sphicache[6][x2 % 30030U]) +
285			((x2p/30030U) * 5760U + sphicache[6][x2p% 30030U]);
286			}
287			#endif
288			default: croak("a %u too large for tablephi\n", a);
289			}
290			}
291			static void phitableinit(void) {
292			if (sphi_init == 0) {
293			int x;
294			for (x = 0; x < 210; x++)
295			_s4[x] = ((x/ 30)* 8+_s3[x% 30])-(((x/ 7)/ 30)* 8+_s3[(x/ 7)% 30]);
296			for (x = 0; x < 2310; x++)
297			_s5[x] = ((x/ 210)* 48+_s4[x% 210])-(((x/11)/ 210)* 48+_s4[(x/11)% 210]);
298			for (x = 0; x < 30030; x++)
299			_s6[x] = ((x/2310)480+_s5[x%2310])-(((x/13)/2310)480+_s5[(x/13)%2310]);
300			sphi_init = 1;
301			}
302			}
303
304
305			/* Max memory = 2XA bytes, e.g. 265536256 = 32 MB */
306			#define PHICACHEA 512
307			#define PHICACHEX 65536
308			typedef struct
309			{
310			uint32_t max[PHICACHEA];
311			int16_t* val[PHICACHEA];
312			} cache_t;
313			static void phicache_init(cache_t* cache) {
314			int a;
315			for (a = 0; a < PHICACHEA; a++) {
316			cache->val[a] = 0;
317			cache->max[a] = 0;
318			}
319			phitableinit();
320			}
321			static void phicache_free(cache_t* cache) {
322			int a;
323			for (a = 0; a < PHICACHEA; a++) {
324			if (cache->val[a] != 0)
325			Safefree(cache->val[a]);
326			cache->val[a] = 0;
327			cache->max[a] = 0;
328			}
329			}
330
331			#define PHI_CACHE_POPULATED(x, a) \
332			((a) < PHICACHEA && (UV) cache->max[a] > (x) && cache->val[a][x] != 0)
333
334			static void phi_cache_insert(uint32_t x, uint32_t a, IV sum, cache_t* cache) {
335			uint32_t cap = ( (x+32) >> 5) << 5;
336			/* If sum is too large for the cache, just ignore it. */
337			if (sum < SHRT_MIN \|\| sum > SHRT_MAX) return;
338			if (cache->val[a] == 0) {
339			Newz(0, cache->val[a], cap, int16_t);
340			cache->max[a] = cap;
341			} else if (cache->max[a] < cap) {
342			uint32_t i;
343			Renew(cache->val[a], cap, int16_t);
344			for (i = cache->max[a]; i < cap; i++)
345			cache->val[a][i] = 0;
346			cache->max[a] = cap;
347			}
348			cache->val[a][x] = (int16_t) sum;
349			}
350
351			static IV _phi3(UV x, UV a, int sign, const uint32_t* const primes, const uint32_t lastidx, cache_t* cache)
352			{
353			IV sum;
354			if (a <= 1)
355			return sign * ((a == 0) ? x : x-x/2);
356			else if (PHI_CACHE_POPULATED(x, a))
357			return sign * cache->val[a][x];
358			else if (a <= PHIC)
359			sum = sign * tablephi(x,a);
360			else if (x < primes[a+1])
361			sum = sign;
362			else if (x <= primes[lastidx] && x < primes[a+1]*primes[a+1])
363			sum = sign * (bs_prime_count(x, primes, lastidx) - a + 1);
364			else {
365			UV a2, iters = (a*a > x) ? bs_prime_count( isqrt(x), primes, a) : a;
366			UV c = (iters > PHIC) ? PHIC : iters;
367			IV phixc = PHI_CACHE_POPULATED(x, c) ? cache->val[c][x] : (IV)tablephi(x,c);
368			sum = sign * (iters - a + phixc);
369			for (a2 = c+1; a2 <= iters; a2++)
370			sum += _phi3(FAST_DIV(x,primes[a2]), a2-1, -sign, primes, lastidx, cache);
371			}
372			if (a < PHICACHEA && x < PHICACHEX)
373			phi_cache_insert(x, a, sign * sum, cache);
374			return sum;
375			}
376			#define phi_small(x, a, primes, lastidx, cache) _phi3(x, a, 1, primes, lastidx, cache)
377
378			/******************************************************************************/
379			/* In-order lists for manipulating our UV value / IV count pairs */
380			/******************************************************************************/
381
382			typedef struct {
383			UV v;
384			IV c;
385			} vc_t;
386
387			typedef struct {
388			vc_t* a;
389			UV size;
390			UV n;
391			} vcarray_t;
392
393			static vcarray_t vcarray_create(void)
394			{
395			vcarray_t l;
396			l.a = 0;
397			l.size = 0;
398			l.n = 0;
399			return l;
400			}
401			static void vcarray_destroy(vcarray_t* l)
402			{
403			if (l->a != 0) {
404			if (verbose > 2) printf("FREE list %p\n", l->a);
405			Safefree(l->a);
406			}
407			l->size = 0;
408			l->n = 0;
409			}
410			/* Insert a value/count pair. We do this indirection because about 80% of
411			* the calls result in a merge with the previous entry. */
412			static void vcarray_insert(vcarray_t* l, UV val, IV count)
413			{
414			UV n = l->n;
415			if (n > 0 && l->a[n-1].v < val)
416			croak("Previous value was %lu, inserting %lu out of order\n", l->a[n-1].v, val);
417			if (n >= l->size) {
418			UV new_size;
419			if (l->size == 0) {
420			new_size = 20000;
421			if (verbose>2) printf("ALLOCing list, size %lu (%luk)\n", new_size, new_size*sizeof(vc_t)/1024);
422			New(0, l->a, new_size, vc_t);
423			} else {
424			new_size = (UV) (1.5 * l->size);
425			if (verbose>2) printf("REALLOCing list %p, new size %lu (%luk)\n",l->a,new_size, new_size*sizeof(vc_t)/1024);
426			Renew( l->a, new_size, vc_t );
427			}
428			l->size = new_size;
429			}
430			/* printf(" inserting %lu %ld\n", val, count); */
431			l->a[n].v = val;
432			l->a[n].c = count;
433			l->n++;
434			}
435
436			/* Merge the two sorted lists A and B into A. Each list has no duplicates,
437			* but they may have duplications between the two. We're quite interested
438			* in saving memory, so first remove all the duplicates, then do an in-place
439			* merge. */
440			static void vcarray_merge(vcarray_t* a, vcarray_t* b)
441			{
442			long ai, bi, bj, k, kn;
443			long an = a->n;
444			long bn = b->n;
445			vc_t* aa = a->a;
446			vc_t* ba = b->a;
447
448			/* Merge anything in B that appears in A. */
449			for (ai = 0, bi = 0, bj = 0; bi < bn; bi++) {
450			UV bval = ba[bi].v;
451			/* Skip forward in A until empty or aa[ai].v <= ba[bi].v */
452			while (ai+8 < an && aa[ai+8].v > bval) ai += 8;
453			while (ai < an && aa[ai ].v > bval) ai++;
454			/* if A empty then copy the remaining elements */
455			if (ai >= an) {
456			if (bi == bj)
457			bj = bn;
458			else
459			while (bi < bn)
460			ba[bj++] = ba[bi++];
461			break;
462			}
463			if (aa[ai].v == bval)
464			aa[ai].c += ba[bi].c;
465			else
466			ba[bj++] = ba[bi];
467			}
468			if (verbose>3) printf(" removed %lu duplicates from b\n", bn - bj);
469			bn = bj;
470
471			if (bn == 0) { /* In case they were all duplicates */
472			b->n = 0;
473			return;
474			}
475
476			/* kn = the final merged size. All duplicates are gone, so this is exact. */
477			kn = an+bn;
478			if ((long)a->size < kn) { /* Make A big enough to hold kn elements */
479			UV new_size = (UV) (1.2 * kn);
480			if (verbose>2) printf("REALLOCing list %p, new size %lu (%luk)\n", a->a, new_size, new_size*sizeof(vc_t)/1024);
481			Renew( a->a, new_size, vc_t );
482			aa = a->a; /* this could have been changed by the realloc */
483			a->size = new_size;
484			}
485
486			/* merge A and B. Very simple using reverse merge. */
487			ai = an-1;
488			bi = bn-1;
489			for (k = kn-1; k >= 0 && bi >= 0; k--) {
490			UV bval = ba[bi].v;
491			long startai = ai;
492			while (ai >= 15 && aa[ai-15].v < bval) ai -= 16;
493			while (ai >= 3 && aa[ai- 3].v < bval) ai -= 4;
494			while (ai >= 0 && aa[ai ].v < bval) ai--;
495			if (startai > ai) {
496			k = k - (startai - ai) + 1;
497			memmove(aa+k, aa+ai+1, (startai-ai) * sizeof(vc_t));
498			} else {
499			if (ai >= 0 && aa[ai].v == bval) croak("deduplication error");
500			aa[k] = ba[bi--];
501			}
502			}
503			a->n = kn; /* A now has this many items */
504			b->n = 0; /* B is marked empty */
505			}
506
507			static void vcarray_remove_zeros(vcarray_t* a)
508			{
509			long ai = 0;
510			long aj = 0;
511			long an = a->n;
512			vc_t* aa = a->a;
513
514			while (aj < an) {
515			if (aa[aj].c != 0) {
516			if (ai != aj)
517			aa[ai] = aa[aj];
518			ai++;
519			}
520			aj++;
521			}
522			a->n = ai;
523			}
524
525
526			/*
527			* The main phi(x,a) algorithm. In this implementation, it takes under 10%
528			* of the total time for the Lehmer algorithm, but is a big memory consumer.
529			*/
530			#define NTHRESH (MAX_PHI_MEM/16)
531
532			static UV phi(UV x, UV a)
533			{
534			UV i, val, sval, lastidx, lastprime;
535			UV sum = 0;
536			IV count;
537			const uint32_t* primes;
538			vcarray_t a1, a2;
539			vc_t* arr;
540			cache_t pcache; /* Cache for recursive phi */
541
542			phitableinit();
543			if (a == 1) return ((x+1)/2);
544			if (a <= PHIC) return tablephi(x, a);
545
546			lastidx = a+1;
547			primes = generate_small_primes(lastidx);
548			lastprime = primes[lastidx];
549			if (x < lastprime) { Safefree(primes); return (x > 0) ? 1 : 0; }
550			phicache_init(&pcache);
551
552			a1 = vcarray_create();
553			a2 = vcarray_create();
554			vcarray_insert(&a1, x, 1);
555
556			while (a > PHIC) {
557			UV primea = primes[a];
558			UV sval_last = 0;
559			IV sval_count = 0;
560			arr = a1.a;
561			for (i = 0; i < a1.n; i++) {
562			count = arr[i].c;
563			val = arr[i].v;
564			sval = FAST_DIV(val, primea);
565			if (sval < primea) break; /* stop inserting into a2 if small */
566			if (sval != sval_last) { /* non-merged value. Insert into a2 */
567			if (sval_last != 0) {
568			if (sval_last <= lastprime && sval_last < primes[a-1]*primes[a-1])
569			sum += sval_count*(bs_prime_count(sval_last,primes,lastidx)-a+2);
570			else
571			vcarray_insert(&a2, sval_last, sval_count);
572			}
573			sval_last = sval;
574			sval_count = 0;
575			}
576			sval_count -= count; /* Accumulate count for this sval */
577			}
578			if (sval_last != 0) { /* Insert the last sval */
579			if (sval_last <= lastprime && sval_last < primes[a-1]*primes[a-1])
580			sum += sval_count*(bs_prime_count(sval_last,primes,lastidx)-a+2);
581			else
582			vcarray_insert(&a2, sval_last, sval_count);
583			}
584			/* For each small sval, add up the counts */
585			for ( ; i < a1.n; i++)
586			sum -= arr[i].c;
587			/* Merge a1 and a2 into a1. a2 will be emptied. */
588			vcarray_merge(&a1, &a2);
589			/* If we've grown too large, use recursive phi to clip. */
590			if ( a1.n > NTHRESH ) {
591			arr = a1.a;
592			if (verbose > 0) printf("clipping small values at a=%lu a1.n=%lu \n", a, a1.n);
593			#ifdef _OPENMP
594			/* #pragma omp parallel for reduction(+: sum) firstprivate(pcache) schedule(dynamic, 16) */
595			#endif
596			for (i = 0; i < a1.n-NTHRESH+NTHRESH/50; i++) {
597			UV j = a1.n - 1 - i;
598			IV count = arr[j].c;
599			if (count != 0) {
600			sum += count * phi_small( arr[j].v, a-1, primes, lastidx, &pcache );
601			arr[j].c = 0;
602			}
603			}
604			}
605			vcarray_remove_zeros(&a1);
606			a--;
607			}
608			phicache_free(&pcache);
609			vcarray_destroy(&a2);
610			arr = a1.a;
611			#ifdef _OPENMP
612			#pragma omp parallel for reduction(+: sum) schedule(dynamic, 16)
613			#endif
614			for (i = 0; i < a1.n; i++)
615			sum += arr[i].c * tablephi( arr[i].v, PHIC );
616			vcarray_destroy(&a1);
617			Safefree(primes);
618			return (UV) sum;
619			}
620
621
622			extern UV meissel_prime_count(UV n);
623			/* b = prime_count(isqrt(n)) */
624			static UV Pk_2_p(UV n, UV a, UV b, const uint32_t* primes, uint32_t lastidx)
625			{
626			UV lastw, lastwpc, i, P2;
627			UV lastpc = primes[lastidx];
628
629			/* Ensure we have a large enough base sieve */
630			prime_precalc(isqrt(n / primes[a+1]));
631
632			P2 = lastw = lastwpc = 0;
633			for (i = b; i > a; i--) {
634			UV w = n / primes[i];
635			lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastidx)
636			: lastwpc + segment_prime_count(lastw+1, w);
637			lastw = w;
638			P2 += lastwpc;
639			}
640			P2 -= ((b+a-2) * (b-a+1) / 2) - a + 1;
641			return P2;
642			}
643			static UV Pk_2(UV n, UV a, UV b)
644			{
645			UV lastprime = ((b3+1) > 203280221) ? 203280221 : b3+1;
646			const uint32_t* primes = generate_small_primes(lastprime);
647			UV P2 = Pk_2_p(n, a, b, primes, lastprime);
648			Safefree(primes);
649			return P2;
650			}
651
652
653			/* Legendre's method. Interesting and a good test for phi(x,a), but Lehmer's
654			* method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */
655			UV legendre_prime_count(UV n)
656			{
657			UV a, phina;
658			if (n < SIEVE_LIMIT)
659			return segment_prime_count(2, n);
660
661			a = legendre_prime_count(isqrt(n));
662			/* phina = phi(n, a); */
663			{ /* The small phi routine is faster for large a */
664			cache_t pcache;
665			const uint32_t* primes = 0;
666			primes = generate_small_primes(a+1);
667			phicache_init(&pcache);
668			phina = phi_small(n, a, primes, a+1, &pcache);
669			phicache_free(&pcache);
670			Safefree(primes);
671			}
672			return phina + a - 1;
673			}
674
675
676			/* Meissel's method. */
677			UV meissel_prime_count(UV n)
678			{
679			UV a, b, sum;
680			if (n < SIEVE_LIMIT)
681			return segment_prime_count(2, n);
682
683			a = meissel_prime_count(icbrt(n)); /* a = Pi(floor(n^1/3)) [max 192725] */
684			b = meissel_prime_count(isqrt(n)); /* b = Pi(floor(n^1/2)) [max 203280221] */
685
686			sum = phi(n, a) + a - 1 - Pk_2(n, a, b);
687			return sum;
688			}
689
690			/* Lehmer's method. This is basically Riesel's Lehmer function (page 22),
691			* with some additional code to help optimize it. */
692			UV lehmer_prime_count(UV n)
693			{
694			UV z, a, b, c, sum, i, j, lastprime, lastpc, lastw, lastwpc;
695			const uint32_t* primes = 0; /* small prime cache, first b=pi(z)=pi(sqrt(n)) */
696			DECLARE_TIMING_VARIABLES;
697
698			if (n < SIEVE_LIMIT)
699			return segment_prime_count(2, n);
700
701			/* Protect against overflow. 2^32-1 and 2^64-1 are both divisible by 3. */
702			if (n == UV_MAX) {
703			if ( (n%3) == 0 \|\| (n%5) == 0 \|\| (n%7) == 0 \|\| (n%31) == 0 )
704			n--;
705			else
706			return segment_prime_count(2,n);
707			}
708
709			if (verbose > 0) printf("lehmer %lu stage 1: calculate a,b,c \n", n);
710			TIMING_START;
711			z = isqrt(n);
712			a = lehmer_prime_count(isqrt(z)); /* a = Pi(floor(n^1/4)) [max 6542] */
713			b = lehmer_prime_count(z); /* b = Pi(floor(n^1/2)) [max 203280221] */
714			c = lehmer_prime_count(icbrt(n)); /* c = Pi(floor(n^1/3)) [max 192725] */
715			TIMING_END_PRINT("stage 1")
716
717			if (verbose > 0) printf("lehmer %lu stage 2: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
718			TIMING_START;
719			sum = phi(n, a) + ((b+a-2) * (b-a+1) / 2);
720			TIMING_END_PRINT("phi(x,a)")
721
722			/* We get an array of the first b primes. This is used in stage 4. If we
723			* get more than necessary, we can use them to speed up some.
724			*/
725			lastprime = b*SIEVE_MULT+1;
726			if (lastprime > 203280221) lastprime = 203280221;
727			if (verbose > 0) printf("lehmer %lu stage 3: %lu small primes\n", n, lastprime);
728			TIMING_START;
729			primes = generate_small_primes(lastprime);
730			lastpc = primes[lastprime];
731			TIMING_END_PRINT("small primes")
732
733			TIMING_START;
734			/* Speed up all the prime counts by doing a big base sieve */
735			prime_precalc( (UV) pow(n, 3.0/5.0) );
736			/* Ensure we have the base sieve for big prime_count ( n/primes[i] ). */
737			/* This is about 75k for n=10^13, 421k for n=10^15, 2.4M for n=10^17 */
738			prime_precalc(isqrt(n / primes[a+1]));
739			TIMING_END_PRINT("sieve precalc")
740
741			if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]);
742			TIMING_START;
743			/* Reverse the i loop so w increases. Count w in segments. */
744			lastw = 0;
745			lastwpc = 0;
746			for (i = b; i >= a+1; i--) {
747			UV w = n / primes[i];
748			lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastprime)
749			: lastwpc + segment_prime_count(lastw+1, w);
750			lastw = w;
751			sum = sum - lastwpc;
752			if (i <= c) {
753			UV bi = bs_prime_count( isqrt(w), primes, lastprime );
754			for (j = i; j <= bi; j++) {
755			sum = sum - bs_prime_count(w / primes[j], primes, lastprime) + j - 1;
756			}
757			/* We could wrap the +j-1 in: sum += ((bi+1-i)(bi+i))/2 - (bi-i+1); /
758			}
759			}
760			TIMING_END_PRINT("stage 4")
761			Safefree(primes);
762			return sum;
763			}
764
765
766			/* The Lagarias-Miller-Odlyzko method.
767			* Naive implementation without optimizations.
768			* About the same speed as Lehmer, a bit less memory.
769			* A better implementation can be 10-50x faster and much less memory.
770			*/
771			UV LMOS_prime_count(UV n)
772			{
773			UV n13, a, b, sum, i, j, k, lastprime, P2, S1, S2;
774			const uint32_t* primes = 0; /* small prime cache */
775			signed char* mu = 0; /* moebius to n^1/3 */
776			uint32_t* lpf = 0; /* least prime factor to n^1/3 */
777			cache_t pcache; /* Cache for recursive phi */
778			DECLARE_TIMING_VARIABLES;
779
780			if (n < SIEVE_LIMIT)
781			return segment_prime_count(2, n);
782
783			n13 = icbrt(n); /* n13 = floor(n^1/3) [max 2642245] */
784			a = lehmer_prime_count(n13); /* a = Pi(floor(n^1/3)) [max 192725] */
785			b = lehmerprime_count(isqrt(n)); /* b = Pi(floor(n^1/2)) [max 203280221] */
786
787			lastprime = b*SIEVE_MULT+1;
788			if (lastprime > 203280221) lastprime = 203280221;
789			if (lastprime < n13) lastprime = n13;
790			primes = generate_small_primes(lastprime);
791
792			New(0, mu, n13+1, signed char);
793			memset(mu, 1, sizeof(signed char) * (n13+1));
794			Newz(0, lpf, n13+1, uint32_t);
795			mu[0] = 0;
796			for (i = 1; i <= n13; i++) {
797			UV primei = primes[i];
798			for (j = primei; j <= n13; j += primei) {
799			mu[j] = -mu[j];
800			if (lpf[j] == 0) lpf[j] = primei;
801			}
802			k = primei * primei;
803			for (j = k; j <= n13; j += k)
804			mu[j] = 0;
805			}
806			lpf[1] = UVCONST(4294967295); /* Set lpf[1] to max */
807
808			/* Remove mu[i] == 0 using lpf */
809			for (i = 1; i <= n13; i++)
810			if (mu[i] == 0)
811			lpf[i] = 0;
812
813			/* Thanks to Kim Walisch for help with the S1+S2 calculations. */
814			k = (a < 7) ? a : 7;
815			S1 = 0;
816			S2 = 0;
817			phicache_init(&pcache);
818			TIMING_START;
819			for (i = 1; i <= n13; i++)
820			if (lpf[i] > primes[k])
821			/* S1 += mu[i] * phi_small(n/i, k, primes, lastprime, &pcache); */
822			S1 += mu[i] * phi(n/i, k);
823			TIMING_END_PRINT("S1")
824
825			TIMING_START;
826			for (i = k; i+1 < a; i++) {
827			uint32_t p = primes[i+1];
828			/* TODO: #pragma omp parallel for reduction(+: S2) firstprivate(pcache) schedule(dynamic, 16) */
829			for (j = (n13/p)+1; j <= n13; j++)
830			if (lpf[j] > p)
831			S2 += -mu[j] * phi_small(n / (j*p), i, primes, lastprime, &pcache);
832			}
833			TIMING_END_PRINT("S2")
834			phicache_free(&pcache);
835			Safefree(lpf);
836			Safefree(mu);
837
838			TIMING_START;
839			prime_precalc( (UV) pow(n, 2.9/5.0) );
840			P2 = Pk_2_p(n, a, b, primes, lastprime);
841			TIMING_END_PRINT("P2")
842			Safefree(primes);
843
844			/* printf("S1 = %lu\nS2 = %lu\na = %lu\nP2 = %lu\n", S1, S2, a, P2); */
845			sum = (S1 + S2) + a - 1 - P2;
846			return sum;
847			}
848
849			#ifdef PRIMESIEVE_STANDALONE
850			int main(int argc, char *argv[])
851			{
852			UV n, pi;
853			double t;
854			const char* method;
855			struct timeval t0, t1;
856
857			if (argc <= 1) { printf("usage: %s []\n", argv[0]); return(1); }
858			n = strtoul(argv[1], 0, 10);
859			if (n < 2) { printf("Pi(%lu) = 0\n", n); return(0); }
860
861			if (argc > 2)
862			method = argv[2];
863			else
864			method = "lehmer";
865
866			gettimeofday(&t0, 0);
867
868			if (!strcasecmp(method, "lehmer")) { pi = lehmer_prime_count(n); }
869			else if (!strcasecmp(method, "meissel")) { pi = meissel_prime_count(n); }
870			else if (!strcasecmp(method, "legendre")) { pi = legendre_prime_count(n); }
871			else if (!strcasecmp(method, "lmo")) { pi = LMOS_prime_count(n); }
872			else if (!strcasecmp(method, "sieve")) { pi = segment_prime_count(2, n); }
873			else {
874			printf("method must be one of: lehmer, meissel, legendre, lmo, or sieve\n");
875			return(2);
876			}
877			gettimeofday(&t1, 0);
878			t = (t1.tv_sec-t0.tv_sec); t *= 1000000.0; t += (t1.tv_usec - t0.tv_usec);
879			printf("%8s Pi(%lu) = %lu in %10.5fs\n", method, n, pi, t / 1000000.0);
880			return(0);
881			}
882			#endif
883
884			#else
885
886			#include "lehmer.h"
887	0	0	UV LMOS_prime_count(UV n) { if (n!=0) croak("Not compiled with Lehmer support"); return 0;}
888	0	0	UV lehmer_prime_count(UV n) { if (n!=0) croak("Not compiled with Lehmer support"); return 0;}
889	0	0	UV meissel_prime_count(UV n) { if (n!=0) croak("Not compiled with Lehmer support"); return 0;}
890	0	0	UV legendre_prime_count(UV n) { if (n!=0) croak("Not compiled with Lehmer support"); return 0;}
891
892			#endif