File Coverage

blib/lib/Pod/POM/Web/Indexer.pm

Criterion	Covered	Total	%
statement	24	26	92.3
branch			n/a
condition			n/a
subroutine	9	9	100.0
pod			n/a
total	33	35	94.2

line	stmt	sub	time	code
1				package Pod::POM::Web::Indexer;
2
3	1	1	10	use strict;
	1		5
	1		42
4	1	1	6	use warnings;
	1		2
	1		36
5	1	1	29	use 5.008;
	1		4
6	1	1	7	no warnings 'uninitialized';
	1		3
	1		43
7
8	1	1	6	use Pod::POM;
	1		2
	1		84
9	1	1	7	use List::Util qw/min max/;
	1		4
	1		78
10	1	1	6	use List::MoreUtils qw/part/;
	1		13
	1		13
11	1	1	1486	use Time::HiRes qw/time/;
	1		1674
	1		4
12	1	1	880	use Search::Indexer 0.75;
	0
	0
13				use BerkeleyDB;
14
15				use parent 'Pod::POM::Web';
16				our $VERSION = 1.23;
17
18				#----------------------------------------------------------------------
19				# Initializations
20				#----------------------------------------------------------------------
21
22				my $defaut_max_size_for_indexing = 300 << 10; # 300K
23
24				my $ignore_dirs = qr[
25				auto \| unicore \| DateTime/TimeZone \| DateTime/Locale ]x;
26
27				my $ignore_headings = qr[
28				SYNOPSIS \| DESCRIPTION \| METHODS \| FUNCTIONS \|
29				BUGS \| AUTHOR \| SEE\ ALSO \| COPYRIGHT \| LICENSE ]x;
30
31				(my $index_dir = __FILE__) =~ s[Indexer\.pm$][index];
32
33				my $id_regex = qr/(?![0-9]) # don't start with a digit
34				\w\w+ # start with 2 or more word chars ..
35				(?:::\w+)* # .. and possibly ::some::more::components
36				/x;
37
38				my $wregex = qr/(?: # either a Perl variable:
39				(?:\$\#?\|\@\|\%) # initial sigil
40				(?: # followed by
41				$id_regex # an id
42				\| # or
43				\^\w # builtin var with '^' prefix
44				\| # or
45				(?:[\#\$](?!\w))# just '$$' or '$#'
46				\| # or
47				[^{\w\s\$] # builtin vars with 1 special char
48				)
49				\| # or
50				$id_regex # a plain word or module name
51				)/x;
52
53
54				my @stopwords = (
55				'a' .. 'z', '_', '0' .. '9',
56				qw/__data__ __end__ $class $self
57				above after all also always an and any are as at
58				be because been before being both but by
59				can cannot could
60				die do don done
61				defined do does doesn
62				each else elsif eq
63				for from
64				ge gt
65				has have how
66				if in into is isn it item its
67				keys
68				last le lt
69				many may me method might must my
70				ne new next no nor not
71				of on only or other our
72				package perl pl pm pod push
73				qq qr qw
74				ref return
75				see set shift should since so some something sub such
76				text than that the their them then these they this those to tr
77				undef unless until up us use used uses using
78				values
79				was we what when which while will with would
80				you your/
81				);
82
83
84				#----------------------------------------------------------------------
85				# RETRIEVING
86				#----------------------------------------------------------------------
87
88
89				sub full_text {
90				my ($self, $search_string) = @_;
91
92				my $indexer = eval {
93				new Search::Indexer(dir => $index_dir,
94				wregex => $wregex,
95				preMatch => '[[',
96				postMatch => ']]');
97				} or die <<__EOHTML__;
98				No full-text index found ($@).
99				<p>
100				Please ask your system administrator to run the
101				command
102				</p>
103				<pre>
104				perl -MPod::POM::Web::Indexer -e "Pod::POM::Web::Indexer->new->index"
105				</pre>
106
107				Indexing may take about half an hour and will use about
108				10 MB on your hard disk.
109				__EOHTML__
110
111
112
113				my $lib = "$self->{root_url}/lib";
114				my $html = <<__EOHTML__;
115				<html>
116				<head>
117				<link href="$lib/GvaScript.css" rel="stylesheet" type="text/css">
118				<link href="$lib/PodPomWeb.css" rel="stylesheet" type="text/css">
119				<style>
120				.src {font-size:70%; float: right}
121				.sep {font-size:110%; font-weight: bolder; color: magenta;
122				padding-left: 8px; padding-right: 8px}
123				.hl {background-color: lightpink}
124				</style>
125				</head>
126				<body>
127				__EOHTML__
128
129
130				# force Some::Module::Name into "Some::Module::Name" to prevent
131				# interpretation of ':' as a field name by Query::Parser
132				$search_string =~ s/(^\|\s)([\w]+(?:::\w+)+)(\s\|$)/$1"$2"$3/g;
133
134				my $result = $indexer->search($search_string, 'implicit_plus');
135
136				my $killedWords = join ", ", @{$result->{killedWords}};
137				$killedWords &&= " (ignoring words : $killedWords)";
138				my $regex = $result->{regex};
139
140				my $scores = $result->{scores};
141				my @doc_ids = sort {$scores->{$b} <=> $scores->{$a}} keys %$scores;
142
143				my $nav_links = $self->paginate_results(\@doc_ids);
144
145				$html .= "<b>Full-text search</b> for '$search_string'$killedWords<br>"
146				. "$nav_links<hr>\n";
147
148				$self->_tie_docs(DB_RDONLY);
149
150				foreach my $id (@doc_ids) {
151				my ($mtime, $path, $description) = split "\t", $self->{_docs}{$id};
152				my $score = $scores->{$id};
153				my @filenames = $self->find_source($path);
154				my $buf = join "\n", map {$self->slurp_file($_)} @filenames;
155
156				my $excerpts = $indexer->excerpts($buf, $regex);
157				foreach (@$excerpts) {
158				s/&/&/g, s/</</g, s/>/>/g; # replace entities
159				s/\[\[/<span class='hl'>/g, s/\]\]/<\/span>/g; # highlight
160				}
161				$excerpts = join "<span class='sep'>/</span>", @$excerpts;
162				$html .= <<__EOHTML__;
163				<p>
164				<a href="$self->{root_url}/source/$path" class="src">source</a>
165				<a href="$self->{root_url}/$path">$path</a>
166				(<small>$score</small>) <em>$description</em>
167				<br>
168				<small>$excerpts</small>
169				</p>
170				__EOHTML__
171				}
172
173				$html .= "<hr>$nav_links\n";
174				return $self->send_html($html);
175				}
176
177
178
179				sub paginate_results {
180				my ($self, $doc_ids_ref) = @_;
181
182				my $n_docs = @$doc_ids_ref;
183				my $count = $self->{params}{count} \|\| 50;
184				my $start_record = $self->{params}{start} \|\| 0;
185				my $end_record = min($start_record + $count - 1, $n_docs - 1);
186				@$doc_ids_ref = @$doc_ids_ref[$start_record ... $end_record];
187				my $prev_idx = max($start_record - $count, 0);
188				my $next_idx = $start_record + $count;
189				my $base_url = "?source=full_text&search=$self->{params}{search}";
190				my $prev_link
191				= $start_record > 0 ? uri_escape("$base_url&start=$prev_idx") : "";
192				my $next_link
193				= $next_idx < $n_docs ? uri_escape("$base_url&start=$next_idx") : "";
194				$_ += 1 for $start_record, $end_record;
195				my $nav_links = "";
196				$nav_links .= "<a href='$prev_link'>[Previous <<]</a> " if $prev_link;
197				$nav_links .= "Results <b>$start_record</b> to <b>$end_record</b> "
198				. "from <b>$n_docs</b>";
199				$nav_links .= " <a href='$next_link'>[>> Next]</a> " if $next_link;
200				return $nav_links;
201				}
202
203
204
205
206
207				sub modlist { # called by Ajax
208				my ($self, $search_string) = @_;
209
210				$self->_tie_docs(DB_RDONLY);
211
212				length($search_string) >= 2 or die "module_list: arg too short";
213				my $regex = qr/^\d+\t(\Q$search_string\E[^\t]*)/i;
214
215				my @modules;
216				foreach my $val (values %{$self->{_docs}}) {
217				$val =~ $regex or next;
218				(my $module = $1) =~ s[/][::]g;
219				push @modules, $module;
220				}
221
222				my $json_names = "[" . join(",", map {qq{"$_"}} sort @modules) . "]";
223				return $self->send_content({content => $json_names,
224				mime_type => 'application/x-json'});
225				}
226
227
228				sub get_abstract { # override from Web.pm
229				my ($self, $path) = @_;
230				if (!$self->{_path_to_descr}) {
231				eval {$self->_tie_docs(DB_RDONLY); 1}
232				or return; # database not found
233				$self->{_path_to_descr} = {
234				map {(split /\t/, $_)[1,2]} values %{$self->{_docs}}
235				};
236				}
237				my $description = $self->{_path_to_descr}->{$path} or return;
238				(my $abstract = $description) =~ s/^.?-\s//;
239				return $abstract;
240				}
241
242
243				#----------------------------------------------------------------------
244				# INDEXING
245				#----------------------------------------------------------------------
246
247				sub import { # export the "index" function if called from command-line
248				my $class = shift;
249				my ($package, $filename) = caller;
250
251				no strict 'refs';
252				*{'main::index'} = sub {$class->new->index(@_)}
253				if $package eq 'main' and $filename eq '-e';
254				}
255
256
257				sub index {
258				my ($self, %options) = @_;
259
260				# check invalid options
261				die "invalid option : $_"
262				if grep {!/^-(from_scratch\|max_size\|positions)$/} keys %options;
263
264				# make sure index dir exists
265				-d $index_dir or mkdir $index_dir or die "mkdir $index_dir: $!";
266
267				# if -from_scratch, throw away old index
268				if ($options{-from_scratch}) {
269				unlink $_ or die "unlink $_ : $!" foreach glob("$index_dir/*.bdb");
270				}
271
272				# store global info for indexing methods
273				$self->{_seen_path} = {};
274				$self->{_last_doc_id} = 0;
275				$self->{_max_size_for_indexing} = $options{-max_size}
276				\|\| $defaut_max_size_for_indexing;
277
278				# tie to docs.bdb, storing {$doc_id => "$mtime\t$pathname\t$description"}
279				$self->_tie_docs(DB_CREATE);
280
281				# build in-memory reverse index of info contained in %{$self->{_docs}}
282				$self->{_max_doc_id} = 0;
283				$self->{_previous_index} = {};
284				while (my ($id, $doc_descr) = each %{$self->{_docs}}) {
285				$self->{_max_doc_id} = max($id, $self->{_max_doc_id});
286				my ($mtime, $path, $description) = split /\t/, $doc_descr;
287				$self->{_previous_index}{$path}
288				= {id => $id, mtime => $mtime, description => $description};
289				}
290
291				# open the index
292				$self->{_indexer} = new Search::Indexer(dir => $index_dir,
293				writeMode => 1,
294				positions => $options{-positions},
295				wregex => $wregex,
296				stopwords => \@stopwords);
297
298				# main indexing loop
299				$self->index_dir($_) foreach @Pod::POM::Web::search_dirs;
300
301				$self->{_indexer} = $self->{_docs} = undef;
302				}
303
304
305				sub index_dir {
306				my ($self, $rootdir, $path) = @_;
307				return if $path =~ /$ignore_dirs/;
308
309				my $dir = $rootdir;
310				if ($path) {
311				$dir .= "/$path";
312				return print STDERR "SKIP DIR $dir (already in \@INC)\n"
313				if grep {m[^\Q$dir\E]} @Pod::POM::Web::search_dirs;
314				}
315
316				chdir $dir or return print STDERR "SKIP DIR $dir (chdir $dir: $!)\n";
317
318				print STDERR "DIR $dir\n";
319				opendir my $dh, "." or die $^E;
320				my ($dirs, $files) = part { -d $_ ? 0 : 1} grep {!/^\./} readdir $dh;
321				$dirs \|\|= [], $files \|\|= [];
322				closedir $dh;
323
324				my %extensions;
325				foreach my $file (sort @$files) {
326				next unless $file =~ s/\.(pm\|pod)$//;
327				$extensions{$file}{$1} = 1;
328				}
329
330				foreach my $base (keys %extensions) {
331				$self->index_file($path, $base, $extensions{$base});
332				}
333
334				my @subpaths = map {$path ? "$path/$_" : $_} @$dirs;
335				$self->index_dir($rootdir, $_) foreach @subpaths;
336				}
337
338
339				sub index_file {
340				my ($self, $path, $file, $has_ext) = @_;
341
342				my $fullpath = $path ? "$path/$file" : $file;
343				return print STDERR "SKIP $fullpath (shadowing)\n"
344				if $self->{_seen_path}{$fullpath};
345
346				$self->{_seen_path}{$fullpath} = 1;
347				my $max_mtime = 0;
348				my ($size, $mtime, @filenames);
349				EXT:
350				foreach my $ext (qw/pm pod/) {
351				next EXT unless $has_ext->{$ext};
352				my $filename = "$file.$ext";
353				($size, $mtime) = (stat $filename)[7, 9] or die "stat $filename: $!";
354				$size < $self->{_max_size_for_indexing} or
355				print STDERR "$filename too big ($size bytes), skipped " and next EXT;
356				$mtime = max($max_mtime, $mtime);
357				push @filenames, $filename;
358				}
359
360				if ($mtime <= $self->{_previous_index}{$fullpath}{mtime}) {
361				return print STDERR "SKIP $fullpath (index up to date)\n";
362				}
363
364				if (@filenames) {
365				my $old_doc_id = $self->{_previous_index}{$fullpath}{id};
366				my $doc_id = $old_doc_id \|\| ++$self->{_max_doc_id};
367
368				print STDERR "INDEXING $fullpath (id $doc_id) ... ";
369
370				my $t0 = time;
371				my $buf = join "\n", map {$self->slurp_file($_)} @filenames;
372				my ($description) = ($buf =~ /^=head1\sNAME\s(.*)$/m);
373				$description \|\|= '';
374				$description =~ s/\t/ /g;
375				$buf =~ s/^=head1\s+($ignore_headings).*$//m; # remove full line of those
376				$buf =~ s/^=(head\d\|item)//mg; # just remove command of =head* or =item
377				$buf =~ s/^=\w.*//mg; # remove full line of all other commands
378
379				if ($old_doc_id) {
380				# Here we should remove the old document from the index. But
381				# we no longer have the document source! So we cheat with the current
382				# doc buffer, hoping that most words are similar. This step sounds
383				# ridiculous but is necessary to avoid having twice the same
384				# doc listed twice in inverted lists.
385				$self->{_indexer}->remove($old_doc_id, $buf);
386				}
387
388				$self->{_indexer}->add($doc_id, $buf);
389				my $interval = time - $t0;
390				printf STDERR "%0.3f s.", $interval;
391
392				$self->{_docs}{$doc_id} = "$mtime\t$fullpath\t$description";
393				}
394
395				print STDERR "\n";
396
397				}
398
399
400				#----------------------------------------------------------------------
401				# UTILITIES
402				#----------------------------------------------------------------------
403
404				sub _tie_docs {
405				my ($self, $mode) = @_;
406
407				# tie to docs.bdb, storing {$doc_id => "$mtime\t$pathname\t$description"}
408				tie %{$self->{_docs}}, 'BerkeleyDB::Hash',
409				-Filename => "$index_dir/docs.bdb",
410				-Flags => $mode
411				or die "open $index_dir/docs.bdb : $^E $BerkeleyDB::Error";
412				}
413
414
415
416				sub uri_escape {
417				my $uri = shift;
418				$uri =~ s{([^;\/?:@&=\$,A-Za-z0-9\-_.!~*'()])}
419				{sprintf("%%%02X", ord($1)) }ge;
420				return $uri;
421				}
422
423
424				1;
425
426				__END__
427
428				=head1 NAME
429
430				Pod::POM::Web::Indexer - full-text search for Pod::POM::Web
431
432				=head1 SYNOPSIS
433
434				perl -MPod::POM::Web::Indexer -e index
435
436				=head1 DESCRIPTION
437
438				Adds full-text search capabilities to the
439				L<Pod::POM::Web\|Pod::POM::Web> application.
440				This requires L<Search::Indexer\|Search::Indexer> to be installed.
441
442				Queries may include plain terms, "exact phrases",
443				'+' or '-' prefixes, Boolean operators and parentheses.
444				See L<Search::QueryParser\|Search::QueryParser> for details.
445
446
447				=head1 METHODS
448
449				=head2 index
450
451				Pod::POM::Web::Indexer->new->index(%options)
452
453				Walks through directories in C<@INC> and indexes
454				all C<.pm> and C<.pod> files, skipping shadowed files
455				(files for which a similar loading path was already
456				found in previous C<@INC> directories), and skipping
457				files that are too big.
458
459				Default indexing is incremental : files whose modification
460				time has not changed since the last indexing operation will
461				not be indexed again.
462
463				Options can be
464
465				=over
466
467				=item -max_size
468
469				Size limit (in bytes) above which files will not be indexed.
470				The default value is 300K.
471				Files of size above this limit are usually not worth
472				indexing because they only contain big configuration tables
473				(like for example C<Module::CoreList> or C<Unicode::Charname>).
474
475				=item -from_scratch
476
477				If true, the previous index is deleted, so all files will be freshly
478				indexed. If false (the default), indexation is incremental, i.e. files
479				whose modification time has not changed will not be re-indexed.
480
481				=item -positions
482
483				If true, the indexer will also store word positions in documents, so
484				that it can later answer to "exact phrase" queries.
485
486				So if C<-positions> are on, a search for C<"more than one way"> will
487				only return documents which contain that exact sequence of contiguous
488				words; whereas if C<-positions> are off, the query is equivalent to
489				C<more AND than AND one AND way>, i.e. it returns all documents which
490				contain these words anywhere and in any order.
491
492				The option is off by default, because it requires much more disk
493				space, and does not seem to be very relevant for searching
494				Perl documentation.
495
496				=back
497
498				The C<index> function is exported into the C<main::> namespace if perl
499				is called with the C<-e> flag, so that you can write
500
501				perl -MPod::POM::Web::Indexer -e index
502
503
504				=head1 PERFORMANCES
505
506				On my machine, indexing a module takes an average of 0.2 seconds,
507				except for some long and complex sources (this is why sources
508				above 300K are ignored by default, see options above).
509				Here are the worst figures (in seconds) :
510
511				Date/Manip 39.655
512				DBI 30.73
513				Pod/perlfunc 29.502
514				Module/CoreList 27.287
515				CGI 16.922
516				Config 13.445
517				CPAN 12.598
518				Pod/perlapi 10.906
519				CGI/FormBuilder 8.592
520				Win32/TieRegistry 7.338
521				Spreadsheet/WriteExcel 7.132
522				Pod/perldiag 5.771
523				Parse/RecDescent 5.405
524				Bit/Vector 4.768
525
526				The index will be stored in an F<index> subdirectory
527				under the module installation directory.
528				The total index size should be around 10MB if C<-positions> are off,
529				and between 30MB and 50MB if C<-positions> are on, depending on
530				how many modules are installed.
531
532
533				=head1 TODO
534
535				- highlights in shown documents
536				- paging
537
538				=cut
539
540