line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Pod::POM::Web::Indexer; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
10
|
use strict; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
42
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
5
|
1
|
|
|
1
|
|
29
|
use 5.008; |
|
1
|
|
|
|
|
4
|
|
6
|
1
|
|
|
1
|
|
7
|
no warnings 'uninitialized'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
43
|
|
7
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
6
|
use Pod::POM; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
84
|
|
9
|
1
|
|
|
1
|
|
7
|
use List::Util qw/min max/; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
78
|
|
10
|
1
|
|
|
1
|
|
6
|
use List::MoreUtils qw/part/; |
|
1
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
13
|
|
11
|
1
|
|
|
1
|
|
1486
|
use Time::HiRes qw/time/; |
|
1
|
|
|
|
|
1674
|
|
|
1
|
|
|
|
|
4
|
|
12
|
1
|
|
|
1
|
|
880
|
use Search::Indexer 0.75; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use BerkeleyDB; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use parent 'Pod::POM::Web'; |
16
|
|
|
|
|
|
|
our $VERSION = 1.23; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my $defaut_max_size_for_indexing = 300 << 10; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $ignore_dirs = qr[ |
25
|
|
|
|
|
|
|
auto | unicore | DateTime/TimeZone | DateTime/Locale ]x; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my $ignore_headings = qr[ |
28
|
|
|
|
|
|
|
SYNOPSIS | DESCRIPTION | METHODS | FUNCTIONS | |
29
|
|
|
|
|
|
|
BUGS | AUTHOR | SEE\ ALSO | COPYRIGHT | LICENSE ]x; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
(my $index_dir = __FILE__) =~ s[Indexer\.pm$][index]; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $id_regex = qr/(?![0-9]) # don't start with a digit |
34
|
|
|
|
|
|
|
\w\w+ # start with 2 or more word chars .. |
35
|
|
|
|
|
|
|
(?:::\w+)* # .. and possibly ::some::more::components |
36
|
|
|
|
|
|
|
/x; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
my $wregex = qr/(?: # either a Perl variable: |
39
|
|
|
|
|
|
|
(?:\$\#?|\@|\%) # initial sigil |
40
|
|
|
|
|
|
|
(?: # followed by |
41
|
|
|
|
|
|
|
$id_regex # an id |
42
|
|
|
|
|
|
|
| # or |
43
|
|
|
|
|
|
|
\^\w # builtin var with '^' prefix |
44
|
|
|
|
|
|
|
| # or |
45
|
|
|
|
|
|
|
(?:[\#\$](?!\w))# just '$$' or '$#' |
46
|
|
|
|
|
|
|
| # or |
47
|
|
|
|
|
|
|
[^{\w\s\$] # builtin vars with 1 special char |
48
|
|
|
|
|
|
|
) |
49
|
|
|
|
|
|
|
| # or |
50
|
|
|
|
|
|
|
$id_regex # a plain word or module name |
51
|
|
|
|
|
|
|
)/x; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
my @stopwords = ( |
55
|
|
|
|
|
|
|
'a' .. 'z', '_', '0' .. '9', |
56
|
|
|
|
|
|
|
qw/__data__ __end__ $class $self |
57
|
|
|
|
|
|
|
above after all also always an and any are as at |
58
|
|
|
|
|
|
|
be because been before being both but by |
59
|
|
|
|
|
|
|
can cannot could |
60
|
|
|
|
|
|
|
die do don done |
61
|
|
|
|
|
|
|
defined do does doesn |
62
|
|
|
|
|
|
|
each else elsif eq |
63
|
|
|
|
|
|
|
for from |
64
|
|
|
|
|
|
|
ge gt |
65
|
|
|
|
|
|
|
has have how |
66
|
|
|
|
|
|
|
if in into is isn it item its |
67
|
|
|
|
|
|
|
keys |
68
|
|
|
|
|
|
|
last le lt |
69
|
|
|
|
|
|
|
many may me method might must my |
70
|
|
|
|
|
|
|
ne new next no nor not |
71
|
|
|
|
|
|
|
of on only or other our |
72
|
|
|
|
|
|
|
package perl pl pm pod push |
73
|
|
|
|
|
|
|
qq qr qw |
74
|
|
|
|
|
|
|
ref return |
75
|
|
|
|
|
|
|
see set shift should since so some something sub such |
76
|
|
|
|
|
|
|
text than that the their them then these they this those to tr |
77
|
|
|
|
|
|
|
undef unless until up us use used uses using |
78
|
|
|
|
|
|
|
values |
79
|
|
|
|
|
|
|
was we what when which while will with would |
80
|
|
|
|
|
|
|
you your/ |
81
|
|
|
|
|
|
|
); |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub full_text { |
90
|
|
|
|
|
|
|
my ($self, $search_string) = @_; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
my $indexer = eval { |
93
|
|
|
|
|
|
|
new Search::Indexer(dir => $index_dir, |
94
|
|
|
|
|
|
|
wregex => $wregex, |
95
|
|
|
|
|
|
|
preMatch => '[[', |
96
|
|
|
|
|
|
|
postMatch => ']]'); |
97
|
|
|
|
|
|
|
} or die <<__EOHTML__; |
98
|
|
|
|
|
|
|
No full-text index found ($@). |
99
|
|
|
|
|
|
|
<p> |
100
|
|
|
|
|
|
|
Please ask your system administrator to run the |
101
|
|
|
|
|
|
|
command |
102
|
|
|
|
|
|
|
</p> |
103
|
|
|
|
|
|
|
<pre> |
104
|
|
|
|
|
|
|
perl -MPod::POM::Web::Indexer -e "Pod::POM::Web::Indexer->new->index" |
105
|
|
|
|
|
|
|
</pre> |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Indexing may take about half an hour and will use about |
108
|
|
|
|
|
|
|
10 MB on your hard disk. |
109
|
|
|
|
|
|
|
__EOHTML__ |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
my $lib = "$self->{root_url}/lib"; |
114
|
|
|
|
|
|
|
my $html = <<__EOHTML__; |
115
|
|
|
|
|
|
|
<html> |
116
|
|
|
|
|
|
|
<head> |
117
|
|
|
|
|
|
|
<link href="$lib/GvaScript.css" rel="stylesheet" type="text/css"> |
118
|
|
|
|
|
|
|
<link href="$lib/PodPomWeb.css" rel="stylesheet" type="text/css"> |
119
|
|
|
|
|
|
|
<style> |
120
|
|
|
|
|
|
|
.src {font-size:70%; float: right} |
121
|
|
|
|
|
|
|
.sep {font-size:110%; font-weight: bolder; color: magenta; |
122
|
|
|
|
|
|
|
padding-left: 8px; padding-right: 8px} |
123
|
|
|
|
|
|
|
.hl {background-color: lightpink} |
124
|
|
|
|
|
|
|
</style> |
125
|
|
|
|
|
|
|
</head> |
126
|
|
|
|
|
|
|
<body> |
127
|
|
|
|
|
|
|
__EOHTML__ |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
$search_string =~ s/(^|\s)([\w]+(?:::\w+)+)(\s|$)/$1"$2"$3/g; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
my $result = $indexer->search($search_string, 'implicit_plus'); |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
my $killedWords = join ", ", @{$result->{killedWords}}; |
137
|
|
|
|
|
|
|
$killedWords &&= " (ignoring words : $killedWords)"; |
138
|
|
|
|
|
|
|
my $regex = $result->{regex}; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
my $scores = $result->{scores}; |
141
|
|
|
|
|
|
|
my @doc_ids = sort {$scores->{$b} <=> $scores->{$a}} keys %$scores; |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
my $nav_links = $self->paginate_results(\@doc_ids); |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
$html .= "<b>Full-text search</b> for '$search_string'$killedWords<br>" |
146
|
|
|
|
|
|
|
. "$nav_links<hr>\n"; |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
$self->_tie_docs(DB_RDONLY); |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
foreach my $id (@doc_ids) { |
151
|
|
|
|
|
|
|
my ($mtime, $path, $description) = split "\t", $self->{_docs}{$id}; |
152
|
|
|
|
|
|
|
my $score = $scores->{$id}; |
153
|
|
|
|
|
|
|
my @filenames = $self->find_source($path); |
154
|
|
|
|
|
|
|
my $buf = join "\n", map {$self->slurp_file($_)} @filenames; |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
my $excerpts = $indexer->excerpts($buf, $regex); |
157
|
|
|
|
|
|
|
foreach (@$excerpts) { |
158
|
|
|
|
|
|
|
s/&/&/g, s/</</g, s/>/>/g; |
159
|
|
|
|
|
|
|
s/\[\[/<span class='hl'>/g, s/\]\]/<\/span>/g; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
$excerpts = join "<span class='sep'>/</span>", @$excerpts; |
162
|
|
|
|
|
|
|
$html .= <<__EOHTML__; |
163
|
|
|
|
|
|
|
<p> |
164
|
|
|
|
|
|
|
<a href="$self->{root_url}/source/$path" class="src">source</a> |
165
|
|
|
|
|
|
|
<a href="$self->{root_url}/$path">$path</a> |
166
|
|
|
|
|
|
|
(<small>$score</small>) <em>$description</em> |
167
|
|
|
|
|
|
|
<br> |
168
|
|
|
|
|
|
|
<small>$excerpts</small> |
169
|
|
|
|
|
|
|
</p> |
170
|
|
|
|
|
|
|
__EOHTML__ |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
$html .= "<hr>$nav_links\n"; |
174
|
|
|
|
|
|
|
return $self->send_html($html); |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
sub paginate_results { |
180
|
|
|
|
|
|
|
my ($self, $doc_ids_ref) = @_; |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
my $n_docs = @$doc_ids_ref; |
183
|
|
|
|
|
|
|
my $count = $self->{params}{count} || 50; |
184
|
|
|
|
|
|
|
my $start_record = $self->{params}{start} || 0; |
185
|
|
|
|
|
|
|
my $end_record = min($start_record + $count - 1, $n_docs - 1); |
186
|
|
|
|
|
|
|
@$doc_ids_ref = @$doc_ids_ref[$start_record ... $end_record]; |
187
|
|
|
|
|
|
|
my $prev_idx = max($start_record - $count, 0); |
188
|
|
|
|
|
|
|
my $next_idx = $start_record + $count; |
189
|
|
|
|
|
|
|
my $base_url = "?source=full_text&search=$self->{params}{search}"; |
190
|
|
|
|
|
|
|
my $prev_link |
191
|
|
|
|
|
|
|
= $start_record > 0 ? uri_escape("$base_url&start=$prev_idx") : ""; |
192
|
|
|
|
|
|
|
my $next_link |
193
|
|
|
|
|
|
|
= $next_idx < $n_docs ? uri_escape("$base_url&start=$next_idx") : ""; |
194
|
|
|
|
|
|
|
$_ += 1 for $start_record, $end_record; |
195
|
|
|
|
|
|
|
my $nav_links = ""; |
196
|
|
|
|
|
|
|
$nav_links .= "<a href='$prev_link'>[Previous <<]</a> " if $prev_link; |
197
|
|
|
|
|
|
|
$nav_links .= "Results <b>$start_record</b> to <b>$end_record</b> " |
198
|
|
|
|
|
|
|
. "from <b>$n_docs</b>"; |
199
|
|
|
|
|
|
|
$nav_links .= " <a href='$next_link'>[>> Next]</a> " if $next_link; |
200
|
|
|
|
|
|
|
return $nav_links; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub modlist { |
208
|
|
|
|
|
|
|
my ($self, $search_string) = @_; |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
$self->_tie_docs(DB_RDONLY); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
length($search_string) >= 2 or die "module_list: arg too short"; |
213
|
|
|
|
|
|
|
my $regex = qr/^\d+\t(\Q$search_string\E[^\t]*)/i; |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
my @modules; |
216
|
|
|
|
|
|
|
foreach my $val (values %{$self->{_docs}}) { |
217
|
|
|
|
|
|
|
$val =~ $regex or next; |
218
|
|
|
|
|
|
|
(my $module = $1) =~ s[/][::]g; |
219
|
|
|
|
|
|
|
push @modules, $module; |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
my $json_names = "[" . join(",", map {qq{"$_"}} sort @modules) . "]"; |
223
|
|
|
|
|
|
|
return $self->send_content({content => $json_names, |
224
|
|
|
|
|
|
|
mime_type => 'application/x-json'}); |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub get_abstract { |
229
|
|
|
|
|
|
|
my ($self, $path) = @_; |
230
|
|
|
|
|
|
|
if (!$self->{_path_to_descr}) { |
231
|
|
|
|
|
|
|
eval {$self->_tie_docs(DB_RDONLY); 1} |
232
|
|
|
|
|
|
|
or return; |
233
|
|
|
|
|
|
|
$self->{_path_to_descr} = { |
234
|
|
|
|
|
|
|
map {(split /\t/, $_)[1,2]} values %{$self->{_docs}} |
235
|
|
|
|
|
|
|
}; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
my $description = $self->{_path_to_descr}->{$path} or return; |
238
|
|
|
|
|
|
|
(my $abstract = $description) =~ s/^.*?-\s*//; |
239
|
|
|
|
|
|
|
return $abstract; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
sub import { |
248
|
|
|
|
|
|
|
my $class = shift; |
249
|
|
|
|
|
|
|
my ($package, $filename) = caller; |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
no strict 'refs'; |
252
|
|
|
|
|
|
|
*{'main::index'} = sub {$class->new->index(@_)} |
253
|
|
|
|
|
|
|
if $package eq 'main' and $filename eq '-e'; |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
sub index { |
258
|
|
|
|
|
|
|
my ($self, %options) = @_; |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
die "invalid option : $_" |
262
|
|
|
|
|
|
|
if grep {!/^-(from_scratch|max_size|positions)$/} keys %options; |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
-d $index_dir or mkdir $index_dir or die "mkdir $index_dir: $!"; |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
if ($options{-from_scratch}) { |
269
|
|
|
|
|
|
|
unlink $_ or die "unlink $_ : $!" foreach glob("$index_dir/*.bdb"); |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
$self->{_seen_path} = {}; |
274
|
|
|
|
|
|
|
$self->{_last_doc_id} = 0; |
275
|
|
|
|
|
|
|
$self->{_max_size_for_indexing} = $options{-max_size} |
276
|
|
|
|
|
|
|
|| $defaut_max_size_for_indexing; |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
$self->_tie_docs(DB_CREATE); |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
$self->{_max_doc_id} = 0; |
283
|
|
|
|
|
|
|
$self->{_previous_index} = {}; |
284
|
|
|
|
|
|
|
while (my ($id, $doc_descr) = each %{$self->{_docs}}) { |
285
|
|
|
|
|
|
|
$self->{_max_doc_id} = max($id, $self->{_max_doc_id}); |
286
|
|
|
|
|
|
|
my ($mtime, $path, $description) = split /\t/, $doc_descr; |
287
|
|
|
|
|
|
|
$self->{_previous_index}{$path} |
288
|
|
|
|
|
|
|
= {id => $id, mtime => $mtime, description => $description}; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
$self->{_indexer} = new Search::Indexer(dir => $index_dir, |
293
|
|
|
|
|
|
|
writeMode => 1, |
294
|
|
|
|
|
|
|
positions => $options{-positions}, |
295
|
|
|
|
|
|
|
wregex => $wregex, |
296
|
|
|
|
|
|
|
stopwords => \@stopwords); |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
$self->index_dir($_) foreach @Pod::POM::Web::search_dirs; |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
$self->{_indexer} = $self->{_docs} = undef; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
sub index_dir { |
306
|
|
|
|
|
|
|
my ($self, $rootdir, $path) = @_; |
307
|
|
|
|
|
|
|
return if $path =~ /$ignore_dirs/; |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
my $dir = $rootdir; |
310
|
|
|
|
|
|
|
if ($path) { |
311
|
|
|
|
|
|
|
$dir .= "/$path"; |
312
|
|
|
|
|
|
|
return print STDERR "SKIP DIR $dir (already in \@INC)\n" |
313
|
|
|
|
|
|
|
if grep {m[^\Q$dir\E]} @Pod::POM::Web::search_dirs; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
chdir $dir or return print STDERR "SKIP DIR $dir (chdir $dir: $!)\n"; |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
print STDERR "DIR $dir\n"; |
319
|
|
|
|
|
|
|
opendir my $dh, "." or die $^E; |
320
|
|
|
|
|
|
|
my ($dirs, $files) = part { -d $_ ? 0 : 1} grep {!/^\./} readdir $dh; |
321
|
|
|
|
|
|
|
$dirs ||= [], $files ||= []; |
322
|
|
|
|
|
|
|
closedir $dh; |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
my %extensions; |
325
|
|
|
|
|
|
|
foreach my $file (sort @$files) { |
326
|
|
|
|
|
|
|
next unless $file =~ s/\.(pm|pod)$//; |
327
|
|
|
|
|
|
|
$extensions{$file}{$1} = 1; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
foreach my $base (keys %extensions) { |
331
|
|
|
|
|
|
|
$self->index_file($path, $base, $extensions{$base}); |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
my @subpaths = map {$path ? "$path/$_" : $_} @$dirs; |
335
|
|
|
|
|
|
|
$self->index_dir($rootdir, $_) foreach @subpaths; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
sub index_file { |
340
|
|
|
|
|
|
|
my ($self, $path, $file, $has_ext) = @_; |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
my $fullpath = $path ? "$path/$file" : $file; |
343
|
|
|
|
|
|
|
return print STDERR "SKIP $fullpath (shadowing)\n" |
344
|
|
|
|
|
|
|
if $self->{_seen_path}{$fullpath}; |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
$self->{_seen_path}{$fullpath} = 1; |
347
|
|
|
|
|
|
|
my $max_mtime = 0; |
348
|
|
|
|
|
|
|
my ($size, $mtime, @filenames); |
349
|
|
|
|
|
|
|
EXT: |
350
|
|
|
|
|
|
|
foreach my $ext (qw/pm pod/) { |
351
|
|
|
|
|
|
|
next EXT unless $has_ext->{$ext}; |
352
|
|
|
|
|
|
|
my $filename = "$file.$ext"; |
353
|
|
|
|
|
|
|
($size, $mtime) = (stat $filename)[7, 9] or die "stat $filename: $!"; |
354
|
|
|
|
|
|
|
$size < $self->{_max_size_for_indexing} or |
355
|
|
|
|
|
|
|
print STDERR "$filename too big ($size bytes), skipped " and next EXT; |
356
|
|
|
|
|
|
|
$mtime = max($max_mtime, $mtime); |
357
|
|
|
|
|
|
|
push @filenames, $filename; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
if ($mtime <= $self->{_previous_index}{$fullpath}{mtime}) { |
361
|
|
|
|
|
|
|
return print STDERR "SKIP $fullpath (index up to date)\n"; |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
if (@filenames) { |
365
|
|
|
|
|
|
|
my $old_doc_id = $self->{_previous_index}{$fullpath}{id}; |
366
|
|
|
|
|
|
|
my $doc_id = $old_doc_id || ++$self->{_max_doc_id}; |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
print STDERR "INDEXING $fullpath (id $doc_id) ... "; |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
my $t0 = time; |
371
|
|
|
|
|
|
|
my $buf = join "\n", map {$self->slurp_file($_)} @filenames; |
372
|
|
|
|
|
|
|
my ($description) = ($buf =~ /^=head1\s*NAME\s*(.*)$/m); |
373
|
|
|
|
|
|
|
$description ||= ''; |
374
|
|
|
|
|
|
|
$description =~ s/\t/ /g; |
375
|
|
|
|
|
|
|
$buf =~ s/^=head1\s+($ignore_headings).*$//m; |
376
|
|
|
|
|
|
|
$buf =~ s/^=(head\d|item)//mg; |
377
|
|
|
|
|
|
|
$buf =~ s/^=\w.*//mg; |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
if ($old_doc_id) { |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
$self->{_indexer}->remove($old_doc_id, $buf); |
386
|
|
|
|
|
|
|
} |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
$self->{_indexer}->add($doc_id, $buf); |
389
|
|
|
|
|
|
|
my $interval = time - $t0; |
390
|
|
|
|
|
|
|
printf STDERR "%0.3f s.", $interval; |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
$self->{_docs}{$doc_id} = "$mtime\t$fullpath\t$description"; |
393
|
|
|
|
|
|
|
} |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
print STDERR "\n"; |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
sub _tie_docs { |
405
|
|
|
|
|
|
|
my ($self, $mode) = @_; |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
tie %{$self->{_docs}}, 'BerkeleyDB::Hash', |
409
|
|
|
|
|
|
|
-Filename => "$index_dir/docs.bdb", |
410
|
|
|
|
|
|
|
-Flags => $mode |
411
|
|
|
|
|
|
|
or die "open $index_dir/docs.bdb : $^E $BerkeleyDB::Error"; |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
sub uri_escape { |
417
|
|
|
|
|
|
|
my $uri = shift; |
418
|
|
|
|
|
|
|
$uri =~ s{([^;\/?:@&=\$,A-Za-z0-9\-_.!~*'()])} |
419
|
|
|
|
|
|
|
{sprintf("%%%02X", ord($1)) }ge; |
420
|
|
|
|
|
|
|
return $uri; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
1; |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
__END__ |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=head1 NAME |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
Pod::POM::Web::Indexer - full-text search for Pod::POM::Web |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=head1 SYNOPSIS |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
perl -MPod::POM::Web::Indexer -e index |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=head1 DESCRIPTION |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
Adds full-text search capabilities to the |
439
|
|
|
|
|
|
|
L<Pod::POM::Web|Pod::POM::Web> application. |
440
|
|
|
|
|
|
|
This requires L<Search::Indexer|Search::Indexer> to be installed. |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
Queries may include plain terms, "exact phrases", |
443
|
|
|
|
|
|
|
'+' or '-' prefixes, Boolean operators and parentheses. |
444
|
|
|
|
|
|
|
See L<Search::QueryParser|Search::QueryParser> for details. |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
=head1 METHODS |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=head2 index |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
Pod::POM::Web::Indexer->new->index(%options) |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
Walks through directories in C<@INC> and indexes |
454
|
|
|
|
|
|
|
all C<*.pm> and C<*.pod> files, skipping shadowed files |
455
|
|
|
|
|
|
|
(files for which a similar loading path was already |
456
|
|
|
|
|
|
|
found in previous C<@INC> directories), and skipping |
457
|
|
|
|
|
|
|
files that are too big. |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
Default indexing is incremental : files whose modification |
460
|
|
|
|
|
|
|
time has not changed since the last indexing operation will |
461
|
|
|
|
|
|
|
not be indexed again. |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
Options can be |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=over |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=item -max_size |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Size limit (in bytes) above which files will not be indexed. |
470
|
|
|
|
|
|
|
The default value is 300K. |
471
|
|
|
|
|
|
|
Files of size above this limit are usually not worth |
472
|
|
|
|
|
|
|
indexing because they only contain big configuration tables |
473
|
|
|
|
|
|
|
(like for example C<Module::CoreList> or C<Unicode::Charname>). |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=item -from_scratch |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
If true, the previous index is deleted, so all files will be freshly |
478
|
|
|
|
|
|
|
indexed. If false (the default), indexation is incremental, i.e. files |
479
|
|
|
|
|
|
|
whose modification time has not changed will not be re-indexed. |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=item -positions |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
If true, the indexer will also store word positions in documents, so |
484
|
|
|
|
|
|
|
that it can later answer to "exact phrase" queries. |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
So if C<-positions> are on, a search for C<"more than one way"> will |
487
|
|
|
|
|
|
|
only return documents which contain that exact sequence of contiguous |
488
|
|
|
|
|
|
|
words; whereas if C<-positions> are off, the query is equivalent to |
489
|
|
|
|
|
|
|
C<more AND than AND one AND way>, i.e. it returns all documents which |
490
|
|
|
|
|
|
|
contain these words anywhere and in any order. |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
The option is off by default, because it requires much more disk |
493
|
|
|
|
|
|
|
space, and does not seem to be very relevant for searching |
494
|
|
|
|
|
|
|
Perl documentation. |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
=back |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
The C<index> function is exported into the C<main::> namespace if perl |
499
|
|
|
|
|
|
|
is called with the C<-e> flag, so that you can write |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
perl -MPod::POM::Web::Indexer -e index |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=head1 PERFORMANCES |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
On my machine, indexing a module takes an average of 0.2 seconds, |
507
|
|
|
|
|
|
|
except for some long and complex sources (this is why sources |
508
|
|
|
|
|
|
|
above 300K are ignored by default, see options above). |
509
|
|
|
|
|
|
|
Here are the worst figures (in seconds) : |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
Date/Manip 39.655 |
512
|
|
|
|
|
|
|
DBI 30.73 |
513
|
|
|
|
|
|
|
Pod/perlfunc 29.502 |
514
|
|
|
|
|
|
|
Module/CoreList 27.287 |
515
|
|
|
|
|
|
|
CGI 16.922 |
516
|
|
|
|
|
|
|
Config 13.445 |
517
|
|
|
|
|
|
|
CPAN 12.598 |
518
|
|
|
|
|
|
|
Pod/perlapi 10.906 |
519
|
|
|
|
|
|
|
CGI/FormBuilder 8.592 |
520
|
|
|
|
|
|
|
Win32/TieRegistry 7.338 |
521
|
|
|
|
|
|
|
Spreadsheet/WriteExcel 7.132 |
522
|
|
|
|
|
|
|
Pod/perldiag 5.771 |
523
|
|
|
|
|
|
|
Parse/RecDescent 5.405 |
524
|
|
|
|
|
|
|
Bit/Vector 4.768 |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
The index will be stored in an F<index> subdirectory |
527
|
|
|
|
|
|
|
under the module installation directory. |
528
|
|
|
|
|
|
|
The total index size should be around 10MB if C<-positions> are off, |
529
|
|
|
|
|
|
|
and between 30MB and 50MB if C<-positions> are on, depending on |
530
|
|
|
|
|
|
|
how many modules are installed. |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=head1 TODO |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
- highlights in shown documents |
536
|
|
|
|
|
|
|
- paging |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=cut |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
|