| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Pod::POM::Web::Indexer; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
10
|
use strict; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
42
|
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
36
|
|
|
5
|
1
|
|
|
1
|
|
29
|
use 5.008; |
|
|
1
|
|
|
|
|
4
|
|
|
6
|
1
|
|
|
1
|
|
7
|
no warnings 'uninitialized'; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
43
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
6
|
use Pod::POM; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
84
|
|
|
9
|
1
|
|
|
1
|
|
7
|
use List::Util qw/min max/; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
78
|
|
|
10
|
1
|
|
|
1
|
|
6
|
use List::MoreUtils qw/part/; |
|
|
1
|
|
|
|
|
13
|
|
|
|
1
|
|
|
|
|
13
|
|
|
11
|
1
|
|
|
1
|
|
1486
|
use Time::HiRes qw/time/; |
|
|
1
|
|
|
|
|
1674
|
|
|
|
1
|
|
|
|
|
4
|
|
|
12
|
1
|
|
|
1
|
|
880
|
use Search::Indexer 0.75; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use BerkeleyDB; |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use parent 'Pod::POM::Web'; |
|
16
|
|
|
|
|
|
|
our $VERSION = 1.23; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my $defaut_max_size_for_indexing = 300 << 10; |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $ignore_dirs = qr[ |
|
25
|
|
|
|
|
|
|
auto | unicore | DateTime/TimeZone | DateTime/Locale ]x; |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my $ignore_headings = qr[ |
|
28
|
|
|
|
|
|
|
SYNOPSIS | DESCRIPTION | METHODS | FUNCTIONS | |
|
29
|
|
|
|
|
|
|
BUGS | AUTHOR | SEE\ ALSO | COPYRIGHT | LICENSE ]x; |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
(my $index_dir = __FILE__) =~ s[Indexer\.pm$][index]; |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $id_regex = qr/(?![0-9]) # don't start with a digit |
|
34
|
|
|
|
|
|
|
\w\w+ # start with 2 or more word chars .. |
|
35
|
|
|
|
|
|
|
(?:::\w+)* # .. and possibly ::some::more::components |
|
36
|
|
|
|
|
|
|
/x; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
my $wregex = qr/(?: # either a Perl variable: |
|
39
|
|
|
|
|
|
|
(?:\$\#?|\@|\%) # initial sigil |
|
40
|
|
|
|
|
|
|
(?: # followed by |
|
41
|
|
|
|
|
|
|
$id_regex # an id |
|
42
|
|
|
|
|
|
|
| # or |
|
43
|
|
|
|
|
|
|
\^\w # builtin var with '^' prefix |
|
44
|
|
|
|
|
|
|
| # or |
|
45
|
|
|
|
|
|
|
(?:[\#\$](?!\w))# just '$$' or '$#' |
|
46
|
|
|
|
|
|
|
| # or |
|
47
|
|
|
|
|
|
|
[^{\w\s\$] # builtin vars with 1 special char |
|
48
|
|
|
|
|
|
|
) |
|
49
|
|
|
|
|
|
|
| # or |
|
50
|
|
|
|
|
|
|
$id_regex # a plain word or module name |
|
51
|
|
|
|
|
|
|
)/x; |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
my @stopwords = ( |
|
55
|
|
|
|
|
|
|
'a' .. 'z', '_', '0' .. '9', |
|
56
|
|
|
|
|
|
|
qw/__data__ __end__ $class $self |
|
57
|
|
|
|
|
|
|
above after all also always an and any are as at |
|
58
|
|
|
|
|
|
|
be because been before being both but by |
|
59
|
|
|
|
|
|
|
can cannot could |
|
60
|
|
|
|
|
|
|
die do don done |
|
61
|
|
|
|
|
|
|
defined do does doesn |
|
62
|
|
|
|
|
|
|
each else elsif eq |
|
63
|
|
|
|
|
|
|
for from |
|
64
|
|
|
|
|
|
|
ge gt |
|
65
|
|
|
|
|
|
|
has have how |
|
66
|
|
|
|
|
|
|
if in into is isn it item its |
|
67
|
|
|
|
|
|
|
keys |
|
68
|
|
|
|
|
|
|
last le lt |
|
69
|
|
|
|
|
|
|
many may me method might must my |
|
70
|
|
|
|
|
|
|
ne new next no nor not |
|
71
|
|
|
|
|
|
|
of on only or other our |
|
72
|
|
|
|
|
|
|
package perl pl pm pod push |
|
73
|
|
|
|
|
|
|
qq qr qw |
|
74
|
|
|
|
|
|
|
ref return |
|
75
|
|
|
|
|
|
|
see set shift should since so some something sub such |
|
76
|
|
|
|
|
|
|
text than that the their them then these they this those to tr |
|
77
|
|
|
|
|
|
|
undef unless until up us use used uses using |
|
78
|
|
|
|
|
|
|
values |
|
79
|
|
|
|
|
|
|
was we what when which while will with would |
|
80
|
|
|
|
|
|
|
you your/ |
|
81
|
|
|
|
|
|
|
); |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub full_text { |
|
90
|
|
|
|
|
|
|
my ($self, $search_string) = @_; |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
my $indexer = eval { |
|
93
|
|
|
|
|
|
|
new Search::Indexer(dir => $index_dir, |
|
94
|
|
|
|
|
|
|
wregex => $wregex, |
|
95
|
|
|
|
|
|
|
preMatch => '[[', |
|
96
|
|
|
|
|
|
|
postMatch => ']]'); |
|
97
|
|
|
|
|
|
|
} or die <<__EOHTML__; |
|
98
|
|
|
|
|
|
|
No full-text index found ($@). |
|
99
|
|
|
|
|
|
|
<p> |
|
100
|
|
|
|
|
|
|
Please ask your system administrator to run the |
|
101
|
|
|
|
|
|
|
command |
|
102
|
|
|
|
|
|
|
</p> |
|
103
|
|
|
|
|
|
|
<pre> |
|
104
|
|
|
|
|
|
|
perl -MPod::POM::Web::Indexer -e "Pod::POM::Web::Indexer->new->index" |
|
105
|
|
|
|
|
|
|
</pre> |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Indexing may take about half an hour and will use about |
|
108
|
|
|
|
|
|
|
10 MB on your hard disk. |
|
109
|
|
|
|
|
|
|
__EOHTML__ |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
my $lib = "$self->{root_url}/lib"; |
|
114
|
|
|
|
|
|
|
my $html = <<__EOHTML__; |
|
115
|
|
|
|
|
|
|
<html> |
|
116
|
|
|
|
|
|
|
<head> |
|
117
|
|
|
|
|
|
|
<link href="$lib/GvaScript.css" rel="stylesheet" type="text/css"> |
|
118
|
|
|
|
|
|
|
<link href="$lib/PodPomWeb.css" rel="stylesheet" type="text/css"> |
|
119
|
|
|
|
|
|
|
<style> |
|
120
|
|
|
|
|
|
|
.src {font-size:70%; float: right} |
|
121
|
|
|
|
|
|
|
.sep {font-size:110%; font-weight: bolder; color: magenta; |
|
122
|
|
|
|
|
|
|
padding-left: 8px; padding-right: 8px} |
|
123
|
|
|
|
|
|
|
.hl {background-color: lightpink} |
|
124
|
|
|
|
|
|
|
</style> |
|
125
|
|
|
|
|
|
|
</head> |
|
126
|
|
|
|
|
|
|
<body> |
|
127
|
|
|
|
|
|
|
__EOHTML__ |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
$search_string =~ s/(^|\s)([\w]+(?:::\w+)+)(\s|$)/$1"$2"$3/g; |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
my $result = $indexer->search($search_string, 'implicit_plus'); |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
my $killedWords = join ", ", @{$result->{killedWords}}; |
|
137
|
|
|
|
|
|
|
$killedWords &&= " (ignoring words : $killedWords)"; |
|
138
|
|
|
|
|
|
|
my $regex = $result->{regex}; |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
my $scores = $result->{scores}; |
|
141
|
|
|
|
|
|
|
my @doc_ids = sort {$scores->{$b} <=> $scores->{$a}} keys %$scores; |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
my $nav_links = $self->paginate_results(\@doc_ids); |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
$html .= "<b>Full-text search</b> for '$search_string'$killedWords<br>" |
|
146
|
|
|
|
|
|
|
. "$nav_links<hr>\n"; |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
$self->_tie_docs(DB_RDONLY); |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
foreach my $id (@doc_ids) { |
|
151
|
|
|
|
|
|
|
my ($mtime, $path, $description) = split "\t", $self->{_docs}{$id}; |
|
152
|
|
|
|
|
|
|
my $score = $scores->{$id}; |
|
153
|
|
|
|
|
|
|
my @filenames = $self->find_source($path); |
|
154
|
|
|
|
|
|
|
my $buf = join "\n", map {$self->slurp_file($_)} @filenames; |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
my $excerpts = $indexer->excerpts($buf, $regex); |
|
157
|
|
|
|
|
|
|
foreach (@$excerpts) { |
|
158
|
|
|
|
|
|
|
s/&/&/g, s/</</g, s/>/>/g; |
|
159
|
|
|
|
|
|
|
s/\[\[/<span class='hl'>/g, s/\]\]/<\/span>/g; |
|
160
|
|
|
|
|
|
|
} |
|
161
|
|
|
|
|
|
|
$excerpts = join "<span class='sep'>/</span>", @$excerpts; |
|
162
|
|
|
|
|
|
|
$html .= <<__EOHTML__; |
|
163
|
|
|
|
|
|
|
<p> |
|
164
|
|
|
|
|
|
|
<a href="$self->{root_url}/source/$path" class="src">source</a> |
|
165
|
|
|
|
|
|
|
<a href="$self->{root_url}/$path">$path</a> |
|
166
|
|
|
|
|
|
|
(<small>$score</small>) <em>$description</em> |
|
167
|
|
|
|
|
|
|
<br> |
|
168
|
|
|
|
|
|
|
<small>$excerpts</small> |
|
169
|
|
|
|
|
|
|
</p> |
|
170
|
|
|
|
|
|
|
__EOHTML__ |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
$html .= "<hr>$nav_links\n"; |
|
174
|
|
|
|
|
|
|
return $self->send_html($html); |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
sub paginate_results { |
|
180
|
|
|
|
|
|
|
my ($self, $doc_ids_ref) = @_; |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
my $n_docs = @$doc_ids_ref; |
|
183
|
|
|
|
|
|
|
my $count = $self->{params}{count} || 50; |
|
184
|
|
|
|
|
|
|
my $start_record = $self->{params}{start} || 0; |
|
185
|
|
|
|
|
|
|
my $end_record = min($start_record + $count - 1, $n_docs - 1); |
|
186
|
|
|
|
|
|
|
@$doc_ids_ref = @$doc_ids_ref[$start_record ... $end_record]; |
|
187
|
|
|
|
|
|
|
my $prev_idx = max($start_record - $count, 0); |
|
188
|
|
|
|
|
|
|
my $next_idx = $start_record + $count; |
|
189
|
|
|
|
|
|
|
my $base_url = "?source=full_text&search=$self->{params}{search}"; |
|
190
|
|
|
|
|
|
|
my $prev_link |
|
191
|
|
|
|
|
|
|
= $start_record > 0 ? uri_escape("$base_url&start=$prev_idx") : ""; |
|
192
|
|
|
|
|
|
|
my $next_link |
|
193
|
|
|
|
|
|
|
= $next_idx < $n_docs ? uri_escape("$base_url&start=$next_idx") : ""; |
|
194
|
|
|
|
|
|
|
$_ += 1 for $start_record, $end_record; |
|
195
|
|
|
|
|
|
|
my $nav_links = ""; |
|
196
|
|
|
|
|
|
|
$nav_links .= "<a href='$prev_link'>[Previous <<]</a> " if $prev_link; |
|
197
|
|
|
|
|
|
|
$nav_links .= "Results <b>$start_record</b> to <b>$end_record</b> " |
|
198
|
|
|
|
|
|
|
. "from <b>$n_docs</b>"; |
|
199
|
|
|
|
|
|
|
$nav_links .= " <a href='$next_link'>[>> Next]</a> " if $next_link; |
|
200
|
|
|
|
|
|
|
return $nav_links; |
|
201
|
|
|
|
|
|
|
} |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub modlist { |
|
208
|
|
|
|
|
|
|
my ($self, $search_string) = @_; |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
$self->_tie_docs(DB_RDONLY); |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
length($search_string) >= 2 or die "module_list: arg too short"; |
|
213
|
|
|
|
|
|
|
my $regex = qr/^\d+\t(\Q$search_string\E[^\t]*)/i; |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
my @modules; |
|
216
|
|
|
|
|
|
|
foreach my $val (values %{$self->{_docs}}) { |
|
217
|
|
|
|
|
|
|
$val =~ $regex or next; |
|
218
|
|
|
|
|
|
|
(my $module = $1) =~ s[/][::]g; |
|
219
|
|
|
|
|
|
|
push @modules, $module; |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
my $json_names = "[" . join(",", map {qq{"$_"}} sort @modules) . "]"; |
|
223
|
|
|
|
|
|
|
return $self->send_content({content => $json_names, |
|
224
|
|
|
|
|
|
|
mime_type => 'application/x-json'}); |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub get_abstract { |
|
229
|
|
|
|
|
|
|
my ($self, $path) = @_; |
|
230
|
|
|
|
|
|
|
if (!$self->{_path_to_descr}) { |
|
231
|
|
|
|
|
|
|
eval {$self->_tie_docs(DB_RDONLY); 1} |
|
232
|
|
|
|
|
|
|
or return; |
|
233
|
|
|
|
|
|
|
$self->{_path_to_descr} = { |
|
234
|
|
|
|
|
|
|
map {(split /\t/, $_)[1,2]} values %{$self->{_docs}} |
|
235
|
|
|
|
|
|
|
}; |
|
236
|
|
|
|
|
|
|
} |
|
237
|
|
|
|
|
|
|
my $description = $self->{_path_to_descr}->{$path} or return; |
|
238
|
|
|
|
|
|
|
(my $abstract = $description) =~ s/^.*?-\s*//; |
|
239
|
|
|
|
|
|
|
return $abstract; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
sub import { |
|
248
|
|
|
|
|
|
|
my $class = shift; |
|
249
|
|
|
|
|
|
|
my ($package, $filename) = caller; |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
no strict 'refs'; |
|
252
|
|
|
|
|
|
|
*{'main::index'} = sub {$class->new->index(@_)} |
|
253
|
|
|
|
|
|
|
if $package eq 'main' and $filename eq '-e'; |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
sub index { |
|
258
|
|
|
|
|
|
|
my ($self, %options) = @_; |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
die "invalid option : $_" |
|
262
|
|
|
|
|
|
|
if grep {!/^-(from_scratch|max_size|positions)$/} keys %options; |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
-d $index_dir or mkdir $index_dir or die "mkdir $index_dir: $!"; |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
if ($options{-from_scratch}) { |
|
269
|
|
|
|
|
|
|
unlink $_ or die "unlink $_ : $!" foreach glob("$index_dir/*.bdb"); |
|
270
|
|
|
|
|
|
|
} |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
$self->{_seen_path} = {}; |
|
274
|
|
|
|
|
|
|
$self->{_last_doc_id} = 0; |
|
275
|
|
|
|
|
|
|
$self->{_max_size_for_indexing} = $options{-max_size} |
|
276
|
|
|
|
|
|
|
|| $defaut_max_size_for_indexing; |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
$self->_tie_docs(DB_CREATE); |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
$self->{_max_doc_id} = 0; |
|
283
|
|
|
|
|
|
|
$self->{_previous_index} = {}; |
|
284
|
|
|
|
|
|
|
while (my ($id, $doc_descr) = each %{$self->{_docs}}) { |
|
285
|
|
|
|
|
|
|
$self->{_max_doc_id} = max($id, $self->{_max_doc_id}); |
|
286
|
|
|
|
|
|
|
my ($mtime, $path, $description) = split /\t/, $doc_descr; |
|
287
|
|
|
|
|
|
|
$self->{_previous_index}{$path} |
|
288
|
|
|
|
|
|
|
= {id => $id, mtime => $mtime, description => $description}; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
$self->{_indexer} = new Search::Indexer(dir => $index_dir, |
|
293
|
|
|
|
|
|
|
writeMode => 1, |
|
294
|
|
|
|
|
|
|
positions => $options{-positions}, |
|
295
|
|
|
|
|
|
|
wregex => $wregex, |
|
296
|
|
|
|
|
|
|
stopwords => \@stopwords); |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
$self->index_dir($_) foreach @Pod::POM::Web::search_dirs; |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
$self->{_indexer} = $self->{_docs} = undef; |
|
302
|
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
sub index_dir { |
|
306
|
|
|
|
|
|
|
my ($self, $rootdir, $path) = @_; |
|
307
|
|
|
|
|
|
|
return if $path =~ /$ignore_dirs/; |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
my $dir = $rootdir; |
|
310
|
|
|
|
|
|
|
if ($path) { |
|
311
|
|
|
|
|
|
|
$dir .= "/$path"; |
|
312
|
|
|
|
|
|
|
return print STDERR "SKIP DIR $dir (already in \@INC)\n" |
|
313
|
|
|
|
|
|
|
if grep {m[^\Q$dir\E]} @Pod::POM::Web::search_dirs; |
|
314
|
|
|
|
|
|
|
} |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
chdir $dir or return print STDERR "SKIP DIR $dir (chdir $dir: $!)\n"; |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
print STDERR "DIR $dir\n"; |
|
319
|
|
|
|
|
|
|
opendir my $dh, "." or die $^E; |
|
320
|
|
|
|
|
|
|
my ($dirs, $files) = part { -d $_ ? 0 : 1} grep {!/^\./} readdir $dh; |
|
321
|
|
|
|
|
|
|
$dirs ||= [], $files ||= []; |
|
322
|
|
|
|
|
|
|
closedir $dh; |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
my %extensions; |
|
325
|
|
|
|
|
|
|
foreach my $file (sort @$files) { |
|
326
|
|
|
|
|
|
|
next unless $file =~ s/\.(pm|pod)$//; |
|
327
|
|
|
|
|
|
|
$extensions{$file}{$1} = 1; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
foreach my $base (keys %extensions) { |
|
331
|
|
|
|
|
|
|
$self->index_file($path, $base, $extensions{$base}); |
|
332
|
|
|
|
|
|
|
} |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
my @subpaths = map {$path ? "$path/$_" : $_} @$dirs; |
|
335
|
|
|
|
|
|
|
$self->index_dir($rootdir, $_) foreach @subpaths; |
|
336
|
|
|
|
|
|
|
} |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
sub index_file { |
|
340
|
|
|
|
|
|
|
my ($self, $path, $file, $has_ext) = @_; |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
my $fullpath = $path ? "$path/$file" : $file; |
|
343
|
|
|
|
|
|
|
return print STDERR "SKIP $fullpath (shadowing)\n" |
|
344
|
|
|
|
|
|
|
if $self->{_seen_path}{$fullpath}; |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
$self->{_seen_path}{$fullpath} = 1; |
|
347
|
|
|
|
|
|
|
my $max_mtime = 0; |
|
348
|
|
|
|
|
|
|
my ($size, $mtime, @filenames); |
|
349
|
|
|
|
|
|
|
EXT: |
|
350
|
|
|
|
|
|
|
foreach my $ext (qw/pm pod/) { |
|
351
|
|
|
|
|
|
|
next EXT unless $has_ext->{$ext}; |
|
352
|
|
|
|
|
|
|
my $filename = "$file.$ext"; |
|
353
|
|
|
|
|
|
|
($size, $mtime) = (stat $filename)[7, 9] or die "stat $filename: $!"; |
|
354
|
|
|
|
|
|
|
$size < $self->{_max_size_for_indexing} or |
|
355
|
|
|
|
|
|
|
print STDERR "$filename too big ($size bytes), skipped " and next EXT; |
|
356
|
|
|
|
|
|
|
$mtime = max($max_mtime, $mtime); |
|
357
|
|
|
|
|
|
|
push @filenames, $filename; |
|
358
|
|
|
|
|
|
|
} |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
if ($mtime <= $self->{_previous_index}{$fullpath}{mtime}) { |
|
361
|
|
|
|
|
|
|
return print STDERR "SKIP $fullpath (index up to date)\n"; |
|
362
|
|
|
|
|
|
|
} |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
if (@filenames) { |
|
365
|
|
|
|
|
|
|
my $old_doc_id = $self->{_previous_index}{$fullpath}{id}; |
|
366
|
|
|
|
|
|
|
my $doc_id = $old_doc_id || ++$self->{_max_doc_id}; |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
print STDERR "INDEXING $fullpath (id $doc_id) ... "; |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
my $t0 = time; |
|
371
|
|
|
|
|
|
|
my $buf = join "\n", map {$self->slurp_file($_)} @filenames; |
|
372
|
|
|
|
|
|
|
my ($description) = ($buf =~ /^=head1\s*NAME\s*(.*)$/m); |
|
373
|
|
|
|
|
|
|
$description ||= ''; |
|
374
|
|
|
|
|
|
|
$description =~ s/\t/ /g; |
|
375
|
|
|
|
|
|
|
$buf =~ s/^=head1\s+($ignore_headings).*$//m; |
|
376
|
|
|
|
|
|
|
$buf =~ s/^=(head\d|item)//mg; |
|
377
|
|
|
|
|
|
|
$buf =~ s/^=\w.*//mg; |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
if ($old_doc_id) { |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
$self->{_indexer}->remove($old_doc_id, $buf); |
|
386
|
|
|
|
|
|
|
} |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
$self->{_indexer}->add($doc_id, $buf); |
|
389
|
|
|
|
|
|
|
my $interval = time - $t0; |
|
390
|
|
|
|
|
|
|
printf STDERR "%0.3f s.", $interval; |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
$self->{_docs}{$doc_id} = "$mtime\t$fullpath\t$description"; |
|
393
|
|
|
|
|
|
|
} |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
print STDERR "\n"; |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
sub _tie_docs { |
|
405
|
|
|
|
|
|
|
my ($self, $mode) = @_; |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
tie %{$self->{_docs}}, 'BerkeleyDB::Hash', |
|
409
|
|
|
|
|
|
|
-Filename => "$index_dir/docs.bdb", |
|
410
|
|
|
|
|
|
|
-Flags => $mode |
|
411
|
|
|
|
|
|
|
or die "open $index_dir/docs.bdb : $^E $BerkeleyDB::Error"; |
|
412
|
|
|
|
|
|
|
} |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
sub uri_escape { |
|
417
|
|
|
|
|
|
|
my $uri = shift; |
|
418
|
|
|
|
|
|
|
$uri =~ s{([^;\/?:@&=\$,A-Za-z0-9\-_.!~*'()])} |
|
419
|
|
|
|
|
|
|
{sprintf("%%%02X", ord($1)) }ge; |
|
420
|
|
|
|
|
|
|
return $uri; |
|
421
|
|
|
|
|
|
|
} |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
1; |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
__END__ |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=head1 NAME |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
Pod::POM::Web::Indexer - full-text search for Pod::POM::Web |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
perl -MPod::POM::Web::Indexer -e index |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
Adds full-text search capabilities to the |
|
439
|
|
|
|
|
|
|
L<Pod::POM::Web|Pod::POM::Web> application. |
|
440
|
|
|
|
|
|
|
This requires L<Search::Indexer|Search::Indexer> to be installed. |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
Queries may include plain terms, "exact phrases", |
|
443
|
|
|
|
|
|
|
'+' or '-' prefixes, Boolean operators and parentheses. |
|
444
|
|
|
|
|
|
|
See L<Search::QueryParser|Search::QueryParser> for details. |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
=head1 METHODS |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=head2 index |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
Pod::POM::Web::Indexer->new->index(%options) |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
Walks through directories in C<@INC> and indexes |
|
454
|
|
|
|
|
|
|
all C<*.pm> and C<*.pod> files, skipping shadowed files |
|
455
|
|
|
|
|
|
|
(files for which a similar loading path was already |
|
456
|
|
|
|
|
|
|
found in previous C<@INC> directories), and skipping |
|
457
|
|
|
|
|
|
|
files that are too big. |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
Default indexing is incremental : files whose modification |
|
460
|
|
|
|
|
|
|
time has not changed since the last indexing operation will |
|
461
|
|
|
|
|
|
|
not be indexed again. |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
Options can be |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=over |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=item -max_size |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Size limit (in bytes) above which files will not be indexed. |
|
470
|
|
|
|
|
|
|
The default value is 300K. |
|
471
|
|
|
|
|
|
|
Files of size above this limit are usually not worth |
|
472
|
|
|
|
|
|
|
indexing because they only contain big configuration tables |
|
473
|
|
|
|
|
|
|
(like for example C<Module::CoreList> or C<Unicode::Charname>). |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=item -from_scratch |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
If true, the previous index is deleted, so all files will be freshly |
|
478
|
|
|
|
|
|
|
indexed. If false (the default), indexation is incremental, i.e. files |
|
479
|
|
|
|
|
|
|
whose modification time has not changed will not be re-indexed. |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=item -positions |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
If true, the indexer will also store word positions in documents, so |
|
484
|
|
|
|
|
|
|
that it can later answer to "exact phrase" queries. |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
So if C<-positions> are on, a search for C<"more than one way"> will |
|
487
|
|
|
|
|
|
|
only return documents which contain that exact sequence of contiguous |
|
488
|
|
|
|
|
|
|
words; whereas if C<-positions> are off, the query is equivalent to |
|
489
|
|
|
|
|
|
|
C<more AND than AND one AND way>, i.e. it returns all documents which |
|
490
|
|
|
|
|
|
|
contain these words anywhere and in any order. |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
The option is off by default, because it requires much more disk |
|
493
|
|
|
|
|
|
|
space, and does not seem to be very relevant for searching |
|
494
|
|
|
|
|
|
|
Perl documentation. |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
=back |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
The C<index> function is exported into the C<main::> namespace if perl |
|
499
|
|
|
|
|
|
|
is called with the C<-e> flag, so that you can write |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
perl -MPod::POM::Web::Indexer -e index |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=head1 PERFORMANCES |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
On my machine, indexing a module takes an average of 0.2 seconds, |
|
507
|
|
|
|
|
|
|
except for some long and complex sources (this is why sources |
|
508
|
|
|
|
|
|
|
above 300K are ignored by default, see options above). |
|
509
|
|
|
|
|
|
|
Here are the worst figures (in seconds) : |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
Date/Manip 39.655 |
|
512
|
|
|
|
|
|
|
DBI 30.73 |
|
513
|
|
|
|
|
|
|
Pod/perlfunc 29.502 |
|
514
|
|
|
|
|
|
|
Module/CoreList 27.287 |
|
515
|
|
|
|
|
|
|
CGI 16.922 |
|
516
|
|
|
|
|
|
|
Config 13.445 |
|
517
|
|
|
|
|
|
|
CPAN 12.598 |
|
518
|
|
|
|
|
|
|
Pod/perlapi 10.906 |
|
519
|
|
|
|
|
|
|
CGI/FormBuilder 8.592 |
|
520
|
|
|
|
|
|
|
Win32/TieRegistry 7.338 |
|
521
|
|
|
|
|
|
|
Spreadsheet/WriteExcel 7.132 |
|
522
|
|
|
|
|
|
|
Pod/perldiag 5.771 |
|
523
|
|
|
|
|
|
|
Parse/RecDescent 5.405 |
|
524
|
|
|
|
|
|
|
Bit/Vector 4.768 |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
The index will be stored in an F<index> subdirectory |
|
527
|
|
|
|
|
|
|
under the module installation directory. |
|
528
|
|
|
|
|
|
|
The total index size should be around 10MB if C<-positions> are off, |
|
529
|
|
|
|
|
|
|
and between 30MB and 50MB if C<-positions> are on, depending on |
|
530
|
|
|
|
|
|
|
how many modules are installed. |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=head1 TODO |
|
534
|
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
- highlights in shown documents |
|
536
|
|
|
|
|
|
|
- paging |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=cut |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
|