| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
package MMM::Text::Search; |
|
3
|
1
|
|
|
1
|
|
1429
|
use File::Copy; |
|
|
1
|
|
|
|
|
9474
|
|
|
|
1
|
|
|
|
|
67
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
#$Id: Search.pm,v 1.50 2004/12/13 18:45:15 maxim Exp $ |
|
6
|
1
|
|
|
1
|
|
7
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
31
|
|
|
7
|
1
|
|
|
1
|
|
5
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK $verbose_flag ); |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
126
|
|
|
8
|
|
|
|
|
|
|
require Exporter; |
|
9
|
|
|
|
|
|
|
require AutoLoader; |
|
10
|
|
|
|
|
|
|
@ISA = qw(Exporter AutoLoader); |
|
11
|
|
|
|
|
|
|
@EXPORT = qw( |
|
12
|
|
|
|
|
|
|
); |
|
13
|
|
|
|
|
|
|
$VERSION = '0.07'; |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# |
|
16
|
|
|
|
|
|
|
# Perl module for indexing and searching text files and web pages. |
|
17
|
|
|
|
|
|
|
# (Max Muzi, Apr-Sep 1999) |
|
18
|
|
|
|
|
|
|
# |
|
19
|
|
|
|
|
|
|
# |
|
20
|
|
|
|
|
|
|
# Note on implementation: |
|
21
|
|
|
|
|
|
|
# The technique used for indexing is substantially derived from that |
|
22
|
|
|
|
|
|
|
# exposed by Tim Kientzle on Dr. Dobbs magazine. (Actually IndexWords() |
|
23
|
|
|
|
|
|
|
# has been cut'n'pasted from his scripts.) |
|
24
|
|
|
|
|
|
|
# |
|
25
|
|
|
|
|
|
|
# |
|
26
|
|
|
|
|
|
|
|
|
27
|
1
|
|
|
1
|
|
1551
|
use DB_File; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use Fcntl; |
|
29
|
|
|
|
|
|
|
require 5.005; |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
$verbose_flag = 0; |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $debug_flag = 0; |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $errstr = undef; |
|
36
|
|
|
|
|
|
|
my $syntax_error = undef; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub errstr { $errstr }; |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub new { # constructor! (see the docs for usage [sorry, there're no docs ]) |
|
41
|
|
|
|
|
|
|
my $pkg = shift; |
|
42
|
|
|
|
|
|
|
my $arg = shift; |
|
43
|
|
|
|
|
|
|
my $opt = undef; |
|
44
|
|
|
|
|
|
|
if (ref($arg) ne "HASH") { |
|
45
|
|
|
|
|
|
|
if (-f $arg) { |
|
46
|
|
|
|
|
|
|
$opt->{IndexDB} = $arg; |
|
47
|
|
|
|
|
|
|
$opt->{Verbose} = shift; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
else { |
|
50
|
|
|
|
|
|
|
die "usage: \$obj = new MMM::Text::Search ( '/index/path' or \$hashref)\n" |
|
51
|
|
|
|
|
|
|
} |
|
52
|
|
|
|
|
|
|
} else { |
|
53
|
|
|
|
|
|
|
$opt = $arg; |
|
54
|
|
|
|
|
|
|
}; |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
$verbose_flag = $opt->{Debug} || $opt->{Verbose} ; |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my $indexdbpath = $opt->{IndexDB} || $opt->{IndexPath} ; |
|
59
|
|
|
|
|
|
|
my $filemask = $opt->{FileMask} ; |
|
60
|
|
|
|
|
|
|
my $dirs = ( ref($opt->{Dirs}) eq "ARRAY" ) ? $opt->{Dirs} : [ ]; |
|
61
|
|
|
|
|
|
|
my $followsymlinks = defined $opt->{FollowSymLinks}; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
my $opturls = $opt->{Urls} || $opt->{URLs}; |
|
64
|
|
|
|
|
|
|
my $urls = ( ref($opturls) eq "ARRAY" ) ? $opturls : [ ]; |
|
65
|
|
|
|
|
|
|
my $level = int $opt->{Level}; |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
my $locationsdbpath = $indexdbpath; |
|
68
|
|
|
|
|
|
|
$locationsdbpath =~ s/(\.db)*$/\-locations.db/; |
|
69
|
|
|
|
|
|
|
my $titlesdbpath = $indexdbpath; |
|
70
|
|
|
|
|
|
|
$titlesdbpath =~ s/(\.db)*$/\-titles.db/; |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
my $minwordsize = $opt->{MinWordSize} || 1; |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
my $self = { |
|
76
|
|
|
|
|
|
|
indexdbpath => $indexdbpath, |
|
77
|
|
|
|
|
|
|
locationsdbpath => $locationsdbpath, |
|
78
|
|
|
|
|
|
|
titlesdbpath => $titlesdbpath, |
|
79
|
|
|
|
|
|
|
filemask => length($filemask) ? qr/$filemask/ : undef, |
|
80
|
|
|
|
|
|
|
dirs => $dirs, |
|
81
|
|
|
|
|
|
|
followsymlinks => $followsymlinks, |
|
82
|
|
|
|
|
|
|
minwordsize => $minwordsize, |
|
83
|
|
|
|
|
|
|
ignorelimit => $opt->{IgnoreLimit} || (2/3), |
|
84
|
|
|
|
|
|
|
urls => $urls, |
|
85
|
|
|
|
|
|
|
level => $level, |
|
86
|
|
|
|
|
|
|
url_exclude => $opt->{UrlExludeMask} || "(?i).*\.(zip|exe|gz|arj|bin|hqx)", |
|
87
|
|
|
|
|
|
|
file_reader => $opt->{FileReader}, |
|
88
|
|
|
|
|
|
|
use_inode => $opt->{UseInodeAsKey}, |
|
89
|
|
|
|
|
|
|
no_reset => $opt->{UseInodeAsKey} && $opt->{NoReset} |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
}; |
|
92
|
|
|
|
|
|
|
DEBUG("filemask=$filemask, indexfile=$indexdbpath, ignorelimit=$self->{ignorelimit}\n"); |
|
93
|
|
|
|
|
|
|
DEBUG("dirs = [", join(",", @$dirs),"], "); |
|
94
|
|
|
|
|
|
|
DEBUG("urls = [", join(",", @$urls),"] \n"); |
|
95
|
|
|
|
|
|
|
bless($self, $pkg); |
|
96
|
|
|
|
|
|
|
return $self; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub _add_keys_to_match_hash { |
|
100
|
|
|
|
|
|
|
# extract file-codes from $keys and update corresponding $hash elements (score) |
|
101
|
|
|
|
|
|
|
my ($keys, $hash) = @_; |
|
102
|
|
|
|
|
|
|
my $key; |
|
103
|
|
|
|
|
|
|
foreach $key ( unpack("N*",$keys) ) { |
|
104
|
|
|
|
|
|
|
# DEBUG($key, " "); |
|
105
|
|
|
|
|
|
|
# ignored words (stop-words) only include file-id 0 (see FlushCache() below) |
|
106
|
|
|
|
|
|
|
return 0 if $key == 0 ; |
|
107
|
|
|
|
|
|
|
$hash->{$key}++ |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
return 1; |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub _push_words_from_hash { |
|
113
|
|
|
|
|
|
|
my ($hash,$array, $regexp) = @_; |
|
114
|
|
|
|
|
|
|
my $w; |
|
115
|
|
|
|
|
|
|
for $w(keys %$hash) { |
|
116
|
|
|
|
|
|
|
push @$array,$w if $w =~ $regexp; |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
#notes on advanced_query(); |
|
123
|
|
|
|
|
|
|
# - queries containing stop-words may yields bizzare results.. |
|
124
|
|
|
|
|
|
|
# - score is not always correct |
|
125
|
|
|
|
|
|
|
# - error handling should be improved... :-) |
|
126
|
|
|
|
|
|
|
sub advanced_query { |
|
127
|
|
|
|
|
|
|
# perform queries such as "( a and ( b or c ) ) and ( d and e) " |
|
128
|
|
|
|
|
|
|
my $self = shift; |
|
129
|
|
|
|
|
|
|
my $expr = shift; |
|
130
|
|
|
|
|
|
|
my $indexdbpath= $self->{indexdbpath}; |
|
131
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
|
132
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
|
133
|
|
|
|
|
|
|
my %indexdb; |
|
134
|
|
|
|
|
|
|
my %locationsdb; |
|
135
|
|
|
|
|
|
|
my %titlesdb; |
|
136
|
|
|
|
|
|
|
return undef unless (-f $indexdbpath && -r _); |
|
137
|
|
|
|
|
|
|
return undef unless (-f $locationsdbpath && -r _); |
|
138
|
|
|
|
|
|
|
return undef unless (-f $titlesdbpath && -r _); |
|
139
|
|
|
|
|
|
|
return undef unless |
|
140
|
|
|
|
|
|
|
tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) && |
|
141
|
|
|
|
|
|
|
tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) && |
|
142
|
|
|
|
|
|
|
tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY ); |
|
143
|
|
|
|
|
|
|
my @ignored = (); |
|
144
|
|
|
|
|
|
|
my @words = (); |
|
145
|
|
|
|
|
|
|
my $verbose_flag_tmp = $verbose_flag; |
|
146
|
|
|
|
|
|
|
$verbose_flag = shift; # undocumented debug switch |
|
147
|
|
|
|
|
|
|
chomp $expr; |
|
148
|
|
|
|
|
|
|
undef $syntax_error; #reset error |
|
149
|
|
|
|
|
|
|
DEBUG("********** _match_expression() debug **********\n"); |
|
150
|
|
|
|
|
|
|
my $match = _match_expression($expr, \%indexdb, \@ignored); |
|
151
|
|
|
|
|
|
|
DEBUG("********** end debug **********\n"); |
|
152
|
|
|
|
|
|
|
if ($syntax_error) { |
|
153
|
|
|
|
|
|
|
$errstr = $syntax_error; |
|
154
|
|
|
|
|
|
|
$verbose_flag = $verbose_flag_tmp; |
|
155
|
|
|
|
|
|
|
return undef; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
my $result = _make_result_hash($match,\%locationsdb, \%titlesdb, \@words, \@ignored); |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
untie(%indexdb); |
|
160
|
|
|
|
|
|
|
untie(%locationsdb); |
|
161
|
|
|
|
|
|
|
untie(%titlesdb); |
|
162
|
|
|
|
|
|
|
$verbose_flag = $verbose_flag_tmp; |
|
163
|
|
|
|
|
|
|
return $result; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub _match_expression { # recursively apply a keyword-search expression to indexdb |
|
167
|
|
|
|
|
|
|
# $expr may be either a string or a ref to an array of tokens |
|
168
|
|
|
|
|
|
|
# a ref to a "score" hash is returned (or undef sometimes) |
|
169
|
|
|
|
|
|
|
my ($expr, $index, $ignored) = @_; |
|
170
|
|
|
|
|
|
|
my $parsed = _parse_expression($expr); |
|
171
|
|
|
|
|
|
|
# _parse_expression() returns a reference to an array of three elements: |
|
172
|
|
|
|
|
|
|
# [ operator, left_expr, right_expr] |
|
173
|
|
|
|
|
|
|
# if right_expr is not defined then expr was atomic and left_expr is a string, |
|
174
|
|
|
|
|
|
|
# otherwise both right_expr and left_expr are references to arrays of tokens |
|
175
|
|
|
|
|
|
|
if ( not $parsed) { |
|
176
|
|
|
|
|
|
|
DEBUG("Syntax error :-( \n"); |
|
177
|
|
|
|
|
|
|
return undef; |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
my ( $op, $left,$right) = @$parsed; |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
if ($left && not $right) { |
|
182
|
|
|
|
|
|
|
$left =~ s/^\s*\(?\s*|\s*\)?\s*$//g; |
|
183
|
|
|
|
|
|
|
DEBUG("Looking up >$left<\n"); |
|
184
|
|
|
|
|
|
|
my %matches = (); |
|
185
|
|
|
|
|
|
|
my $word = $left; |
|
186
|
|
|
|
|
|
|
my $rc = 0; |
|
187
|
|
|
|
|
|
|
my $keys = $index->{lc $word}; # get file-id's from indexdb |
|
188
|
|
|
|
|
|
|
$rc = _add_keys_to_match_hash($keys,\%matches); |
|
189
|
|
|
|
|
|
|
# if $rc is false then $word is a stop-word, see _add_keys_to_match_hash() for more info |
|
190
|
|
|
|
|
|
|
if (not $rc) { |
|
191
|
|
|
|
|
|
|
DEBUG("$word ignored\n"); |
|
192
|
|
|
|
|
|
|
push @$ignored, $word; |
|
193
|
|
|
|
|
|
|
return undef; |
|
194
|
|
|
|
|
|
|
# what should we do now? gotta think it over... |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
return \%matches; |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
DEBUG("Evaluating >$left< --$op-- >$right<\n"); |
|
200
|
|
|
|
|
|
|
my $left_match = _match_expression($left, $index, $ignored); |
|
201
|
|
|
|
|
|
|
my $right_match = _match_expression($right, $index, $ignored); |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
return undef if ($syntax_error); |
|
204
|
|
|
|
|
|
|
my %matches = (); |
|
205
|
|
|
|
|
|
|
my $file = undef; |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
if ($op eq 'AND' ) { |
|
208
|
|
|
|
|
|
|
%matches = ( %$left_match ); |
|
209
|
|
|
|
|
|
|
for $file( keys %matches) { |
|
210
|
|
|
|
|
|
|
delete $matches{$file} unless $right_match->{$file} |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
return \%matches; |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
if ($op eq 'AND NOT') { |
|
215
|
|
|
|
|
|
|
%matches = ( %$left_match ); |
|
216
|
|
|
|
|
|
|
for $file( keys %matches) { |
|
217
|
|
|
|
|
|
|
delete $matches{$file} if $right_match->{$file} |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
return \%matches; |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
if ($op eq 'OR') { |
|
222
|
|
|
|
|
|
|
%matches = ( %$left_match ); |
|
223
|
|
|
|
|
|
|
for $file( keys %$right_match) { |
|
224
|
|
|
|
|
|
|
if ($matches{$file}) { |
|
225
|
|
|
|
|
|
|
$matches{$file} +=$right_match->{$file}; |
|
226
|
|
|
|
|
|
|
} else { |
|
227
|
|
|
|
|
|
|
$matches{$file} =$right_match->{$file}; |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
return \%matches; |
|
231
|
|
|
|
|
|
|
} |
|
232
|
|
|
|
|
|
|
return undef; |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
sub _parse_expression { |
|
236
|
|
|
|
|
|
|
my $arg = shift; |
|
237
|
|
|
|
|
|
|
my $tokens = undef; # this is an arry ref |
|
238
|
|
|
|
|
|
|
if (ref($arg) ne 'ARRAY') { |
|
239
|
|
|
|
|
|
|
$tokens = [ |
|
240
|
|
|
|
|
|
|
$arg =~ m/( \( | \)| \bAND\s+NOT\b | \bAND\b | \bOR\b | \"[^\"]+\" | \b\w+\b) /xig |
|
241
|
|
|
|
|
|
|
]; |
|
242
|
|
|
|
|
|
|
} |
|
243
|
|
|
|
|
|
|
# important!! "AND NOT" is treated as a single logical operator... |
|
244
|
|
|
|
|
|
|
# this means that things like "not a and b" aren't well-formed, |
|
245
|
|
|
|
|
|
|
# while "b and not a" is |
|
246
|
|
|
|
|
|
|
else { $tokens = $arg; |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
my $left = undef; # array ref (oppure stringa se è un espressione atomica) |
|
249
|
|
|
|
|
|
|
my $right = undef; # array ref ! |
|
250
|
|
|
|
|
|
|
my $op = 'OR'; |
|
251
|
|
|
|
|
|
|
my $depth = 0; |
|
252
|
|
|
|
|
|
|
my $pos = 0; |
|
253
|
|
|
|
|
|
|
my $tok; |
|
254
|
|
|
|
|
|
|
my $len = int @$tokens; |
|
255
|
|
|
|
|
|
|
DEBUG("expr = ", join(" + ", @$tokens),"\n"); |
|
256
|
|
|
|
|
|
|
while (1) { |
|
257
|
|
|
|
|
|
|
if ($len == 1) { |
|
258
|
|
|
|
|
|
|
return [ undef, $tokens->[0], undef ]; |
|
259
|
|
|
|
|
|
|
} |
|
260
|
|
|
|
|
|
|
DEBUG("$tok : depth=$depth pos=$pos len=$len\n"); |
|
261
|
|
|
|
|
|
|
if ($depth == 0 && ($pos == $len) ) { |
|
262
|
|
|
|
|
|
|
if ($tokens->[0] eq '(' && $tokens->[$len-1] eq ')') { |
|
263
|
|
|
|
|
|
|
# take off outer parentheses... |
|
264
|
|
|
|
|
|
|
shift @$tokens; |
|
265
|
|
|
|
|
|
|
pop @$tokens; |
|
266
|
|
|
|
|
|
|
$len -= 2; |
|
267
|
|
|
|
|
|
|
$pos = 0; |
|
268
|
|
|
|
|
|
|
$depth = 0; |
|
269
|
|
|
|
|
|
|
DEBUG("expr = ", join(" + ", @$tokens),"\n"); |
|
270
|
|
|
|
|
|
|
} else { # ahhhh... this expression won't be parsed... |
|
271
|
|
|
|
|
|
|
$syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")"; |
|
272
|
|
|
|
|
|
|
DEBUG("atom not atomic\n"); |
|
273
|
|
|
|
|
|
|
return undef; |
|
274
|
|
|
|
|
|
|
} |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
} elsif ( $pos == $len ) { |
|
277
|
|
|
|
|
|
|
$syntax_error = "Non-matching parentheses (\"".join(' ', @$tokens)."\")"; |
|
278
|
|
|
|
|
|
|
DEBUG("non matching parentheses\n"); |
|
279
|
|
|
|
|
|
|
return undef; |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
$tok = $tokens->[$pos++]; |
|
282
|
|
|
|
|
|
|
if ($tok eq '(') { $depth++; next; } |
|
283
|
|
|
|
|
|
|
if ($tok eq ')') { $depth--; next; } |
|
284
|
|
|
|
|
|
|
next if $depth; |
|
285
|
|
|
|
|
|
|
if ($tok =~ /\b(AND\s+NOT|AND|OR)\b/i) { |
|
286
|
|
|
|
|
|
|
if ($pos == 1 || $pos == $len) { |
|
287
|
|
|
|
|
|
|
$syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")"; |
|
288
|
|
|
|
|
|
|
return undef |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
$op = uc $1; $op =~ s/\s+/ /g; |
|
291
|
|
|
|
|
|
|
$left = [ @$tokens[0..$pos-2] ]; |
|
292
|
|
|
|
|
|
|
$right = [ @$tokens[$pos..$len-1] ]; |
|
293
|
|
|
|
|
|
|
DEBUG("right = ", join(" + ", @$right),"\n"); |
|
294
|
|
|
|
|
|
|
DEBUG("left = ", join(" + ", @$left),"\n"); |
|
295
|
|
|
|
|
|
|
return [ $op, $left, $right ]; |
|
296
|
|
|
|
|
|
|
} |
|
297
|
|
|
|
|
|
|
} |
|
298
|
|
|
|
|
|
|
} |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
sub query { # simple query.... altavista +/- prefixes are recognized... |
|
303
|
|
|
|
|
|
|
# */? globbing also works but |
|
304
|
|
|
|
|
|
|
# slows query down significantly |
|
305
|
|
|
|
|
|
|
# globbing implicitly discards +/- prefix (it's a BUG!!!) |
|
306
|
|
|
|
|
|
|
my $self = shift; |
|
307
|
|
|
|
|
|
|
my $indexdbpath= $self->{indexdbpath}; |
|
308
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
|
309
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
|
310
|
|
|
|
|
|
|
my %indexdb; |
|
311
|
|
|
|
|
|
|
my %locationsdb; |
|
312
|
|
|
|
|
|
|
my %titlesdb; |
|
313
|
|
|
|
|
|
|
return undef unless (-f $indexdbpath && -r _); |
|
314
|
|
|
|
|
|
|
return undef unless (-f $locationsdbpath && -r _); |
|
315
|
|
|
|
|
|
|
return undef unless (-f $titlesdbpath && -r _); |
|
316
|
|
|
|
|
|
|
return undef unless |
|
317
|
|
|
|
|
|
|
tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) && |
|
318
|
|
|
|
|
|
|
tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) && |
|
319
|
|
|
|
|
|
|
tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY ); |
|
320
|
|
|
|
|
|
|
my %matches; |
|
321
|
|
|
|
|
|
|
my %limit; |
|
322
|
|
|
|
|
|
|
my %exclude; |
|
323
|
|
|
|
|
|
|
my @ignored; |
|
324
|
|
|
|
|
|
|
my $key; |
|
325
|
|
|
|
|
|
|
my $word; |
|
326
|
|
|
|
|
|
|
my $mustbe_words = 0; |
|
327
|
|
|
|
|
|
|
my @words = (); |
|
328
|
|
|
|
|
|
|
my $glob_regexp = undef; |
|
329
|
|
|
|
|
|
|
for (@_) { # globbing feature... e.g. uni* passw? |
|
330
|
|
|
|
|
|
|
if ( /\*|\?/) { |
|
331
|
|
|
|
|
|
|
s/\*/\.\*/g; |
|
332
|
|
|
|
|
|
|
s/\?/\./g; |
|
333
|
|
|
|
|
|
|
$glob_regexp = $glob_regexp ? $glob_regexp."|^$_\$" : "^$_\$" ; |
|
334
|
|
|
|
|
|
|
} |
|
335
|
|
|
|
|
|
|
else { |
|
336
|
|
|
|
|
|
|
push @words, $_; |
|
337
|
|
|
|
|
|
|
} |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
if ($glob_regexp) { |
|
340
|
|
|
|
|
|
|
my $regexp = qr/$glob_regexp/; |
|
341
|
|
|
|
|
|
|
# collect all words in db matching $glob_regexp and append them to the query |
|
342
|
|
|
|
|
|
|
_push_words_from_hash(\%indexdb, \@words, $regexp); |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
DEBUG("looking up ", join(", ", @words ), "\n"); |
|
346
|
|
|
|
|
|
|
foreach $word (@words) { |
|
347
|
|
|
|
|
|
|
my $rc = 0; |
|
348
|
|
|
|
|
|
|
# DEBUG($word); |
|
349
|
|
|
|
|
|
|
if ($word =~ /^-(.*)/) { |
|
350
|
|
|
|
|
|
|
my $keys = $indexdb{lc $1}; |
|
351
|
|
|
|
|
|
|
$rc = _add_keys_to_match_hash($keys,\%exclude); |
|
352
|
|
|
|
|
|
|
} elsif ($word =~ /^\+(.*)/) { |
|
353
|
|
|
|
|
|
|
$mustbe_words++; |
|
354
|
|
|
|
|
|
|
my $keys = $indexdb{lc $1}; |
|
355
|
|
|
|
|
|
|
$rc = _add_keys_to_match_hash($keys,\%limit); |
|
356
|
|
|
|
|
|
|
} else { |
|
357
|
|
|
|
|
|
|
my $keys = $indexdb{lc $word}; |
|
358
|
|
|
|
|
|
|
$rc = _add_keys_to_match_hash($keys,\%matches); |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
# DEBUG("\n"); |
|
361
|
|
|
|
|
|
|
if (not $rc) { push @ignored, $word } |
|
362
|
|
|
|
|
|
|
} |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
if ($mustbe_words) { |
|
365
|
|
|
|
|
|
|
for $key(keys %limit) { |
|
366
|
|
|
|
|
|
|
next unless $limit{$key} >= $mustbe_words; |
|
367
|
|
|
|
|
|
|
$matches{$key} += $limit{$key} ; |
|
368
|
|
|
|
|
|
|
} |
|
369
|
|
|
|
|
|
|
for $key(keys %matches) { |
|
370
|
|
|
|
|
|
|
delete $matches{$key} unless $limit{$key}; |
|
371
|
|
|
|
|
|
|
} |
|
372
|
|
|
|
|
|
|
} |
|
373
|
|
|
|
|
|
|
for $key(keys %exclude) { |
|
374
|
|
|
|
|
|
|
delete $matches{$key}; |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
my $result = _make_result_hash(\%matches,\%locationsdb, \%titlesdb, \@words, \@ignored); |
|
377
|
|
|
|
|
|
|
untie(%indexdb); |
|
378
|
|
|
|
|
|
|
untie(%locationsdb); |
|
379
|
|
|
|
|
|
|
untie(%titlesdb); |
|
380
|
|
|
|
|
|
|
return $result; |
|
381
|
|
|
|
|
|
|
} |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
sub _make_result_hash { |
|
385
|
|
|
|
|
|
|
# hash-ref hash-ref hash-ref array-ref array-ref |
|
386
|
|
|
|
|
|
|
my ( $match, $locationsdb, $titlesdb, $words, $ignored ) = @_; |
|
387
|
|
|
|
|
|
|
my $result = { |
|
388
|
|
|
|
|
|
|
searched => $words, |
|
389
|
|
|
|
|
|
|
ignored => $ignored, |
|
390
|
|
|
|
|
|
|
entries => [] |
|
391
|
|
|
|
|
|
|
}; |
|
392
|
|
|
|
|
|
|
my $key; |
|
393
|
|
|
|
|
|
|
foreach $key (keys %$match) { |
|
394
|
|
|
|
|
|
|
my $ckey = pack("xN",$key); |
|
395
|
|
|
|
|
|
|
my $name = $locationsdb->{$ckey}; |
|
396
|
|
|
|
|
|
|
my $title = $titlesdb->{$ckey}; |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
push @{ $result->{entries} }, { |
|
399
|
|
|
|
|
|
|
location => $name, |
|
400
|
|
|
|
|
|
|
score => $match->{$key}, |
|
401
|
|
|
|
|
|
|
title => $title |
|
402
|
|
|
|
|
|
|
}; |
|
403
|
|
|
|
|
|
|
DEBUG("$name: $match->{$key}\n"); |
|
404
|
|
|
|
|
|
|
} |
|
405
|
|
|
|
|
|
|
return $result; |
|
406
|
|
|
|
|
|
|
} |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
sub DEBUG (@) { $verbose_flag && print STDERR @_ }; |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
sub tie_hash { |
|
417
|
|
|
|
|
|
|
my ($hashref, $file ,$perm) = @_; |
|
418
|
|
|
|
|
|
|
$perm = (O_RDWR|O_CREAT) unless defined $perm; |
|
419
|
|
|
|
|
|
|
my $rc = tied(%$hashref); |
|
420
|
|
|
|
|
|
|
return $rc if $rc; |
|
421
|
|
|
|
|
|
|
$rc = tie(%$hashref,'DB_File',$file, $perm, 0644, $DB_File::DB_BTREE) ; |
|
422
|
|
|
|
|
|
|
if ($debug_flag) { |
|
423
|
|
|
|
|
|
|
my $count = int keys %$hashref; |
|
424
|
|
|
|
|
|
|
DEBUG("tie $hashref ($rc) ($count keys)\n"); |
|
425
|
|
|
|
|
|
|
} elsif ($verbose_flag) { |
|
426
|
|
|
|
|
|
|
DEBUG("tie $hashref ($rc)\n"); |
|
427
|
|
|
|
|
|
|
} |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
return $rc; |
|
431
|
|
|
|
|
|
|
} |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
sub untie_hash { |
|
434
|
|
|
|
|
|
|
my ($hashref, $file ) = @_; |
|
435
|
|
|
|
|
|
|
if ($debug_flag) { |
|
436
|
|
|
|
|
|
|
my $count = int keys %$hashref; |
|
437
|
|
|
|
|
|
|
DEBUG("untie $hashref ($count keys)\n") |
|
438
|
|
|
|
|
|
|
} |
|
439
|
|
|
|
|
|
|
untie(%$hashref); |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
1; |
|
444
|
|
|
|
|
|
|
#__END__ |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head1 NAME |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
MMM::Text::Search - Perl module for indexing and searching text files and web objects |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
use MMM::Text::Search; |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
my $srch = new MMM::Text::Search { #for indexing... |
|
455
|
|
|
|
|
|
|
#index main file location... |
|
456
|
|
|
|
|
|
|
IndexPath => "/tmp/myindex.db", |
|
457
|
|
|
|
|
|
|
#local files... (optional) |
|
458
|
|
|
|
|
|
|
FileMask => '(?i)(\.txt|\.htm.?)$', |
|
459
|
|
|
|
|
|
|
Dirs => [ "/usr/doc", "/tmp" ] , |
|
460
|
|
|
|
|
|
|
FollowSymLinks => 0|1, (default = 0) |
|
461
|
|
|
|
|
|
|
#web objects... (optional) |
|
462
|
|
|
|
|
|
|
URLs => [ "http://localhost/", ... ], |
|
463
|
|
|
|
|
|
|
Level => recursion-level (0=unlimited) |
|
464
|
|
|
|
|
|
|
#common options... |
|
465
|
|
|
|
|
|
|
IgnoreLimit => 0.3, (default = 2/3) |
|
466
|
|
|
|
|
|
|
Verbose => 0|1 |
|
467
|
|
|
|
|
|
|
}; |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
$srch->start_indexing_session(); |
|
470
|
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
$srch->commit_indexing_session(); |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
$srch->index_default_locations(); |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
$srch->index_content( { title => '...', |
|
476
|
|
|
|
|
|
|
content=> '...', |
|
477
|
|
|
|
|
|
|
id => '...' } ); |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
$srch->makeindex; |
|
480
|
|
|
|
|
|
|
(Obsolete.) |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
my $srch = new MMM::Text::Search ( #for searching.... |
|
487
|
|
|
|
|
|
|
"/tmp/myindex.db", verbose_flag ); |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
my $hashref = $srch->query("pizza","ciao", "-pasta" ); |
|
490
|
|
|
|
|
|
|
my $hashref = $srch->advanced_query("(pizza OR ciao) AND NOT pasta"); |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
$srch->errstr() # returns last error |
|
493
|
|
|
|
|
|
|
# (only query syntax-errors for the moment being) |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
$srch->dump_word_stats(\*FH) |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=item * |
|
502
|
|
|
|
|
|
|
Indexing |
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
When a session is closed the following files will have been created |
|
505
|
|
|
|
|
|
|
(assuming IndexPath = /path/myindex.db, see constructor): |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
/path/myindex.db word index database |
|
509
|
|
|
|
|
|
|
/path/myindex-locations.db filename/URL database |
|
510
|
|
|
|
|
|
|
/path/myindex-titles.db html title database |
|
511
|
|
|
|
|
|
|
/path/myindex.stopwords stop-words list |
|
512
|
|
|
|
|
|
|
/path/myindex.filelist readable list of indexed files/URLs |
|
513
|
|
|
|
|
|
|
/path/myindex.deadlinks broken http links |
|
514
|
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
[... lots of important things missing ... ] |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
start_indexing_session() starts session. |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
commit_indexing_session() commits and closes current session. |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
index_default_locations() indexes all files and URLs specified on construction. |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
index_content() pushes content into indexing engine. |
|
524
|
|
|
|
|
|
|
Argument must have the following structure |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
{ title => '...', content=> '...', id => '...' } |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
makeindex() is obsolete. |
|
530
|
|
|
|
|
|
|
Equivalent to: |
|
531
|
|
|
|
|
|
|
$srch->start_indexing_session(); |
|
532
|
|
|
|
|
|
|
$srch->index_default_locations(); |
|
533
|
|
|
|
|
|
|
$srch->commit_indexing_session(); |
|
534
|
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
dump_word_stats(\*FH) dumps all words sorted by occurence frequency using |
|
539
|
|
|
|
|
|
|
FH file handle (or STDOUT if no parameter is specified). Stop-words get a |
|
540
|
|
|
|
|
|
|
frequency value of 1. |
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=item * |
|
543
|
|
|
|
|
|
|
Searching |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
Both query() and advanced_query() return a reference to a hash with |
|
546
|
|
|
|
|
|
|
the following structure: |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
( |
|
549
|
|
|
|
|
|
|
ignored => [ string, string, ... ], # ignored words |
|
550
|
|
|
|
|
|
|
searched => [ string, string, ... ], # words searched for |
|
551
|
|
|
|
|
|
|
entries => [ hashref, hashref, ... ] # list of records |
|
552
|
|
|
|
|
|
|
# found |
|
553
|
|
|
|
|
|
|
) |
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
The 'entries' element is a reference to an array of hashes, each having |
|
556
|
|
|
|
|
|
|
the following structure: |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
( |
|
559
|
|
|
|
|
|
|
location => string, # file path or URL or anything |
|
560
|
|
|
|
|
|
|
score => number, # score |
|
561
|
|
|
|
|
|
|
title => string # HTML title |
|
562
|
|
|
|
|
|
|
) |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=head1 NOTES |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
Note on implementation: |
|
567
|
|
|
|
|
|
|
The technique used for indexing is substantially derived from that |
|
568
|
|
|
|
|
|
|
exposed by Tim Kientzle on Dr. Dobbs magazine. |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=head1 BUGS |
|
571
|
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
Many, I guess. |
|
573
|
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=head1 AUTHOR |
|
575
|
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
Max Muzi |
|
577
|
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
579
|
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
perl(1). |
|
581
|
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
=cut |
|
583
|
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
# |
|
587
|
|
|
|
|
|
|
#-------------------- the following code is only used when indexing ---------------- |
|
588
|
|
|
|
|
|
|
# |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
sub dump_word_stats { |
|
591
|
|
|
|
|
|
|
my $self = shift; |
|
592
|
|
|
|
|
|
|
my $fh = shift || \*STDOUT; |
|
593
|
|
|
|
|
|
|
my $indexdbpath= $self->{indexdbpath}; |
|
594
|
|
|
|
|
|
|
my %indexdb; |
|
595
|
|
|
|
|
|
|
die unless (-f $indexdbpath && -r _); |
|
596
|
|
|
|
|
|
|
tie_hash(\%indexdb,$indexdbpath, O_RDONLY ); |
|
597
|
|
|
|
|
|
|
my %index = ( %indexdb ); |
|
598
|
|
|
|
|
|
|
my $w; |
|
599
|
|
|
|
|
|
|
for $w( sort { length($index{$b}) <=> length($index{$a}) } |
|
600
|
|
|
|
|
|
|
keys %index ) { |
|
601
|
|
|
|
|
|
|
print $fh $w, "\t", length($index{$w}) / 2, "\n"; |
|
602
|
|
|
|
|
|
|
} |
|
603
|
|
|
|
|
|
|
untie_hash(\%indexdb); |
|
604
|
|
|
|
|
|
|
} |
|
605
|
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
sub start_indexing_session |
|
608
|
|
|
|
|
|
|
{ |
|
609
|
|
|
|
|
|
|
my $self = shift; |
|
610
|
|
|
|
|
|
|
$self->rollback_indexing_session; |
|
611
|
|
|
|
|
|
|
my $key = 0; |
|
612
|
|
|
|
|
|
|
my $indexdbpath = $self->{indexdbpath}; |
|
613
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
|
614
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
|
615
|
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
my $filemask = $self->{filemask}; |
|
617
|
|
|
|
|
|
|
my $keyref = \$key; |
|
618
|
|
|
|
|
|
|
my $filelistfile = $indexdbpath; |
|
619
|
|
|
|
|
|
|
$filelistfile =~ s/(\.db)?$/\.filelist/; |
|
620
|
|
|
|
|
|
|
open FILELIST, ">".$filelistfile; |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
my $session = { |
|
623
|
|
|
|
|
|
|
indexdbpath => $indexdbpath, |
|
624
|
|
|
|
|
|
|
locationsdbpath => $locationsdbpath, |
|
625
|
|
|
|
|
|
|
titlesdbpath => $titlesdbpath, |
|
626
|
|
|
|
|
|
|
indexdb => { }, |
|
627
|
|
|
|
|
|
|
locationsdb => { }, |
|
628
|
|
|
|
|
|
|
titlesdb => { }, |
|
629
|
|
|
|
|
|
|
cachedb => { }, |
|
630
|
|
|
|
|
|
|
filemask => $filemask, |
|
631
|
|
|
|
|
|
|
current_key => 16, # first 16 values are reserved (0 = word is ignored) |
|
632
|
|
|
|
|
|
|
bytes => 0, |
|
633
|
|
|
|
|
|
|
count => 0, |
|
634
|
|
|
|
|
|
|
filecount => 0, |
|
635
|
|
|
|
|
|
|
listfh => \*FILELIST, |
|
636
|
|
|
|
|
|
|
status_THE => 0, |
|
637
|
|
|
|
|
|
|
followsymlinks => $self->{followsymlinks}, |
|
638
|
|
|
|
|
|
|
minwordsize => $self->{minwordsize}, |
|
639
|
|
|
|
|
|
|
ignoreword => {}, |
|
640
|
|
|
|
|
|
|
autoignore => 1, |
|
641
|
|
|
|
|
|
|
ignorelimit => $self->{ignorelimit} || (2/3), |
|
642
|
|
|
|
|
|
|
level => $self->{level}, |
|
643
|
|
|
|
|
|
|
url_exclude => $self->{url_exclude}, |
|
644
|
|
|
|
|
|
|
file_reader => $self->{file_reader}, |
|
645
|
|
|
|
|
|
|
use_inode => $self->{use_inode}, |
|
646
|
|
|
|
|
|
|
no_reset => $self->{no_reset}, |
|
647
|
|
|
|
|
|
|
}; |
|
648
|
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
unlink $indexdbpath."~"; |
|
650
|
|
|
|
|
|
|
unlink $locationsdbpath."~"; |
|
651
|
|
|
|
|
|
|
unlink $titlesdbpath."~"; |
|
652
|
|
|
|
|
|
|
if( $self->{no_reset} ) |
|
653
|
|
|
|
|
|
|
{ |
|
654
|
|
|
|
|
|
|
copy( $indexdbpath, $indexdbpath."~" ); |
|
655
|
|
|
|
|
|
|
copy( $locationsdbpath, $locationsdbpath."~" ); |
|
656
|
|
|
|
|
|
|
copy( $titlesdbpath, $titlesdbpath."~" ); |
|
657
|
|
|
|
|
|
|
} |
|
658
|
|
|
|
|
|
|
tie_hash($session->{indexdb}, $indexdbpath."~" ) or die "$indexdbpath: $!\n"; |
|
659
|
|
|
|
|
|
|
tie_hash($session->{locationsdb}, $locationsdbpath."~" ) or die $!; |
|
660
|
|
|
|
|
|
|
tie_hash($session->{titlesdb},$titlesdbpath."~" ) or die $!; |
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
my $ignorefile = $indexdbpath; |
|
663
|
|
|
|
|
|
|
$ignorefile =~ s/(\.db)?$/\.stopwords/; |
|
664
|
|
|
|
|
|
|
if (-r $ignorefile) { # read *-stopwords.dat file |
|
665
|
|
|
|
|
|
|
open F, $ignorefile; |
|
666
|
|
|
|
|
|
|
while () { |
|
667
|
|
|
|
|
|
|
chomp; |
|
668
|
|
|
|
|
|
|
s/^\s+|\s+$//g; |
|
669
|
|
|
|
|
|
|
$session->{ignoreword}->{$_} = 1; |
|
670
|
|
|
|
|
|
|
} |
|
671
|
|
|
|
|
|
|
close F; |
|
672
|
|
|
|
|
|
|
my $count = int keys %{ $session->{ignoreword} }; |
|
673
|
|
|
|
|
|
|
DEBUG("using stop-words from $ignorefile ($count words)\n"); |
|
674
|
|
|
|
|
|
|
$session->{autoignore} = 0; |
|
675
|
|
|
|
|
|
|
} |
|
676
|
|
|
|
|
|
|
$session->{ignorefile} = $ignorefile; |
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
my $time = time(); |
|
679
|
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
$session->{start_time} = $time; |
|
681
|
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
$self->{session} = $session; |
|
683
|
|
|
|
|
|
|
} |
|
684
|
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
sub index_default_locations |
|
686
|
|
|
|
|
|
|
{ |
|
687
|
|
|
|
|
|
|
my $self = shift; |
|
688
|
|
|
|
|
|
|
my $session = $self->{session}; |
|
689
|
|
|
|
|
|
|
return unless $session; |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
my $dirs = $self->{dirs}; |
|
692
|
|
|
|
|
|
|
my $urls = $self->{urls}; |
|
693
|
|
|
|
|
|
|
my $filecount = 0; |
|
694
|
|
|
|
|
|
|
DEBUG("Counting files...\n") if int @$dirs; |
|
695
|
|
|
|
|
|
|
my $dir; |
|
696
|
|
|
|
|
|
|
for $dir( sort @$dirs) { $filecount += IndexDir($session, $dir, 1); } |
|
697
|
|
|
|
|
|
|
$session->{filecount} = $filecount; |
|
698
|
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
for $dir( sort @$dirs) { IndexDir($session, $dir); } |
|
700
|
|
|
|
|
|
|
for my $url( sort @$urls) { IndexWeb($session, $url); } |
|
701
|
|
|
|
|
|
|
} |
|
702
|
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
sub index_content |
|
704
|
|
|
|
|
|
|
{ |
|
705
|
|
|
|
|
|
|
my $self = shift; |
|
706
|
|
|
|
|
|
|
my $session = $self->{session}; |
|
707
|
|
|
|
|
|
|
return unless $session; |
|
708
|
|
|
|
|
|
|
my $info = shift; |
|
709
|
|
|
|
|
|
|
if( ref($info) ne 'HASH' ) |
|
710
|
|
|
|
|
|
|
{ warn("usage: \$src->index_content( { content=>'...', id=>'...', title=>'...' } )\n"); |
|
711
|
|
|
|
|
|
|
return undef; |
|
712
|
|
|
|
|
|
|
} |
|
713
|
|
|
|
|
|
|
IndexFile( $session, $info->{id}, $info->{content}, $info->{title} ); |
|
714
|
|
|
|
|
|
|
return 1; |
|
715
|
|
|
|
|
|
|
} |
|
716
|
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
sub rollback_indexing_session |
|
718
|
|
|
|
|
|
|
{ |
|
719
|
|
|
|
|
|
|
my $self = shift; |
|
720
|
|
|
|
|
|
|
my $session = $self->{session}; |
|
721
|
|
|
|
|
|
|
return unless $session; |
|
722
|
|
|
|
|
|
|
untie_hash($session->{indexdb}); |
|
723
|
|
|
|
|
|
|
untie_hash($session->{locationsdb}); |
|
724
|
|
|
|
|
|
|
untie_hash($session->{titlesdb}); |
|
725
|
|
|
|
|
|
|
my $indexdbpath = $self->{indexdbpath}; |
|
726
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
|
727
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
unlink $indexdbpath."~"; |
|
730
|
|
|
|
|
|
|
unlink $locationsdbpath."~"; |
|
731
|
|
|
|
|
|
|
unlink $titlesdbpath."~"; |
|
732
|
|
|
|
|
|
|
$self->{session} = undef; |
|
733
|
|
|
|
|
|
|
} |
|
734
|
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
sub DESTROY |
|
736
|
|
|
|
|
|
|
{ |
|
737
|
|
|
|
|
|
|
my $self = shift; |
|
738
|
|
|
|
|
|
|
$self->rollback_indexing_session; |
|
739
|
|
|
|
|
|
|
} |
|
740
|
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
sub commit_indexing_session |
|
742
|
|
|
|
|
|
|
{ |
|
743
|
|
|
|
|
|
|
my $self = shift; |
|
744
|
|
|
|
|
|
|
my $session = $self->{session}; |
|
745
|
|
|
|
|
|
|
return unless $session; |
|
746
|
|
|
|
|
|
|
FlushCache($session->{cachedb}, $session->{indexdb}, $session); |
|
747
|
|
|
|
|
|
|
my $time = time()-$session->{start_time}; |
|
748
|
|
|
|
|
|
|
DEBUG("$session->{bytes} bytes read, $session->{count} files processed in $time seconds\n"); |
|
749
|
|
|
|
|
|
|
untie_hash($session->{indexdb}); |
|
750
|
|
|
|
|
|
|
untie_hash($session->{locationsdb}); |
|
751
|
|
|
|
|
|
|
untie_hash($session->{titlesdb}); |
|
752
|
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
my $indexdbpath = $self->{indexdbpath}; |
|
754
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
|
755
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
|
756
|
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
rename $indexdbpath."~", $indexdbpath; |
|
758
|
|
|
|
|
|
|
rename $locationsdbpath."~", $locationsdbpath ; |
|
759
|
|
|
|
|
|
|
rename $titlesdbpath."~", $titlesdbpath; |
|
760
|
|
|
|
|
|
|
close $session->{listfh}; |
|
761
|
|
|
|
|
|
|
if ( $session->{autoignore} ) { |
|
762
|
|
|
|
|
|
|
my $ignorefile = $session->{ignorefile}; |
|
763
|
|
|
|
|
|
|
open F, ">".$ignorefile; #write *-stopwords.dat file |
|
764
|
|
|
|
|
|
|
print F join( "\n", sort keys %{ $session->{ignoreword} } ); |
|
765
|
|
|
|
|
|
|
close F; |
|
766
|
|
|
|
|
|
|
} |
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
$self->{session} = undef; |
|
769
|
|
|
|
|
|
|
} |
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
sub makeindex |
|
774
|
|
|
|
|
|
|
{ |
|
775
|
|
|
|
|
|
|
my $self = shift; |
|
776
|
|
|
|
|
|
|
$self->start_indexing_session(); |
|
777
|
|
|
|
|
|
|
$self->index_default_locations(); |
|
778
|
|
|
|
|
|
|
$self->commit_indexing_session(); |
|
779
|
|
|
|
|
|
|
} |
|
780
|
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
sub IndexDir { |
|
783
|
|
|
|
|
|
|
my ($session, $dir, $only_recurse) = @_; |
|
784
|
|
|
|
|
|
|
my $followsymlinks = $session->{followsymlinks}; |
|
785
|
|
|
|
|
|
|
my $file_reader = $session->{file_reader}; |
|
786
|
|
|
|
|
|
|
opendir D, $dir; |
|
787
|
|
|
|
|
|
|
# DEBUG "D $dir\n"; |
|
788
|
|
|
|
|
|
|
my @files = readdir D; |
|
789
|
|
|
|
|
|
|
close D; |
|
790
|
|
|
|
|
|
|
my $e; |
|
791
|
|
|
|
|
|
|
my $count = 0; |
|
792
|
|
|
|
|
|
|
my $text; |
|
793
|
|
|
|
|
|
|
for $e(@files) { |
|
794
|
|
|
|
|
|
|
next if $e =~ /^\.\.?/; |
|
795
|
|
|
|
|
|
|
my $path = $dir."/".$e; |
|
796
|
|
|
|
|
|
|
if (-d $path) { |
|
797
|
|
|
|
|
|
|
unless ($followsymlinks) { |
|
798
|
|
|
|
|
|
|
next if -l $path ; |
|
799
|
|
|
|
|
|
|
} |
|
800
|
|
|
|
|
|
|
$count += IndexDir($session,$path, $only_recurse); |
|
801
|
|
|
|
|
|
|
} |
|
802
|
|
|
|
|
|
|
elsif (-f _ ) { |
|
803
|
|
|
|
|
|
|
my $filemask = $session->{filemask}; |
|
804
|
|
|
|
|
|
|
if ($filemask) { |
|
805
|
|
|
|
|
|
|
next unless $e =~ $filemask; |
|
806
|
|
|
|
|
|
|
} |
|
807
|
|
|
|
|
|
|
unless ($only_recurse) |
|
808
|
|
|
|
|
|
|
{ |
|
809
|
|
|
|
|
|
|
if( $file_reader ) |
|
810
|
|
|
|
|
|
|
{ |
|
811
|
|
|
|
|
|
|
$text = $file_reader->read( $path ); |
|
812
|
|
|
|
|
|
|
IndexFile($session,$path,$text); |
|
813
|
|
|
|
|
|
|
} else |
|
814
|
|
|
|
|
|
|
{ |
|
815
|
|
|
|
|
|
|
IndexFile($session,$path); |
|
816
|
|
|
|
|
|
|
} |
|
817
|
|
|
|
|
|
|
} |
|
818
|
|
|
|
|
|
|
$count ++; |
|
819
|
|
|
|
|
|
|
} |
|
820
|
|
|
|
|
|
|
} |
|
821
|
|
|
|
|
|
|
return $count; |
|
822
|
|
|
|
|
|
|
} |
|
823
|
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
sub IndexFile { |
|
827
|
|
|
|
|
|
|
my ($session, $file, $text, $title ) = @_; |
|
828
|
|
|
|
|
|
|
my $cachedb = $session->{cachedb}; |
|
829
|
|
|
|
|
|
|
my $locationsdb = $session->{locationsdb}; |
|
830
|
|
|
|
|
|
|
my $key = $session->{current_key}; |
|
831
|
|
|
|
|
|
|
if( $session->{use_inode} ) |
|
832
|
|
|
|
|
|
|
{ |
|
833
|
|
|
|
|
|
|
$key = (stat($file))[1]; |
|
834
|
|
|
|
|
|
|
} |
|
835
|
|
|
|
|
|
|
my $no_of_files = $session->{filecount}; |
|
836
|
|
|
|
|
|
|
if( $session->{no_reset} ) |
|
837
|
|
|
|
|
|
|
{ |
|
838
|
|
|
|
|
|
|
if( exists $locationsdb->{pack"xN",$key} ) |
|
839
|
|
|
|
|
|
|
{ |
|
840
|
|
|
|
|
|
|
warn("key $key already in locationsdb. Skipping\n"); |
|
841
|
|
|
|
|
|
|
return; |
|
842
|
|
|
|
|
|
|
} |
|
843
|
|
|
|
|
|
|
} |
|
844
|
|
|
|
|
|
|
DEBUG $session->{count}+1, "/$no_of_files $file (id=$key)\n"; |
|
845
|
|
|
|
|
|
|
my $fh = $session->{listfh}; |
|
846
|
|
|
|
|
|
|
print $fh "$key\t$file\n"; |
|
847
|
|
|
|
|
|
|
local $/; |
|
848
|
|
|
|
|
|
|
unless (defined $text) { |
|
849
|
|
|
|
|
|
|
undef $/; |
|
850
|
|
|
|
|
|
|
open(FILE, $file); |
|
851
|
|
|
|
|
|
|
($text) = ; # Read entire file |
|
852
|
|
|
|
|
|
|
close FILE; |
|
853
|
|
|
|
|
|
|
} |
|
854
|
|
|
|
|
|
|
my $filesize = length($text); |
|
855
|
|
|
|
|
|
|
if ($file =~ /\.s?htm.?/i ) { |
|
856
|
|
|
|
|
|
|
$text =~ /]*>([^<]+)<\/title/i ; |
|
857
|
|
|
|
|
|
|
$title = $1; |
|
858
|
|
|
|
|
|
|
$title =~ s/\s+/ /g; |
|
859
|
|
|
|
|
|
|
$text =~ s/<[^>]*>//g; # strip all HTML tags |
|
860
|
|
|
|
|
|
|
} |
|
861
|
|
|
|
|
|
|
if( defined $title ) |
|
862
|
|
|
|
|
|
|
{ |
|
863
|
|
|
|
|
|
|
$session->{titlesdb}->{pack"xN",$key} = $title; # put title in db |
|
864
|
|
|
|
|
|
|
DEBUG("* \"$title\"\n"); |
|
865
|
|
|
|
|
|
|
} |
|
866
|
|
|
|
|
|
|
# index all the words under the current file-id |
|
867
|
|
|
|
|
|
|
my($wordsIndexed) = &IndexWords($cachedb, $text,$key, $session); |
|
868
|
|
|
|
|
|
|
$session->{current_key}++; |
|
869
|
|
|
|
|
|
|
DEBUG "* $wordsIndexed words\n"; |
|
870
|
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
# map file-id (key) to this filename |
|
872
|
|
|
|
|
|
|
$locationsdb->{pack"xN",$key} = $file; # leading null is here for |
|
873
|
|
|
|
|
|
|
# historical reasons :-) |
|
874
|
|
|
|
|
|
|
$session->{bytes} += $filesize; |
|
875
|
|
|
|
|
|
|
$session->{count}++; |
|
876
|
|
|
|
|
|
|
$session->{_temp_size} += $filesize; |
|
877
|
|
|
|
|
|
|
if ($session->{_temp_size} > 2000000 ) { |
|
878
|
|
|
|
|
|
|
my $rc = 0; |
|
879
|
|
|
|
|
|
|
$rc = FlushCache($cachedb, $session->{indexdb}, $session); |
|
880
|
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
if (! $rc ) { |
|
882
|
|
|
|
|
|
|
tie_hash($session->{indexdb}, $session->{indexdbpath}) or die $!; |
|
883
|
|
|
|
|
|
|
untie_hash($session->{indexdb}); |
|
884
|
|
|
|
|
|
|
$rc = FlushCache($cachedb, $session->{indexdb}, $session); |
|
885
|
|
|
|
|
|
|
die $! if not $rc; |
|
886
|
|
|
|
|
|
|
} |
|
887
|
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
$session->{_temp_size} = 0; |
|
889
|
|
|
|
|
|
|
$session->{cachedb} = {}; |
|
890
|
|
|
|
|
|
|
} |
|
891
|
|
|
|
|
|
|
} |
|
892
|
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
sub IndexWords { |
|
894
|
|
|
|
|
|
|
my ($db, $words, $fileKey, $session) = @_; |
|
895
|
|
|
|
|
|
|
# hash content file-id options |
|
896
|
|
|
|
|
|
|
my (%worduniq); # for unique-ifying word list |
|
897
|
|
|
|
|
|
|
my $minwordsize = $session->{minwordsize}; |
|
898
|
|
|
|
|
|
|
my (@words) = split( /[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words); # split into an array of words |
|
899
|
|
|
|
|
|
|
@words = grep { $worduniq{$_}++ == 0 } # remove duplicates |
|
900
|
|
|
|
|
|
|
grep { length > $minwordsize } # must be longer than one character |
|
901
|
|
|
|
|
|
|
grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ } # strip leading punct |
|
902
|
|
|
|
|
|
|
grep { /[a-zA-Z0-9\xc0-\xff]/ } # must have an alphanumeric |
|
903
|
|
|
|
|
|
|
@words; |
|
904
|
|
|
|
|
|
|
# " foreach (sort @words) { " |
|
905
|
|
|
|
|
|
|
for (@words) { # no need to sort here, |
|
906
|
|
|
|
|
|
|
my $a = $db->{$_}; # we will sort when cache is flushed |
|
907
|
|
|
|
|
|
|
$a .= pack "N",$fileKey; # appending packed file-id's |
|
908
|
|
|
|
|
|
|
$db->{$_} = $a; |
|
909
|
|
|
|
|
|
|
} |
|
910
|
|
|
|
|
|
|
return int @words; |
|
911
|
|
|
|
|
|
|
} |
|
912
|
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
sub FlushCache { |
|
916
|
|
|
|
|
|
|
my ($source, $dest, $session) = @_; |
|
917
|
|
|
|
|
|
|
# flush source hashe into dest.... |
|
918
|
|
|
|
|
|
|
# %$dest is supposed to be tied, otherwise the whole |
|
919
|
|
|
|
|
|
|
# thing doens't make much sense... :-) |
|
920
|
|
|
|
|
|
|
my $scount = int keys %$source ; |
|
921
|
|
|
|
|
|
|
my $ucount = 0; |
|
922
|
|
|
|
|
|
|
my $acount = 0; |
|
923
|
|
|
|
|
|
|
if ($scount == 0) { |
|
924
|
|
|
|
|
|
|
die "error: 0 words in cache\n"; |
|
925
|
|
|
|
|
|
|
} |
|
926
|
|
|
|
|
|
|
# my $wordcount = int keys %$dest; |
|
927
|
|
|
|
|
|
|
# if ($wordcount < $session->{wordcount}) { |
|
928
|
|
|
|
|
|
|
# warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n"; |
|
929
|
|
|
|
|
|
|
# return undef; |
|
930
|
|
|
|
|
|
|
# } |
|
931
|
|
|
|
|
|
|
# $session->{wordcount} = $wordcount; |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
# DEBUG("$wordcount words in database\n"); |
|
934
|
|
|
|
|
|
|
my $objref = tied %$dest ; |
|
935
|
|
|
|
|
|
|
DEBUG("flushing $scount words into $dest ($objref)\n"); |
|
936
|
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
my $filecount = $session->{count}; |
|
938
|
|
|
|
|
|
|
my $autoignore = $session->{autoignore}; |
|
939
|
|
|
|
|
|
|
my $ignorethreshold = int ( $filecount * $session->{ignorelimit} ); |
|
940
|
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
my $w; |
|
942
|
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
WORD: |
|
944
|
|
|
|
|
|
|
for $w(sort keys %$source) { |
|
945
|
|
|
|
|
|
|
my $data = $source->{$w}; |
|
946
|
|
|
|
|
|
|
if ($session->{ignoreword}->{$w} ) { |
|
947
|
|
|
|
|
|
|
DEBUG("ignoring '$w' \n"); |
|
948
|
|
|
|
|
|
|
$data = pack("N*", ( 0 ) ); # id = 0 means $w is a stop-word |
|
949
|
|
|
|
|
|
|
} |
|
950
|
|
|
|
|
|
|
elsif (defined $dest->{$w}) { |
|
951
|
|
|
|
|
|
|
my %uniq = (); |
|
952
|
|
|
|
|
|
|
my $keys = $dest->{$w} . $data ; |
|
953
|
|
|
|
|
|
|
my $keycount = length($keys)/2; # dividing by 2 |
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
$ucount++; |
|
956
|
|
|
|
|
|
|
## my @keys = unpack("n*", $keys) ; |
|
957
|
|
|
|
|
|
|
## my $keycount = @keys; |
|
958
|
|
|
|
|
|
|
## |
|
959
|
|
|
|
|
|
|
## if ($keys[0] == 0 ) { # skip ignored word |
|
960
|
|
|
|
|
|
|
## DEBUG("skipping '$w' \n"); |
|
961
|
|
|
|
|
|
|
## next WORD; |
|
962
|
|
|
|
|
|
|
## } els |
|
963
|
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
if ($autoignore && ($filecount > 100) |
|
965
|
|
|
|
|
|
|
&& ($keycount > $ignorethreshold ) ) { |
|
966
|
|
|
|
|
|
|
DEBUG("word '$w' will be ignored (found in $keycount of $filecount files)\n"); |
|
967
|
|
|
|
|
|
|
# ignored words are associated to file-id 0 |
|
968
|
|
|
|
|
|
|
## @keys = ( 0 ); |
|
969
|
|
|
|
|
|
|
$keys = pack("N*", 0); |
|
970
|
|
|
|
|
|
|
$session->{ignoreword}->{$w} = 1; |
|
971
|
|
|
|
|
|
|
} |
|
972
|
|
|
|
|
|
|
## @keys = grep { $uniq{$_}++ == 0} @keys; |
|
973
|
|
|
|
|
|
|
## $data = pack("n*", @keys); |
|
974
|
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
$data = $keys; |
|
976
|
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
## if ($verbose_flag && ( $w eq "the" ) ) { |
|
978
|
|
|
|
|
|
|
## my $len = int(@keys); |
|
979
|
|
|
|
|
|
|
## if ($len < $session->{status_THE} ) { |
|
980
|
|
|
|
|
|
|
## die "panic: problem with word 'the'"; |
|
981
|
|
|
|
|
|
|
## } |
|
982
|
|
|
|
|
|
|
## $session->{status_THE} = $len; |
|
983
|
|
|
|
|
|
|
## DEBUG("word 'the' found in $len files \n"); |
|
984
|
|
|
|
|
|
|
## } |
|
985
|
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
} else { |
|
987
|
|
|
|
|
|
|
$acount++; |
|
988
|
|
|
|
|
|
|
} |
|
989
|
|
|
|
|
|
|
$dest->{$w} = $data; |
|
990
|
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
# if ($dest->{$w} ne $data) { |
|
992
|
|
|
|
|
|
|
# warn "unexpected error: \$w=$w\n"; |
|
993
|
|
|
|
|
|
|
# return undef; |
|
994
|
|
|
|
|
|
|
# } |
|
995
|
|
|
|
|
|
|
} |
|
996
|
|
|
|
|
|
|
DEBUG("$ucount words updated, $acount new words added\n"); |
|
997
|
|
|
|
|
|
|
if ($debug_flag) { |
|
998
|
|
|
|
|
|
|
my $wordcount = int keys %$dest; |
|
999
|
|
|
|
|
|
|
if ($wordcount < $session->{wordcount}) { |
|
1000
|
|
|
|
|
|
|
warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n"; |
|
1001
|
|
|
|
|
|
|
return undef; |
|
1002
|
|
|
|
|
|
|
} |
|
1003
|
|
|
|
|
|
|
$session->{wordcount} = $wordcount; |
|
1004
|
|
|
|
|
|
|
DEBUG("$wordcount words in database\n"); |
|
1005
|
|
|
|
|
|
|
} |
|
1006
|
|
|
|
|
|
|
return 1; |
|
1007
|
|
|
|
|
|
|
} |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
sub IndexWeb { |
|
1014
|
|
|
|
|
|
|
my ($session, $url) = @_; |
|
1015
|
|
|
|
|
|
|
require MMM::Text::Search::Inet; |
|
1016
|
|
|
|
|
|
|
my $req = new HTTPRequest { AutoRedirect => 1 }; |
|
1017
|
|
|
|
|
|
|
my %fetched = (); |
|
1018
|
|
|
|
|
|
|
$req->set_url($url); |
|
1019
|
|
|
|
|
|
|
my $host = $req->host(); |
|
1020
|
|
|
|
|
|
|
$session->{req} = $req; |
|
1021
|
|
|
|
|
|
|
$session->{fetched} = \%fetched; |
|
1022
|
|
|
|
|
|
|
$session->{host} = $host; |
|
1023
|
|
|
|
|
|
|
my $deadlinksfile = $session->{indexdbpath}; |
|
1024
|
|
|
|
|
|
|
$deadlinksfile =~ s/(\.db)?$/\.deadlinks/; |
|
1025
|
|
|
|
|
|
|
open DL, ">".$deadlinksfile; |
|
1026
|
|
|
|
|
|
|
$session->{deadlinksfh} = \*DL; |
|
1027
|
|
|
|
|
|
|
recursive_fetch($session, $url, "", 0); |
|
1028
|
|
|
|
|
|
|
} |
|
1029
|
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
sub recursive_fetch { |
|
1033
|
|
|
|
|
|
|
my ($session, $URL, $parent, $level) = @_; |
|
1034
|
|
|
|
|
|
|
my $req = $session->{req}; |
|
1035
|
|
|
|
|
|
|
$req->reset(); |
|
1036
|
|
|
|
|
|
|
$req->set_url($URL); |
|
1037
|
|
|
|
|
|
|
my $url = $req->url(); |
|
1038
|
|
|
|
|
|
|
return unless $req->host() eq $session->{host}; |
|
1039
|
|
|
|
|
|
|
return if $session->{fetched}->{$url}; |
|
1040
|
|
|
|
|
|
|
$session->{fetched}->{$url} = 1; |
|
1041
|
|
|
|
|
|
|
return unless $req->get_page(); |
|
1042
|
|
|
|
|
|
|
my $status = $req->status(); |
|
1043
|
|
|
|
|
|
|
DEBUG( ">>> $url ($status)\n"); |
|
1044
|
|
|
|
|
|
|
if ( $status != 200 ) { |
|
1045
|
|
|
|
|
|
|
my $fh = $session->{deadlinksfh}; |
|
1046
|
|
|
|
|
|
|
my $url = $req->url(); |
|
1047
|
|
|
|
|
|
|
print $fh $status, "\t", |
|
1048
|
|
|
|
|
|
|
$url, "(", $req->{_URL},")", |
|
1049
|
|
|
|
|
|
|
"\t", $parent, "\n"; |
|
1050
|
|
|
|
|
|
|
return; |
|
1051
|
|
|
|
|
|
|
}; |
|
1052
|
|
|
|
|
|
|
my $base = $req->base_url(); |
|
1053
|
|
|
|
|
|
|
my $content_ref = $req->content_ref(); |
|
1054
|
|
|
|
|
|
|
my $header = $req->header(); |
|
1055
|
|
|
|
|
|
|
IndexFile($session, $url, $$content_ref); |
|
1056
|
|
|
|
|
|
|
return if ($session->{level} && $level >= $session->{level}); |
|
1057
|
|
|
|
|
|
|
$$content_ref =~ s///gs; #remove comments |
|
1058
|
|
|
|
|
|
|
my @links = $$content_ref =~/href=([^>\s]+)/ig; #extract hyperlinks |
|
1059
|
|
|
|
|
|
|
my $count = 0; |
|
1060
|
|
|
|
|
|
|
my $exclude_re = $session->{url_exclude}; |
|
1061
|
|
|
|
|
|
|
for(@links) { |
|
1062
|
|
|
|
|
|
|
s/\"|\'//g; |
|
1063
|
|
|
|
|
|
|
next if m/^(ftp|mailto|gopher|news):/; |
|
1064
|
|
|
|
|
|
|
next if m/^$exclude_re$/o; |
|
1065
|
|
|
|
|
|
|
my $link = /^http/ ? $_ : join("/",$base,$_); |
|
1066
|
|
|
|
|
|
|
$link =~ s/#.*//; |
|
1067
|
|
|
|
|
|
|
$count++; |
|
1068
|
|
|
|
|
|
|
recursive_fetch($session,$link, $url, $level + 1); |
|
1069
|
|
|
|
|
|
|
} |
|
1070
|
|
|
|
|
|
|
} |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
1; |
|
1074
|
|
|
|
|
|
|
__END__ |