line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Plucene::SearchEngine::Query; |
2
|
1
|
|
|
1
|
|
68079
|
use 5.006; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
45
|
|
3
|
1
|
|
|
1
|
|
7
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
42
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
37
|
|
5
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
87
|
|
6
|
1
|
|
|
1
|
|
8
|
use UNIVERSAL::require; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
33
|
|
7
|
1
|
|
|
1
|
|
960
|
use Lucene::QueryParser; |
|
1
|
|
|
|
|
15393
|
|
|
1
|
|
|
|
|
77
|
|
8
|
1
|
|
|
1
|
|
1005
|
use Plucene::Search::IndexSearcher; |
|
1
|
|
|
|
|
8990
|
|
|
1
|
|
|
|
|
20
|
|
9
|
1
|
|
|
1
|
|
39
|
use Plucene::Search::HitCollector; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
9
|
|
10
|
1
|
|
|
1
|
|
988
|
use Plucene::QueryParser; |
|
1
|
|
|
|
|
2872
|
|
|
1
|
|
|
|
|
11
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Plucene::SearchEngine::Query - A higher level abstraction for Plucene |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use Plucene::SearchEngine::Query; |
21
|
|
|
|
|
|
|
my $query = Plucene::SearchEngine::Query->new( |
22
|
|
|
|
|
|
|
dir => "/var/plucene/foo" |
23
|
|
|
|
|
|
|
); |
24
|
|
|
|
|
|
|
my @docs = $queryer->search("some stuff"); |
25
|
|
|
|
|
|
|
for my $id (@docs) { |
26
|
|
|
|
|
|
|
$snippeter = $query->snippeter( retrieve_text_for_doc($id) ); |
27
|
|
|
|
|
|
|
print "Doc $id \n"; |
28
|
|
|
|
|
|
|
print "" . $snippeter->as_html . " "; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 DESCRIPTION |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Plucene is an extremely powerful library for building search engines, but |
34
|
|
|
|
|
|
|
each time I build a search engine with it, I always find myself doing the |
35
|
|
|
|
|
|
|
same things. This module provides an abstraction layer around Plucene - |
36
|
|
|
|
|
|
|
not quite as abstracted as L, but more abstracted than |
37
|
|
|
|
|
|
|
Plucene itself. |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 METHODS |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=cut |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head2 new |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
Plucene::SearchEngine::Query->new( |
46
|
|
|
|
|
|
|
dir => "/var/plucene/foo", |
47
|
|
|
|
|
|
|
analyzer => "Plucene::Analysis::SimpleAnalyzer", |
48
|
|
|
|
|
|
|
default => "text", |
49
|
|
|
|
|
|
|
expand_docs => sub { shift; @_ }, |
50
|
|
|
|
|
|
|
snippeter => "Text::Context"; |
51
|
|
|
|
|
|
|
) |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
This prepares for searching the index. The only mandatory argument is |
54
|
|
|
|
|
|
|
C, which tells Plucene where the index is to be found. The |
55
|
|
|
|
|
|
|
C and C arguments are explained below; |
56
|
|
|
|
|
|
|
C specifies which Plucene analysis class to use when tokenising |
57
|
|
|
|
|
|
|
the search terms, and the C argument denotes the default field |
58
|
|
|
|
|
|
|
for unqualified query terms. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=cut |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub new { |
63
|
1
|
|
|
1
|
1
|
885
|
my ($class, %args) = @_; |
64
|
1
|
50
|
|
|
|
9
|
croak("No directory given!") unless $args{dir}; |
65
|
1
|
50
|
|
|
|
30
|
croak("$args{dir} isn't a directory") unless -d $args{dir}; |
66
|
1
|
|
|
|
|
10
|
my $self = bless { |
67
|
|
|
|
|
|
|
analyzer => "Plucene::Analysis::SimpleAnalyzer", |
68
|
|
|
|
|
|
|
default => "text", |
69
|
|
|
|
|
|
|
expand_docs => \&expand_docs, |
70
|
|
|
|
|
|
|
snippeter => "Text::Context", |
71
|
|
|
|
|
|
|
%args |
72
|
|
|
|
|
|
|
}, $class; |
73
|
1
|
50
|
|
|
|
25
|
$self->{analyzer}->require |
74
|
|
|
|
|
|
|
or die "Couldn't require analyzer: $self->{analyzer}"; |
75
|
1
|
50
|
|
|
|
49
|
$self->{snippeter}->require |
76
|
|
|
|
|
|
|
or die "Couldn't require snippet class: $self->{snippeter}"; |
77
|
1
|
|
|
|
|
1386
|
return $self; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub prepare_search { |
81
|
1
|
|
|
1
|
0
|
2
|
my $self = shift; |
82
|
1
|
|
33
|
|
|
20
|
$self->{searcher} ||= Plucene::Search::IndexSearcher->new( $self->{dir} ); |
83
|
1
|
|
33
|
|
|
2834
|
$self->{parser} ||= Plucene::QueryParser->new({ |
84
|
|
|
|
|
|
|
analyzer => $self->{analyzer}->new, |
85
|
|
|
|
|
|
|
default => $self->{default} |
86
|
|
|
|
|
|
|
}); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head2 search |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
@docs = $queryer->search("foo bar"); |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Returns a set of documents matching the search query. The default |
94
|
|
|
|
|
|
|
way of "expanding" these search results is to sort them by score, |
95
|
|
|
|
|
|
|
and then return the value of the C field from the Plucene index. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Those more familiar with Plucene can have alternative data structures |
98
|
|
|
|
|
|
|
returned by providing a different C parameter to the |
99
|
|
|
|
|
|
|
constructor. For instance, the default doesn't actually B the |
100
|
|
|
|
|
|
|
score, so if you want to get at it, you can say: |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
expand_docs => sub { my ($self, @docs) = @_; return @docs } |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
This will return a list of array references; the first element in each |
105
|
|
|
|
|
|
|
array ref will be the C object, and the second will |
106
|
|
|
|
|
|
|
be the score. |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Or, if you're dealing with C-derived classes, you might |
109
|
|
|
|
|
|
|
like to try: |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
expand_docs => sub { my ($self, @docs) = @_; |
112
|
|
|
|
|
|
|
sort { $b->date <=> $a->date } # Sort by date descending |
113
|
|
|
|
|
|
|
map { My::Class->retrieve($_->[0]->get("id")->string) } |
114
|
|
|
|
|
|
|
@docs; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
The choice is yours. |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=cut |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub search { |
122
|
1
|
|
|
1
|
1
|
847
|
my ($self, $query) = @_; |
123
|
1
|
|
|
|
|
3
|
$self->{orig_query} = $query; |
124
|
1
|
|
|
|
|
5
|
$self->prepare_search; |
125
|
1
|
|
|
|
|
54
|
$self->{query} = $self->{parser}->parse($query); |
126
|
|
|
|
|
|
|
|
127
|
1
|
|
|
|
|
5772
|
my @docs; |
128
|
1
|
|
|
|
|
5
|
my $searcher = $self->{searcher}; |
129
|
|
|
|
|
|
|
my $hc = Plucene::Search::HitCollector->new( |
130
|
|
|
|
|
|
|
collect => sub { |
131
|
2
|
|
|
2
|
|
3043
|
my ($self, $doc, $score) = @_; |
132
|
2
|
|
|
|
|
3
|
my $res = eval { $searcher->doc($doc) }; |
|
2
|
|
|
|
|
8
|
|
133
|
2
|
50
|
|
|
|
1342
|
die $@ if $@; |
134
|
2
|
50
|
|
|
|
14
|
push @docs, [$res, $score] if $res; |
135
|
1
|
|
|
|
|
13
|
}); |
136
|
1
|
|
|
|
|
21
|
$self->{searcher}->search_hc($self->{query}, $hc); |
137
|
1
|
|
|
|
|
78
|
return $self->{expand_docs}->($self, @docs); |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub expand_docs { |
141
|
1
|
|
|
1
|
0
|
4
|
my ($self, @docs) = @_; |
142
|
1
|
|
|
|
|
8
|
map $_->[0]->get("id")->string, sort { $b->[1] <=> $a->[1] } @docs; |
|
1
|
|
|
|
|
9
|
|
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub _unlucene { |
146
|
0
|
|
|
0
|
|
|
my ($self, $ast) = @_; |
147
|
0
|
0
|
|
|
|
|
return map { |
|
|
0
|
|
|
|
|
|
148
|
0
|
0
|
0
|
|
|
|
$_->{query} eq "SUBQUERY" ? $self->_unlucene($_->{subquery}) : |
149
|
|
|
|
|
|
|
$_->{query} ne "PHRASE" ? $_->{term} : |
150
|
|
|
|
|
|
|
(split /\s+/, $_->{term}) |
151
|
|
|
|
|
|
|
} grep { |
152
|
0
|
|
|
|
|
|
$_->{type} ne "PROHIBITED" and |
153
|
|
|
|
|
|
|
(!exists($_->{field}) or $_->{field} eq $self->{default}) |
154
|
0
|
|
|
|
|
|
} @{$ast}; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 snippeter |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
$self->snippeter($doc_text) |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Given the searchable text of a document, returns a snippeter class |
162
|
|
|
|
|
|
|
(C, C, etc.) object primed with |
163
|
|
|
|
|
|
|
the positive parts of the query. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
When you call the rendering method (say, C) on this object, |
166
|
|
|
|
|
|
|
you'll get the text snippet highlighting where the search terms appear |
167
|
|
|
|
|
|
|
in the document. |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=cut |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub snippeter { |
172
|
0
|
|
|
0
|
1
|
|
my ($self, $body) = @_; |
173
|
0
|
0
|
|
|
|
|
croak "It doesn't look like you've actually done a search yet" |
174
|
|
|
|
|
|
|
unless $self->{orig_query}; |
175
|
|
|
|
|
|
|
# We can't actually use the original parser, because it may have |
176
|
|
|
|
|
|
|
# tokenized us funny. (Porter stemming, etc.) |
177
|
0
|
|
|
|
|
|
my @terms = $self->_unlucene(parse_query($self->{orig_query})); |
178
|
0
|
|
|
|
|
|
$self->{snippeter}->new($body, @terms); |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
1; |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head1 AUTHOR |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Simon Cozens, C |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=head1 SEE ALSO |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
L, L, L. |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=cut |