File Coverage

blib/lib/Plucene/SearchEngine/Query.pm
Criterion Covered Total %
statement 54 63 85.7
branch 6 20 30.0
condition 2 9 22.2
subroutine 14 16 87.5
pod 3 5 60.0
total 79 113 69.9


line stmt bran cond sub pod time code
1             package Plucene::SearchEngine::Query;
2 1     1   68079 use 5.006;
  1         5  
  1         45  
3 1     1   7 use strict;
  1         3  
  1         42  
4 1     1   6 use warnings;
  1         3  
  1         37  
5 1     1   5 use Carp;
  1         3  
  1         87  
6 1     1   8 use UNIVERSAL::require;
  1         2  
  1         33  
7 1     1   960 use Lucene::QueryParser;
  1         15393  
  1         77  
8 1     1   1005 use Plucene::Search::IndexSearcher;
  1         8990  
  1         20  
9 1     1   39 use Plucene::Search::HitCollector;
  1         2  
  1         9  
10 1     1   988 use Plucene::QueryParser;
  1         2872  
  1         11  
11              
12             our $VERSION = '0.01';
13              
14             =head1 NAME
15              
16             Plucene::SearchEngine::Query - A higher level abstraction for Plucene
17              
18             =head1 SYNOPSIS
19              
20             use Plucene::SearchEngine::Query;
21             my $query = Plucene::SearchEngine::Query->new(
22             dir => "/var/plucene/foo"
23             );
24             my @docs = $queryer->search("some stuff");
25             for my $id (@docs) {
26             $snippeter = $query->snippeter( retrieve_text_for_doc($id) );
27             print "

Doc $id

\n";
28             print "
" . $snippeter->as_html . "
";
29             }
30              
31             =head1 DESCRIPTION
32              
33             Plucene is an extremely powerful library for building search engines, but
34             each time I build a search engine with it, I always find myself doing the
35             same things. This module provides an abstraction layer around Plucene -
36             not quite as abstracted as L, but more abstracted than
37             Plucene itself.
38              
39             =head1 METHODS
40              
41             =cut
42              
43             =head2 new
44              
45             Plucene::SearchEngine::Query->new(
46             dir => "/var/plucene/foo",
47             analyzer => "Plucene::Analysis::SimpleAnalyzer",
48             default => "text",
49             expand_docs => sub { shift; @_ },
50             snippeter => "Text::Context";
51             )
52              
53             This prepares for searching the index. The only mandatory argument is
54             C, which tells Plucene where the index is to be found. The
55             C and C arguments are explained below;
56             C specifies which Plucene analysis class to use when tokenising
57             the search terms, and the C argument denotes the default field
58             for unqualified query terms.
59              
60             =cut
61              
62             sub new {
63 1     1 1 885 my ($class, %args) = @_;
64 1 50       9 croak("No directory given!") unless $args{dir};
65 1 50       30 croak("$args{dir} isn't a directory") unless -d $args{dir};
66 1         10 my $self = bless {
67             analyzer => "Plucene::Analysis::SimpleAnalyzer",
68             default => "text",
69             expand_docs => \&expand_docs,
70             snippeter => "Text::Context",
71             %args
72             }, $class;
73 1 50       25 $self->{analyzer}->require
74             or die "Couldn't require analyzer: $self->{analyzer}";
75 1 50       49 $self->{snippeter}->require
76             or die "Couldn't require snippet class: $self->{snippeter}";
77 1         1386 return $self;
78             }
79              
80             sub prepare_search {
81 1     1 0 2 my $self = shift;
82 1   33     20 $self->{searcher} ||= Plucene::Search::IndexSearcher->new( $self->{dir} );
83 1   33     2834 $self->{parser} ||= Plucene::QueryParser->new({
84             analyzer => $self->{analyzer}->new,
85             default => $self->{default}
86             });
87             }
88              
89             =head2 search
90              
91             @docs = $queryer->search("foo bar");
92              
93             Returns a set of documents matching the search query. The default
94             way of "expanding" these search results is to sort them by score,
95             and then return the value of the C field from the Plucene index.
96              
97             Those more familiar with Plucene can have alternative data structures
98             returned by providing a different C parameter to the
99             constructor. For instance, the default doesn't actually B the
100             score, so if you want to get at it, you can say:
101              
102             expand_docs => sub { my ($self, @docs) = @_; return @docs }
103              
104             This will return a list of array references; the first element in each
105             array ref will be the C object, and the second will
106             be the score.
107              
108             Or, if you're dealing with C-derived classes, you might
109             like to try:
110              
111             expand_docs => sub { my ($self, @docs) = @_;
112             sort { $b->date <=> $a->date } # Sort by date descending
113             map { My::Class->retrieve($_->[0]->get("id")->string) }
114             @docs;
115             }
116              
117             The choice is yours.
118              
119             =cut
120              
121             sub search {
122 1     1 1 847 my ($self, $query) = @_;
123 1         3 $self->{orig_query} = $query;
124 1         5 $self->prepare_search;
125 1         54 $self->{query} = $self->{parser}->parse($query);
126              
127 1         5772 my @docs;
128 1         5 my $searcher = $self->{searcher};
129             my $hc = Plucene::Search::HitCollector->new(
130             collect => sub {
131 2     2   3043 my ($self, $doc, $score) = @_;
132 2         3 my $res = eval { $searcher->doc($doc) };
  2         8  
133 2 50       1342 die $@ if $@;
134 2 50       14 push @docs, [$res, $score] if $res;
135 1         13 });
136 1         21 $self->{searcher}->search_hc($self->{query}, $hc);
137 1         78 return $self->{expand_docs}->($self, @docs);
138             }
139              
140             sub expand_docs {
141 1     1 0 4 my ($self, @docs) = @_;
142 1         8 map $_->[0]->get("id")->string, sort { $b->[1] <=> $a->[1] } @docs;
  1         9  
143             }
144              
145             sub _unlucene {
146 0     0     my ($self, $ast) = @_;
147 0 0         return map {
    0          
148 0 0 0       $_->{query} eq "SUBQUERY" ? $self->_unlucene($_->{subquery}) :
149             $_->{query} ne "PHRASE" ? $_->{term} :
150             (split /\s+/, $_->{term})
151             } grep {
152 0           $_->{type} ne "PROHIBITED" and
153             (!exists($_->{field}) or $_->{field} eq $self->{default})
154 0           } @{$ast};
155             }
156              
157             =head2 snippeter
158              
159             $self->snippeter($doc_text)
160              
161             Given the searchable text of a document, returns a snippeter class
162             (C, C, etc.) object primed with
163             the positive parts of the query.
164              
165             When you call the rendering method (say, C) on this object,
166             you'll get the text snippet highlighting where the search terms appear
167             in the document.
168              
169             =cut
170              
171             sub snippeter {
172 0     0 1   my ($self, $body) = @_;
173 0 0         croak "It doesn't look like you've actually done a search yet"
174             unless $self->{orig_query};
175             # We can't actually use the original parser, because it may have
176             # tokenized us funny. (Porter stemming, etc.)
177 0           my @terms = $self->_unlucene(parse_query($self->{orig_query}));
178 0           $self->{snippeter}->new($body, @terms);
179             }
180              
181             1;
182              
183             =head1 AUTHOR
184              
185             Simon Cozens, C
186              
187             =head1 SEE ALSO
188              
189             L, L, L.
190              
191             =cut