File Coverage

blib/lib/LucyX/Suggester.pm

Criterion	Covered	Total	%
statement	16	18	88.8
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	22	24	91.6

line	stmt	sub	time	code
1				package LucyX::Suggester;
2	2	2	50391	use warnings;
	2		6
	2		63
3	2	2	10	use strict;
	2		3
	2		70
4	2	2	10	use Carp;
	2		8
	2		132
5	2	2	1813	use Data::Dump qw( dump );
	2		18001
	2		192
6	2	2	2101	use Search::Tools;
	2		3415
	2		74
7	2	2	2247	use Lucy;
	0
	0
8
9				our $VERSION = '0.005';
10
11				=head1 NAME
12
13				LucyX::Suggester - suggest terms for Apache Lucy search
14
15				=head1 SYNOPSIS
16
17				use LucyX::Suggester;
18				my $suggester = LucyX::Suggester->new(
19				fields => [qw( foo bar )],
20				indexes => $list_of_indexes,
21				spellcheck => $search_tools_spellcheck,
22				limit => 10,
23				use_regex => 0,
24				max_length => 64,
25				);
26				my $suggestions = $suggester->suggest('quiK brwn fox');
27
28				=head1 DESCRIPTION
29
30				Inspired by the Solr Suggester feature, LucyX::Suggester
31				will return a list of suggested terms based on
32				actual terms in the specified index(es). Spellchecking
33				on query terms is performed with Search::Tools::SpellCheck,
34				which uses Text::Aspell.
35
36				=head1 METHODS
37
38				=head2 new( I )
39
40				Returns a new Suggester object. Supported I include:
41
42				=over
43
44				=item fields I
45
46				List of fields to limit Lexicon scans to.
47
48				=item indexes I
49
50				List of indexes to search within.
51
52				=item spellcheck I
53
54				An instance of L. Set
55				this to indicate custom values for language, dictionary,
56				and other params to L.
57
58				=item limit I
59
60				Maximum number of suggestions to return. Defaults to 10.
61
62				=item use_regex 1\|0
63
64				Use a simple regex when comparing terms. Defaults to false (0),
65				preferring index(). If your analyzer results in terms containing
66				multiple words (e.g. phrases) then B is probably what you want.
67
68				=item max_length I
69
70				Set a max term length beyond which suggestions are trimmed
71				with substr(). Default is 64 characters. Set to 0 to disable.
72
73				=back
74
75				=cut
76
77				sub new {
78				my $class = shift;
79				my %args = @_;
80				my $fields = delete $args{fields} \|\| [];
81				my $indexes = delete $args{indexes} or croak "indexes required";
82				my $spellcheck = delete $args{spellcheck};
83				my $limit = delete $args{limit} \|\| 10;
84				my $debug = delete $args{debug} \|\| $ENV{LUCYX_DEBUG} \|\| 0;
85				my $use_regex = delete $args{use_regex} \|\| 0;
86				my $max_length = delete $args{max_length};
87				$max_length = 64 unless defined $max_length;
88
89				if (%args) {
90				croak "Too many arguments to new(): " . dump( \%args );
91				}
92				if ( ref $indexes ne 'ARRAY' ) {
93				croak "indexes must be an ARRAY ref";
94				}
95				return bless(
96				{ fields => $fields,
97				indexes => $indexes,
98				spellcheck => $spellcheck,
99				limit => $limit,
100				debug => $debug,
101				use_regex => $use_regex,
102				max_length => $max_length,
103				},
104				$class
105				);
106				}
107
108				=head2 suggest( I )
109
110				Returns arrayref of terms that match I.
111
112				=cut
113
114				#
115				# I tried re-writing this to use a PolySearcher
116				# instead, in order to preserve phrases, etc
117				# but it was up to 3x slower.
118				#
119
120				sub suggest {
121				my $self = shift;
122				my $query = shift;
123
124				croak "query required" unless defined $query;
125
126				my $debug = $self->{debug};
127
128				my $spellchecker = $self->{spellcheck};
129				if ( !$spellchecker ) {
130				my $qparser = Search::Tools->parser( debug => $debug, );
131				$spellchecker = Search::Tools->spellcheck(
132				debug => $debug,
133				query_parser => $qparser,
134				);
135				}
136
137				my $suggestions = $spellchecker->suggest($query);
138				my %terms;
139				for my $s (@$suggestions) {
140
141				if ( !$s->{suggestions} ) {
142				$terms{ $s->{word} }++;
143				}
144				else {
145				for my $suggest ( @{ $s->{suggestions} } ) {
146				$terms{$suggest}++;
147				}
148				}
149				}
150
151				# must analyze each query term and suggestion
152				# per-field, which we cache for performance
153				my %analyzed;
154
155				# suggestions
156				my %matches;
157
158				my @my_fields = @{ $self->{fields} };
159				my $use_regex = $self->{use_regex};
160				my $maxl = $self->{max_length};
161
162				INDEX: for my $invindex ( @{ $self->{indexes} } ) {
163				my $reader = Lucy::Index::IndexReader->open( index => $invindex );
164				my $schema = $reader->get_schema();
165				my $fields;
166				if (@my_fields) {
167				$fields = \@my_fields;
168				}
169				else {
170				$fields = $schema->all_fields();
171				}
172				my $seg_readers = $reader->seg_readers;
173				SEG: for my $seg_reader (@$seg_readers) {
174				my $lex_reader
175				= $seg_reader->obtain('Lucy::Index::LexiconReader');
176				FIELD: for my $field (@$fields) {
177				$self->_analyze_terms( $schema, $field, \%analyzed, \%terms );
178
179				# sort in order to seek() below.
180				my @to_check = sort keys %{ $analyzed{$field} };
181
182				$debug and warn "$field=" . dump \@to_check;
183
184				my $lexicon = $lex_reader->lexicon( field => $field );
185				next FIELD unless $lexicon;
186
187				CHECK: for my $check_term (@to_check) {
188
189				my $check_initial = substr( $check_term, 0, 1 );
190
191				$debug and warn " seek($check_term)";
192				$lexicon->seek($check_term);
193
194				TERM: while ( defined( my $term = $lexicon->get_term ) ) {
195
196				$debug and warn " $check_term -> $term";
197
198				my $initial = substr( $term, 0, 1 );
199				if ( $initial and $initial gt $check_initial ) {
200				$debug
201				and warn
202				" reset: initial=$initial > check_initial=$check_initial";
203				next CHECK;
204				}
205
206				# TODO better weighting than simple freq?
207
208				if ( !$use_regex
209				&& index( $term, $check_term, 0 ) == 0 )
210				{
211				my $freq = $lex_reader->doc_freq(
212				field => $field,
213				term => $term,
214				);
215				$debug and warn "ok term=$term [$freq]";
216				$matches{
217				$maxl
218				? substr( $term, 0, $maxl )
219				: $term
220				} += $freq;
221
222				# abort everything if we've hit our limit
223				if ( scalar( keys %matches ) >= $self->{limit} ) {
224				last INDEX;
225				}
226
227				}
228				elsif ( $use_regex && $term =~ m/\b\Q$check_term/ ) {
229				my $freq = $lex_reader->doc_freq(
230				field => $field,
231				term => $term,
232				);
233				$debug and warn "ok term=$term [$freq]";
234				$matches{
235				$maxl
236				? substr(
237				$term, index( $term, $check_term, 0 ),
238				$maxl
239				)
240				: $term
241				} += $freq;
242
243				# abort everything if we've hit our limit
244				if ( scalar( keys %matches ) >= $self->{limit} ) {
245				last INDEX;
246				}
247				}
248				else {
249				$debug
250				and warn
251				" No match - skipping to next CHECK term";
252				next CHECK;
253				}
254
255				last TERM unless $lexicon->next;
256
257				}
258				}
259				}
260				}
261				}
262
263				# boost phrase matches
264				for my $m ( keys %matches ) {
265				next unless $m =~ m/ /;
266				for my $m2 ( keys %matches ) {
267				if ( $m =~ m/\b\Q$m2\E\b/ ) {
268				$matches{$m} += $matches{$m2};
269				}
270				}
271				}
272
273				$debug and warn "matches=" . dump( \%matches );
274
275				return [
276				sort { $matches{$b} <=> $matches{$a} \|\| $a cmp $b }
277				keys %matches
278				];
279				}
280
281				sub _analyze_terms {
282				my ( $self, $schema, $field, $analyzed, $terms ) = @_;
283
284				return if exists $analyzed->{$field};
285
286				my $analyzer = $schema->fetch_analyzer($field);
287				for my $t ( keys %$terms ) {
288				my $baked = $analyzer ? $analyzer->split($t)->[0] : $t;
289				next if length $baked == 1; # too much noise
290				$analyzed->{$field}->{$baked} = $t;
291				}
292				}
293
294				1;
295
296				__END__