File Coverage

blib/lib/Search/Tools/Query.pm

Criterion	Covered	Total	%
statement	114	126	90.4
branch	15	24	62.5
condition			n/a
subroutine	24	25	96.0
pod	11	11	100.0
total	164	186	88.1

line	stmt	bran	sub	pod	time	code
1						package Search::Tools::Query;
2	26		26		1143	use Moo;
	26				9095
	26				128
3						extends 'Search::Tools::Object';
4						use overload
5	9		9		766	'""' => sub { $_[0]->str; },
6	36		36		5327	'bool' => sub {1},
7	26		26		8887	fallback => 1;
	26				59
	26				239
8	26		26		1908	use Carp;
	26				56
	26				1342
9	26		26		145	use Data::Dump qw( dump );
	26				65
	26				1001
10	26		26		8772	use Search::Tools::RegEx;
	26				66
	26				702
11	26		26		3874	use Search::Tools::UTF8;
	26				61
	26				2362
12	26		26		4046	use Search::Tools::Tokenizer;
	26				67
	26				668
13	26		26		4666	use Search::Tools::XML;
	26				59
	26				725
14
15	26		26		153	use namespace::autoclean;
	26				51
	26				102
16
17						our $VERSION = '1.006';
18
19						has 'terms' => ( is => 'ro' );
20						has 'fields' => ( is => 'ro' );
21						has 'dialect' => ( is => 'ro' );
22						has 'str' => ( is => 'ro' );
23						has 'regex' => ( is => 'ro' );
24						has 'qp' => ( is => 'ro' );
25
26						=head1 NAME
27
28						Search::Tools::Query - objectified string for highlighting, snipping, etc.
29
30						=head1 SYNOPSIS
31
32						use Search::Tools::QueryParser;
33						my $qparser = Search::Tools::QueryParser->new;
34						my $query = $qparser->parse(q(the quick color:brown "fox jumped"));
35						my $fields = $query->fields; # ['color']
36						my $terms = $query->terms; # ['quick', 'brown', '"fox jumped"']
37						my $regex = $query->regex_for($terms->[0]); # S::T::RegEx
38						my $tree = $query->tree; # the Search::Query::Dialect tree()
39						print "$query\n"; # the quick color:brown "fox jumped"
40						print $query->str . "\n"; # same thing
41
42
43						=head1 DESCRIPTION
44
45
46						=head1 METHODS
47
48						=head2 fields
49
50						Array ref of fields from the original query string.
51						See Search::Tools::QueryParser for controls over ignore_fields().
52
53						=head2 terms
54
55						Array ref of key words from the original query string.
56						See Search::Tools::QueryParser for controls over ignore_fields()
57						and tokenizing regex.
58
59						B
60						Only positive words are extracted by QueryParser.
61						In other words, if you search for:
62
63						foo not bar
64
65						then only C is returned. Likewise:
66
67						+foo -bar
68
69						would return only C.
70
71						=head2 str
72
73						The original string.
74
75						=head2 regex
76
77						The hash ref of terms to Search::Tools::RegEx objects.
78
79						=head2 dialect
80
81						The internal Search::Query::Dialect object. See tree()
82						and str_clean() which delegate to the dialect object.
83
84						=head2 qp
85
86						The Search::Tools::QueryParser object used to generate the Query.
87
88						=head2 num_terms
89
90						Returns the number of terms().
91
92						=cut
93
94						sub num_terms {
95	38		38	1	1511	return scalar @{ shift->{terms} };
	38				109
96						}
97
98						=head2 unique_terms
99
100						Returns array ref of unique terms from query.
101						If stemming was on in the QueryParser,
102						all terms have already been stemmed as part
103						of the parsing process.
104
105						=cut
106
107						sub unique_terms {
108	2		2	1	3	my $self = shift;
109	2				3	my @t = @{ $self->{terms} };
	2				7
110	2				3	my %uniq;
111	2				4	for my $t (@t) {
112	4				8	my $re = $self->regex_for($t);
113	4	50			9	if ( $re->is_phrase ) {
114	4				5	for my $pt ( @{ $re->phrase_terms } ) {
	4				9
115	8				20	$uniq{ $pt->term }++;
116						}
117						}
118						else {
119	0				0	$uniq{ $re->term }++;
120						}
121						}
122	2				17	return [ keys %uniq ];
123						}
124
125						=head2 num_unique_terms
126
127						Returns number of unique_terms().
128
129						=cut
130
131						sub num_unique_terms {
132	2		2	1	4	return scalar( @{ $_[0]->unique_terms } );
	2				5
133						}
134
135						=head2 phrases
136
137						Return array ref of RegEx objects for all terms where is_phrase
138						is true.
139
140						=cut
141
142						sub phrases {
143	30		30	1	53	my $self = shift;
144	30				53	my @p;
145	30				47	for my $t ( keys %{ $self->{regex} } ) {
	30				129
146	57	100			205	if ( $self->{regex}->{$t}->is_phrase ) {
147	21				49	push @p, $self->{regex}->{$t};
148						}
149						}
150	30				106	return \@p;
151						}
152
153						=head2 non_phrases
154
155						Return array ref of RegEx objects for all terms where is_phrase
156						is false.
157
158						=cut
159
160						sub non_phrases {
161	0		0	1	0	my $self = shift;
162	0				0	my @p;
163	0				0	for my $t ( keys %{ $self->{regex} } ) {
	0				0
164	0	0			0	if ( !$self->{regex}->{$t}->is_phrase ) {
165	0				0	push @p, $self->{regex}->{$t};
166						}
167						}
168	0				0	return \@p;
169						}
170
171						=head2 tree
172
173						Returns the internal Search::Query::Dialect tree().
174
175						=cut
176
177						sub tree {
178	1		1	1	514	my $self = shift;
179	1				5	return $self->dialect->tree();
180						}
181
182						=head2 str_clean
183
184						Returns the internal Search::Query::Dialect stringify().
185
186						=cut
187
188						sub str_clean {
189	1		1	1	2	my $self = shift;
190	1				7	return $self->dialect->stringify();
191						}
192
193						=head2 regex_for(I)
194
195						Returns a Search::Tools::RegEx object for I.
196
197						=cut
198
199						sub regex_for {
200	116		116	1	25219	my $self = shift;
201	116				164	my $term = shift;
202	116	50			247	unless ( defined $term ) {
203	0				0	croak "term required";
204						}
205	116	50			254	my $regex = $self->{regex} or croak "regex not defined for query";
206	116	50			254	if ( !exists $regex->{$term} ) {
207	0				0	croak "no regex for $term";
208						}
209	116				348	return $regex->{$term};
210						}
211
212						=head2 regexp_for
213
214						Alias for regex_for(). The author has come to prefer "regex"
215						instead of "regexp" because it's one less keystroke.
216
217						=cut
218
219						*regexp_for = \®ex_for;
220
221						=head2 matches_text( I )
222
223						Returns the number of matches for the query against I.
224
225						=head2 matches_html( I )
226
227						Returns the number of matches for the query against I.
228
229						=cut
230
231						sub _matches_stemmed {
232	4		4		7	my $self = shift;
233	4				14	my $text = to_utf8( $_[0] );
234	4				7	my $count = 0;
235	4				8	my $qp = $self->qp;
236	4				10	my $stemmer = $qp->stemmer;
237	4				8	my $wildcard = $qp->wildcard;
238	4				88	my $tokenizer = Search::Tools::Tokenizer->new(
239						re => $qp->term_re,
240						debug => $self->debug,
241						);
242
243						# stem the whole text, creating a new buffer to
244						# match against. This covers both the cases where
245						# a term is a phrase and where it is not.
246	4				7	my @buf;
247						my $buf_maker = sub {
248	30		30		290	push @buf, $stemmer->( $qp, $_[0]->str );
249	4				18	};
250	4				148	$tokenizer->tokenize( $text, $buf_maker );
251	4				106	my $new_text = join( " ", @buf );
252
253	4				7	for my $term ( @{ $self->{terms} } ) {
	4				12
254	4				10	my $re = $self->{regex}->{$term}->{plain};
255	4				57	$count += $new_text =~ m/$re/;
256						}
257	4				44	return $count;
258						}
259
260						sub _matches {
261	2		2		4	my $self = shift;
262	2				3	my $style = shift;
263	2				7	my $text = to_utf8( $_[0] );
264	2				3	my $count = 0;
265	2				4	for my $term ( @{ $self->{terms} } ) {
	2				4
266	2				6	my $regex = $self->{regex}->{$term}->{$style};
267	2				31	$count += $text =~ m/$regex/;
268						}
269	2				8	return $count;
270						}
271
272						sub matches_text {
273	3		3	1	730	my $self = shift;
274	3				6	my $text = shift;
275	3	50			10	if ( !defined $text ) {
276	0				0	croak "text required";
277						}
278	3	100			17	return $self->_matches_stemmed($text) if $self->qp->stemmer;
279	1				4	return $self->_matches( 'plain', $text );
280						}
281
282						sub matches_html {
283	3		3	1	219	my $self = shift;
284	3				5	my $html = shift;
285	3	50			10	if ( !defined $html ) {
286	0				0	croak "html required";
287						}
288	3	100			15	if ( $self->qp->stemmer ) {
289	2				15	return $self->_matches_stemmed( Search::Tools::XML->no_html($html) );
290						}
291	1				3	return $self->_matches( 'html', $html );
292						}
293
294						=head2 terms_as_regex([I])
295
296						Returns all terms() as a single qr// regex, pipe-joined in a "OR"
297						logic.
298
299						=cut
300
301						sub terms_as_regex {
302	31		31	1	56	my $self = shift;
303	31				42	my $treat_phrases_as_singles = shift;
304	31	50			76	$treat_phrases_as_singles = 1 unless defined $treat_phrases_as_singles;
305	31				91	my $wildcard = $self->qp->wildcard;
306	31				74	my $wild_esc = quotemeta($wildcard);
307	31				72	my $wc = $self->qp->word_characters;
308	31				45	my @re;
309	31				44	for my $term ( @{ $self->{terms} } ) {
	31				104
310
311	65				152	my $q = quotemeta($term); # quotemeta speeds up the match, too
312						# even though we have to unquote below
313
314	65				288	$q =~ s/\\$wild_esc/[$wc]*/g; # wildcard match is very approximate
315
316						# treat phrases like OR'd words
317						# since that will just create more matches.
318						# if hiliting later, the phrase will be treated as such.
319	65	100			129	if ($treat_phrases_as_singles) {
320	46				121	$q =~ s/(\\ )+/\\|/g;
321						}
322
323						# if keeping phrases together use a less-naive regex instead of a space.
324						else {
325
326						#$q = $self->regex_for($term)->plain();
327						#$q =~ s/(\\ )+/[^$wc]+/g;
328						}
329
330	65				173	push( @re, $q );
331						}
332
333	31				149	my $j = sprintf( '(%s)', join( '\|', @re ) );
334	31				1061	return qr/$j/i;
335						}
336
337						1;
338
339						__END__