File Coverage

blib/lib/Search/Tools/QueryParser.pm

Criterion	Covered	Total	%
statement	233	260	89.6
branch	82	126	65.0
condition	17	27	62.9
subroutine	19	21	90.4
pod	3	3	100.0
total	354	437	81.0

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::QueryParser;
2	26			26		37562	use Moo;
	26					97702
	26					130
3							extends 'Search::Tools::Object';
4	26			26		14375	use Carp;
	26					44
	26					1369
5	26			26		2648	use Data::Dump qw( dump );
	26					20465
	26					972
6	26			26		12873	use Search::Query::Parser;
	26					2398028
	26					844
7	26			26		4830	use Encode;
	26					59574
	26					1793
8	26			26		117	use Data::Dump;
	26					33
	26					1015
9	26			26		10465	use Search::Tools::Query;
	26					56
	26					842
10	26			26		141	use Search::Tools::UTF8;
	26					31
	26					2189
11	26			26		103	use Search::Tools::XML;
	26					37
	26					410
12	26			26		84	use Search::Tools::RegEx;
	26					31
	26					381
13
14	26			26		73	use namespace::autoclean;
	26					30
	26					113
15
16							our $VERSION = '1.004';
17
18							my $XML = Search::Tools::XML->new();
19							my $C2E = $XML->char2ent_map;
20
21							# we turn locale pragma on in a small block
22							# because we don't want it to mess up our regex building
23							# or taint vars in other areas. We just want to use setlocale()
24							# and make sure we get correct ->utf8 encoding
25							my ( $locale, $lang, $charset );
26							{
27	26			26		13171	use POSIX qw(locale_h);
	26					102521
	26					135
28							$locale = setlocale(LC_CTYPE);
29							( $lang, $charset ) = split( m/\./, $locale );
30							$charset \|\|= q/UTF-8/; #
31							$lang = q/en_US/ if $lang =~ m/^(posix\|c)$/i;
32							}
33
34							my %Defaults = (
35							and_word => q/and\|near\d*/,
36							charset => $charset,
37							default_field => "",
38							ignore_case => 1,
39							ignore_fields => {},
40							ignore_first_char => quotemeta(q/'-/),
41							ignore_last_char => quotemeta(q/'-/),
42							lang => $lang,
43							locale => $locale,
44							not_word => q/not/,
45							or_word => q/or/,
46							phrase_delim => q/"/,
47							query_class => 'Search::Tools::Query',
48							query_dialect => "Search::Query::Dialect::Native",
49							stemmer => undef,
50							stopwords => [],
51							tag_re => $XML->tag_re,
52							term_re => qr/\w+(?:[\'\-]\w+)*/,
53							term_min_length => 1,
54							treat_uris_like_phrases => 1,
55							whitespace => $XML->html_whitespace,
56							wildcard => q/*/,
57							word_characters => q/\w/ . quotemeta(q/'-/),
58							);
59
60							for my $attr ( keys %Defaults ) {
61							has( $attr => ( is => 'rw', default => sub { $Defaults{$attr} } ) );
62							}
63							has 'start_bound' => ( is => 'ro' );
64							has 'end_bound' => ( is => 'ro' );
65							has 'plain_phrase_bound' => ( is => 'ro' );
66							has 'html_phrase_bound' => ( is => 'ro' );
67
68							sub get_defaults {
69	0			0	1	0	return {%Defaults};
70							}
71
72							sub BUILD {
73	51			51	1	302	my $self = shift;
74
75							# TODO handle case where both term_re and word_characters are defined
76
77							# charset/locale/lang are a bit interdependent
78							# so make sure charset/lang are set if locale is explicitly passed.
79	51	100				196	if ( $self->{locale} ne $Defaults{locale} ) {
80	1					3	( $self->{lang}, $self->{charset} ) = split( m/\./, $self->{locale} );
81	1	50				4	$self->{lang} = 'en_US' if $self->{lang} =~ m/^(posix\|c)$/i;
82	1		33			2	$self->{charset} \|\|= $Defaults{charset};
83							}
84
85							# make sure ignore_fields is a hash ref
86	51	50				165	if ( ref( $self->{ignore_fields} ) eq 'ARRAY' ) {
87							$self->{ignore_fields}
88	0					0	= { map { $_ => $_ } @{ $self->{ignore_fields} } };
	0					0
	0					0
89							}
90
91	51					162	$self->_setup_regex_builder;
92
93	51					1139	return $self;
94							}
95
96							sub parse {
97	63			63	1	11137	my $self = shift;
98	63					97	my $query_str = shift;
99	63	50				176	confess "query required" unless defined $query_str;
100	63	50				165	if ( ref $query_str ) {
101	0					0	croak "query must be a scalar string";
102							}
103
104							#$query_str = to_utf8( $query_str, $self->charset );
105	63					157	my $extracted = $self->_extract_terms($query_str);
106	63					118	my %regex;
107	63					79	TERM: for my $term ( @{ $extracted->{terms} } ) {
	63					160
108	145					2221	my ( $plain, $html, $escaped ) = $self->_build_regex($term);
109	145					424	my $is_phrase = $term =~ m/\ /;
110	145					155	my @phrase_terms;
111
112							# if the term is a phrase,
113							# build regex for each term in the phrase
114	145	100				325	if ($is_phrase) {
115	32					147	my @pts = split( /\ /, $term );
116	32					72	for my $pt (@pts) {
117	87					1303	my ( $pt_plain, $pt_html, $pt_esc )
118							= $self->_build_regex($pt);
119	87					2908	push @phrase_terms,
120							Search::Tools::RegEx->new(
121							plain => $pt_plain,
122							html => $pt_html,
123							term => $pt,
124							term_re => qr/$pt_esc/i,
125							is_phrase => 0,
126							);
127							}
128							}
129	145					6676	$regex{$term} = Search::Tools::RegEx->new(
130							plain => $plain,
131							html => $html,
132							term => $term,
133							term_re => qr/$escaped/i,
134							is_phrase => $is_phrase,
135							phrase_terms => \@phrase_terms,
136							);
137
138							}
139							return $self->{query_class}->new(
140							dialect => $extracted->{dialect},
141							terms => $extracted->{terms},
142							fields => $extracted->{fields},
143	63					1846	str => to_utf8( $query_str, $self->charset ),
144							regex => \%regex,
145							qp => $self,
146							);
147							}
148
149							sub _extract_terms {
150	63			63		77	my $self = shift;
151	63					75	my $query = shift;
152	63	50				140	confess "need query to extract terms" unless defined $query;
153	63					168	my $stopwords = $self->stopwords;
154	63					129	my $and_word = $self->and_word;
155	63					144	my $or_word = $self->or_word;
156	63					146	my $not_word = $self->not_word;
157	63					247	my $wildcard = $self->wildcard;
158	63					214	my $phrase = $self->phrase_delim;
159	63					122	my $igf = $self->ignore_first_char;
160	63					107	my $igl = $self->ignore_last_char;
161	63					126	my $wordchar = $self->word_characters;
162	63					128	my $default_field = $self->default_field;
163	63					97	my $esc_wildcard = quotemeta($wildcard);
164	63					2860	my $word_re = qr/(($esc_wildcard)?[$wordchar]+($esc_wildcard)?)/;
165	63					323	my $min_length = $self->term_min_length;
166	63					93	my $raw_query = $query;
167
168	63	100				243	$stopwords = [ split( /\s+/, $stopwords ) ] unless ref $stopwords;
169	63					160	my %stophash = map { to_utf8( lc($_), $self->charset ) => 1 } @$stopwords;
	15					61
170	63					135	my ( %words, %uniq, $c );
171	63					1831	my $parser = Search::Query::Parser->new(
172							and_regex => qr{$and_word}i,
173							or_regex => qr{$or_word}i,
174							not_regex => qr{$not_word}i,
175							default_field => $default_field,
176							query_class => $self->query_dialect,
177							);
178
179	63					84616	my $baked_query = $raw_query;
180	63	50				358	$baked_query = lc($baked_query) if $self->ignore_case;
181	63					3455	$baked_query = to_utf8( $baked_query, $self->charset );
182	63	50				216	my $dialect = $parser->parse($baked_query) or croak $parser->error;
183	63	50				111004	$self->debug && carp "parsetree: " . Data::Dump::dump( $dialect->tree );
184	63					9748	my $fields_searched
185							= $self->_get_value_from_tree( \%uniq, $dialect->tree, $c );
186
187	63	50				1382	$self->debug && carp "parsed: " . Data::Dump::dump( \%uniq );
188
189	63					365	my $count = scalar( keys %uniq );
190
191							# parse uniq into word tokens
192							# including removing stop words
193
194	63	50				859	$self->debug && carp "word_re: $word_re";
195
196	63					492	U: for my $u ( sort { $uniq{$a} <=> $uniq{$b} } keys %uniq ) {
	164					272
197
198	152					207	my $n = $uniq{$u};
199
200							# only phrases have space
201							# but due to our word_re, a single non-spaced string
202							# might actually be multiple word tokens
203	152		100			601	my $isphrase = $u =~ m/\s/ \|\| 0;
204
205	152	50				387	if ( $self->treat_uris_like_phrases ) {
206
207							# special case: treat email addresses, uris, as phrase
208	152		100			1751	$isphrase \|\|= $u =~ m/[$wordchar][\@\.\\\/][$wordchar]/ \|\| 0;
			100
209							}
210
211	152	50				2379	$self->debug && carp "$u -> isphrase = $isphrase";
212
213	152					731	my @w = ();
214
215	152					470	TOK: for my $w ( split( m/\s+/, to_utf8( $u, $self->charset ) ) ) {
216
217	199	50				500	next TOK unless $w =~ m/\S/;
218
219	199					559	$w =~ s/\Q$phrase\E//g;
220
221	199					903	while ( $w =~ m/$word_re/g ) {
222	208					300	my $tok = _untaint($1);
223
224							# strip ignorable chars
225	208	50				944	$tok =~ s/^[$igf]+// if length($igf);
226	208	50				802	$tok =~ s/[$igl]+$// if length($igl);
227
228	208	50				322	unless ($tok) {
229	0	0				0	$self->debug && carp "no token for '$w' $word_re";
230	0					0	next TOK;
231							}
232
233	208	50				3124	$self->debug && carp "found token: $tok";
234
235	208	100				1251	if ( exists $stophash{ lc($tok) } ) {
236	13	50				188	$self->debug && carp "$tok = stopword";
237	13	100				81	next TOK unless $isphrase;
238							}
239
240	202	100				345	unless ($isphrase) {
241	115	50				1061	next TOK if $tok =~ m/^($and_word\|$or_word\|$not_word)$/i;
242							}
243
244							# if tainting was on, odd things can happen.
245							# so check one more time
246	202					529	$tok = to_utf8( $tok, $self->charset );
247
248							# final sanity check
249	202	50				430	if ( !Encode::is_utf8($tok) ) {
250	0					0	carp "$tok is NOT utf8";
251	0					0	next TOK;
252							}
253
254							#$self->debug && carp "pushing $tok into wordlist";
255	202					755	push( @w, $tok );
256
257							}
258
259							}
260
261	152	100				375	next U unless @w;
262
263							#$self->debug && carp "joining \@w: " . Data::Dump::dump(\@w);
264	146	100				320	if ($isphrase) {
265	32					145	$words{ join( ' ', @w ) } = $n + $count++;
266							}
267							else {
268	114					188	for (@w) {
269	115					350	$words{$_} = $n + $count++;
270							}
271							}
272
273							}
274
275	63	50				1010	$self->debug && carp "tokenized: " . Data::Dump::dump( \%words );
276
277							# make sure we don't have 'foo' and 'foo*'
278	63					418	for ( keys %words ) {
279	147	100				472	if ( $_ =~ m/$esc_wildcard/ ) {
280	12					87	( my $copy = $_ ) =~ s,$esc_wildcard,,g;
281
282							# delete the more exact of the two
283							# since the * will match both
284	12					25	delete( $words{$copy} );
285							}
286
287	147	100				343	if ( length $_ < $min_length ) {
288	1	50				16	$self->debug and carp "token too short: '$_'";
289	1					6	delete $words{$_};
290							}
291
292							}
293
294	63	50				1381	$self->debug && carp "wildcards removed: " . Data::Dump::dump( \%words );
295
296							# if any words need to be stemmed
297	63	100				430	if ( $self->stemmer ) {
298
299							# split each $word into words
300							# stem each word
301							# if stem ne word, break into chars and find first N common
302							# rejoin $uniq
303
304							#carp "stemming ON\n";
305
306	8					24	K: for ( keys %words ) {
307	15					45	my (@w) = split /\s+/;
308	15					19	W: for my $w (@w) {
309	27					37	my $func = $self->stemmer;
310	27					47	my $f = &$func( $self, $w );
311	27	50	33			149	if ( !defined $f or !length $f ) {
312	0					0	next W;
313							}
314	27					39	$f = to_utf8($f);
315
316							#warn "w: $w\nf: $f\n";
317
318							# add wildcard to indicate chars were lost
319	27					55	$w = $f . $wildcard;
320
321							}
322	15					22	my $new = join ' ', @w;
323	15	50				29	if ( $new ne $_ ) {
324	15					31	$words{$new} = $words{$_};
325	15					32	delete $words{$_};
326							}
327							}
328
329							}
330
331	63	50				896	$self->debug && carp "stemming done: " . Data::Dump::dump( \%words );
332
333							# sort keeps query in same order as we entered
334							return {
335	63					595	terms => [ sort { $words{$a} <=> $words{$b} } keys %words ],
	145					489
336							fields => [ keys %$fields_searched ],
337							dialect => $dialect,
338							query => $raw_query,
339							};
340
341							}
342
343							# stolen nearly verbatim from Taint::Runtime
344							# apparently regex can be tainted when running under 'use locale'.
345							# as of version 0.24 this should not be needed but until I can find a way
346							# to easily test the Taint feature, we just do this. It's low overhead.
347							sub _untaint {
348	208			208		308	my $str = shift;
349	208	50				344	my $ref = ref($str) ? $str : \$str;
350	208	50				335	if ( !defined $$ref ) {
351	0					0	$$ref = undef;
352							}
353							else {
354							$$ref
355							= ( $$ref =~ /(.*)/ )
356							? $1
357	208	50				592	: do { confess("Couldn't find data to untaint") };
	0					0
358							}
359	208	50				406	return ref($str) ? 1 : $str;
360							}
361
362							sub _get_value_from_tree {
363	63			63		2622	my $self = shift;
364	63					78	my $uniq = shift;
365	63					65	my $parseTree = shift;
366	63					79	my $c = shift;
367	63					110	my %fields = ();
368
369							# we only want the values from non minus queries
370	63					105	for my $node ( '+', '' ) {
371	126	100				295	next unless exists $parseTree->{$node};
372
373	63					73	my @branches = @{ $parseTree->{$node} };
	63					207
374
375							#warn dump \@branches;
376
377	63					383	for my $leaf (@branches) {
378	150					182	my $v = $leaf->{value};
379	150	50				236	if ( !defined $v ) {
380	0					0	croak "undefined value in query tree: " . dump($leaf);
381							}
382	150	50	66			401	if ( defined $leaf->{field}
383							and exists $self->ignore_fields->{ $leaf->{field} } )
384							{
385	0					0	next;
386							}
387	150					137	my $field = $leaf->{field};
388	150	100				236	if ( defined $field ) {
389	3					9	$fields{$field}++;
390							}
391	150	50				372	if ( ref $v eq 'HASH' ) {
		100
392	0					0	my $f = $self->_get_value_from_tree( $uniq, $v, $c );
393	0					0	$fields{$_} = $f->{$_} for ( keys %$f );
394							}
395							elsif ( ref $v eq 'ARRAY' ) {
396	1					3	for my $value (@$v) {
397	2					28	$value =~ s/\s+/ /g;
398	2					7	$uniq->{$value} = ++$c;
399							}
400							}
401							else {
402
403							# if the $leaf is a proximity query,
404							# ignore the "phrase-ness" of it and split
405							# on whitespace. This is a compromise,
406							# mitigated by the tendency of HeatMap
407							# to reward proximity anyway.
408	149	100	66			316	if ( $leaf->{proximity} and $leaf->{proximity} > 1 ) {
409	1					7	my @tokens = split( m/\ +/, $v );
410	1					5	$uniq->{$_} = ++$c for @tokens;
411	1					3	next;
412							}
413
414							# collapse any whitespace
415	148					321	$v =~ s,\s+,\ ,g;
416
417	148					385	$uniq->{$v} = ++$c;
418							}
419							}
420							}
421	63					124	return \%fields;
422							}
423
424							sub _setup_regex_builder {
425	51			51		63	my $self = shift;
426
427							# TODO optional for term_re
428
429							# a search for a '<' or '>' should still highlight,
430							# since < or > can be indexed as literal < and >
431							# but this causes a great deal of hassle
432							# so we just ignore them.
433	51					284	my $wordchars = $self->word_characters;
434	51					117	$wordchars =~ s,[<>&],,g;
435	51					89	$self->{html_safe_wordchars} = $wordchars; # remember for build
436	51					128	my $ignore_first = $self->ignore_first_char;
437	51					108	my $ignore_last = $self->ignore_last_char;
438	51					105	my $html_whitespace = $self->whitespace;
439
440							# what's the boundary between a word and a not-word?
441							# by default:
442							# the beginning of a string
443							# the end of a string
444							# whatever we've defined as WhiteSpace
445							# any character that is not a WordChar
446							# any character we explicitly ignore at start or end of word
447							#
448							# the \A and \Z (beginning and end) should help if the word butts up
449							# against the beginning or end of a tagset
450							# like Word or Word
451
452	51					182	my @start_bound = (
453							'\A',
454							'[>]',
455							'(?:&[\w\#]+;)', # because a ; might be a legitimate wordchar
456							# and we treat a char entity like a single char.
457							# if &char; resolves to a legit wordchar
458							# this might give unexpected results.
459							# NOTE that etc is in $WhiteSpace
460							$html_whitespace,
461							'[^' . $wordchars . ']'
462							);
463	51	50				709	push( @start_bound, qr/[$ignore_first]+/i ) if length $ignore_first;
464
465							my @end_bound
466	51					266	= ( '\Z', '[<&]', $html_whitespace, '[^' . $wordchars . ']' );
467	51	50				332	push( @end_bound, qr/[$ignore_last]+/i ) if length $ignore_last;
468
469	51		33			397	$self->{start_bound} \|\|= join( '\|', @start_bound );
470
471	51		33			274	$self->{end_bound} \|\|= join( '\|', @end_bound );
472
473							# the whitespace in a query phrase might be:
474							# any ignore_last_char, followed by
475							# one or more nonwordchar or whitespace, followed by
476							# any ignore_first_char
477							# define for both text and html
478							# NOTE the first/last swap for plain vs html
479							# is intentional because of how regex are built.
480
481	51	50				13321	my @plain_phrase_bound = (
		50
482							( length($ignore_last) ? qr/[$ignore_last]*/i : '' ),
483							qr/(?:[\s\x20]\|[^$wordchars])+/is,
484							( length($ignore_first) ? qr/[$ignore_first]?/i : '' ),
485							);
486	51					2442	$self->{plain_phrase_bound} = join( '', @plain_phrase_bound );
487
488	51	50				2381	my @html_phrase_bound = (
		50
489							( length($ignore_first) ? qr/[$ignore_first]*/i : '' ),
490							qr/(?:$html_whitespace\|[^$wordchars])+/is,
491							( length($ignore_last) ? qr/[$ignore_last]?/i : '' ),
492							);
493	51					149527	$self->{html_phrase_bound} = join( '', @html_phrase_bound );
494
495							}
496
497							sub _build_regex {
498	232			232		254	my $self = shift;
499	232	50				451	my $q = shift or croak "need query to build()";
500	232					306	my $wild = $self->{html_safe_wordchars};
501	232					235	my $st_bound = $self->{start_bound};
502	232					202	my $end_bound = $self->{end_bound};
503	232					219	my $wc = $self->{html_safe_wordchars};
504	232					210	my $ppb = $self->{plain_phrase_bound};
505	232					212	my $hpb = $self->{html_phrase_bound};
506	232					417	my $wildcard = $self->wildcard;
507	232					305	my $wild_esc = quotemeta($wildcard);
508	232					335	my $tag_re = $self->tag_re;
509
510							# define simple pattern for plain text
511							# and complex pattern for HTML markup
512	232					194	my ( $plain, $html );
513	232					251	my $escaped = quotemeta($q);
514	232					985	$escaped =~ s/\\[$wild_esc]/[$wc]*/g; # wildcard
515	232					493	$escaped =~ s/\\[\s]/$ppb/g; # whitespace
516
517	232					71028	$plain = qr/
518							(
519							\A\|$ppb
520							)
521							(
522							${escaped}
523							)
524							(
525							\Z\|$ppb
526							)
527							/xis;
528
529	232					9852	my (@char) = split( m//, $q );
530
531	232					282	my $counter = -1;
532
533	232					409	CHAR: foreach my $c (@char) {
534	1461					1007	$counter++;
535
536	1461		100			3040	my $ent = $C2E->{$c} \|\| undef;
537	1461					1409	my $num = ord($c);
538
539							# if this is a special regexp char, protect it
540	1461					1144	$c = quotemeta($c);
541
542							# if it's a *, replace it with the Wild class
543	1461	100				2113	$c = "[$wild]*" if $c eq $wild_esc;
544
545	1461	100				1804	if ( $c eq '\ ' ) {
546	55					132	$c = $hpb . $tag_re . '*';
547	55					101	next CHAR;
548							}
549
550	1406					872	my $aka;
551	1406	100				1519	if ($ent) {
552	1404	100				2284	$aka = $ent eq "&#$num;" ? $ent : "$ent\|&#$num;";
553							}
554							else {
555	2					5	$aka = "&#$num;";
556							}
557
558							# make $c into a regexp
559	1406	100				12524	$c = qr/$c\|$aka/i unless $c eq "[$wild]*";
560
561							# any char might be followed by zero or more tags, unless it's the last char
562	1406	100				4770	$c .= $tag_re . '*' unless $counter == $#char;
563
564							}
565
566							# re-join the chars into a single string
567	232					674	my $safe = join( "\n", @char ); # use \n to make it legible in debugging
568
569							# for debugging legibility we include newlines, so make sure we s//x in matches
570	232					147590	$html = qr/
571							(
572							${st_bound}
573							)
574							(
575							${safe}
576							)
577							(
578							${end_bound}
579							)
580							/xis;
581
582	232					6855	return ( $plain, $html, $escaped );
583							}
584
585							sub _build_term_re {
586
587							# this based on SWISH::PhraseHighlight::set_match_regexp()
588
589	0			0			my $self = shift;
590
591							#dump $self;
592
593	0						my $wc = $self->word_characters;
594							$self->{_wc_regexp}
595	0						= qr/[^$wc]+/io; # regexp for splitting into swish-words
596
597	0						my $igf = $self->ignore_first_char;
598	0						my $igl = $self->ignore_last_char;
599	0						for ( $igf, $igl ) {
600	0	0					if ($_) {
601	0						$_ = "[$_]*";
602							}
603							else {
604	0						$_ = '';
605							}
606							}
607
608	0						$self->{_ignoreFirst} = $igf;
609	0						$self->{_ignoreLast} = $igl;
610
611							}
612
613							1;
614
615							__END__