File Coverage

blib/lib/Search/Tools/QueryParser.pm
Criterion Covered Total %
statement 233 260 89.6
branch 82 126 65.0
condition 17 27 62.9
subroutine 19 21 90.4
pod 3 3 100.0
total 354 437 81.0


line stmt bran cond sub pod time code
1             package Search::Tools::QueryParser;
2 26     26   37562 use Moo;
  26         97702  
  26         130  
3             extends 'Search::Tools::Object';
4 26     26   14375 use Carp;
  26         44  
  26         1369  
5 26     26   2648 use Data::Dump qw( dump );
  26         20465  
  26         972  
6 26     26   12873 use Search::Query::Parser;
  26         2398028  
  26         844  
7 26     26   4830 use Encode;
  26         59574  
  26         1793  
8 26     26   117 use Data::Dump;
  26         33  
  26         1015  
9 26     26   10465 use Search::Tools::Query;
  26         56  
  26         842  
10 26     26   141 use Search::Tools::UTF8;
  26         31  
  26         2189  
11 26     26   103 use Search::Tools::XML;
  26         37  
  26         410  
12 26     26   84 use Search::Tools::RegEx;
  26         31  
  26         381  
13              
14 26     26   73 use namespace::autoclean;
  26         30  
  26         113  
15              
16             our $VERSION = '1.004';
17              
18             my $XML = Search::Tools::XML->new();
19             my $C2E = $XML->char2ent_map;
20              
21             # we turn locale pragma on in a small block
22             # because we don't want it to mess up our regex building
23             # or taint vars in other areas. We just want to use setlocale()
24             # and make sure we get correct ->utf8 encoding
25             my ( $locale, $lang, $charset );
26             {
27 26     26   13171 use POSIX qw(locale_h);
  26         102521  
  26         135  
28             $locale = setlocale(LC_CTYPE);
29             ( $lang, $charset ) = split( m/\./, $locale );
30             $charset ||= q/UTF-8/; #
31             $lang = q/en_US/ if $lang =~ m/^(posix|c)$/i;
32             }
33              
34             my %Defaults = (
35             and_word => q/and|near\d*/,
36             charset => $charset,
37             default_field => "",
38             ignore_case => 1,
39             ignore_fields => {},
40             ignore_first_char => quotemeta(q/'-/),
41             ignore_last_char => quotemeta(q/'-/),
42             lang => $lang,
43             locale => $locale,
44             not_word => q/not/,
45             or_word => q/or/,
46             phrase_delim => q/"/,
47             query_class => 'Search::Tools::Query',
48             query_dialect => "Search::Query::Dialect::Native",
49             stemmer => undef,
50             stopwords => [],
51             tag_re => $XML->tag_re,
52             term_re => qr/\w+(?:[\'\-]\w+)*/,
53             term_min_length => 1,
54             treat_uris_like_phrases => 1,
55             whitespace => $XML->html_whitespace,
56             wildcard => q/*/,
57             word_characters => q/\w/ . quotemeta(q/'-/),
58             );
59              
60             for my $attr ( keys %Defaults ) {
61             has( $attr => ( is => 'rw', default => sub { $Defaults{$attr} } ) );
62             }
63             has 'start_bound' => ( is => 'ro' );
64             has 'end_bound' => ( is => 'ro' );
65             has 'plain_phrase_bound' => ( is => 'ro' );
66             has 'html_phrase_bound' => ( is => 'ro' );
67              
68             sub get_defaults {
69 0     0 1 0 return {%Defaults};
70             }
71              
72             sub BUILD {
73 51     51 1 302 my $self = shift;
74              
75             # TODO handle case where both term_re and word_characters are defined
76              
77             # charset/locale/lang are a bit interdependent
78             # so make sure charset/lang are set if locale is explicitly passed.
79 51 100       196 if ( $self->{locale} ne $Defaults{locale} ) {
80 1         3 ( $self->{lang}, $self->{charset} ) = split( m/\./, $self->{locale} );
81 1 50       4 $self->{lang} = 'en_US' if $self->{lang} =~ m/^(posix|c)$/i;
82 1   33     2 $self->{charset} ||= $Defaults{charset};
83             }
84              
85             # make sure ignore_fields is a hash ref
86 51 50       165 if ( ref( $self->{ignore_fields} ) eq 'ARRAY' ) {
87             $self->{ignore_fields}
88 0         0 = { map { $_ => $_ } @{ $self->{ignore_fields} } };
  0         0  
  0         0  
89             }
90              
91 51         162 $self->_setup_regex_builder;
92              
93 51         1139 return $self;
94             }
95              
96             sub parse {
97 63     63 1 11137 my $self = shift;
98 63         97 my $query_str = shift;
99 63 50       176 confess "query required" unless defined $query_str;
100 63 50       165 if ( ref $query_str ) {
101 0         0 croak "query must be a scalar string";
102             }
103              
104             #$query_str = to_utf8( $query_str, $self->charset );
105 63         157 my $extracted = $self->_extract_terms($query_str);
106 63         118 my %regex;
107 63         79 TERM: for my $term ( @{ $extracted->{terms} } ) {
  63         160  
108 145         2221 my ( $plain, $html, $escaped ) = $self->_build_regex($term);
109 145         424 my $is_phrase = $term =~ m/\ /;
110 145         155 my @phrase_terms;
111              
112             # if the term is a phrase,
113             # build regex for each term in the phrase
114 145 100       325 if ($is_phrase) {
115 32         147 my @pts = split( /\ /, $term );
116 32         72 for my $pt (@pts) {
117 87         1303 my ( $pt_plain, $pt_html, $pt_esc )
118             = $self->_build_regex($pt);
119 87         2908 push @phrase_terms,
120             Search::Tools::RegEx->new(
121             plain => $pt_plain,
122             html => $pt_html,
123             term => $pt,
124             term_re => qr/$pt_esc/i,
125             is_phrase => 0,
126             );
127             }
128             }
129 145         6676 $regex{$term} = Search::Tools::RegEx->new(
130             plain => $plain,
131             html => $html,
132             term => $term,
133             term_re => qr/$escaped/i,
134             is_phrase => $is_phrase,
135             phrase_terms => \@phrase_terms,
136             );
137              
138             }
139             return $self->{query_class}->new(
140             dialect => $extracted->{dialect},
141             terms => $extracted->{terms},
142             fields => $extracted->{fields},
143 63         1846 str => to_utf8( $query_str, $self->charset ),
144             regex => \%regex,
145             qp => $self,
146             );
147             }
148              
149             sub _extract_terms {
150 63     63   77 my $self = shift;
151 63         75 my $query = shift;
152 63 50       140 confess "need query to extract terms" unless defined $query;
153 63         168 my $stopwords = $self->stopwords;
154 63         129 my $and_word = $self->and_word;
155 63         144 my $or_word = $self->or_word;
156 63         146 my $not_word = $self->not_word;
157 63         247 my $wildcard = $self->wildcard;
158 63         214 my $phrase = $self->phrase_delim;
159 63         122 my $igf = $self->ignore_first_char;
160 63         107 my $igl = $self->ignore_last_char;
161 63         126 my $wordchar = $self->word_characters;
162 63         128 my $default_field = $self->default_field;
163 63         97 my $esc_wildcard = quotemeta($wildcard);
164 63         2860 my $word_re = qr/(($esc_wildcard)?[$wordchar]+($esc_wildcard)?)/;
165 63         323 my $min_length = $self->term_min_length;
166 63         93 my $raw_query = $query;
167              
168 63 100       243 $stopwords = [ split( /\s+/, $stopwords ) ] unless ref $stopwords;
169 63         160 my %stophash = map { to_utf8( lc($_), $self->charset ) => 1 } @$stopwords;
  15         61  
170 63         135 my ( %words, %uniq, $c );
171 63         1831 my $parser = Search::Query::Parser->new(
172             and_regex => qr{$and_word}i,
173             or_regex => qr{$or_word}i,
174             not_regex => qr{$not_word}i,
175             default_field => $default_field,
176             query_class => $self->query_dialect,
177             );
178              
179 63         84616 my $baked_query = $raw_query;
180 63 50       358 $baked_query = lc($baked_query) if $self->ignore_case;
181 63         3455 $baked_query = to_utf8( $baked_query, $self->charset );
182 63 50       216 my $dialect = $parser->parse($baked_query) or croak $parser->error;
183 63 50       111004 $self->debug && carp "parsetree: " . Data::Dump::dump( $dialect->tree );
184 63         9748 my $fields_searched
185             = $self->_get_value_from_tree( \%uniq, $dialect->tree, $c );
186              
187 63 50       1382 $self->debug && carp "parsed: " . Data::Dump::dump( \%uniq );
188              
189 63         365 my $count = scalar( keys %uniq );
190              
191             # parse uniq into word tokens
192             # including removing stop words
193              
194 63 50       859 $self->debug && carp "word_re: $word_re";
195              
196 63         492 U: for my $u ( sort { $uniq{$a} <=> $uniq{$b} } keys %uniq ) {
  164         272  
197              
198 152         207 my $n = $uniq{$u};
199              
200             # only phrases have space
201             # but due to our word_re, a single non-spaced string
202             # might actually be multiple word tokens
203 152   100     601 my $isphrase = $u =~ m/\s/ || 0;
204              
205 152 50       387 if ( $self->treat_uris_like_phrases ) {
206              
207             # special case: treat email addresses, uris, as phrase
208 152   100     1751 $isphrase ||= $u =~ m/[$wordchar][\@\.\\\/][$wordchar]/ || 0;
      100        
209             }
210              
211 152 50       2379 $self->debug && carp "$u -> isphrase = $isphrase";
212              
213 152         731 my @w = ();
214              
215 152         470 TOK: for my $w ( split( m/\s+/, to_utf8( $u, $self->charset ) ) ) {
216              
217 199 50       500 next TOK unless $w =~ m/\S/;
218              
219 199         559 $w =~ s/\Q$phrase\E//g;
220              
221 199         903 while ( $w =~ m/$word_re/g ) {
222 208         300 my $tok = _untaint($1);
223              
224             # strip ignorable chars
225 208 50       944 $tok =~ s/^[$igf]+// if length($igf);
226 208 50       802 $tok =~ s/[$igl]+$// if length($igl);
227              
228 208 50       322 unless ($tok) {
229 0 0       0 $self->debug && carp "no token for '$w' $word_re";
230 0         0 next TOK;
231             }
232              
233 208 50       3124 $self->debug && carp "found token: $tok";
234              
235 208 100       1251 if ( exists $stophash{ lc($tok) } ) {
236 13 50       188 $self->debug && carp "$tok = stopword";
237 13 100       81 next TOK unless $isphrase;
238             }
239              
240 202 100       345 unless ($isphrase) {
241 115 50       1061 next TOK if $tok =~ m/^($and_word|$or_word|$not_word)$/i;
242             }
243              
244             # if tainting was on, odd things can happen.
245             # so check one more time
246 202         529 $tok = to_utf8( $tok, $self->charset );
247              
248             # final sanity check
249 202 50       430 if ( !Encode::is_utf8($tok) ) {
250 0         0 carp "$tok is NOT utf8";
251 0         0 next TOK;
252             }
253              
254             #$self->debug && carp "pushing $tok into wordlist";
255 202         755 push( @w, $tok );
256              
257             }
258              
259             }
260              
261 152 100       375 next U unless @w;
262              
263             #$self->debug && carp "joining \@w: " . Data::Dump::dump(\@w);
264 146 100       320 if ($isphrase) {
265 32         145 $words{ join( ' ', @w ) } = $n + $count++;
266             }
267             else {
268 114         188 for (@w) {
269 115         350 $words{$_} = $n + $count++;
270             }
271             }
272              
273             }
274              
275 63 50       1010 $self->debug && carp "tokenized: " . Data::Dump::dump( \%words );
276              
277             # make sure we don't have 'foo' and 'foo*'
278 63         418 for ( keys %words ) {
279 147 100       472 if ( $_ =~ m/$esc_wildcard/ ) {
280 12         87 ( my $copy = $_ ) =~ s,$esc_wildcard,,g;
281              
282             # delete the more exact of the two
283             # since the * will match both
284 12         25 delete( $words{$copy} );
285             }
286              
287 147 100       343 if ( length $_ < $min_length ) {
288 1 50       16 $self->debug and carp "token too short: '$_'";
289 1         6 delete $words{$_};
290             }
291              
292             }
293              
294 63 50       1381 $self->debug && carp "wildcards removed: " . Data::Dump::dump( \%words );
295              
296             # if any words need to be stemmed
297 63 100       430 if ( $self->stemmer ) {
298              
299             # split each $word into words
300             # stem each word
301             # if stem ne word, break into chars and find first N common
302             # rejoin $uniq
303              
304             #carp "stemming ON\n";
305              
306 8         24 K: for ( keys %words ) {
307 15         45 my (@w) = split /\s+/;
308 15         19 W: for my $w (@w) {
309 27         37 my $func = $self->stemmer;
310 27         47 my $f = &$func( $self, $w );
311 27 50 33     149 if ( !defined $f or !length $f ) {
312 0         0 next W;
313             }
314 27         39 $f = to_utf8($f);
315              
316             #warn "w: $w\nf: $f\n";
317              
318             # add wildcard to indicate chars were lost
319 27         55 $w = $f . $wildcard;
320              
321             }
322 15         22 my $new = join ' ', @w;
323 15 50       29 if ( $new ne $_ ) {
324 15         31 $words{$new} = $words{$_};
325 15         32 delete $words{$_};
326             }
327             }
328              
329             }
330              
331 63 50       896 $self->debug && carp "stemming done: " . Data::Dump::dump( \%words );
332              
333             # sort keeps query in same order as we entered
334             return {
335 63         595 terms => [ sort { $words{$a} <=> $words{$b} } keys %words ],
  145         489  
336             fields => [ keys %$fields_searched ],
337             dialect => $dialect,
338             query => $raw_query,
339             };
340              
341             }
342              
343             # stolen nearly verbatim from Taint::Runtime
344             # apparently regex can be tainted when running under 'use locale'.
345             # as of version 0.24 this should not be needed but until I can find a way
346             # to easily test the Taint feature, we just do this. It's low overhead.
347             sub _untaint {
348 208     208   308 my $str = shift;
349 208 50       344 my $ref = ref($str) ? $str : \$str;
350 208 50       335 if ( !defined $$ref ) {
351 0         0 $$ref = undef;
352             }
353             else {
354             $$ref
355             = ( $$ref =~ /(.*)/ )
356             ? $1
357 208 50       592 : do { confess("Couldn't find data to untaint") };
  0         0  
358             }
359 208 50       406 return ref($str) ? 1 : $str;
360             }
361              
362             sub _get_value_from_tree {
363 63     63   2622 my $self = shift;
364 63         78 my $uniq = shift;
365 63         65 my $parseTree = shift;
366 63         79 my $c = shift;
367 63         110 my %fields = ();
368              
369             # we only want the values from non minus queries
370 63         105 for my $node ( '+', '' ) {
371 126 100       295 next unless exists $parseTree->{$node};
372              
373 63         73 my @branches = @{ $parseTree->{$node} };
  63         207  
374              
375             #warn dump \@branches;
376              
377 63         383 for my $leaf (@branches) {
378 150         182 my $v = $leaf->{value};
379 150 50       236 if ( !defined $v ) {
380 0         0 croak "undefined value in query tree: " . dump($leaf);
381             }
382 150 50 66     401 if ( defined $leaf->{field}
383             and exists $self->ignore_fields->{ $leaf->{field} } )
384             {
385 0         0 next;
386             }
387 150         137 my $field = $leaf->{field};
388 150 100       236 if ( defined $field ) {
389 3         9 $fields{$field}++;
390             }
391 150 50       372 if ( ref $v eq 'HASH' ) {
    100          
392 0         0 my $f = $self->_get_value_from_tree( $uniq, $v, $c );
393 0         0 $fields{$_} = $f->{$_} for ( keys %$f );
394             }
395             elsif ( ref $v eq 'ARRAY' ) {
396 1         3 for my $value (@$v) {
397 2         28 $value =~ s/\s+/ /g;
398 2         7 $uniq->{$value} = ++$c;
399             }
400             }
401             else {
402              
403             # if the $leaf is a proximity query,
404             # ignore the "phrase-ness" of it and split
405             # on whitespace. This is a compromise,
406             # mitigated by the tendency of HeatMap
407             # to reward proximity anyway.
408 149 100 66     316 if ( $leaf->{proximity} and $leaf->{proximity} > 1 ) {
409 1         7 my @tokens = split( m/\ +/, $v );
410 1         5 $uniq->{$_} = ++$c for @tokens;
411 1         3 next;
412             }
413              
414             # collapse any whitespace
415 148         321 $v =~ s,\s+,\ ,g;
416              
417 148         385 $uniq->{$v} = ++$c;
418             }
419             }
420             }
421 63         124 return \%fields;
422             }
423              
424             sub _setup_regex_builder {
425 51     51   63 my $self = shift;
426              
427             # TODO optional for term_re
428              
429             # a search for a '<' or '>' should still highlight,
430             # since < or > can be indexed as literal < and >
431             # but this causes a great deal of hassle
432             # so we just ignore them.
433 51         284 my $wordchars = $self->word_characters;
434 51         117 $wordchars =~ s,[<>&],,g;
435 51         89 $self->{html_safe_wordchars} = $wordchars; # remember for build
436 51         128 my $ignore_first = $self->ignore_first_char;
437 51         108 my $ignore_last = $self->ignore_last_char;
438 51         105 my $html_whitespace = $self->whitespace;
439              
440             # what's the boundary between a word and a not-word?
441             # by default:
442             # the beginning of a string
443             # the end of a string
444             # whatever we've defined as WhiteSpace
445             # any character that is not a WordChar
446             # any character we explicitly ignore at start or end of word
447             #
448             # the \A and \Z (beginning and end) should help if the word butts up
449             # against the beginning or end of a tagset
450             # like

Word or Word

451              
452 51         182 my @start_bound = (
453             '\A',
454             '[>]',
455             '(?:&[\w\#]+;)', # because a ; might be a legitimate wordchar
456             # and we treat a char entity like a single char.
457             # if &char; resolves to a legit wordchar
458             # this might give unexpected results.
459             # NOTE that   etc is in $WhiteSpace
460             $html_whitespace,
461             '[^' . $wordchars . ']'
462             );
463 51 50       709 push( @start_bound, qr/[$ignore_first]+/i ) if length $ignore_first;
464              
465             my @end_bound
466 51         266 = ( '\Z', '[<&]', $html_whitespace, '[^' . $wordchars . ']' );
467 51 50       332 push( @end_bound, qr/[$ignore_last]+/i ) if length $ignore_last;
468              
469 51   33     397 $self->{start_bound} ||= join( '|', @start_bound );
470              
471 51   33     274 $self->{end_bound} ||= join( '|', @end_bound );
472              
473             # the whitespace in a query phrase might be:
474             # any ignore_last_char, followed by
475             # one or more nonwordchar or whitespace, followed by
476             # any ignore_first_char
477             # define for both text and html
478             # NOTE the first/last swap for plain vs html
479             # is intentional because of how regex are built.
480              
481 51 50       13321 my @plain_phrase_bound = (
    50          
482             ( length($ignore_last) ? qr/[$ignore_last]*/i : '' ),
483             qr/(?:[\s\x20]|[^$wordchars])+/is,
484             ( length($ignore_first) ? qr/[$ignore_first]?/i : '' ),
485             );
486 51         2442 $self->{plain_phrase_bound} = join( '', @plain_phrase_bound );
487              
488 51 50       2381 my @html_phrase_bound = (
    50          
489             ( length($ignore_first) ? qr/[$ignore_first]*/i : '' ),
490             qr/(?:$html_whitespace|[^$wordchars])+/is,
491             ( length($ignore_last) ? qr/[$ignore_last]?/i : '' ),
492             );
493 51         149527 $self->{html_phrase_bound} = join( '', @html_phrase_bound );
494              
495             }
496              
497             sub _build_regex {
498 232     232   254 my $self = shift;
499 232 50       451 my $q = shift or croak "need query to build()";
500 232         306 my $wild = $self->{html_safe_wordchars};
501 232         235 my $st_bound = $self->{start_bound};
502 232         202 my $end_bound = $self->{end_bound};
503 232         219 my $wc = $self->{html_safe_wordchars};
504 232         210 my $ppb = $self->{plain_phrase_bound};
505 232         212 my $hpb = $self->{html_phrase_bound};
506 232         417 my $wildcard = $self->wildcard;
507 232         305 my $wild_esc = quotemeta($wildcard);
508 232         335 my $tag_re = $self->tag_re;
509              
510             # define simple pattern for plain text
511             # and complex pattern for HTML markup
512 232         194 my ( $plain, $html );
513 232         251 my $escaped = quotemeta($q);
514 232         985 $escaped =~ s/\\[$wild_esc]/[$wc]*/g; # wildcard
515 232         493 $escaped =~ s/\\[\s]/$ppb/g; # whitespace
516              
517 232         71028 $plain = qr/
518             (
519             \A|$ppb
520             )
521             (
522             ${escaped}
523             )
524             (
525             \Z|$ppb
526             )
527             /xis;
528              
529 232         9852 my (@char) = split( m//, $q );
530              
531 232         282 my $counter = -1;
532              
533 232         409 CHAR: foreach my $c (@char) {
534 1461         1007 $counter++;
535              
536 1461   100     3040 my $ent = $C2E->{$c} || undef;
537 1461         1409 my $num = ord($c);
538              
539             # if this is a special regexp char, protect it
540 1461         1144 $c = quotemeta($c);
541              
542             # if it's a *, replace it with the Wild class
543 1461 100       2113 $c = "[$wild]*" if $c eq $wild_esc;
544              
545 1461 100       1804 if ( $c eq '\ ' ) {
546 55         132 $c = $hpb . $tag_re . '*';
547 55         101 next CHAR;
548             }
549              
550 1406         872 my $aka;
551 1406 100       1519 if ($ent) {
552 1404 100       2284 $aka = $ent eq "&#$num;" ? $ent : "$ent|&#$num;";
553             }
554             else {
555 2         5 $aka = "&#$num;";
556             }
557              
558             # make $c into a regexp
559 1406 100       12524 $c = qr/$c|$aka/i unless $c eq "[$wild]*";
560              
561             # any char might be followed by zero or more tags, unless it's the last char
562 1406 100       4770 $c .= $tag_re . '*' unless $counter == $#char;
563              
564             }
565              
566             # re-join the chars into a single string
567 232         674 my $safe = join( "\n", @char ); # use \n to make it legible in debugging
568              
569             # for debugging legibility we include newlines, so make sure we s//x in matches
570 232         147590 $html = qr/
571             (
572             ${st_bound}
573             )
574             (
575             ${safe}
576             )
577             (
578             ${end_bound}
579             )
580             /xis;
581              
582 232         6855 return ( $plain, $html, $escaped );
583             }
584              
585             sub _build_term_re {
586              
587             # this based on SWISH::PhraseHighlight::set_match_regexp()
588              
589 0     0     my $self = shift;
590              
591             #dump $self;
592              
593 0           my $wc = $self->word_characters;
594             $self->{_wc_regexp}
595 0           = qr/[^$wc]+/io; # regexp for splitting into swish-words
596              
597 0           my $igf = $self->ignore_first_char;
598 0           my $igl = $self->ignore_last_char;
599 0           for ( $igf, $igl ) {
600 0 0         if ($_) {
601 0           $_ = "[$_]*";
602             }
603             else {
604 0           $_ = '';
605             }
606             }
607              
608 0           $self->{_ignoreFirst} = $igf;
609 0           $self->{_ignoreLast} = $igl;
610              
611             }
612              
613             1;
614              
615             __END__