File Coverage

blib/lib/Search/Query/Parser.pm
Criterion Covered Total %
statement 306 340 90.0
branch 137 176 77.8
condition 72 101 71.2
subroutine 24 24 100.0
pod 7 7 100.0
total 546 648 84.2


line stmt bran cond sub pod time code
1             package Search::Query::Parser;
2 9     9   7626 use Moo;
  9         114039  
  9         57  
3 9     9   12457 use Carp;
  9         16  
  9         616  
4 9     9   2292 use Data::Dump qw( dump );
  9         31141  
  9         559  
5 9     9   1235 use Search::Query;
  9         12  
  9         218  
6 9     9   4329 use Search::Query::Dialect::Native;
  9         42  
  9         404  
7 9     9   4944 use Search::Query::Clause;
  9         20  
  9         321  
8 9     9   4147 use Search::Query::Field;
  9         18  
  9         318  
9 9     9   47 use Scalar::Util qw( blessed weaken );
  9         11  
  9         456  
10 9     9   37 use namespace::autoclean;
  9         10  
  9         51  
11              
12             our $VERSION = '0.307';
13              
14             has 'and_regex' => ( is => 'rw', default => sub {qr/\&|AND|ET|UND|E/i} );
15             has 'clause_class' =>
16             ( is => 'rw', default => sub {'Search::Query::Clause'} );
17             has 'croak_on_error' => ( is => 'rw', default => sub {0} );
18             has 'default_boolop' => ( is => 'rw', default => sub {'+'} );
19             has 'default_field' => ( is => 'rw' );
20             has 'default_op' => ( is => 'rw', default => sub {':'} );
21             has 'field_class' => ( is => 'rw', default => sub {'Search::Query::Field'} );
22              
23             # match prefix.field: or field
24             has 'field_regex' => ( is => 'rw', default => sub {qr/[\.\w]+/}, );
25              
26             has 'fixup' => ( is => 'rw', default => sub {0} );
27             has 'near_regex' => ( is => 'rw', default => sub {qr/NEAR\d+/i}, );
28             has 'not_regex' => ( is => 'rw', default => sub {qr/NOT|PAS|NICHT|NON/i}, );
29             has 'null_term' => ( is => 'rw', );
30              
31             # ops that admit an empty left operand
32             has 'op_nofield_regex' => ( is => 'rw', default => sub {qr/=~|!~|[~:#]/}, );
33              
34             # longest ops first !
35             has 'op_regex' =>
36             ( is => 'rw', default => sub {qr/~\d+|==|<=|>=|!=|!:|=~|!~|[:=<>~#]/}, );
37              
38             has 'or_regex' => ( is => 'rw', default => sub {qr/\||OR|OU|ODER|O/i}, );
39             has 'phrase_delim' => ( is => 'rw', default => sub {q/"/}, );
40             has 'query_class' =>
41             ( is => 'rw', default => sub {'Search::Query::Dialect::Native'} );
42             has 'query_class_opts' => ( is => 'rw', default => sub { {} } );
43             has 'range_regex' => ( is => 'rw', default => sub {qr/\.\./}, );
44             has 'sloppy' => ( is => 'rw', default => sub {0} );
45             has 'sloppy_term_regex' => ( is => 'rw', default => sub {qr/[\.\w]+/}, );
46             has 'term_expander' => ( is => 'rw' );
47             has 'term_regex' => ( is => 'rw', default => sub {qr/[^\s()]+/}, );
48             has 'error' => ( is => 'ro' );
49             has 'fields' => ( is => 'ro' );
50              
51             my %SQPCOMPAT = (
52             rxAnd => 'and_regex',
53             rxOr => 'or_regex',
54             rxNot => 'not_regex',
55             defField => 'default_field',
56             rxTerm => 'term_regex',
57             rxField => 'field_regex',
58             rxOp => 'op_regex',
59             rxOpNoField => 'op_nofield_regex',
60             dialect => 'query_class', # our own compat
61             );
62              
63             =head1 NAME
64              
65             Search::Query::Parser - convert query strings into query objects
66              
67             =head1 SYNOPSIS
68              
69             use Search::Query;
70             my $parser = Search::Query->parser(
71             term_regex => qr/[^\s()]+/,
72             field_regex => qr/\w+/,
73             op_regex => qr/==|<=|>=|!=|=~|!~|[:=<>~#]/,
74              
75             # ops that admit an empty left operand
76             op_nofield_regex => qr/=~|!~|[~:#]/,
77              
78             # case insensitive
79             and_regex => qr/\&|AND|ET|UND|E/i,
80             or_regex => qr/\||OR|OU|ODER|O/i,
81             not_regex => qr/NOT|PAS|NICHT|NON/i,
82              
83             default_field => 'myfield', # or ['myfield', 'myfield2']
84             phrase_delim => q/"/,
85             default_boolop => '+',
86             query_class => 'Search::Query::Dialect::Native',
87             field_class => 'Search::Query::Field',
88             query_class_opts => {
89             default_field => 'foo', # or ['foo', 'bar']
90             },
91            
92             # a generous mode, overlooking boolean-parser syntax errors
93             sloppy => 0,
94             sloppy_term_regex => qr/[\.\w]+/,
95             fixup => 0,
96            
97             # if set, this special term indicates a NULL query
98             null_term => 'NULL',
99             );
100              
101             my $query = $parser->parse('+hello -world now');
102             print $query;
103              
104             =head1 DESCRIPTION
105              
106             Search::Query::Parser is a fork of Search::QueryParser
107             that supports multiple query dialects.
108              
109             The Parser class transforms a query string into a Dialect object structure
110             to be handled by external search engines.
111              
112             The query string can contain simple terms, "exact phrases", field
113             names and comparison operators, '+/-' prefixes, parentheses, and
114             boolean connectors.
115              
116             The parser can be customized using regular expressions for specific
117             notions of "term", "field name" or "operator" -- see the L
118             method.
119              
120             The Dialect object resulting from a parsed query is a tree of terms
121             and operators. Each Dialect can be re-serialized as a string
122             using the stringify() method, or simply by printing the Dialect object,
123             since the string-related Perl operations are overloaded using stringify().
124              
125             =head1 QUERY STRING
126              
127             The query string is decomposed into Clause objects, where
128             each Clause has an optional sign prefix,
129             an optional field name and comparison operator,
130             and a mandatory value.
131              
132             =head2 Sign prefix
133              
134             Prefix '+' means that the item is mandatory.
135             Prefix '-' means that the item must be excluded.
136             No prefix means that the item will be searched
137             for, but is not mandatory.
138              
139             See also section L below, which is another
140             way to combine items into a query.
141              
142             =head2 Field name and comparison operator
143              
144             Internally, each query item has a field name and comparison
145             operator; if not written explicitly in the query, these
146             take default values C<''> (empty field name) and
147             C<':'> (colon operator).
148              
149             Operators have a left operand (the field name) and
150             a right operand (the value to be compared with);
151             for example, C means "search documents containing
152             term 'bar' in field 'foo'", whereas C means
153             "search documents where field 'foo' has exact value 'bar'".
154              
155             Here is the list of admitted operators with their intended meaning:
156              
157             =over
158              
159             =item C<:>
160              
161             treat value as a term to be searched within field.
162             This is the default operator.
163              
164             =item C<~> or C<=~>
165              
166             treat value as a regex; match field against the regex.
167              
168             Note that C<~>
169             after a phrase indicates a proximity assertion:
170              
171             "foo bar"~5
172              
173             means "match 'foo' and 'bar' within 5 positions of each other."
174              
175             =item C
176              
177             negation of above
178              
179             =item C<==> or C<=>, C=>, C=>, C, C>, C>
180              
181             classical relational operators
182              
183             =item C<#>
184              
185             Inclusion in the set of comma-separated integers supplied
186             on the right-hand side.
187              
188             =back
189              
190             Operators C<:>, C<~>, C<=~>, C and C<#> admit an empty
191             left operand (so the field name will be C<''>).
192             Search engines will usually interpret this as
193             "any field" or "the whole data record". But see the B
194             feature.
195              
196             =head2 Value
197              
198             A value (right operand to a comparison operator) can be
199              
200             =over
201              
202             =item *
203              
204             A term (as recognized by regex C, see L method below).
205              
206             =item *
207              
208             A quoted phrase, i.e. a collection of terms within
209             single or double quotes.
210              
211             Quotes can be used not only for "exact phrases", but also
212             to prevent misinterpretation of some values : for example
213             C<-2> would mean "value '2' with prefix '-'",
214             in other words "exclude term '2'", so if you want to search for
215             value -2, you should write C<"-2"> instead.
216              
217             Note that C<~>
218             after a phrase indicates a proximity assertion:
219              
220             "foo bar"~5
221              
222             means "match 'foo' and 'bar' within 5 positions of each other."
223              
224             =item *
225              
226             A subquery within parentheses.
227             Field names and operators distribute over parentheses, so for
228             example C is equivalent to
229             C.
230              
231             Nested field names such as C are not allowed.
232              
233             Sign prefixes do not distribute : C<+(foo bar) +bie> is not
234             equivalent to C<+foo +bar +bie>.
235              
236             =back
237              
238             =head2 Boolean connectors
239              
240             Queries can contain boolean connectors 'AND', 'OR', 'NOT'
241             (or their equivalent in some other languages -- see the *_regex
242             features in new()).
243             This is mere syntactic sugar for the '+' and '-' prefixes :
244             C is equivalent to C<+a +b>;
245             C is equivalent to C<(a b)>;
246             C is equivalent to C<-a>.
247             C<+a OR b> does not make sense,
248             but it is translated into C<(a b)>, under the assumption
249             that the user understands "OR" better than a
250             '+' prefix.
251             C<-a OR b> does not make sense either,
252             but has no meaningful approximation, so it is rejected.
253              
254             Combinations of AND/OR clauses must be surrounded by
255             parentheses, i.e. C<(a AND b) OR c> or C are
256             allowed, but C is not.
257              
258             The C connector is treated like the proximity phrase assertion.
259              
260             foo NEAR5 bar
261              
262             is treated as if it were:
263              
264             "foo bar"~5
265              
266             See the B option.
267              
268             =head1 METHODS
269              
270             =head2 new
271              
272             The following attributes may be initialized in new().
273             These are also available as get/set methods on the returned
274             Parser object.
275              
276             =over
277              
278             =item default_boolop
279              
280             =item term_regex
281              
282             =item field_regex
283              
284             =item op_regex
285              
286             =item op_nofield_regex
287              
288             =item and_regex
289              
290             =item or_regex
291              
292             =item not_regex
293              
294             =item near_regex
295              
296             =item range_regex
297              
298             =item default_field
299              
300             Applied to all terms where no field is defined.
301             The default value is undef (no default).
302              
303             =item default_op
304              
305             The operator used when default_field is applied.
306              
307             =item fields
308              
309             =item phrase_delim
310              
311             =item query_class
312              
313             C is an alias for C.
314              
315             =item field_class
316              
317             =item clause_class
318              
319             =item query_class_opts
320              
321             Will be passed to I new() method each time a query is parse()'d.
322              
323             =item dialect_opts
324              
325             Alias for query_class_opts.
326              
327             =item croak_on_error
328              
329             Default value is false (0). Set to true to automatically throw an exception
330             via Carp::croak() if parse() would return undef.
331              
332             =item term_expander
333              
334             A function reference for transforming query terms after they have been parsed.
335             Examples might include adding alternate spellings, synonyms, or
336             expanding wildcards based on lexicon listings.
337              
338             Example:
339              
340             my $parser = Search::Query->parser(
341             term_expander => sub {
342             my ($term, $field) = @_;
343             return ($term) if ref $term; # skip ranges
344             return ( qw( one two three ), $term );
345             }
346             );
347              
348             my $query = $parser->parse("foo=bar")
349             print "$query\n"; # +foo=(one OR two OR three OR bar)
350              
351             The term_expander reference should expect two arguments: the term value
352             and, if available, the term field name. It should return an array of values.
353              
354             The term_expander reference is called internally during the parse() method,
355             B any field alias expansion or validation is performed.
356              
357             =item sloppy( 0|1 )
358              
359             If the string passed to parse() has any incorrect or unsupported syntax
360             in it, the default behavior is for parsing to stop immediately, error()
361             to be set, and for parse() to return undef.
362              
363             In certain cases (as on a web form) this is undesirable. Set sloppy
364             mode to true to fallback to non-boolean evaluation of the string,
365             which in most cases should still return a Dialect object.
366              
367             Example:
368              
369             $parser->parse('foo -- OR bar'); # if sloppy==0, returns undef
370             $parser->parse('foo -- OR bar'); # if sloppy==1, equivalent to 'foo bar'
371              
372             =item sloppy_term_regex
373              
374             The regex definition used to match a term when sloppy==1.
375              
376             =item fixup( 0|1 )
377              
378             Attempt to fix syntax errors like the lack of a closing parenthesis
379             or a missing double-quote. Different than sloppy() which will not
380             attempt to fix broken syntax, but should probably be used together
381             if you really do not care about strict syntax checking.
382              
383             =item null_term
384              
385             If set to I, the B feature will treat field value
386             of I as if it was undefined. Example:
387              
388             $parser->parse('foo='); # throws fatal error
389             $parser->null_term('NULL');
390             $parser->parse('foo=NULL'); # field foo has NULL value
391              
392             This feature is most useful with the SQL dialect, where you might want to
393             find NULL values. Use it like:
394              
395             my $parser = Search::Query->parser(
396             dialect => 'SQL',
397             null_term => 'NULL'
398             );
399             my $query = $parser->parse('foo!=NULL');
400             print $query; # prints "foo is not NULL"
401              
402              
403             =back
404              
405             =head2 BUILDARGS
406              
407             Internal method for mangling constructor params.
408              
409             =cut
410              
411             sub BUILDARGS {
412 30     30 1 43319 my ( $class, %args ) = @_;
413              
414             # Search::QueryParser compatability
415 30 50       110 if ( exists $args{dialect_opts} ) {
416 0         0 $args{query_class_opts} = delete $args{dialect_opts};
417             }
418 30         119 for my $key ( keys %args ) {
419 93 100       200 if ( exists $SQPCOMPAT{$key} ) {
420 19         65 $args{ $SQPCOMPAT{$key} } = delete $args{$key};
421             }
422             }
423 30         648 return \%args;
424             }
425              
426             =head2 BUILD
427              
428             Called internally to initialize the object.
429              
430             =cut
431              
432             sub BUILD {
433 30     30 1 194 my $self = shift;
434              
435             # query class can be shortcut
436             $self->{query_class}
437 30         165 = Search::Query->get_query_class( $self->{query_class} );
438              
439             # use field class if query class defines one
440             # and we weren't passed one explicitly
441 30 100 100     257 if ( $self->{field_class} eq 'Search::Query::Field'
442             && $self->{query_class}->field_class ne $self->{field_class} )
443             {
444 18         52 $self->{field_class} = $self->{query_class}->field_class;
445             }
446              
447 30 100       129 $self->set_fields( $self->{fields} ) if $self->{fields};
448              
449 30         553 return $self;
450             }
451              
452             =head2 error
453              
454             Returns the last error message.
455              
456             =cut
457              
458             =head2 clear_error
459              
460             Sets error message to undef.
461              
462             =cut
463              
464             sub clear_error {
465 2     2 1 84 $_->{error} = undef;
466             }
467              
468             =head2 get_field( I )
469              
470             Returns Field object for I or undef if there isn't one
471             defined.
472              
473             =cut
474              
475             sub get_field {
476 230     230 1 236 my $self = shift;
477 230 50       405 my $name = shift or croak "name required";
478 230 100       485 if ( !exists $self->{fields}->{$name} ) {
479 8         22 return undef;
480             }
481 222         491 return $self->{fields}->{$name};
482             }
483              
484             =head2 set_fields( I )
485              
486             Set the I structure. Called internally by BUILD()
487             if you pass a C key/value pair to new().
488              
489             The structure of I may be one of the following:
490              
491             my $fields = {
492             field1 => 1,
493             field2 => { alias_for => 'field1' },
494             field3 => Search::Query::Field->new( name => 'field3' ),
495             field4 => { alias_for => [qw( field1 field3 )] },
496             };
497              
498             # or
499              
500             my $fields = [
501             'field1',
502             { name => 'field2', alias_for => 'field1' },
503             Search::Query::Field->new( name => 'field3' ),
504             { name => 'field4', alias_for => [qw( field1 field3 )] },
505             ];
506              
507              
508             =cut
509              
510             sub set_fields {
511 25     25 1 44 my $self = shift;
512 25         31 my $origfields = shift;
513 25 50       63 if ( !defined $origfields ) {
514 0         0 croak "fields required";
515             }
516              
517 25         25 my %fields;
518 25         38 my $field_class = $self->{field_class};
519              
520 25         49 my $reftype = ref($origfields);
521 25 50 66     139 if ( !$reftype or ( $reftype ne 'ARRAY' and $reftype ne 'HASH' ) ) {
      33        
522 0         0 croak "fields must be an ARRAY or HASH ref";
523             }
524              
525             # convert simple array to hash
526 25 100       67 if ( $reftype eq 'ARRAY' ) {
    50          
527 21         55 for my $name (@$origfields) {
528 37 50       1066 if ( blessed($name) ) {
    100          
529 0         0 $fields{ $name->name } = $name;
530             }
531             elsif ( ref($name) eq 'HASH' ) {
532 1 50       4 if ( !exists $name->{name} ) {
533 0         0 croak "'name' required in hashref: " . dump($name);
534             }
535 1         19 $fields{ $name->{name} } = $field_class->new(%$name);
536             }
537             else {
538 36         681 $fields{$name} = $field_class->new( name => $name, );
539             }
540             }
541             }
542             elsif ( $reftype eq 'HASH' ) {
543 4         15 for my $name ( keys %$origfields ) {
544 12         17 my $val = $origfields->{$name};
545 12         12 my $obj;
546 12 50       67 if ( blessed($val) ) {
    50          
    0          
547 0         0 $obj = $val;
548             }
549             elsif ( ref($val) eq 'HASH' ) {
550 12 50       28 if ( !exists $val->{name} ) {
551 12         19 $val->{name} = $name;
552             }
553 12         236 $obj = $field_class->new(%$val);
554             }
555             elsif ( !ref $val ) {
556 0         0 $obj = $field_class->new( name => $name );
557             }
558             else {
559 0         0 croak
560             "field value for $name must be a field name, hashref or Field object";
561             }
562 12         1073 $fields{$name} = $obj;
563             }
564             }
565              
566 25         2587 $self->{fields} = \%fields;
567 25         50 return $self->{fields};
568             }
569              
570             =head2 set_field( I => I )
571              
572             Sets field I to Field object I.
573              
574             =cut
575              
576             sub set_field {
577 8     8 1 44 my $self = shift;
578 8         12 my ( $name, $field ) = @_;
579 8 50       20 confess "name required" unless $name;
580 8 50       15 confess "field object required" unless $field;
581 8 50       33 confess "field not an object: $field" unless blessed($field);
582 8         24 $self->{fields}->{$name} = $field;
583             }
584              
585             =head2 parse( I )
586              
587             Returns a Search::Query::Dialect object of type
588             I.
589              
590             If there is a syntax error in I,
591             parse() will return C and set error().
592              
593             =cut
594              
595             sub parse {
596 84     84 1 14252 my $self = shift;
597 84         112 my $q = shift;
598 84 50       199 croak "query required" unless defined $q;
599 84   33     488 my $class = shift || $self->query_class;
600              
601             # reset state in case we are called multiple times
602 84         132 $self->{error} = undef;
603 84         122 $self->{_paren_count} = 0;
604              
605 84         355 $q = $class->preprocess($q);
606 84         202 my ($query) = $self->_parse( $q, undef, undef, $class );
607 84 100 66     212 if ( !defined $query && !$self->sloppy ) {
608 2 100       180 croak $self->error if $self->croak_on_error;
609 1         5 return $query;
610             }
611              
612             # if in sloppy mode and we failed to parse,
613             # extract what looks like terms and re-parse.
614 82 100 66     206 if ( !defined $query && $self->sloppy ) {
615 1         4 return $self->_sloppify( $q, $class );
616             }
617              
618 81 100       162 if ( $self->{term_expander} ) {
619 3         10 $self->_call_term_expander($query);
620             }
621              
622 81 100       198 if ( $self->{fields} ) {
623 68         140 $self->_expand($query);
624 68         882 $self->_validate($query);
625             }
626              
627             # if in sloppy mode and we failed to parse,
628             # extract what looks like terms and re-parse.
629 78 100 66     276 if ( $self->error && $self->sloppy ) {
630 1         2 return $self->_sloppify( $q, $class );
631             }
632              
633 77         100 $query->{parser} = $self;
634              
635             #warn dump $query;
636              
637             # if the query isn't re-parse-able once stringified
638             # then it is broken, somehow.
639 77 100 33     578 if ( defined $query
      66        
640             and !$self->error
641             and $self->croak_on_error )
642             {
643 27         77 my ($reparsed) = $self->_parse( "$query", undef, undef, $class );
644 27 50       193 if ( !defined $reparsed ) {
645 0         0 croak sprintf( "Error: unable to parse '%s'. Reason: '%s'.",
646             $q, $self->error );
647             }
648             }
649              
650             #weaken( $query->{parser} ); # TODO leaks possible?
651              
652 77         362 return $query;
653             }
654              
655             sub _sloppify {
656 2     2   3 my ( $self, $q, $class ) = @_;
657 2         3 my $term = $self->{sloppy_term_regex};
658 2         2 my $and = $self->{and_regex};
659 2         3 my $or = $self->{or_regex};
660 2         2 my $not = $self->{not_regex};
661 2         2 my $near = $self->{near_regex};
662 2         3 my $ops = $self->{op_regex};
663 2         91 my $bools = qr/($and|$or|$not|$near|$ops)/;
664 2         6 my @terms;
665              
666 2         32 while ( $q =~ m/($term)/ig ) {
667 14         18 my $t = $1;
668              
669             #warn "$t =~ $bools\n";
670 14 100       124 if ( $t =~ m/^$bools$/ ) {
671 7         28 next;
672             }
673 7         42 push @terms, split( /$ops/, $t );
674             }
675              
676             #dump \@terms;
677              
678             # reset errors since we will re-parse
679 2         4 $self->{error} = undef;
680 2         7 my ($query) = $self->_parse( join( ' ', @terms ), undef, undef, $class );
681 2 50       5 if ( !defined $query ) {
682 0 0       0 $self->croak_on_error and croak $self->error;
683             }
684             else {
685 2         3 $query->{parser} = $self;
686             }
687 2         15 return $query;
688             }
689              
690             sub _call_term_expander {
691 3     3   5 my ( $self, $query ) = @_;
692 3         6 my $expander = $self->{term_expander};
693 3 50       26 if ( ref($expander) ne 'CODE' ) {
694 0         0 croak "term_expander must be a CODE reference";
695             }
696              
697 3         6 my $query_class = $self->{query_class};
698              
699             $query->walk(
700             sub {
701 3     3   8 my ( $clause, $tree, $code, $prefix ) = @_;
702 3 50       13 if ( $clause->is_tree ) {
703 0         0 $clause->value->walk($code);
704 0         0 return;
705             }
706              
707 3         20 my @newterms = $expander->( $clause->value, $clause->field );
708 3 50 33     49 if ( ref $newterms[0] and ref $clause->value ) {
    100          
709 0         0 $clause->value( $newterms[0] );
710             }
711             elsif ( @newterms > 1 ) {
712              
713             # turn $clause into a tree
714 1         6 my $class = blessed($clause);
715 1         5 my $op = $clause->op;
716 1         5 my $field = $clause->field;
717 1         4 my $proximity = $clause->proximity;
718 1         5 my $quote = $clause->quote;
719              
720             #warn "before tree: " . dump $tree;
721              
722             #warn "code clause: " . dump $clause;
723 1         3 my @subclauses;
724 1         3 for my $term (@newterms) {
725 4         234 push(
726             @subclauses,
727             $class->new(
728             field => $field,
729             op => $op,
730             value => $term,
731             quote => $quote,
732             proximity => $proximity,
733             )
734             );
735             }
736              
737             # OR the fields together. TODO optional?
738              
739             # we must set "" key here explicitly, because
740             # our bool op keys are not methods.
741             my $subclause
742 1         33 = $query_class->new( %{ $self->query_class_opts },
  1         36  
743             parser => $self );
744 1         28 $subclause->{""} = \@subclauses;
745              
746 1         6 $clause->op('()');
747 1         10 $clause->value($subclause);
748             }
749             else {
750 2         13 $clause->value( $newterms[0] );
751             }
752              
753             }
754 3         41 );
755              
756             }
757              
758             sub _expand {
759 68     68   82 my ( $self, $query ) = @_;
760              
761 68 50       137 return if !exists $self->{fields};
762 68         75 my $fields = $self->{fields};
763 68         81 my $query_class = $self->{query_class};
764 68         76 my $default_field = $self->{default_field};
765              
766             #dump $fields;
767              
768             $query->walk(
769             sub {
770 168     168   224 my ( $clause, $tree, $code, $prefix ) = @_;
771              
772             #warn "code clause: " . dump $clause;
773              
774             #warn "code tree: " . dump $tree;
775              
776 168 100       361 if ( $clause->is_tree ) {
777 46         167 $clause->value->walk($code);
778 46         111 return;
779             }
780 122 100 66     537 if ( ( !defined $clause->field || !length $clause->field )
      66        
781             && !defined $default_field )
782             {
783 14         37 return;
784             }
785              
786             # make sure clause has an op
787 108 100       232 if ( !$clause->op ) {
788 16         54 $clause->op( $self->default_op );
789             }
790              
791             # even if $clause has a field defined,
792             # it may be aliased to multiple others,
793             # so check field def and default_field to determine.
794 108         135 my @field_names;
795              
796             # first, which field name to start with?
797             my @clause_fields; # could be plural
798 108 100       195 if ( !defined $clause->field ) {
799             @clause_fields
800 16 100       52 = ref($default_field)
801             ? @$default_field
802             : ($default_field);
803             }
804             else {
805 92         204 @clause_fields = ( $clause->field );
806             }
807              
808             # second, resolve any aliases
809 108         135 for my $cfield (@clause_fields) {
810              
811             # if we have no definition for $cfield, it's invalid
812 110 100       218 if ( !exists $fields->{$cfield} ) {
813 3         69 return;
814             }
815              
816 107         122 my $field_def = $fields->{$cfield};
817 107 100       234 if ( $field_def->alias_for ) {
818             my @aliases
819             = ref $field_def->alias_for
820 10 100       32 ? @{ $field_def->alias_for }
  2         8  
821             : ( $field_def->alias_for );
822 10         25 push @field_names, @aliases;
823             }
824             else {
825 97         186 push @field_names, $cfield;
826             }
827             }
828              
829             #warn "resolved field_names: " . dump( \@field_names );
830              
831             # third, apply our canonical names to the $clause
832 105 100       189 if ( @field_names > 1 ) {
833              
834             # turn $clause into a tree
835 4         13 my $class = blessed($clause);
836 4         9 my $op = $clause->op;
837              
838             #warn "before tree: " . dump $tree;
839              
840             #warn "code clause: " . dump $clause;
841 4         4 my @newfields;
842 4         7 for my $name (@field_names) {
843 8         291 push(
844             @newfields,
845             $class->new(
846             field => $name,
847             op => $op,
848             value => $clause->value,
849             quote => $clause->quote,
850             proximity => $clause->proximity,
851             )
852             );
853             }
854              
855             # OR the fields together. TODO optional?
856              
857             # we must bless here because
858             # our bool op keys are not methods.
859             my $newfield
860 4         69 = $query_class->new( %{ $self->query_class_opts },
  4         71  
861             parser => $self );
862 4         60 $newfield->{""} = \@newfields;
863              
864 4         10 $clause->op('()');
865 4         9 $clause->value($newfield);
866              
867             #warn "after tree: " . dump $tree;
868              
869             }
870             else {
871              
872             # if no field defined in clause, or it differs, override.
873 101 100 100     434 if ( !defined $clause->field
874             or $field_names[0] ne $clause->field )
875             {
876 20         45 $clause->field( $field_names[0] );
877             }
878             }
879              
880 105         317 return $clause;
881             }
882 68         544 );
883             }
884              
885             sub _validate {
886 68     68   90 my ( $self, $query ) = @_;
887              
888 68         87 my $fields = $self->{fields};
889             my $validator = sub {
890 176     176   220 my ( $clause, $tree, $code, $prefix ) = @_;
891 176 100       326 if ( $clause->is_tree ) {
892 50         128 $clause->value->walk($code);
893             }
894             else {
895 126 100 100     572 return unless defined $clause->field and length $clause->field;
896 112         155 my $field_name = $clause->field;
897 112         153 my $field_value = $clause->value;
898 112         130 my $field = $fields->{$field_name};
899 112 100       175 if ( !$field ) {
900 3 100       18 if ( $self->croak_on_error ) {
901 2         502 croak "No such field: $field_name";
902             }
903             else {
904 1         3 $self->{error} = "No such field: $field_name";
905 1         3 return;
906             }
907             }
908 109 100       274 if ( !$field->validate($field_value) ) {
909 1 50       16 if ( $self->croak_on_error ) {
910 1         3 my $err = $field->error;
911 1         195 croak
912             "Invalid field value for $field_name: $field_value ($err)";
913             }
914             }
915             }
916 68         298 };
917 68         174 $query->walk($validator);
918             }
919              
920             sub _parse {
921 176     176   267 my $self = shift;
922 176         178 my $str = shift;
923 176         155 my $parent_field = shift; # only for recursive calls
924 176         161 my $parent_op = shift; # only for recursive calls
925 176         171 my $query_class = shift;
926              
927             #warn "_parse: " . dump [ $str, $parent_field, $parent_op, $query_class ];
928              
929             #dump $self;
930              
931 176         197 my $q = {};
932 176         184 my $pre_bool = '';
933 176         187 my $err = undef;
934 176         144 my $s_orig = $str;
935 176         214 my $phrase_delim = $self->{phrase_delim};
936 176         184 my $field_regex = $self->{field_regex};
937 176         178 my $and_regex = $self->{and_regex};
938 176         174 my $or_regex = $self->{or_regex};
939 176         180 my $not_regex = $self->{not_regex};
940 176         165 my $op_regex = $self->{op_regex};
941 176         171 my $op_nofield_regex = $self->{op_nofield_regex};
942 176         155 my $term_regex = $self->{term_regex};
943 176         456 my $phrase_regex = qr/[^"()]+/;
944 176         188 my $near_regex = $self->{near_regex};
945 176         172 my $range_regex = $self->{range_regex};
946 176         225 my $clause_class = $self->{clause_class};
947 176         176 my $fixup = $self->{fixup};
948 176         168 my $null_term = $self->{null_term};
949              
950 176         386 $str =~ s/^\s+//; # remove leading spaces
951              
952             LOOP:
953 176         395 while ( length $str ) { # while query string is not empty
954 355         505 for ($str) { # temporary alias to $_ for easier regex application
955              
956             #warn "LOOP start: " . dump [ $str, $parent_field, $parent_op ];
957              
958 355         411 my $sign = $self->{default_boolop};
959 355         314 my $field = $parent_field;
960 355   100     990 my $op = $parent_op || "";
961              
962             #warn "LOOP after start: " . dump [ $sign, $field, $op ];
963              
964 355 100       761 if (m/^\)/) {
965 64         74 $self->{_paren_count}--;
966              
967             #warn "leaving loop on ) [paren_count==$self->{_paren_count}]";
968 64 100       110 if ( $self->{_paren_count} < 0 ) {
969 4 100       6 if ( !$fixup ) {
970              
971             #warn "unbalanced parens -- extra right-hand )";
972 1         1 $err = "unbalanced parentheses -- extra right-hand )";
973 1         2 last LOOP;
974             }
975             else {
976 3         9 s/^[\)\s]+//; # trim all trailing ) and space
977 3         8 next LOOP;
978             }
979             }
980             else {
981 60         128 last LOOP; # return from recursive call if meeting a ')'
982             }
983             }
984              
985             # try to parse sign prefix ('+', '-' or '!|NOT')
986 291 100       2185 if (s/^(\+|-)\s*//) { $sign = $1; }
  11 100       26  
    100          
987 5         10 elsif (s/^($not_regex)\b\s*//) { $sign = '-'; }
988              
989             # special check because of \b above
990 3         7 elsif (s/^\!\s*([^:=~])/$1/) { $sign = '-'; }
991              
992             # try to parse field name and operator
993 291 100 66     8841 if (s/^"($field_regex)"\s*($op_regex)\s*// # "field name" and op
      100        
994             or
995             s/^'?($field_regex)'?\s*($op_regex)\s*// # 'field name' and op
996             or s/^()($op_nofield_regex)\s*// # no field, just op
997             )
998             {
999 135         365 ( $field, $op ) = ( $1, $2 );
1000              
1001             #warn "matched field+op = " . dump [ $field, $op ];
1002 135 50       296 if ($parent_field) {
1003 0         0 $err = "field '$field' inside '$parent_field' (op=$op)";
1004 0         0 last LOOP;
1005             }
1006             }
1007              
1008             # parse a value (single term or quoted list or parens)
1009 291         374 my $clause = undef;
1010              
1011 291 100 100     4094 if ( s/^(")([^"]*?)"~(\d+)\s*//
    100 100        
    100 100        
    100          
    50          
1012             or s/^(")([^"]*?)"\s*//
1013             or s/^(')([^']*?)'\s*// )
1014             { # parse a quoted string.
1015 31         75 my ( $quote, $val, $proximity ) = ( $1, $2, $3 );
1016 31   66     798 $clause = $clause_class->new(
1017             field => $field,
1018             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1019             value => $val,
1020             quote => $quote,
1021             proximity => $proximity
1022             );
1023             }
1024              
1025             # fixup mode allows for a partially quoted string.
1026             elsif ( $fixup and s/^(")([^"]*?)\s*$// ) {
1027 1         4 my ( $quote, $val, $proximity ) = ( $1, $2, $3 );
1028 1   33     26 $clause = $clause_class->new(
1029             field => $field,
1030             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1031             value => $val,
1032             quote => $quote,
1033             proximity => $proximity
1034             );
1035             }
1036              
1037             # special case for range grouped with () since we do not
1038             # want the op of record to be the ().
1039             elsif (
1040             s/^\(\s*"?($phrase_regex)"?$range_regex"?($phrase_regex)"?\s*\)\s*//
1041             )
1042             {
1043 7         17 my ( $t1, $t2 ) = ( $1, $2 );
1044              
1045             # trim any spaces since phrase_regex includes it
1046 7         18 $t1 =~ s/^\ +|\ +$//g;
1047 7         14 $t2 =~ s/^\ +|\ +$//g;
1048              
1049 7 100       24 my $this_op = $op =~ m/\!/ ? '!..' : '..';
1050 7         10 my $has_spaces = 0;
1051 7 100 66     41 if ( index( $t1, ' ' ) != -1 or index( $t2, ' ' ) != -1 ) {
1052 1         2 $has_spaces = 1;
1053             }
1054 7 100       169 $clause = $clause_class->new(
1055             field => $field,
1056             op => $this_op,
1057             value => [ $t1, $t2 ],
1058             quote => ( $has_spaces ? '"' : undef ),
1059             );
1060             }
1061             elsif (s/^\(\s*//) { # parse parentheses
1062 63         91 $self->{_paren_count}++;
1063 63         203 my ( $r, $s2 )
1064             = $self->_parse( $str, $field, $op, $query_class );
1065 63 50       206 if ( !$r ) {
1066 0         0 $err = $self->error;
1067 0         0 last LOOP;
1068             }
1069 63         87 $str = $s2;
1070 63 100 33     376 if ( !defined($str) or !( $str =~ s/^\)\s*// ) ) {
1071 4 100 66     16 if ( defined($str) and $fixup ) {
1072 2         4 $str = ') ' . $str;
1073             }
1074             else {
1075 2         2 $err = "no matching ) ";
1076 2         10 last LOOP;
1077             }
1078             }
1079              
1080 61         1175 $clause = $clause_class->new(
1081             field => '',
1082             op => '()',
1083             value => bless( $r, $query_class ), # re-bless
1084             );
1085              
1086             }
1087             elsif (s/^($term_regex)\s*//) { # parse a single term
1088 189         316 my $term = $1;
1089 189 50 66     1042 if ( $term =~ m/^($term_regex)$range_regex($term_regex)$/ ) {
    100          
1090 0         0 my $t1 = $1;
1091 0         0 my $t2 = $2;
1092              
1093             #warn "found range ($op $parent_op): $term => $t1 .. $t2";
1094 0 0       0 my $this_op = $op =~ m/\!/ ? '!..' : '..';
1095 0         0 $clause = $clause_class->new(
1096             field => $field,
1097             op => $this_op,
1098             value => [ $t1, $t2 ],
1099             );
1100             }
1101             elsif ( $null_term and $term eq $null_term ) {
1102 5   33     134 $clause = $clause_class->new(
1103             field => $field,
1104             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1105             value => undef, # mimic NULL
1106             );
1107              
1108             }
1109             else {
1110              
1111 184   66     4573 $clause = $clause_class->new(
1112             field => $field,
1113             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1114             value => $term,
1115             );
1116              
1117             }
1118             }
1119              
1120 289 100       14875 if (s/^($near_regex)\s+//) {
1121              
1122             # modify the existing clause
1123             # and treat what comes next like a phrase
1124             # matching the syntax "foo bar"~\d+
1125 2         7 my ($prox_match) = ($1);
1126 2         6 my ($proximity) = $prox_match;
1127 2         7 $proximity =~ s/\D+//; # leave only number
1128 2 50       133 if (s/^($term_regex)\s*//) {
1129 2         7 my $term = $1;
1130 2         6 $clause->{value} .= ' ' . $term;
1131 2         6 $clause->{proximity} = $proximity;
1132 2         9 $clause->{quote} = '"';
1133             }
1134             else {
1135 0         0 $err = "missing term after $prox_match";
1136 0         0 last LOOP;
1137             }
1138              
1139             }
1140              
1141             # deal with boolean connectors
1142 289         325 my $post_bool = '';
1143 289 100       1918 if (s/^($and_regex)\s+//) {
    100          
1144 26         37 $post_bool = 'AND';
1145             }
1146             elsif (s/^($or_regex)\s+//) {
1147 43         55 $post_bool = 'OR';
1148             }
1149              
1150 289 50 100     795 if ( $pre_bool
      66        
1151             and $post_bool
1152             and $pre_bool ne $post_bool )
1153             {
1154 0         0 $err = "cannot mix AND/OR in requests; use parentheses";
1155 0         0 last LOOP;
1156             }
1157              
1158 289   100     799 my $bool = $pre_bool || $post_bool;
1159 289         256 $pre_bool = $post_bool; # for next loop
1160              
1161             # insert clause in query structure
1162 289 50       905 if ($clause) {
1163 289 100 100     1015 $sign = '' if $sign eq '+' and $bool eq 'OR';
1164 289 100 100     776 $sign = '+' if $sign eq '' and $bool eq 'AND';
1165 289 50 66     603 if ( $sign eq '-' and $bool eq 'OR' ) {
1166 0         0 $err = 'operands of "OR" cannot have "-" or "NOT" prefix';
1167 0         0 last LOOP;
1168             }
1169 289         270 push @{ $q->{$sign} }, $clause;
  289         1610  
1170             }
1171             else {
1172 0 0       0 if ($_) {
1173 0         0 $err = "unexpected string in query: '$_'";
1174 0         0 last LOOP;
1175             }
1176 0 0       0 if ($field) {
1177 0         0 $err = "missing value after $field $op";
1178 0         0 last LOOP;
1179             }
1180             }
1181             }
1182             }
1183              
1184             # handle error
1185 176 100       357 if ($err) {
1186 3         7 $self->{error} = "[$s_orig] : $err";
1187 3         4 $q = undef;
1188             }
1189              
1190             #dump $q;
1191              
1192 176 100       323 if ( !defined $q ) {
1193 3         10 return ( $q, $str );
1194             }
1195             my $query
1196 173         155 = $query_class->new( %{ $self->query_class_opts }, parser => $self );
  173         3838  
1197 173         16099 $query->{$_} = $q->{$_} for keys %$q;
1198 173         726 return ( $query, $str );
1199             }
1200              
1201             1;
1202              
1203             __END__