File Coverage

blib/lib/Search/Query/Parser.pm
Criterion Covered Total %
statement 303 340 89.1
branch 135 176 76.7
condition 69 98 70.4
subroutine 24 24 100.0
pod 7 7 100.0
total 538 645 83.4


line stmt bran cond sub pod time code
1             package Search::Query::Parser;
2 8     8   10646 use Moo;
  8         129577  
  8         55  
3 8     8   14388 use Carp;
  8         18  
  8         594  
4 8     8   2415 use Data::Dump qw( dump );
  8         22935  
  8         431  
5 8     8   2133 use Search::Query;
  8         20  
  8         233  
6 8     8   5498 use Search::Query::Dialect::Native;
  8         91  
  8         303  
7 8     8   5570 use Search::Query::Clause;
  8         24  
  8         272  
8 8     8   5026 use Search::Query::Field;
  8         23  
  8         283  
9 8     8   50 use Scalar::Util qw( blessed weaken );
  8         17  
  8         565  
10 8     8   45 use namespace::autoclean;
  8         16  
  8         42  
11              
12             our $VERSION = '0.306';
13              
14             has 'and_regex' => ( is => 'rw', default => sub {qr/\&|AND|ET|UND|E/i} );
15             has 'clause_class' =>
16             ( is => 'rw', default => sub {'Search::Query::Clause'} );
17             has 'croak_on_error' => ( is => 'rw', default => sub {0} );
18             has 'default_boolop' => ( is => 'rw', default => sub {'+'} );
19             has 'default_field' => ( is => 'rw' );
20             has 'default_op' => ( is => 'rw', default => sub {':'} );
21             has 'field_class' => ( is => 'rw', default => sub {'Search::Query::Field'} );
22              
23             # match prefix.field: or field
24             has 'field_regex' => ( is => 'rw', default => sub {qr/[\.\w]+/}, );
25              
26             has 'fixup' => ( is => 'rw', default => sub {0} );
27             has 'near_regex' => ( is => 'rw', default => sub {qr/NEAR\d+/i}, );
28             has 'not_regex' => ( is => 'rw', default => sub {qr/NOT|PAS|NICHT|NON/i}, );
29             has 'null_term' => ( is => 'rw', );
30              
31             # ops that admit an empty left operand
32             has 'op_nofield_regex' => ( is => 'rw', default => sub {qr/=~|!~|[~:#]/}, );
33              
34             # longest ops first !
35             has 'op_regex' =>
36             ( is => 'rw', default => sub {qr/~\d+|==|<=|>=|!=|!:|=~|!~|[:=<>~#]/}, );
37              
38             has 'or_regex' => ( is => 'rw', default => sub {qr/\||OR|OU|ODER|O/i}, );
39             has 'phrase_delim' => ( is => 'rw', default => sub {q/"/}, );
40             has 'query_class' =>
41             ( is => 'rw', default => sub {'Search::Query::Dialect::Native'} );
42             has 'query_class_opts' => ( is => 'rw', default => sub { {} } );
43             has 'range_regex' => ( is => 'rw', default => sub {qr/\.\./}, );
44             has 'sloppy' => ( is => 'rw', default => sub {0} );
45             has 'sloppy_term_regex' => ( is => 'rw', default => sub {qr/[\.\w]+/}, );
46             has 'term_expander' => ( is => 'rw' );
47             has 'term_regex' => ( is => 'rw', default => sub {qr/[^\s()]+/}, );
48             has 'error' => ( is => 'ro' );
49             has 'fields' => ( is => 'ro' );
50              
51             my %SQPCOMPAT = (
52             rxAnd => 'and_regex',
53             rxOr => 'or_regex',
54             rxNot => 'not_regex',
55             defField => 'default_field',
56             rxTerm => 'term_regex',
57             rxField => 'field_regex',
58             rxOp => 'op_regex',
59             rxOpNoField => 'op_nofield_regex',
60             dialect => 'query_class', # our own compat
61             );
62              
63             =head1 NAME
64              
65             Search::Query::Parser - convert query strings into query objects
66              
67             =head1 SYNOPSIS
68              
69             use Search::Query;
70             my $parser = Search::Query->parser(
71             term_regex => qr/[^\s()]+/,
72             field_regex => qr/\w+/,
73             op_regex => qr/==|<=|>=|!=|=~|!~|[:=<>~#]/,
74              
75             # ops that admit an empty left operand
76             op_nofield_regex => qr/=~|!~|[~:#]/,
77              
78             # case insensitive
79             and_regex => qr/\&|AND|ET|UND|E/i,
80             or_regex => qr/\||OR|OU|ODER|O/i,
81             not_regex => qr/NOT|PAS|NICHT|NON/i,
82              
83             default_field => 'myfield', # or ['myfield', 'myfield2']
84             phrase_delim => q/"/,
85             default_boolop => '+',
86             query_class => 'Search::Query::Dialect::Native',
87             field_class => 'Search::Query::Field',
88             query_class_opts => {
89             default_field => 'foo', # or ['foo', 'bar']
90             },
91            
92             # a generous mode, overlooking boolean-parser syntax errors
93             sloppy => 0,
94             sloppy_term_regex => qr/[\.\w]+/,
95             fixup => 0,
96            
97             # if set, this special term indicates a NULL query
98             null_term => 'NULL',
99             );
100              
101             my $query = $parser->parse('+hello -world now');
102             print $query;
103              
104             =head1 DESCRIPTION
105              
106             Search::Query::Parser is a fork of Search::QueryParser
107             that supports multiple query dialects.
108              
109             The Parser class transforms a query string into a Dialect object structure
110             to be handled by external search engines.
111              
112             The query string can contain simple terms, "exact phrases", field
113             names and comparison operators, '+/-' prefixes, parentheses, and
114             boolean connectors.
115              
116             The parser can be customized using regular expressions for specific
117             notions of "term", "field name" or "operator" -- see the L
118             method.
119              
120             The Dialect object resulting from a parsed query is a tree of terms
121             and operators. Each Dialect can be re-serialized as a string
122             using the stringify() method, or simply by printing the Dialect object,
123             since the string-related Perl operations are overloaded using stringify().
124              
125             =head1 QUERY STRING
126              
127             The query string is decomposed into Clause objects, where
128             each Clause has an optional sign prefix,
129             an optional field name and comparison operator,
130             and a mandatory value.
131              
132             =head2 Sign prefix
133              
134             Prefix '+' means that the item is mandatory.
135             Prefix '-' means that the item must be excluded.
136             No prefix means that the item will be searched
137             for, but is not mandatory.
138              
139             See also section L below, which is another
140             way to combine items into a query.
141              
142             =head2 Field name and comparison operator
143              
144             Internally, each query item has a field name and comparison
145             operator; if not written explicitly in the query, these
146             take default values C<''> (empty field name) and
147             C<':'> (colon operator).
148              
149             Operators have a left operand (the field name) and
150             a right operand (the value to be compared with);
151             for example, C means "search documents containing
152             term 'bar' in field 'foo'", whereas C means
153             "search documents where field 'foo' has exact value 'bar'".
154              
155             Here is the list of admitted operators with their intended meaning:
156              
157             =over
158              
159             =item C<:>
160              
161             treat value as a term to be searched within field.
162             This is the default operator.
163              
164             =item C<~> or C<=~>
165              
166             treat value as a regex; match field against the regex.
167              
168             Note that C<~>
169             after a phrase indicates a proximity assertion:
170              
171             "foo bar"~5
172              
173             means "match 'foo' and 'bar' within 5 positions of each other."
174              
175             =item C
176              
177             negation of above
178              
179             =item C<==> or C<=>, C=>, C=>, C, C>, C>
180              
181             classical relational operators
182              
183             =item C<#>
184              
185             Inclusion in the set of comma-separated integers supplied
186             on the right-hand side.
187              
188             =back
189              
190             Operators C<:>, C<~>, C<=~>, C and C<#> admit an empty
191             left operand (so the field name will be C<''>).
192             Search engines will usually interpret this as
193             "any field" or "the whole data record". But see the B
194             feature.
195              
196             =head2 Value
197              
198             A value (right operand to a comparison operator) can be
199              
200             =over
201              
202             =item *
203              
204             A term (as recognized by regex C, see L method below).
205              
206             =item *
207              
208             A quoted phrase, i.e. a collection of terms within
209             single or double quotes.
210              
211             Quotes can be used not only for "exact phrases", but also
212             to prevent misinterpretation of some values : for example
213             C<-2> would mean "value '2' with prefix '-'",
214             in other words "exclude term '2'", so if you want to search for
215             value -2, you should write C<"-2"> instead.
216              
217             Note that C<~>
218             after a phrase indicates a proximity assertion:
219              
220             "foo bar"~5
221              
222             means "match 'foo' and 'bar' within 5 positions of each other."
223              
224             =item *
225              
226             A subquery within parentheses.
227             Field names and operators distribute over parentheses, so for
228             example C is equivalent to
229             C.
230              
231             Nested field names such as C are not allowed.
232              
233             Sign prefixes do not distribute : C<+(foo bar) +bie> is not
234             equivalent to C<+foo +bar +bie>.
235              
236             =back
237              
238             =head2 Boolean connectors
239              
240             Queries can contain boolean connectors 'AND', 'OR', 'NOT'
241             (or their equivalent in some other languages -- see the *_regex
242             features in new()).
243             This is mere syntactic sugar for the '+' and '-' prefixes :
244             C is equivalent to C<+a +b>;
245             C is equivalent to C<(a b)>;
246             C is equivalent to C<-a>.
247             C<+a OR b> does not make sense,
248             but it is translated into C<(a b)>, under the assumption
249             that the user understands "OR" better than a
250             '+' prefix.
251             C<-a OR b> does not make sense either,
252             but has no meaningful approximation, so it is rejected.
253              
254             Combinations of AND/OR clauses must be surrounded by
255             parentheses, i.e. C<(a AND b) OR c> or C are
256             allowed, but C is not.
257              
258             The C connector is treated like the proximity phrase assertion.
259              
260             foo NEAR5 bar
261              
262             is treated as if it were:
263              
264             "foo bar"~5
265              
266             See the B option.
267              
268             =head1 METHODS
269              
270             =head2 new
271              
272             The following attributes may be initialized in new().
273             These are also available as get/set methods on the returned
274             Parser object.
275              
276             =over
277              
278             =item default_boolop
279              
280             =item term_regex
281              
282             =item field_regex
283              
284             =item op_regex
285              
286             =item op_nofield_regex
287              
288             =item and_regex
289              
290             =item or_regex
291              
292             =item not_regex
293              
294             =item near_regex
295              
296             =item range_regex
297              
298             =item default_field
299              
300             Applied to all terms where no field is defined.
301             The default value is undef (no default).
302              
303             =item default_op
304              
305             The operator used when default_field is applied.
306              
307             =item fields
308              
309             =item phrase_delim
310              
311             =item query_class
312              
313             C is an alias for C.
314              
315             =item field_class
316              
317             =item clause_class
318              
319             =item query_class_opts
320              
321             Will be passed to I new() method each time a query is parse()'d.
322              
323             =item dialect_opts
324              
325             Alias for query_class_opts.
326              
327             =item croak_on_error
328              
329             Default value is false (0). Set to true to automatically throw an exception
330             via Carp::croak() if parse() would return undef.
331              
332             =item term_expander
333              
334             A function reference for transforming query terms after they have been parsed.
335             Examples might include adding alternate spellings, synonyms, or
336             expanding wildcards based on lexicon listings.
337              
338             Example:
339              
340             my $parser = Search::Query->parser(
341             term_expander => sub {
342             my ($term, $field) = @_;
343             return ($term) if ref $term; # skip ranges
344             return ( qw( one two three ), $term );
345             }
346             );
347              
348             my $query = $parser->parse("foo=bar")
349             print "$query\n"; # +foo=(one OR two OR three OR bar)
350              
351             The term_expander reference should expect two arguments: the term value
352             and, if available, the term field name. It should return an array of values.
353              
354             The term_expander reference is called internally during the parse() method,
355             B any field alias expansion or validation is performed.
356              
357             =item sloppy( 0|1 )
358              
359             If the string passed to parse() has any incorrect or unsupported syntax
360             in it, the default behavior is for parsing to stop immediately, error()
361             to be set, and for parse() to return undef.
362              
363             In certain cases (as on a web form) this is undesirable. Set sloppy
364             mode to true to fallback to non-boolean evaluation of the string,
365             which in most cases should still return a Dialect object.
366              
367             Example:
368              
369             $parser->parse('foo -- OR bar'); # if sloppy==0, returns undef
370             $parser->parse('foo -- OR bar'); # if sloppy==1, equivalent to 'foo bar'
371              
372             =item sloppy_term_regex
373              
374             The regex definition used to match a term when sloppy==1.
375              
376             =item fixup( 0|1 )
377              
378             Attempt to fix syntax errors like the lack of a closing parenthesis
379             or a missing double-quote. Different than sloppy() which will not
380             attempt to fix broken syntax, but should probably be used together
381             if you really do not care about strict syntax checking.
382              
383             =item null_term
384              
385             If set to I, the B feature will treat field value
386             of I as if it was undefined. Example:
387              
388             $parser->parse('foo='); # throws fatal error
389             $parser->null_term('NULL');
390             $parser->parse('foo=NULL'); # field foo has NULL value
391              
392             This feature is most useful with the SQL dialect, where you might want to
393             find NULL values. Use it like:
394              
395             my $parser = Search::Query->parser(
396             dialect => 'SQL',
397             null_term => 'NULL'
398             );
399             my $query = $parser->parse('foo!=NULL');
400             print $query; # prints "foo is not NULL"
401              
402              
403             =back
404              
405             =head2 BUILDARGS
406              
407             Internal method for mangling constructor params.
408              
409             =cut
410              
411             sub BUILDARGS {
412 27     27 1 342958 my ( $class, %args ) = @_;
413              
414             # Search::QueryParser compatability
415 27 50       128 if ( exists $args{dialect_opts} ) {
416 0         0 $args{query_class_opts} = delete $args{dialect_opts};
417             }
418 27         110 for my $key ( keys %args ) {
419 83 100       268 if ( exists $SQPCOMPAT{$key} ) {
420 17         83 $args{ $SQPCOMPAT{$key} } = delete $args{$key};
421             }
422             }
423 27         701 return \%args;
424             }
425              
426             =head2 BUILD
427              
428             Called internally to initialize the object.
429              
430             =cut
431              
432             sub BUILD {
433 27     27 1 219 my $self = shift;
434              
435             # query class can be shortcut
436             $self->{query_class}
437 27         200 = Search::Query->get_query_class( $self->{query_class} );
438              
439             # use field class if query class defines one
440             # and we weren't passed one explicitly
441 27 100       195 if ( $self->{query_class}->field_class ne $self->{field_class} ) {
442 17         75 $self->{field_class} = $self->{query_class}->field_class;
443             }
444              
445 27 100       171 $self->set_fields( $self->{fields} ) if $self->{fields};
446              
447 27         734 return $self;
448             }
449              
450             =head2 error
451              
452             Returns the last error message.
453              
454             =cut
455              
456             =head2 clear_error
457              
458             Sets error message to undef.
459              
460             =cut
461              
462             sub clear_error {
463 2     2 1 109 $_->{error} = undef;
464             }
465              
466             =head2 get_field( I )
467              
468             Returns Field object for I or undef if there isn't one
469             defined.
470              
471             =cut
472              
473             sub get_field {
474 230     230 1 341 my $self = shift;
475 230 50       574 my $name = shift or croak "name required";
476 230 100       669 if ( !exists $self->{fields}->{$name} ) {
477 8         37 return undef;
478             }
479 222         851 return $self->{fields}->{$name};
480             }
481              
482             =head2 set_fields( I )
483              
484             Set the I structure. Called internally by BUILD()
485             if you pass a C key/value pair to new().
486              
487             The structure of I may be one of the following:
488              
489             my $fields = {
490             field1 => 1,
491             field2 => { alias_for => 'field1' },
492             field3 => Search::Query::Field->new( name => 'field3' ),
493             field4 => { alias_for => [qw( field1 field3 )] },
494             };
495              
496             # or
497              
498             my $fields = [
499             'field1',
500             { name => 'field2', alias_for => 'field1' },
501             Search::Query::Field->new( name => 'field3' ),
502             { name => 'field4', alias_for => [qw( field1 field3 )] },
503             ];
504              
505              
506             =cut
507              
508             sub set_fields {
509 22     22 1 43 my $self = shift;
510 22         44 my $origfields = shift;
511 22 50       83 if ( !defined $origfields ) {
512 0         0 croak "fields required";
513             }
514              
515 22         39 my %fields;
516 22         53 my $field_class = $self->{field_class};
517              
518 22         57 my $reftype = ref($origfields);
519 22 50 66     162 if ( !$reftype or ( $reftype ne 'ARRAY' and $reftype ne 'HASH' ) ) {
      33        
520 0         0 croak "fields must be an ARRAY or HASH ref";
521             }
522              
523             # convert simple array to hash
524 22 100       100 if ( $reftype eq 'ARRAY' ) {
    50          
525 18         48 for my $name (@$origfields) {
526 34 50       1560 if ( blessed($name) ) {
    100          
527 0         0 $fields{ $name->name } = $name;
528             }
529             elsif ( ref($name) eq 'HASH' ) {
530 1 50       6 if ( !exists $name->{name} ) {
531 0         0 croak "'name' required in hashref: " . dump($name);
532             }
533 1         128 $fields{ $name->{name} } = $field_class->new(%$name);
534             }
535             else {
536 33         874 $fields{$name} = $field_class->new( name => $name, );
537             }
538             }
539             }
540             elsif ( $reftype eq 'HASH' ) {
541 4         17 for my $name ( keys %$origfields ) {
542 12         23 my $val = $origfields->{$name};
543 12         16 my $obj;
544 12 50       62 if ( blessed($val) ) {
    50          
    0          
545 0         0 $obj = $val;
546             }
547             elsif ( ref($val) eq 'HASH' ) {
548 12 50       38 if ( !exists $val->{name} ) {
549 12         27 $val->{name} = $name;
550             }
551 12         299 $obj = $field_class->new(%$val);
552             }
553             elsif ( !ref $val ) {
554 0         0 $obj = $field_class->new( name => $name );
555             }
556             else {
557 0         0 croak
558             "field value for $name must be a field name, hashref or Field object";
559             }
560 12         1347 $fields{$name} = $obj;
561             }
562             }
563              
564 22         228 $self->{fields} = \%fields;
565 22         65 return $self->{fields};
566             }
567              
568             =head2 set_field( I => I )
569              
570             Sets field I to Field object I.
571              
572             =cut
573              
574             sub set_field {
575 8     8 1 60 my $self = shift;
576 8         21 my ( $name, $field ) = @_;
577 8 50       30 confess "name required" unless $name;
578 8 50       24 confess "field object required" unless $field;
579 8 50       46 confess "field not an object: $field" unless blessed($field);
580 8         42 $self->{fields}->{$name} = $field;
581             }
582              
583             =head2 parse( I )
584              
585             Returns a Search::Query::Dialect object of type
586             I.
587              
588             If there is a syntax error in I,
589             parse() will return C and set error().
590              
591             =cut
592              
593             sub parse {
594 82     82 1 14055 my $self = shift;
595 82         159 my $q = shift;
596 82 50       269 croak "query required" unless defined $q;
597 82   33     621 my $class = shift || $self->query_class;
598              
599             # reset state in case we are called multiple times
600 82         180 $self->{error} = undef;
601 82         156 $self->{_paren_count} = 0;
602              
603 82         402 $q = $class->preprocess($q);
604 82         279 my ($query) = $self->_parse( $q, undef, undef, $class );
605 82 100 66     283 if ( !defined $query && !$self->sloppy ) {
606 2 100       199 croak $self->error if $self->croak_on_error;
607 1         6 return $query;
608             }
609              
610             # if in sloppy mode and we failed to parse,
611             # extract what looks like terms and re-parse.
612 80 100 66     259 if ( !defined $query && $self->sloppy ) {
613 1         6 return $self->_sloppify( $q, $class );
614             }
615              
616 79 100       240 if ( $self->{term_expander} ) {
617 3         8 $self->_call_term_expander($query);
618             }
619              
620 79 100       250 if ( $self->{fields} ) {
621 66         239 $self->_expand($query);
622 66         1153 $self->_validate($query);
623             }
624              
625             # if in sloppy mode and we failed to parse,
626             # extract what looks like terms and re-parse.
627 77 100 66     337 if ( $self->error && $self->sloppy ) {
628 1         4 return $self->_sloppify( $q, $class );
629             }
630              
631 76         156 $query->{parser} = $self;
632              
633             #warn dump $query;
634              
635             # if the query isn't re-parse-able once stringified
636             # then it is broken, somehow.
637 76 100 33     694 if ( defined $query
      66        
638             and !$self->error
639             and $self->croak_on_error )
640             {
641 26         137 my ($reparsed) = $self->_parse( "$query", undef, undef, $class );
642 26 50       282 if ( !defined $reparsed ) {
643 0         0 croak sprintf( "Error: unable to parse '%s'. Reason: '%s'.",
644             $q, $self->error );
645             }
646             }
647              
648             #weaken( $query->{parser} ); # TODO leaks possible?
649              
650 76         434 return $query;
651             }
652              
653             sub _sloppify {
654 2     2   4 my ( $self, $q, $class ) = @_;
655 2         5 my $term = $self->{sloppy_term_regex};
656 2         2 my $and = $self->{and_regex};
657 2         4 my $or = $self->{or_regex};
658 2         3 my $not = $self->{not_regex};
659 2         3 my $near = $self->{near_regex};
660 2         4 my $ops = $self->{op_regex};
661 2         140 my $bools = qr/($and|$or|$not|$near|$ops)/;
662 2         10 my @terms;
663              
664 2         44 while ( $q =~ m/($term)/ig ) {
665 14         25 my $t = $1;
666              
667             #warn "$t =~ $bools\n";
668 14 100       153 if ( $t =~ m/^$bools$/ ) {
669 7         37 next;
670             }
671 7         58 push @terms, split( /$ops/, $t );
672             }
673              
674             #dump \@terms;
675              
676             # reset errors since we will re-parse
677 2         3 $self->{error} = undef;
678 2         9 my ($query) = $self->_parse( join( ' ', @terms ), undef, undef, $class );
679 2 50       9 if ( !defined $query ) {
680 0 0       0 $self->croak_on_error and croak $self->error;
681             }
682             else {
683 2         3 $query->{parser} = $self;
684             }
685 2         18 return $query;
686             }
687              
688             sub _call_term_expander {
689 3     3   6 my ( $self, $query ) = @_;
690 3         4 my $expander = $self->{term_expander};
691 3 50       10 if ( ref($expander) ne 'CODE' ) {
692 0         0 croak "term_expander must be a CODE reference";
693             }
694              
695 3         5 my $query_class = $self->{query_class};
696              
697             $query->walk(
698             sub {
699 3     3   7 my ( $clause, $tree, $code, $prefix ) = @_;
700 3 50       10 if ( $clause->is_tree ) {
701 0         0 $clause->value->walk($code);
702 0         0 return;
703             }
704              
705 3         16 my @newterms = $expander->( $clause->value, $clause->field );
706 3 50 33     33 if ( ref $newterms[0] and ref $clause->value ) {
    100          
707 0         0 $clause->value( $newterms[0] );
708             }
709             elsif ( @newterms > 1 ) {
710              
711             # turn $clause into a tree
712 1         4 my $class = blessed($clause);
713 1         4 my $op = $clause->op;
714 1         4 my $field = $clause->field;
715 1         4 my $proximity = $clause->proximity;
716 1         3 my $quote = $clause->quote;
717              
718             #warn "before tree: " . dump $tree;
719              
720             #warn "code clause: " . dump $clause;
721 1         2 my @subclauses;
722 1         3 for my $term (@newterms) {
723 4         163 push(
724             @subclauses,
725             $class->new(
726             field => $field,
727             op => $op,
728             value => $term,
729             quote => $quote,
730             proximity => $proximity,
731             )
732             );
733             }
734              
735             # OR the fields together. TODO optional?
736              
737             # we must set "" key here explicitly, because
738             # our bool op keys are not methods.
739             my $subclause
740 1         30 = $query_class->new( %{ $self->query_class_opts },
  1         24  
741             parser => $self );
742 1         19 $subclause->{""} = \@subclauses;
743              
744 1         4 $clause->op('()');
745 1         8 $clause->value($subclause);
746             }
747             else {
748 2         13 $clause->value( $newterms[0] );
749             }
750              
751             }
752 3         27 );
753              
754             }
755              
756             sub _expand {
757 66     66   117 my ( $self, $query ) = @_;
758              
759 66 50       199 return if !exists $self->{fields};
760 66         120 my $fields = $self->{fields};
761 66         134 my $query_class = $self->{query_class};
762 66         116 my $default_field = $self->{default_field};
763              
764             #dump $fields;
765              
766             $query->walk(
767             sub {
768 165     165   313 my ( $clause, $tree, $code, $prefix ) = @_;
769              
770             #warn "code clause: " . dump $clause;
771              
772             #warn "code tree: " . dump $tree;
773              
774 165 100       518 if ( $clause->is_tree ) {
775 46         184 $clause->value->walk($code);
776 46         158 return;
777             }
778 119 100 66     684 if ( ( !defined $clause->field || !length $clause->field )
      66        
779             && !defined $default_field )
780             {
781 13         57 return;
782             }
783              
784             # make sure clause has an op
785 106 100       309 if ( !$clause->op ) {
786 16         71 $clause->op( $self->default_op );
787             }
788              
789             # even if $clause has a field defined,
790             # it may be aliased to multiple others,
791             # so check field def and default_field to determine.
792 106         142 my @field_names;
793              
794             # first, which field name to start with?
795             my @clause_fields; # could be plural
796 106 100       286 if ( !defined $clause->field ) {
797             @clause_fields
798 16 100       61 = ref($default_field)
799             ? @$default_field
800             : ($default_field);
801             }
802             else {
803 90         289 @clause_fields = ( $clause->field );
804             }
805              
806             # second, resolve any aliases
807 106         243 for my $cfield (@clause_fields) {
808              
809             # if we have no definition for $cfield, it's invalid
810 108 100       282 if ( !exists $fields->{$cfield} ) {
811 3         19 return;
812             }
813              
814 105         171 my $field_def = $fields->{$cfield};
815 105 100       336 if ( $field_def->alias_for ) {
816             my @aliases
817             = ref $field_def->alias_for
818 10 100       32 ? @{ $field_def->alias_for }
  2         9  
819             : ( $field_def->alias_for );
820 10         27 push @field_names, @aliases;
821             }
822             else {
823 95         280 push @field_names, $cfield;
824             }
825             }
826              
827             #warn "resolved field_names: " . dump( \@field_names );
828              
829             # third, apply our canonical names to the $clause
830 103 100       235 if ( @field_names > 1 ) {
831              
832             # turn $clause into a tree
833 4         14 my $class = blessed($clause);
834 4         11 my $op = $clause->op;
835              
836             #warn "before tree: " . dump $tree;
837              
838             #warn "code clause: " . dump $clause;
839 4         4 my @newfields;
840 4         7 for my $name (@field_names) {
841 8         285 push(
842             @newfields,
843             $class->new(
844             field => $name,
845             op => $op,
846             value => $clause->value,
847             quote => $clause->quote,
848             proximity => $clause->proximity,
849             )
850             );
851             }
852              
853             # OR the fields together. TODO optional?
854              
855             # we must bless here because
856             # our bool op keys are not methods.
857             my $newfield
858 4         102 = $query_class->new( %{ $self->query_class_opts },
  4         91  
859             parser => $self );
860 4         69 $newfield->{""} = \@newfields;
861              
862 4         13 $clause->op('()');
863 4         10 $clause->value($newfield);
864              
865             #warn "after tree: " . dump $tree;
866              
867             }
868             else {
869              
870             # if no field defined in clause, or it differs, override.
871 99 100 100     587 if ( !defined $clause->field
872             or $field_names[0] ne $clause->field )
873             {
874 20         61 $clause->field( $field_names[0] );
875             }
876             }
877              
878 103         445 return $clause;
879             }
880 66         665 );
881             }
882              
883             sub _validate {
884 66     66   122 my ( $self, $query ) = @_;
885              
886 66         132 my $fields = $self->{fields};
887             my $validator = sub {
888 173     173   326 my ( $clause, $tree, $code, $prefix ) = @_;
889 173 100       486 if ( $clause->is_tree ) {
890 50         181 $clause->value->walk($code);
891             }
892             else {
893 123 100 100     832 return unless defined $clause->field and length $clause->field;
894 110         274 my $field_name = $clause->field;
895 110         232 my $field_value = $clause->value;
896 110         190 my $field = $fields->{$field_name};
897 110 100       292 if ( !$field ) {
898 3 100       20 if ( $self->croak_on_error ) {
899 2         508 croak "No such field: $field_name";
900             }
901             else {
902 1         3 $self->{error} = "No such field: $field_name";
903 1         5 return;
904             }
905             }
906 107 50       400 if ( !$field->validate($field_value) ) {
907 0 0       0 if ( $self->croak_on_error ) {
908 0         0 my $err = $field->error;
909 0         0 croak
910             "Invalid field value for $field_name: $field_value ($err)";
911             }
912             }
913             }
914 66         354 };
915 66         251 $query->walk($validator);
916             }
917              
918             sub _parse {
919 173     173   281 my $self = shift;
920 173         252 my $str = shift;
921 173         307 my $parent_field = shift; # only for recursive calls
922 173         234 my $parent_op = shift; # only for recursive calls
923 173         242 my $query_class = shift;
924              
925             #warn "_parse: " . dump [ $str, $parent_field, $parent_op, $query_class ];
926              
927             #dump $self;
928              
929 173         284 my $q = {};
930 173         263 my $pre_bool = '';
931 173         236 my $err = undef;
932 173         221 my $s_orig = $str;
933 173         309 my $phrase_delim = $self->{phrase_delim};
934 173         267 my $field_regex = $self->{field_regex};
935 173         361 my $and_regex = $self->{and_regex};
936 173         260 my $or_regex = $self->{or_regex};
937 173         268 my $not_regex = $self->{not_regex};
938 173         280 my $op_regex = $self->{op_regex};
939 173         262 my $op_nofield_regex = $self->{op_nofield_regex};
940 173         251 my $term_regex = $self->{term_regex};
941 173         611 my $phrase_regex = qr/[^"()]+/;
942 173         302 my $near_regex = $self->{near_regex};
943 173         269 my $range_regex = $self->{range_regex};
944 173         293 my $clause_class = $self->{clause_class};
945 173         270 my $fixup = $self->{fixup};
946 173         265 my $null_term = $self->{null_term};
947              
948 173         441 $str =~ s/^\s+//; # remove leading spaces
949              
950             LOOP:
951 173         449 while ( length $str ) { # while query string is not empty
952 350         672 for ($str) { # temporary alias to $_ for easier regex application
953              
954             #warn "LOOP start: " . dump [ $str, $parent_field, $parent_op ];
955              
956 350         671 my $sign = $self->{default_boolop};
957 350         436 my $field = $parent_field;
958 350   100     1284 my $op = $parent_op || "";
959              
960             #warn "LOOP after start: " . dump [ $sign, $field, $op ];
961              
962 350 100       1020 if (m/^\)/) {
963 64         113 $self->{_paren_count}--;
964              
965             #warn "leaving loop on ) [paren_count==$self->{_paren_count}]";
966 64 100       142 if ( $self->{_paren_count} < 0 ) {
967 4 100       11 if ( !$fixup ) {
968              
969             #warn "unbalanced parens -- extra right-hand )";
970 1         1 $err = "unbalanced parentheses -- extra right-hand )";
971 1         4 last LOOP;
972             }
973             else {
974 3         11 s/^[\)\s]+//; # trim all trailing ) and space
975 3         9 next LOOP;
976             }
977             }
978             else {
979 60         150 last LOOP; # return from recursive call if meeting a ')'
980             }
981             }
982              
983             # try to parse sign prefix ('+', '-' or '!|NOT')
984 286 100       2739 if (s/^(\+|-)\s*//) { $sign = $1; }
  11 100       34  
    100          
985 5         11 elsif (s/^($not_regex)\b\s*//) { $sign = '-'; }
986              
987             # special check because of \b above
988 3         7 elsif (s/^\!\s*([^:=~])/$1/) { $sign = '-'; }
989              
990             # try to parse field name and operator
991 286 100 66     11032 if (s/^"($field_regex)"\s*($op_regex)\s*// # "field name" and op
      100        
992             or
993             s/^'?($field_regex)'?\s*($op_regex)\s*// # 'field name' and op
994             or s/^()($op_nofield_regex)\s*// # no field, just op
995             )
996             {
997 132         413 ( $field, $op ) = ( $1, $2 );
998              
999             #warn "matched field+op = " . dump [ $field, $op ];
1000 132 50       389 if ($parent_field) {
1001 0         0 $err = "field '$field' inside '$parent_field' (op=$op)";
1002 0         0 last LOOP;
1003             }
1004             }
1005              
1006             # parse a value (single term or quoted list or parens)
1007 286         504 my $clause = undef;
1008              
1009 286 100 100     5466 if ( s/^(")([^"]*?)"~(\d+)\s*//
    100 100        
    100 100        
    100          
    50          
1010             or s/^(")([^"]*?)"\s*//
1011             or s/^(')([^']*?)'\s*// )
1012             { # parse a quoted string.
1013 31         106 my ( $quote, $val, $proximity ) = ( $1, $2, $3 );
1014 31   66     988 $clause = $clause_class->new(
1015             field => $field,
1016             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1017             value => $val,
1018             quote => $quote,
1019             proximity => $proximity
1020             );
1021             }
1022              
1023             # fixup mode allows for a partially quoted string.
1024             elsif ( $fixup and s/^(")([^"]*?)\s*$// ) {
1025 1         6 my ( $quote, $val, $proximity ) = ( $1, $2, $3 );
1026 1   33     34 $clause = $clause_class->new(
1027             field => $field,
1028             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1029             value => $val,
1030             quote => $quote,
1031             proximity => $proximity
1032             );
1033             }
1034              
1035             # special case for range grouped with () since we do not
1036             # want the op of record to be the ().
1037             elsif (
1038             s/^\(\s*"?($phrase_regex)"?$range_regex"?($phrase_regex)"?\s*\)\s*//
1039             )
1040             {
1041 7         24 my ( $t1, $t2 ) = ( $1, $2 );
1042              
1043             # trim any spaces since phrase_regex includes it
1044 7         24 $t1 =~ s/^\ +|\ +$//g;
1045 7         25 $t2 =~ s/^\ +|\ +$//g;
1046              
1047 7 100       31 my $this_op = $op =~ m/\!/ ? '!..' : '..';
1048 7         14 my $has_spaces = 0;
1049 7 100 66     64 if ( index( $t1, ' ' ) != -1 or index( $t2, ' ' ) != -1 ) {
1050 1         3 $has_spaces = 1;
1051             }
1052 7 100       237 $clause = $clause_class->new(
1053             field => $field,
1054             op => $this_op,
1055             value => [ $t1, $t2 ],
1056             quote => ( $has_spaces ? '"' : undef ),
1057             );
1058             }
1059             elsif (s/^\(\s*//) { # parse parentheses
1060 63         111 $self->{_paren_count}++;
1061 63         244 my ( $r, $s2 )
1062             = $self->_parse( $str, $field, $op, $query_class );
1063 63 50       262 if ( !$r ) {
1064 0         0 $err = $self->error;
1065 0         0 last LOOP;
1066             }
1067 63         135 $str = $s2;
1068 63 100 33     394 if ( !defined($str) or !( $str =~ s/^\)\s*// ) ) {
1069 4 100 66     22 if ( defined($str) and $fixup ) {
1070 2         5 $str = ') ' . $str;
1071             }
1072             else {
1073 2         4 $err = "no matching ) ";
1074 2         12 last LOOP;
1075             }
1076             }
1077              
1078 61         1487 $clause = $clause_class->new(
1079             field => '',
1080             op => '()',
1081             value => bless( $r, $query_class ), # re-bless
1082             );
1083              
1084             }
1085             elsif (s/^($term_regex)\s*//) { # parse a single term
1086 184         389 my $term = $1;
1087 184 50 66     1326 if ( $term =~ m/^($term_regex)$range_regex($term_regex)$/ ) {
    100          
1088 0         0 my $t1 = $1;
1089 0         0 my $t2 = $2;
1090              
1091             #warn "found range ($op $parent_op): $term => $t1 .. $t2";
1092 0 0       0 my $this_op = $op =~ m/\!/ ? '!..' : '..';
1093 0         0 $clause = $clause_class->new(
1094             field => $field,
1095             op => $this_op,
1096             value => [ $t1, $t2 ],
1097             );
1098             }
1099             elsif ( $null_term and $term eq $null_term ) {
1100 5   33     181 $clause = $clause_class->new(
1101             field => $field,
1102             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1103             value => undef, # mimic NULL
1104             );
1105              
1106             }
1107             else {
1108              
1109 179   66     5451 $clause = $clause_class->new(
1110             field => $field,
1111             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1112             value => $term,
1113             );
1114              
1115             }
1116             }
1117              
1118 284 100       17716 if (s/^($near_regex)\s+//) {
1119              
1120             # modify the existing clause
1121             # and treat what comes next like a phrase
1122             # matching the syntax "foo bar"~\d+
1123 2         7 my ($prox_match) = ($1);
1124 2         6 my ($proximity) = $prox_match;
1125 2         9 $proximity =~ s/\D+//; # leave only number
1126 2 50       95 if (s/^($term_regex)\s*//) {
1127 2         7 my $term = $1;
1128 2         8 $clause->{value} .= ' ' . $term;
1129 2         4 $clause->{proximity} = $proximity;
1130 2         7 $clause->{quote} = '"';
1131             }
1132             else {
1133 0         0 $err = "missing term after $prox_match";
1134 0         0 last LOOP;
1135             }
1136              
1137             }
1138              
1139             # deal with boolean connectors
1140 284         471 my $post_bool = '';
1141 284 100       2437 if (s/^($and_regex)\s+//) {
    100          
1142 26         48 $post_bool = 'AND';
1143             }
1144             elsif (s/^($or_regex)\s+//) {
1145 42         81 $post_bool = 'OR';
1146             }
1147              
1148 284 50 100     1012 if ( $pre_bool
      66        
1149             and $post_bool
1150             and $pre_bool ne $post_bool )
1151             {
1152 0         0 $err = "cannot mix AND/OR in requests; use parentheses";
1153 0         0 last LOOP;
1154             }
1155              
1156 284   100     1038 my $bool = $pre_bool || $post_bool;
1157 284         389 $pre_bool = $post_bool; # for next loop
1158              
1159             # insert clause in query structure
1160 284 50       1120 if ($clause) {
1161 284 100 100     1310 $sign = '' if $sign eq '+' and $bool eq 'OR';
1162 284 100 100     954 $sign = '+' if $sign eq '' and $bool eq 'AND';
1163 284 50 66     808 if ( $sign eq '-' and $bool eq 'OR' ) {
1164 0         0 $err = 'operands of "OR" cannot have "-" or "NOT" prefix';
1165 0         0 last LOOP;
1166             }
1167 284         346 push @{ $q->{$sign} }, $clause;
  284         2065  
1168             }
1169             else {
1170 0 0       0 if ($_) {
1171 0         0 $err = "unexpected string in query: '$_'";
1172 0         0 last LOOP;
1173             }
1174 0 0       0 if ($field) {
1175 0         0 $err = "missing value after $field $op";
1176 0         0 last LOOP;
1177             }
1178             }
1179             }
1180             }
1181              
1182             # handle error
1183 173 100       397 if ($err) {
1184 3         10 $self->{error} = "[$s_orig] : $err";
1185 3         6 $q = undef;
1186             }
1187              
1188             #dump $q;
1189              
1190 173 100       417 if ( !defined $q ) {
1191 3         13 return ( $q, $str );
1192             }
1193             my $query
1194 170         219 = $query_class->new( %{ $self->query_class_opts }, parser => $self );
  170         4799  
1195 170         13517 $query->{$_} = $q->{$_} for keys %$q;
1196 170         966 return ( $query, $str );
1197             }
1198              
1199             1;
1200              
1201             __END__