File Coverage

blib/lib/Search/Query/Parser.pm
Criterion Covered Total %
statement 303 340 89.1
branch 135 176 76.7
condition 69 98 70.4
subroutine 24 24 100.0
pod 7 7 100.0
total 538 645 83.4


line stmt bran cond sub pod time code
1             package Search::Query::Parser;
2 8     8   9862 use Moo;
  8         134618  
  8         55  
3 8     8   13003 use Carp;
  8         21  
  8         742  
4 8     8   1936 use Data::Dump qw( dump );
  8         22721  
  8         572  
5 8     8   1548 use Search::Query;
  8         17  
  8         254  
6 8     8   5205 use Search::Query::Dialect::Native;
  8         23  
  8         342  
7 8     8   4617 use Search::Query::Clause;
  8         23  
  8         283  
8 8     8   3744 use Search::Query::Field;
  8         21  
  8         340  
9 8     8   59 use Scalar::Util qw( blessed weaken );
  8         9  
  8         500  
10 8     8   45 use namespace::autoclean;
  8         12  
  8         51  
11              
12             our $VERSION = '0.305';
13              
14             has 'and_regex' => ( is => 'rw', default => sub {qr/\&|AND|ET|UND|E/i} );
15             has 'clause_class' =>
16             ( is => 'rw', default => sub {'Search::Query::Clause'} );
17             has 'croak_on_error' => ( is => 'rw', default => sub {0} );
18             has 'default_boolop' => ( is => 'rw', default => sub {'+'} );
19             has 'default_field' => ( is => 'rw' );
20             has 'default_op' => ( is => 'rw', default => sub {':'} );
21             has 'field_class' => ( is => 'rw', default => sub {'Search::Query::Field'} );
22              
23             # match prefix.field: or field
24             has 'field_regex' => ( is => 'rw', default => sub {qr/[\.\w]+/}, );
25              
26             has 'fixup' => ( is => 'rw', default => sub {0} );
27             has 'near_regex' => ( is => 'rw', default => sub {qr/NEAR\d+/i}, );
28             has 'not_regex' => ( is => 'rw', default => sub {qr/NOT|PAS|NICHT|NON/i}, );
29             has 'null_term' => ( is => 'rw', );
30              
31             # ops that admit an empty left operand
32             has 'op_nofield_regex' => ( is => 'rw', default => sub {qr/=~|!~|[~:#]/}, );
33              
34             # longest ops first !
35             has 'op_regex' =>
36             ( is => 'rw', default => sub {qr/~\d+|==|<=|>=|!=|!:|=~|!~|[:=<>~#]/}, );
37              
38             has 'or_regex' => ( is => 'rw', default => sub {qr/\||OR|OU|ODER|O/i}, );
39             has 'phrase_delim' => ( is => 'rw', default => sub {q/"/}, );
40             has 'query_class' =>
41             ( is => 'rw', default => sub {'Search::Query::Dialect::Native'} );
42             has 'query_class_opts' => ( is => 'rw', default => sub { {} } );
43             has 'range_regex' => ( is => 'rw', default => sub {qr/\.\./}, );
44             has 'sloppy' => ( is => 'rw', default => sub {0} );
45             has 'sloppy_term_regex' => ( is => 'rw', default => sub {qr/[\.\w]+/}, );
46             has 'term_expander' => ( is => 'rw' );
47             has 'term_regex' => ( is => 'rw', default => sub {qr/[^\s()]+/}, );
48             has 'error' => ( is => 'ro' );
49             has 'fields' => ( is => 'ro' );
50              
51             my %SQPCOMPAT = (
52             rxAnd => 'and_regex',
53             rxOr => 'or_regex',
54             rxNot => 'not_regex',
55             defField => 'default_field',
56             rxTerm => 'term_regex',
57             rxField => 'field_regex',
58             rxOp => 'op_regex',
59             rxOpNoField => 'op_nofield_regex',
60             dialect => 'query_class', # our own compat
61             );
62              
63             =head1 NAME
64              
65             Search::Query::Parser - convert query strings into query objects
66              
67             =head1 SYNOPSIS
68              
69             use Search::Query;
70             my $parser = Search::Query->parser(
71             term_regex => qr/[^\s()]+/,
72             field_regex => qr/\w+/,
73             op_regex => qr/==|<=|>=|!=|=~|!~|[:=<>~#]/,
74              
75             # ops that admit an empty left operand
76             op_nofield_regex => qr/=~|!~|[~:#]/,
77              
78             # case insensitive
79             and_regex => qr/\&|AND|ET|UND|E/i,
80             or_regex => qr/\||OR|OU|ODER|O/i,
81             not_regex => qr/NOT|PAS|NICHT|NON/i,
82              
83             default_field => 'myfield', # or ['myfield', 'myfield2']
84             phrase_delim => q/"/,
85             default_boolop => '+',
86             query_class => 'Search::Query::Dialect::Native',
87             field_class => 'Search::Query::Field',
88             query_class_opts => {
89             default_field => 'foo', # or ['foo', 'bar']
90             },
91            
92             # a generous mode, overlooking boolean-parser syntax errors
93             sloppy => 0,
94             sloppy_term_regex => qr/[\.\w]+/,
95             fixup => 0,
96            
97             # if set, this special term indicates a NULL query
98             null_term => 'NULL',
99             );
100              
101             my $query = $parser->parse('+hello -world now');
102             print $query;
103              
104             =head1 DESCRIPTION
105              
106             Search::Query::Parser is a fork of Search::QueryParser
107             that supports multiple query dialects.
108              
109             The Parser class transforms a query string into a Dialect object structure
110             to be handled by external search engines.
111              
112             The query string can contain simple terms, "exact phrases", field
113             names and comparison operators, '+/-' prefixes, parentheses, and
114             boolean connectors.
115              
116             The parser can be customized using regular expressions for specific
117             notions of "term", "field name" or "operator" -- see the L
118             method.
119              
120             The Dialect object resulting from a parsed query is a tree of terms
121             and operators. Each Dialect can be re-serialized as a string
122             using the stringify() method, or simply by printing the Dialect object,
123             since the string-related Perl operations are overloaded using stringify().
124              
125             =head1 QUERY STRING
126              
127             The query string is decomposed into Clause objects, where
128             each Clause has an optional sign prefix,
129             an optional field name and comparison operator,
130             and a mandatory value.
131              
132             =head2 Sign prefix
133              
134             Prefix '+' means that the item is mandatory.
135             Prefix '-' means that the item must be excluded.
136             No prefix means that the item will be searched
137             for, but is not mandatory.
138              
139             See also section L below, which is another
140             way to combine items into a query.
141              
142             =head2 Field name and comparison operator
143              
144             Internally, each query item has a field name and comparison
145             operator; if not written explicitly in the query, these
146             take default values C<''> (empty field name) and
147             C<':'> (colon operator).
148              
149             Operators have a left operand (the field name) and
150             a right operand (the value to be compared with);
151             for example, C means "search documents containing
152             term 'bar' in field 'foo'", whereas C means
153             "search documents where field 'foo' has exact value 'bar'".
154              
155             Here is the list of admitted operators with their intended meaning:
156              
157             =over
158              
159             =item C<:>
160              
161             treat value as a term to be searched within field.
162             This is the default operator.
163              
164             =item C<~> or C<=~>
165              
166             treat value as a regex; match field against the regex.
167              
168             Note that C<~>
169             after a phrase indicates a proximity assertion:
170              
171             "foo bar"~5
172              
173             means "match 'foo' and 'bar' within 5 positions of each other."
174              
175             =item C
176              
177             negation of above
178              
179             =item C<==> or C<=>, C=>, C=>, C, C>, C>
180              
181             classical relational operators
182              
183             =item C<#>
184              
185             Inclusion in the set of comma-separated integers supplied
186             on the right-hand side.
187              
188             =back
189              
190             Operators C<:>, C<~>, C<=~>, C and C<#> admit an empty
191             left operand (so the field name will be C<''>).
192             Search engines will usually interpret this as
193             "any field" or "the whole data record". But see the B
194             feature.
195              
196             =head2 Value
197              
198             A value (right operand to a comparison operator) can be
199              
200             =over
201              
202             =item *
203              
204             A term (as recognized by regex C, see L method below).
205              
206             =item *
207              
208             A quoted phrase, i.e. a collection of terms within
209             single or double quotes.
210              
211             Quotes can be used not only for "exact phrases", but also
212             to prevent misinterpretation of some values : for example
213             C<-2> would mean "value '2' with prefix '-'",
214             in other words "exclude term '2'", so if you want to search for
215             value -2, you should write C<"-2"> instead.
216              
217             Note that C<~>
218             after a phrase indicates a proximity assertion:
219              
220             "foo bar"~5
221              
222             means "match 'foo' and 'bar' within 5 positions of each other."
223              
224             =item *
225              
226             A subquery within parentheses.
227             Field names and operators distribute over parentheses, so for
228             example C is equivalent to
229             C.
230              
231             Nested field names such as C are not allowed.
232              
233             Sign prefixes do not distribute : C<+(foo bar) +bie> is not
234             equivalent to C<+foo +bar +bie>.
235              
236             =back
237              
238             =head2 Boolean connectors
239              
240             Queries can contain boolean connectors 'AND', 'OR', 'NOT'
241             (or their equivalent in some other languages -- see the *_regex
242             features in new()).
243             This is mere syntactic sugar for the '+' and '-' prefixes :
244             C is equivalent to C<+a +b>;
245             C is equivalent to C<(a b)>;
246             C is equivalent to C<-a>.
247             C<+a OR b> does not make sense,
248             but it is translated into C<(a b)>, under the assumption
249             that the user understands "OR" better than a
250             '+' prefix.
251             C<-a OR b> does not make sense either,
252             but has no meaningful approximation, so it is rejected.
253              
254             Combinations of AND/OR clauses must be surrounded by
255             parentheses, i.e. C<(a AND b) OR c> or C are
256             allowed, but C is not.
257              
258             The C connector is treated like the proximity phrase assertion.
259              
260             foo NEAR5 bar
261              
262             is treated as if it were:
263              
264             "foo bar"~5
265              
266             See the B option.
267              
268             =head1 METHODS
269              
270             =head2 new
271              
272             The following attributes may be initialized in new().
273             These are also available as get/set methods on the returned
274             Parser object.
275              
276             =over
277              
278             =item default_boolop
279              
280             =item term_regex
281              
282             =item field_regex
283              
284             =item op_regex
285              
286             =item op_nofield_regex
287              
288             =item and_regex
289              
290             =item or_regex
291              
292             =item not_regex
293              
294             =item near_regex
295              
296             =item range_regex
297              
298             =item default_field
299              
300             Applied to all terms where no field is defined.
301             The default value is undef (no default).
302              
303             =item default_op
304              
305             The operator used when default_field is applied.
306              
307             =item fields
308              
309             =item phrase_delim
310              
311             =item query_class
312              
313             C is an alias for C.
314              
315             =item field_class
316              
317             =item clause_class
318              
319             =item query_class_opts
320              
321             Will be passed to I new() method each time a query is parse()'d.
322              
323             =item dialect_opts
324              
325             Alias for query_class_opts.
326              
327             =item croak_on_error
328              
329             Default value is false (0). Set to true to automatically throw an exception
330             via Carp::croak() if parse() would return undef.
331              
332             =item term_expander
333              
334             A function reference for transforming query terms after they have been parsed.
335             Examples might include adding alternate spellings, synonyms, or
336             expanding wildcards based on lexicon listings.
337              
338             Example:
339              
340             my $parser = Search::Query->parser(
341             term_expander => sub {
342             my ($term, $field) = @_;
343             return ($term) if ref $term; # skip ranges
344             return ( qw( one two three ), $term );
345             }
346             );
347              
348             my $query = $parser->parse("foo=bar")
349             print "$query\n"; # +foo=(one OR two OR three OR bar)
350              
351             The term_expander reference should expect two arguments: the term value
352             and, if available, the term field name. It should return an array of values.
353              
354             The term_expander reference is called internally during the parse() method,
355             B any field alias expansion or validation is performed.
356              
357             =item sloppy( 0|1 )
358              
359             If the string passed to parse() has any incorrect or unsupported syntax
360             in it, the default behavior is for parsing to stop immediately, error()
361             to be set, and for parse() to return undef.
362              
363             In certain cases (as on a web form) this is undesirable. Set sloppy
364             mode to true to fallback to non-boolean evaluation of the string,
365             which in most cases should still return a Dialect object.
366              
367             Example:
368              
369             $parser->parse('foo -- OR bar'); # if sloppy==0, returns undef
370             $parser->parse('foo -- OR bar'); # if sloppy==1, equivalent to 'foo bar'
371              
372             =item sloppy_term_regex
373              
374             The regex definition used to match a term when sloppy==1.
375              
376             =item fixup( 0|1 )
377              
378             Attempt to fix syntax errors like the lack of a closing parenthesis
379             or a missing double-quote. Different than sloppy() which will not
380             attempt to fix broken syntax, but should probably be used together
381             if you really do not care about strict syntax checking.
382              
383             =item null_term
384              
385             If set to I, the B feature will treat field value
386             of I as if it was undefined. Example:
387              
388             $parser->parse('foo='); # throws fatal error
389             $parser->null_term('NULL');
390             $parser->parse('foo=NULL'); # field foo has NULL value
391              
392             This feature is most useful with the SQL dialect, where you might want to
393             find NULL values. Use it like:
394              
395             my $parser = Search::Query->parser(
396             dialect => 'SQL',
397             null_term => 'NULL'
398             );
399             my $query = $parser->parse('foo!=NULL');
400             print $query; # prints "foo is not NULL"
401              
402              
403             =back
404              
405             =head2 BUILDARGS
406              
407             Internal method for mangling constructor params.
408              
409             =cut
410              
411             sub BUILDARGS {
412 27     27 1 46980 my ( $class, %args ) = @_;
413              
414             # Search::QueryParser compatability
415 27 50       112 if ( exists $args{dialect_opts} ) {
416 0         0 $args{query_class_opts} = delete $args{dialect_opts};
417             }
418 27         104 for my $key ( keys %args ) {
419 83 100       204 if ( exists $SQPCOMPAT{$key} ) {
420 17         69 $args{ $SQPCOMPAT{$key} } = delete $args{$key};
421             }
422             }
423 27         1446 return \%args;
424             }
425              
426             =head2 BUILD
427              
428             Called internally to initialize the object.
429              
430             =cut
431              
432             sub BUILD {
433 27     27 1 167 my $self = shift;
434              
435             # query class can be shortcut
436 27         172 $self->{query_class}
437             = Search::Query->get_query_class( $self->{query_class} );
438              
439             # use field class if query class defines one
440             # and we weren't passed one explicitly
441 27 100       205 if ( $self->{query_class}->field_class ne $self->{field_class} ) {
442 17         67 $self->{field_class} = $self->{query_class}->field_class;
443             }
444              
445 27 100       160 $self->set_fields( $self->{fields} ) if $self->{fields};
446              
447 27         501 return $self;
448             }
449              
450             =head2 error
451              
452             Returns the last error message.
453              
454             =cut
455              
456             =head2 clear_error
457              
458             Sets error message to undef.
459              
460             =cut
461              
462             sub clear_error {
463 2     2 1 155 $_->{error} = undef;
464             }
465              
466             =head2 get_field( I )
467              
468             Returns Field object for I or undef if there isn't one
469             defined.
470              
471             =cut
472              
473             sub get_field {
474 230     230 1 256 my $self = shift;
475 230 50       496 my $name = shift or croak "name required";
476 230 100       641 if ( !exists $self->{fields}->{$name} ) {
477 8         27 return undef;
478             }
479 222         629 return $self->{fields}->{$name};
480             }
481              
482             =head2 set_fields( I )
483              
484             Set the I structure. Called internally by BUILD()
485             if you pass a C key/value pair to new().
486              
487             The structure of I may be one of the following:
488              
489             my $fields = {
490             field1 => 1,
491             field2 => { alias_for => 'field1' },
492             field3 => Search::Query::Field->new( name => 'field3' ),
493             field4 => { alias_for => [qw( field1 field3 )] },
494             };
495              
496             # or
497              
498             my $fields = [
499             'field1',
500             { name => 'field2', alias_for => 'field1' },
501             Search::Query::Field->new( name => 'field3' ),
502             { name => 'field4', alias_for => [qw( field1 field3 )] },
503             ];
504              
505              
506             =cut
507              
508             sub set_fields {
509 22     22 1 44 my $self = shift;
510 22         37 my $origfields = shift;
511 22 50       65 if ( !defined $origfields ) {
512 0         0 croak "fields required";
513             }
514              
515 22         37 my %fields;
516 22         45 my $field_class = $self->{field_class};
517              
518 22         48 my $reftype = ref($origfields);
519 22 50 66     169 if ( !$reftype or ( $reftype ne 'ARRAY' and $reftype ne 'HASH' ) ) {
      33        
520 0         0 croak "fields must be an ARRAY or HASH ref";
521             }
522              
523             # convert simple array to hash
524 22 100       76 if ( $reftype eq 'ARRAY' ) {
    50          
525 18         49 for my $name (@$origfields) {
526 34 50       1156 if ( blessed($name) ) {
    100          
527 0         0 $fields{ $name->name } = $name;
528             }
529             elsif ( ref($name) eq 'HASH' ) {
530 1 50       6 if ( !exists $name->{name} ) {
531 0         0 croak "'name' required in hashref: " . dump($name);
532             }
533 1         30 $fields{ $name->{name} } = $field_class->new(%$name);
534             }
535             else {
536 33         674 $fields{$name} = $field_class->new( name => $name, );
537             }
538             }
539             }
540             elsif ( $reftype eq 'HASH' ) {
541 4         22 for my $name ( keys %$origfields ) {
542 12         23 my $val = $origfields->{$name};
543 12         14 my $obj;
544 12 50       77 if ( blessed($val) ) {
    50          
    0          
545 0         0 $obj = $val;
546             }
547             elsif ( ref($val) eq 'HASH' ) {
548 12 50       37 if ( !exists $val->{name} ) {
549 12         27 $val->{name} = $name;
550             }
551 12         353 $obj = $field_class->new(%$val);
552             }
553             elsif ( !ref $val ) {
554 0         0 $obj = $field_class->new( name => $name );
555             }
556             else {
557 0         0 croak
558             "field value for $name must be a field name, hashref or Field object";
559             }
560 12         1893 $fields{$name} = $obj;
561             }
562             }
563              
564 22         192 $self->{fields} = \%fields;
565 22         52 return $self->{fields};
566             }
567              
568             =head2 set_field( I => I )
569              
570             Sets field I to Field object I.
571              
572             =cut
573              
574             sub set_field {
575 8     8 1 48 my $self = shift;
576 8         17 my ( $name, $field ) = @_;
577 8 50       25 confess "name required" unless $name;
578 8 50       18 confess "field object required" unless $field;
579 8 50       40 confess "field not an object: $field" unless blessed($field);
580 8         33 $self->{fields}->{$name} = $field;
581             }
582              
583             =head2 parse( I )
584              
585             Returns a Search::Query::Dialect object of type
586             I.
587              
588             If there is a syntax error in I,
589             parse() will return C and set error().
590              
591             =cut
592              
593             sub parse {
594 82     82 1 18325 my $self = shift;
595 82         123 my $q = shift;
596 82 50       220 croak "query required" unless defined $q;
597 82   33     513 my $class = shift || $self->query_class;
598              
599             # reset state in case we are called multiple times
600 82         167 $self->{error} = undef;
601 82         148 $self->{_paren_count} = 0;
602              
603 82         383 $q = $class->preprocess($q);
604 82         261 my ($query) = $self->_parse( $q, undef, undef, $class );
605 82 100 66     235 if ( !defined $query && !$self->sloppy ) {
606 2 100       159 croak $self->error if $self->croak_on_error;
607 1         4 return $query;
608             }
609              
610             # if in sloppy mode and we failed to parse,
611             # extract what looks like terms and re-parse.
612 80 100 66     238 if ( !defined $query && $self->sloppy ) {
613 1         3 return $self->_sloppify( $q, $class );
614             }
615              
616 79 100       195 if ( $self->{term_expander} ) {
617 3         7 $self->_call_term_expander($query);
618             }
619              
620 79 100       227 if ( $self->{fields} ) {
621 66         257 $self->_expand($query);
622 66         1198 $self->_validate($query);
623             }
624              
625             # if in sloppy mode and we failed to parse,
626             # extract what looks like terms and re-parse.
627 77 100 66     328 if ( $self->error && $self->sloppy ) {
628 1         3 return $self->_sloppify( $q, $class );
629             }
630              
631 76         152 $query->{parser} = $self;
632              
633             #warn dump $query;
634              
635             # if the query isn't re-parse-able once stringified
636             # then it is broken, somehow.
637 76 100 33     665 if ( defined $query
      66        
638             and !$self->error
639             and $self->croak_on_error )
640             {
641 26         98 my ($reparsed) = $self->_parse( "$query", undef, undef, $class );
642 26 50       389 if ( !defined $reparsed ) {
643 0         0 croak sprintf( "Error: unable to parse '%s'. Reason: '%s'.",
644             $q, $self->error );
645             }
646             }
647              
648             #weaken( $query->{parser} ); # TODO leaks possible?
649              
650 76         470 return $query;
651             }
652              
653             sub _sloppify {
654 2     2   3 my ( $self, $q, $class ) = @_;
655 2         4 my $term = $self->{sloppy_term_regex};
656 2         2 my $and = $self->{and_regex};
657 2         3 my $or = $self->{or_regex};
658 2         4 my $not = $self->{not_regex};
659 2         2 my $near = $self->{near_regex};
660 2         2 my $ops = $self->{op_regex};
661 2         101 my $bools = qr/($and|$or|$not|$near|$ops)/;
662 2         7 my @terms;
663              
664 2         34 while ( $q =~ m/($term)/ig ) {
665 14         15 my $t = $1;
666              
667             #warn "$t =~ $bools\n";
668 14 100       112 if ( $t =~ m/^$bools$/ ) {
669 7         25 next;
670             }
671 7         40 push @terms, split( /$ops/, $t );
672             }
673              
674             #dump \@terms;
675              
676             # reset errors since we will re-parse
677 2         3 $self->{error} = undef;
678 2         6 my ($query) = $self->_parse( join( ' ', @terms ), undef, undef, $class );
679 2 50       5 if ( !defined $query ) {
680 0 0       0 $self->croak_on_error and croak $self->error;
681             }
682             else {
683 2         3 $query->{parser} = $self;
684             }
685 2         12 return $query;
686             }
687              
688             sub _call_term_expander {
689 3     3   3 my ( $self, $query ) = @_;
690 3         4 my $expander = $self->{term_expander};
691 3 50       7 if ( ref($expander) ne 'CODE' ) {
692 0         0 croak "term_expander must be a CODE reference";
693             }
694              
695 3         4 my $query_class = $self->{query_class};
696              
697             $query->walk(
698             sub {
699 3     3   4 my ( $clause, $tree, $code, $prefix ) = @_;
700 3 50       10 if ( $clause->is_tree ) {
701 0         0 $clause->value->walk($code);
702 0         0 return;
703             }
704              
705 3         14 my @newterms = $expander->( $clause->value, $clause->field );
706 3 50 33     30 if ( ref $newterms[0] and ref $clause->value ) {
    100          
707 0         0 $clause->value( $newterms[0] );
708             }
709             elsif ( @newterms > 1 ) {
710              
711             # turn $clause into a tree
712 1         4 my $class = blessed($clause);
713 1         2 my $op = $clause->op;
714 1         4 my $field = $clause->field;
715 1         2 my $proximity = $clause->proximity;
716 1         3 my $quote = $clause->quote;
717              
718             #warn "before tree: " . dump $tree;
719              
720             #warn "code clause: " . dump $clause;
721 1         1 my @subclauses;
722 1         1 for my $term (@newterms) {
723 4         125 push(
724             @subclauses,
725             $class->new(
726             field => $field,
727             op => $op,
728             value => $term,
729             quote => $quote,
730             proximity => $proximity,
731             )
732             );
733             }
734              
735             # OR the fields together. TODO optional?
736              
737             # we must set "" key here explicitly, because
738             # our bool op keys are not methods.
739 1         20 my $subclause
740 1         18 = $query_class->new( %{ $self->query_class_opts },
741             parser => $self );
742 1         16 $subclause->{""} = \@subclauses;
743              
744 1         4 $clause->op('()');
745 1         5 $clause->value($subclause);
746             }
747             else {
748 2         11 $clause->value( $newterms[0] );
749             }
750              
751             }
752 3         25 );
753              
754             }
755              
756             sub _expand {
757 66     66   108 my ( $self, $query ) = @_;
758              
759 66 50       177 return if !exists $self->{fields};
760 66         105 my $fields = $self->{fields};
761 66         125 my $query_class = $self->{query_class};
762 66         121 my $default_field = $self->{default_field};
763              
764             #dump $fields;
765              
766             $query->walk(
767             sub {
768 165     165   252 my ( $clause, $tree, $code, $prefix ) = @_;
769              
770             #warn "code clause: " . dump $clause;
771              
772             #warn "code tree: " . dump $tree;
773              
774 165 100       491 if ( $clause->is_tree ) {
775 46         195 $clause->value->walk($code);
776 46         195 return;
777             }
778 119 100 66     639 if ( ( !defined $clause->field || !length $clause->field )
      66        
779             && !defined $default_field )
780             {
781 13         43 return;
782             }
783              
784             # make sure clause has an op
785 106 100       266 if ( !$clause->op ) {
786 16         67 $clause->op( $self->default_op );
787             }
788              
789             # even if $clause has a field defined,
790             # it may be aliased to multiple others,
791             # so check field def and default_field to determine.
792 106         101 my @field_names;
793              
794             # first, which field name to start with?
795             my @clause_fields; # could be plural
796 106 100       235 if ( !defined $clause->field ) {
797             @clause_fields
798 16 100       54 = ref($default_field)
799             ? @$default_field
800             : ($default_field);
801             }
802             else {
803 90         231 @clause_fields = ( $clause->field );
804             }
805              
806             # second, resolve any aliases
807 106         162 for my $cfield (@clause_fields) {
808              
809             # if we have no definition for $cfield, it's invalid
810 108 100       246 if ( !exists $fields->{$cfield} ) {
811 3         16 return;
812             }
813              
814 105         144 my $field_def = $fields->{$cfield};
815 105 100       277 if ( $field_def->alias_for ) {
816             my @aliases
817 2         10 = ref $field_def->alias_for
818 10 100       36 ? @{ $field_def->alias_for }
819             : ( $field_def->alias_for );
820 10         34 push @field_names, @aliases;
821             }
822             else {
823 95         239 push @field_names, $cfield;
824             }
825             }
826              
827             #warn "resolved field_names: " . dump( \@field_names );
828              
829             # third, apply our canonical names to the $clause
830 103 100       226 if ( @field_names > 1 ) {
831              
832             # turn $clause into a tree
833 4         15 my $class = blessed($clause);
834 4         11 my $op = $clause->op;
835              
836             #warn "before tree: " . dump $tree;
837              
838             #warn "code clause: " . dump $clause;
839 4         7 my @newfields;
840 4         6 for my $name (@field_names) {
841 8         377 push(
842             @newfields,
843             $class->new(
844             field => $name,
845             op => $op,
846             value => $clause->value,
847             quote => $clause->quote,
848             proximity => $clause->proximity,
849             )
850             );
851             }
852              
853             # OR the fields together. TODO optional?
854              
855             # we must bless here because
856             # our bool op keys are not methods.
857 4         110 my $newfield
858 4         113 = $query_class->new( %{ $self->query_class_opts },
859             parser => $self );
860 4         87 $newfield->{""} = \@newfields;
861              
862 4         14 $clause->op('()');
863 4         10 $clause->value($newfield);
864              
865             #warn "after tree: " . dump $tree;
866              
867             }
868             else {
869              
870             # if no field defined in clause, or it differs, override.
871 99 100 100     504 if ( !defined $clause->field
872             or $field_names[0] ne $clause->field )
873             {
874 20         54 $clause->field( $field_names[0] );
875             }
876             }
877              
878 103         388 return $clause;
879             }
880 66         765 );
881             }
882              
883             sub _validate {
884 66     66   95 my ( $self, $query ) = @_;
885              
886 66         118 my $fields = $self->{fields};
887             my $validator = sub {
888 173     173   264 my ( $clause, $tree, $code, $prefix ) = @_;
889 173 100       376 if ( $clause->is_tree ) {
890 50         180 $clause->value->walk($code);
891             }
892             else {
893 123 100 100     706 return unless defined $clause->field and length $clause->field;
894 110         190 my $field_name = $clause->field;
895 110         192 my $field_value = $clause->value;
896 110         151 my $field = $fields->{$field_name};
897 110 100       208 if ( !$field ) {
898 3 100       13 if ( $self->croak_on_error ) {
899 2         448 croak "No such field: $field_name";
900             }
901             else {
902 1         3 $self->{error} = "No such field: $field_name";
903 1         3 return;
904             }
905             }
906 107 50       344 if ( !$field->validate($field_value) ) {
907 0 0       0 if ( $self->croak_on_error ) {
908 0         0 my $err = $field->error;
909 0         0 croak
910             "Invalid field value for $field_name: $field_value ($err)";
911             }
912             }
913             }
914 66         346 };
915 66         212 $query->walk($validator);
916             }
917              
918             sub _parse {
919 173     173   236 my $self = shift;
920 173         215 my $str = shift;
921 173         223 my $parent_field = shift; # only for recursive calls
922 173         193 my $parent_op = shift; # only for recursive calls
923 173         174 my $query_class = shift;
924              
925             #warn "_parse: " . dump [ $str, $parent_field, $parent_op, $query_class ];
926              
927             #dump $self;
928              
929 173         369 my $q = {};
930 173         278 my $pre_bool = '';
931 173         177 my $err = undef;
932 173         186 my $s_orig = $str;
933 173         258 my $phrase_delim = $self->{phrase_delim};
934 173         238 my $field_regex = $self->{field_regex};
935 173         234 my $and_regex = $self->{and_regex};
936 173         201 my $or_regex = $self->{or_regex};
937 173         208 my $not_regex = $self->{not_regex};
938 173         206 my $op_regex = $self->{op_regex};
939 173         205 my $op_nofield_regex = $self->{op_nofield_regex};
940 173         200 my $term_regex = $self->{term_regex};
941 173         643 my $phrase_regex = qr/[^"()]+/;
942 173         247 my $near_regex = $self->{near_regex};
943 173         228 my $range_regex = $self->{range_regex};
944 173         233 my $clause_class = $self->{clause_class};
945 173         217 my $fixup = $self->{fixup};
946 173         195 my $null_term = $self->{null_term};
947              
948 173         496 $str =~ s/^\s+//; # remove leading spaces
949              
950             LOOP:
951 173         449 while ( length $str ) { # while query string is not empty
952 350         543 for ($str) { # temporary alias to $_ for easier regex application
953              
954             #warn "LOOP start: " . dump [ $str, $parent_field, $parent_op ];
955              
956 350         515 my $sign = $self->{default_boolop};
957 350         349 my $field = $parent_field;
958 350   100     1136 my $op = $parent_op || "";
959              
960             #warn "LOOP after start: " . dump [ $sign, $field, $op ];
961              
962 350 100       955 if (m/^\)/) {
963 64         111 $self->{_paren_count}--;
964              
965             #warn "leaving loop on ) [paren_count==$self->{_paren_count}]";
966 64 100       146 if ( $self->{_paren_count} < 0 ) {
967 4 100       7 if ( !$fixup ) {
968              
969             #warn "unbalanced parens -- extra right-hand )";
970 1         1 $err = "unbalanced parentheses -- extra right-hand )";
971 1         2 last LOOP;
972             }
973             else {
974 3         7 s/^[\)\s]+//; # trim all trailing ) and space
975 3         6 next LOOP;
976             }
977             }
978             else {
979 60         149 last LOOP; # return from recursive call if meeting a ')'
980             }
981             }
982              
983             # try to parse sign prefix ('+', '-' or '!|NOT')
984 286 100       2580 if (s/^(\+|-)\s*//) { $sign = $1; }
  11 100       25  
    100          
985 5         10 elsif (s/^($not_regex)\b\s*//) { $sign = '-'; }
986              
987             # special check because of \b above
988 3         6 elsif (s/^\!\s*([^:=~])/$1/) { $sign = '-'; }
989              
990             # try to parse field name and operator
991 286 100 66     7895 if (s/^"($field_regex)"\s*($op_regex)\s*// # "field name" and op
      100        
992             or
993             s/^'?($field_regex)'?\s*($op_regex)\s*// # 'field name' and op
994             or s/^()($op_nofield_regex)\s*// # no field, just op
995             )
996             {
997 132         403 ( $field, $op ) = ( $1, $2 );
998              
999             #warn "matched field+op = " . dump [ $field, $op ];
1000 132 50       272 if ($parent_field) {
1001 0         0 $err = "field '$field' inside '$parent_field' (op=$op)";
1002 0         0 last LOOP;
1003             }
1004             }
1005              
1006             # parse a value (single term or quoted list or parens)
1007 286         429 my $clause = undef;
1008              
1009 286 100 100     4769 if ( s/^(")([^"]*?)"~(\d+)\s*//
    100 100        
    100 100        
    100          
    50          
1010             or s/^(")([^"]*?)"\s*//
1011             or s/^(')([^']*?)'\s*// )
1012             { # parse a quoted string.
1013 31         96 my ( $quote, $val, $proximity ) = ( $1, $2, $3 );
1014 31   66     994 $clause = $clause_class->new(
1015             field => $field,
1016             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1017             value => $val,
1018             quote => $quote,
1019             proximity => $proximity
1020             );
1021             }
1022              
1023             # fixup mode allows for a partially quoted string.
1024             elsif ( $fixup and s/^(")([^"]*?)\s*$// ) {
1025 1         3 my ( $quote, $val, $proximity ) = ( $1, $2, $3 );
1026 1   33     28 $clause = $clause_class->new(
1027             field => $field,
1028             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1029             value => $val,
1030             quote => $quote,
1031             proximity => $proximity
1032             );
1033             }
1034              
1035             # special case for range grouped with () since we do not
1036             # want the op of record to be the ().
1037             elsif (
1038             s/^\(\s*"?($phrase_regex)"?$range_regex"?($phrase_regex)"?\s*\)\s*//
1039             )
1040             {
1041 7         17 my ( $t1, $t2 ) = ( $1, $2 );
1042              
1043             # trim any spaces since phrase_regex includes it
1044 7         20 $t1 =~ s/^\ +|\ +$//g;
1045 7         18 $t2 =~ s/^\ +|\ +$//g;
1046              
1047 7 100       31 my $this_op = $op =~ m/\!/ ? '!..' : '..';
1048 7         11 my $has_spaces = 0;
1049 7 100 66     44 if ( index( $t1, ' ' ) != -1 or index( $t2, ' ' ) != -1 ) {
1050 1         2 $has_spaces = 1;
1051             }
1052 7 100       196 $clause = $clause_class->new(
1053             field => $field,
1054             op => $this_op,
1055             value => [ $t1, $t2 ],
1056             quote => ( $has_spaces ? '"' : undef ),
1057             );
1058             }
1059             elsif (s/^\(\s*//) { # parse parentheses
1060 63         119 $self->{_paren_count}++;
1061 63         331 my ( $r, $s2 )
1062             = $self->_parse( $str, $field, $op, $query_class );
1063 63 50       270 if ( !$r ) {
1064 0         0 $err = $self->error;
1065 0         0 last LOOP;
1066             }
1067 63         114 $str = $s2;
1068 63 100 33     482 if ( !defined($str) or !( $str =~ s/^\)\s*// ) ) {
1069 4 100 66     15 if ( defined($str) and $fixup ) {
1070 2         4 $str = ') ' . $str;
1071             }
1072             else {
1073 2         3 $err = "no matching ) ";
1074 2         8 last LOOP;
1075             }
1076             }
1077              
1078 61         1678 $clause = $clause_class->new(
1079             field => '',
1080             op => '()',
1081             value => bless( $r, $query_class ), # re-bless
1082             );
1083              
1084             }
1085             elsif (s/^($term_regex)\s*//) { # parse a single term
1086 184         360 my $term = $1;
1087 184 50 66     1204 if ( $term =~ m/^($term_regex)$range_regex($term_regex)$/ ) {
    100          
1088 0         0 my $t1 = $1;
1089 0         0 my $t2 = $2;
1090              
1091             #warn "found range ($op $parent_op): $term => $t1 .. $t2";
1092 0 0       0 my $this_op = $op =~ m/\!/ ? '!..' : '..';
1093 0         0 $clause = $clause_class->new(
1094             field => $field,
1095             op => $this_op,
1096             value => [ $t1, $t2 ],
1097             );
1098             }
1099             elsif ( $null_term and $term eq $null_term ) {
1100 5   33     161 $clause = $clause_class->new(
1101             field => $field,
1102             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1103             value => undef, # mimic NULL
1104             );
1105              
1106             }
1107             else {
1108              
1109 179   66     5239 $clause = $clause_class->new(
1110             field => $field,
1111             op => ( $op || $parent_op || ( $field ? ":" : "" ) ),
1112             value => $term,
1113             );
1114              
1115             }
1116             }
1117              
1118 284 100       15335 if (s/^($near_regex)\s+//) {
1119              
1120             # modify the existing clause
1121             # and treat what comes next like a phrase
1122             # matching the syntax "foo bar"~\d+
1123 2         5 my ($prox_match) = ($1);
1124 2         4 my ($proximity) = $prox_match;
1125 2         7 $proximity =~ s/\D+//; # leave only number
1126 2 50       69 if (s/^($term_regex)\s*//) {
1127 2         4 my $term = $1;
1128 2         7 $clause->{value} .= ' ' . $term;
1129 2         4 $clause->{proximity} = $proximity;
1130 2         4 $clause->{quote} = '"';
1131             }
1132             else {
1133 0         0 $err = "missing term after $prox_match";
1134 0         0 last LOOP;
1135             }
1136              
1137             }
1138              
1139             # deal with boolean connectors
1140 284         384 my $post_bool = '';
1141 284 100       2175 if (s/^($and_regex)\s+//) {
    100          
1142 26         43 $post_bool = 'AND';
1143             }
1144             elsif (s/^($or_regex)\s+//) {
1145 42         73 $post_bool = 'OR';
1146             }
1147              
1148 284 50 100     947 if ( $pre_bool
      66        
1149             and $post_bool
1150             and $pre_bool ne $post_bool )
1151             {
1152 0         0 $err = "cannot mix AND/OR in requests; use parentheses";
1153 0         0 last LOOP;
1154             }
1155              
1156 284   100     916 my $bool = $pre_bool || $post_bool;
1157 284         307 $pre_bool = $post_bool; # for next loop
1158              
1159             # insert clause in query structure
1160 284 50       1097 if ($clause) {
1161 284 100 100     1118 $sign = '' if $sign eq '+' and $bool eq 'OR';
1162 284 100 100     792 $sign = '+' if $sign eq '' and $bool eq 'AND';
1163 284 50 66     631 if ( $sign eq '-' and $bool eq 'OR' ) {
1164 0         0 $err = 'operands of "OR" cannot have "-" or "NOT" prefix';
1165 0         0 last LOOP;
1166             }
1167 284         254 push @{ $q->{$sign} }, $clause;
  284         2079  
1168             }
1169             else {
1170 0 0       0 if ($_) {
1171 0         0 $err = "unexpected string in query: '$_'";
1172 0         0 last LOOP;
1173             }
1174 0 0       0 if ($field) {
1175 0         0 $err = "missing value after $field $op";
1176 0         0 last LOOP;
1177             }
1178             }
1179             }
1180             }
1181              
1182             # handle error
1183 173 100       339 if ($err) {
1184 3         9 $self->{error} = "[$s_orig] : $err";
1185 3         4 $q = undef;
1186             }
1187              
1188             #dump $q;
1189              
1190 173 100       453 if ( !defined $q ) {
1191 3         10 return ( $q, $str );
1192             }
1193 170         4791 my $query
1194 170         184 = $query_class->new( %{ $self->query_class_opts }, parser => $self );
1195 170         12344 $query->{$_} = $q->{$_} for keys %$q;
1196 170         869 return ( $query, $str );
1197             }
1198              
1199             1;
1200              
1201             __END__