File Coverage

blib/lib/Search/QueryParser.pm
Criterion Covered Total %
statement 85 87 97.7
branch 43 54 79.6
condition 43 48 89.5
subroutine 10 10 100.0
pod 4 5 80.0
total 185 204 90.6


line stmt bran cond sub pod time code
1             package Search::QueryParser;
2              
3 1     1   71299 use strict;
  1         3  
  1         30  
4 1     1   5 use warnings;
  1         2  
  1         25  
5 1     1   526 use locale;
  1         607  
  1         5  
6              
7             our $VERSION = "0.95";
8              
9             =head1 NAME
10              
11             Search::QueryParser - parses a query string into a data structure
12             suitable for external search engines
13              
14             =head1 SYNOPSIS
15              
16             my $qp = new Search::QueryParser;
17             my $s = '+mandatoryWord -excludedWord +field:word "exact phrase"';
18             my $query = $qp->parse($s) or die "Error in query : " . $qp->err;
19             $someIndexer->search($query);
20              
21             # query with comparison operators and implicit plus (second arg is true)
22             $query = $qp->parse("txt~'^foo.*' date>='01.01.2001' date<='02.02.2002'", 1);
23              
24             # boolean operators (example below is equivalent to "+a +(b c) -d")
25             $query = $qp->parse("a AND (b OR c) AND NOT d");
26              
27             # subset of rows
28             $query = $qp->parse("Id#123,444,555,666 AND (b OR c)");
29              
30              
31             =head1 DESCRIPTION
32              
33             This module parses a query string into a data structure to be handled
34             by external search engines. For examples of such engines, see
35             L and L.
36              
37             The query string can contain simple terms, "exact phrases", field
38             names and comparison operators, '+/-' prefixes, parentheses, and
39             boolean connectors.
40              
41             The parser can be parameterized by regular expressions for specific
42             notions of "term", "field name" or "operator" ; see the L
43             method. The parser has no support for lemmatization or other term
44             transformations : these should be done externally, before passing the
45             query data structure to the search engine.
46              
47             The data structure resulting from a parsed query is a tree of terms
48             and operators, as described below in the L method. The
49             interpretation of the structure is up to the external search engine
50             that will receive the parsed query ; the present module does not make
51             any assumption about what it means to be "equal" or to "contain" a
52             term.
53              
54              
55             =head1 QUERY STRING
56              
57             The query string is decomposed into "items", where
58             each item has an optional sign prefix,
59             an optional field name and comparison operator,
60             and a mandatory value.
61              
62             =head2 Sign prefix
63              
64             Prefix '+' means that the item is mandatory.
65             Prefix '-' means that the item must be excluded.
66             No prefix means that the item will be searched
67             for, but is not mandatory.
68              
69             As far as the result set is concerned,
70             C<+a +b c> is strictly equivalent to C<+a +b> : the search engine will
71             return documents containing both terms 'a' and 'b', and possibly
72             also term 'c'. However, if the search engine also returns
73             relevance scores, query C<+a +b c> might give a better score
74             to documents containing also term 'c'.
75              
76             See also section L below, which is another
77             way to combine items into a query.
78              
79             =head2 Field name and comparison operator
80              
81             Internally, each query item has a field name and comparison
82             operator; if not written explicitly in the query, these
83             take default values C<''> (empty field name) and
84             C<':'> (colon operator).
85              
86             Operators have a left operand (the field name) and
87             a right operand (the value to be compared with);
88             for example, C means "search documents containing
89             term 'bar' in field 'foo'", whereas C means
90             "search documents where field 'foo' has exact value 'bar'".
91              
92             Here is the list of admitted operators with their intended meaning :
93              
94             =over
95              
96             =item C<:>
97              
98             treat value as a term to be searched within field.
99             This is the default operator.
100              
101             =item C<~> or C<=~>
102              
103             treat value as a regex; match field against the regex.
104              
105             =item C
106              
107             negation of above
108              
109             =item C<==> or C<=>, C=>, C=>, C, C>, C>
110              
111             classical relational operators
112              
113             =item C<#>
114              
115             Inclusion in the set of comma-separated integers supplied
116             on the right-hand side.
117              
118              
119             =back
120              
121             Operators C<:>, C<~>, C<=~>, C and C<#> admit an empty
122             left operand (so the field name will be C<''>).
123             Search engines will usually interpret this as
124             "any field" or "the whole data record".
125              
126             =head2 Value
127              
128             A value (right operand to a comparison operator) can be
129              
130             =over
131              
132             =item *
133              
134             just a term (as recognized by regex C, see L method below)
135              
136             =item *
137              
138             A quoted phrase, i.e. a collection of terms within
139             single or double quotes.
140              
141             Quotes can be used not only for "exact phrases", but also
142             to prevent misinterpretation of some values : for example
143             C<-2> would mean "value '2' with prefix '-'",
144             in other words "exclude term '2'", so if you want to search for
145             value -2, you should write C<"-2"> instead. In the
146             last example of the synopsis, quotes were used to
147             prevent splitting of dates into several search terms.
148              
149             =item *
150              
151             a subquery within parentheses.
152             Field names and operators distribute over parentheses, so for
153             example C is equivalent to
154             C.
155             Nested field names such as C are not allowed.
156             Sign prefixes do not distribute : C<+(foo bar) +bie> is not
157             equivalent to C<+foo +bar +bie>.
158              
159              
160             =back
161              
162              
163             =head2 Boolean connectors
164              
165             Queries can contain boolean connectors 'AND', 'OR', 'NOT'
166             (or their equivalent in some other languages).
167             This is mere syntactic sugar for the '+' and '-' prefixes :
168             C is translated into C<+a +b>;
169             C is translated into C<(a b)>;
170             C is translated into C<-a>.
171             C<+a OR b> does not make sense,
172             but it is translated into C<(a b)>, under the assumption
173             that the user understands "OR" better than a
174             '+' prefix.
175             C<-a OR b> does not make sense either,
176             but has no meaningful approximation, so it is rejected.
177              
178             Combinations of AND/OR clauses must be surrounded by
179             parentheses, i.e. C<(a AND b) OR c> or C are
180             allowed, but C is not.
181              
182              
183             =head1 METHODS
184              
185             =over
186              
187             =cut
188              
189 1         1376 use constant DEFAULT => {
190             rxTerm => qr/[^\s()]+/,
191             rxField => qr/\w+/,
192              
193             rxOp => qr/==|<=|>=|!=|=~|!~|[:=<>~#]/, # longest ops first !
194             rxOpNoField => qr/=~|!~|[~:#]/, # ops that admit an empty left operand
195              
196             rxAnd => qr/AND|ET|UND|E/,
197             rxOr => qr/OR|OU|ODER|O/,
198             rxNot => qr/NOT|PAS|NICHT|NON/,
199              
200             defField => "",
201 1     1   255 };
  1         3  
202              
203             =item new
204              
205             new(rxTerm => qr/.../, rxOp => qr/.../, ...)
206              
207             Creates a new query parser, initialized with (optional) regular
208             expressions :
209              
210             =over
211              
212             =item rxTerm
213              
214             Regular expression for matching a term.
215             Of course it should not match the empty string.
216             Default value is C.
217             A term should not be allowed to include parenthesis, otherwise the parser
218             might get into trouble.
219              
220             =item rxField
221              
222             Regular expression for matching a field name.
223             Default value is C (meaning of C<\w> according to C).
224              
225             =item rxOp
226              
227             Regular expression for matching an operator.
228             Default value is C=|E=|!=|=~|!~|:|=|E|E|~/>.
229             Note that the longest operators come first in the regex, because
230             "alternatives are tried from left to right"
231             (see L) :
232             this is to avoid C=3> being parsed as
233             C '=3'>.
234              
235             =item rxOpNoField
236              
237             Regular expression for a subset of the operators
238             which admit an empty left operand (no field name).
239             Default value is C.
240             Such operators can be meaningful for comparisons
241             with "any field" or with "the whole record" ;
242             the precise interpretation depends on the search engine.
243              
244             =item rxAnd
245              
246             Regular expression for boolean connector AND.
247             Default value is C.
248              
249             =item rxOr
250              
251             Regular expression for boolean connector OR.
252             Default value is C.
253              
254             =item rxNot
255              
256             Regular expression for boolean connector NOT.
257             Default value is C.
258              
259             =item defField
260              
261             If no field is specified in the query, use I.
262             The default is the empty string "".
263              
264             =back
265              
266             =cut
267              
268             sub new {
269 2     2 1 86 my $class = shift;
270 2 50       10 my $args = ref $_[0] eq 'HASH' ? $_[0] : {@_};
271              
272             # create object with default values
273 2         5 my $self = bless {}, $class;
274             $self->{$_} = $args->{$_} || DEFAULT->{$_}
275 2   100     45 foreach qw(rxTerm rxField rxOp rxOpNoField rxAnd rxOr rxNot defField);
276 2         13 return $self;
277             }
278              
279             =item parse
280              
281             $q = $queryParser->parse($queryString, $implicitPlus);
282              
283             Returns a data structure corresponding to the parsed string.
284             The second argument is optional; if true, it adds an implicit
285             '+' in front of each term without prefix, so
286             C is equivalent to C.
287             This is often seen in common WWW search engines
288             as an option "match all words".
289              
290             The return value has following structure :
291              
292             { '+' => [{field=>'f1', op=>':', value=>'v1', quote=>'q1'},
293             {field=>'f2', op=>':', value=>'v2', quote=>'q2'}, ...],
294             '' => [...],
295             '-' => [...]
296             }
297              
298             In other words, it is a hash ref with 3 keys C<'+'>, C<''> and C<'-'>,
299             corresponding to the 3 sign prefixes (mandatory, ordinary or excluded
300             items). Each key holds either a ref to an array of items, or
301             C (no items with this prefix in the query).
302              
303             An I is a hash ref containing
304              
305             =over
306              
307             =item C
308              
309             scalar, field name (may be the empty string)
310              
311             =item C
312              
313             scalar, operator
314              
315             =item C
316              
317             scalar, character that was used for quoting the value ('"', "'" or undef)
318              
319             =item C
320              
321             Either
322              
323             =over
324              
325             =item *
326              
327             a scalar (simple term), or
328              
329             =item *
330              
331             a recursive ref to another query structure. In that case,
332             C is necessarily C<'()'> ; this corresponds
333             to a subquery in parentheses.
334              
335             =back
336              
337             =back
338              
339             In case of a parsing error, C returns C;
340             method L can be called to get an explanatory message.
341              
342             =cut
343              
344              
345              
346             sub parse {
347 12     12 1 563 my $self = shift;
348 12         19 my $s_orig = $_[0];
349 12         29 my ($parsedQuery, $restOfString) = $self->_parse(@_);
350 12 100       25 if ($restOfString) {
351 2   66     10 $self->{err} ||= "[$s_orig] : parsed into " . $self->unparse($parsedQuery)
352             . ", but unable to parse [$restOfString]";
353 2         12 return undef;
354             }
355              
356 10         43 return $parsedQuery;
357             }
358              
359              
360             sub _parse{ # returns ($parsedQuery, $restOfString)
361 17     17   37 my ($self, $s, $implicitPlus, $parentField, $parentOp) = @_; # last 2 args only for recursive calls
362              
363 17         30 my $q = {};
364 17         24 my $preBool = '';
365 17         24 my $err = undef;
366 17         26 my $s_orig = $s;
367              
368 17         55 $s =~ s/^\s+//; # remove leading spaces
369              
370             LOOP :
371 17         37 while ($s) { # while query string is not empty
372 42         81 for ($s) { # temporary alias to $_ for easier regex application
373 42 100       78 my $sign = $implicitPlus ? "+" : "";
374 42         58 my $explicit_field;
375 42   100     107 my $op = $parentOp || ":";
376              
377 42 100       98 last LOOP if m/^\)/; # return from recursive call if meeting a ')'
378              
379             # try to parse sign prefix ('+', '-' or 'NOT')
380 37 100       220 if (s/^(\+|-)\s*//) { $sign = $1; }
  7 100       15  
381 1         3 elsif (s/^($self->{rxNot})\b\s*//) { $sign = '-'; }
382              
383             # try to parse field name and operator
384 37 100 100     584 if (s/^"($self->{rxField})"\s*($self->{rxOp})\s*// # "field name" and op
      100        
      100        
385             or
386             s/^'($self->{rxField})'\s*($self->{rxOp})\s*// # 'field name' and op
387             or
388             s/^($self->{rxField})\s*($self->{rxOp})\s*// # field name and op
389             or
390             s/^()($self->{rxOpNoField})\s*//) { # no field, just op
391 15         46 ($explicit_field, $op) = ($1, $2);
392 15 100       34 $err = "field '$explicit_field' inside '$parentField'", last LOOP if $parentField;
393             }
394              
395             # target field, either explicit or implicit
396 36   100     158 my $field = $explicit_field || $parentField || $self->{defField};
397              
398              
399             # parse a value (single term or quoted list or parens)
400 36         51 my $subQ = undef;
401              
402 36 100 100     277 if (s/^(")([^"]*?)"\s*// or
    100          
    50          
403             s/^(')([^']*?)'\s*//) { # parse a quoted string.
404 6         19 my ($quote, $val) = ($1, $2);
405 6         23 $subQ = {field=>$field, op=>$op, value=>$val, quote=>$quote};
406             }
407             elsif (s/^\(\s*//) { # parse parentheses
408 5         20 my ($r, $s2) = $self->_parse($s, $implicitPlus, $explicit_field, $op);
409 5 100       13 $err = $self->err, last LOOP if not $r;
410 4         55 $s = $s2;
411 4 50       22 $s =~ s/^\)\s*// or $err = "no matching ) ", last LOOP;
412 4         14 $subQ = {field=>'', op=>'()', value=>$r};
413             }
414             elsif (s/^($self->{rxTerm})\s*//) { # parse a single term
415 25         107 $subQ = {field=>$field, op=>$op, value=>$1};
416             }
417              
418             # deal with boolean connectors
419 35         66 my $postBool = '';
420 35 100       243 if (s/^($self->{rxAnd})\b\s*//) { $postBool = 'AND' }
  3 100       5  
421 2         4 elsif (s/^($self->{rxOr})\b\s*//) { $postBool = 'OR' }
422 35 50 100     91 $err = "cannot mix AND/OR in requests; use parentheses", last LOOP
      66        
423             if $preBool and $postBool and $preBool ne $postBool;
424 35   100     96 my $bool = $preBool || $postBool;
425 35         48 $preBool = $postBool; # for next loop
426              
427             # insert subquery in query structure
428 35 50       80 if ($subQ) {
429 35 50 66     84 $sign = '' if $sign eq '+' and $bool eq 'OR';
430 35 100 100     90 $sign = '+' if $sign eq '' and $bool eq 'AND';
431 35 50 66     70 $err = 'operands of "OR" cannot have "-" or "NOT" prefix', last LOOP
432             if $sign eq '-' and $bool eq 'OR';
433 35         45 push @{$q->{$sign}}, $subQ;
  35         167  
434             }
435             else {
436 0 0       0 $err = "unexpected string in query : $_", last LOOP if $_;
437 0 0       0 $err = "missing value after $field $op" , last LOOP if $field;
438             }
439             }
440             }
441              
442 17 100 50     70 $err ||= "no positive value in query" unless $q->{'+'} or $q->{''};
      100        
443 17 100       38 $self->{err} = $err ? "[$s_orig] : $err" : "";
444 17 100       34 $q = undef if $err;
445 17         47 return ($q, $s);
446             }
447              
448              
449             =item err
450              
451             $msg = $queryParser->err;
452              
453             Message describing the last parse error
454              
455             =cut
456              
457             sub err {
458 3     3 1 596 my $self = shift;
459 3         18 return $self->{err};
460             }
461              
462              
463             =item unparse
464              
465             $s = $queryParser->unparse($query);
466              
467             Returns a string representation of the C<$query> data structure.
468              
469             =cut
470              
471             sub unparse {
472 15     15 1 431 my $self = shift;
473 15         21 my $q = shift;
474              
475 15         22 my @subQ;
476 15         25 foreach my $prefix ('+', '', '-') {
477 45 100       89 next if not $q->{$prefix};
478 21         29 push @subQ, $prefix . $self->unparse_subQ($_) foreach @{$q->{$prefix}};
  21         57  
479             }
480 15         79 return join " ", @subQ;
481             }
482              
483             sub unparse_subQ {
484 35     35 0 54 my $self = shift;
485 35         53 my $subQ = shift;
486              
487 35 100       75 return "(" . $self->unparse($subQ->{value}) . ")" if $subQ->{op} eq '()';
488 31   100     70 my $quote = $subQ->{quote} || "";
489 31         114 return "$subQ->{field}$subQ->{op}$quote$subQ->{value}$quote";
490             }
491              
492             =back
493              
494             =head1 AUTHOR
495              
496             Laurent Dami, Elaurent.dami AT etat ge chE
497              
498             =head1 COPYRIGHT AND LICENSE
499              
500             Copyright (C) 2005, 2007 by Laurent Dami.
501              
502             This library is free software; you can redistribute it and/or modify
503             it under the same terms as Perl itself.
504              
505             =cut
506              
507             1;
508