File Coverage

blib/lib/CQL/Parser.pm
Criterion Covered Total %
statement 176 186 94.6
branch 40 52 76.9
condition 24 36 66.6
subroutine 29 29 100.0
pod 3 17 17.6
total 272 320 85.0


line stmt bran cond sub pod time code
1             package CQL::Parser;
2              
3 7     7   46822 use strict;
  7         19  
  7         296  
4 7     7   38 use warnings;
  7         13  
  7         218  
5 7     7   4888 use CQL::Lexer;
  7         19  
  7         207  
6 7     7   6066 use CQL::Relation;
  7         26  
  7         666  
7 7     7   50 use CQL::Token;
  7         18  
  7         1503  
8 7     7   5340 use CQL::TermNode;
  7         19  
  7         390  
9 7     7   4629 use CQL::AndNode;
  7         20  
  7         362  
10 7     7   4015 use CQL::OrNode;
  7         18  
  7         300  
11 7     7   3972 use CQL::NotNode;
  7         17  
  7         288  
12 7     7   13912 use CQL::PrefixNode;
  7         20  
  7         1061  
13 7     7   14890 use CQL::ProxNode;
  7         18  
  7         312  
14 7     7   43 use Carp qw( croak );
  7         14  
  7         22723  
15              
16             our $VERSION = '1.13';
17              
18             my $lexer;
19             my $token;
20              
21             =head1 NAME
22              
23             CQL::Parser - compiles CQL strings into parse trees of Node subtypes.
24              
25             =head1 SYNOPSIS
26              
27             use CQL::Parser;
28             my $parser = CQL::Parser->new();
29             my $root = $parser->parse( $cql );
30              
31             =head1 DESCRIPTION
32              
33             CQL::Parser provides a mechanism to parse Common Query Language (CQL)
34             statements. The best description of CQL comes from the CQL homepage
35             at the Library of Congress L
36              
37             CQL is a formal language for representing queries to information
38             retrieval systems such as web indexes, bibliographic catalogs and museum
39             collection information. The CQL design objective is that queries be
40             human readable and human writable, and that the language be intuitive
41             while maintaining the expressiveness of more complex languages.
42              
43             A CQL statement can be as simple as a single keyword, or as complicated as a set
44             of compoenents indicating search indexes, relations, relational modifiers,
45             proximity clauses and boolean logic. CQL::Parser will parse CQL statements
46             and return the root node for a tree of nodes which describes the CQL statement.
47             This data structure can then be used by a client application to analyze the
48             statement, and possibly turn it into a query for a local repository.
49              
50             Each CQL component in the tree inherits from L and can be one
51             of the following: L, L, L,
52             L, L, L. See the
53             documentation for those modules for their respective APIs.
54              
55             Here are some examples of CQL statements:
56              
57             =over 4
58              
59             =item * george
60              
61             =item * dc.creator=george
62              
63             =item * dc.creator="George Clinton"
64              
65             =item * clinton and funk
66              
67             =item * clinton and parliament and funk
68              
69             =item * (clinton or bootsy) and funk
70              
71             =item * dc.creator="clinton" and dc.date="1976"
72              
73             =back
74              
75             =head1 METHODS
76              
77             =head2 new()
78              
79             =cut
80              
81             ## for convenience the lexer is located at the package level
82             ## just need to be sure to reinitialize it in very call to parse()
83              
84             sub new {
85 7     7 1 6121 my ( $class, $debug ) = @_;
86 7 50       267 $CQL::DEBUG = $debug ? 1 : 0;
87 7   33     101 return bless { }, ref($class) || $class;
88             }
89              
90             =head2 parse( $query )
91              
92             Pass in a CQL query and you'll get back the root node for the CQL parse tree.
93             If the CQL is invalid an exception will be thrown.
94              
95             =cut
96              
97             sub parse {
98 64     64 1 2322 my ($self,$query) = @_;
99              
100             ## initialize lexer
101 64         345 $lexer = CQL::Lexer->new();
102              
103 64         889 debug( "about to parse query: $query" );
104            
105             ## create the lexer and get the first token
106 64         206 $lexer->tokenize( $query );
107 64         208 $token = $lexer->nextToken();
108              
109 64         342 my $root = parseQuery( 'srw.ServerChoice', CQL::Relation->new( 'scr' ) );
110 55 50       204 if ( $token->getType() != CQL_EOF ) {
111 0         0 croak( "junk after end ".$token->getString() );
112             }
113            
114 55         233 return $root;
115             }
116              
117             =head2 parseSafe( $query )
118              
119             Pass in a CQL query and you'll get back the root node for the CQL parse tree.
120             If the CQL is invalid, an error code from the SRU Diagnostics List
121             will be returned.
122              
123             =cut
124              
125             my @cql_errors = (
126             { regex => qr/does not support relational modifiers/, code => 20 },
127             { regex => qr/expected boolean got /, code => 37 },
128             { regex => qr/expected relation modifier got /, code => 20 },
129             { regex => qr/unknown first-class relation modifier: /, code => 20 },
130             { regex => qr/missing term/, code => 27 },
131             { regex => qr/expected proximity relation got /, code => 40 },
132             { regex => qr/expected proximity distance got /, code => 41 },
133             { regex => qr/expected proximity unit got/, code => 42 },
134             { regex => qr/expected proximity ordering got /, code => 43 },
135             { regex => qr/unknown first class relation: /, code => 19 },
136             { regex => qr/must supply name/, code => 15 },
137             { regex => qr/must supply identifier/, code => 15 },
138             { regex => qr/must supply subtree/, code => 15 },
139             { regex => qr/must supply term parameter/, code => 27 },
140             { regex => qr/doesn\'t support relations other than/, code => 20 },
141             );
142              
143             sub parseSafe {
144 2     2 1 6300 my ($self,$query) = @_;
145              
146 2         5 my $root = eval { $self->parse( $query ); };
  2         9  
147              
148 2 50       1147 if ( my $error = $@ ) {
149 2         4 my $code = 10;
150 2         6 for( @cql_errors ) {
151 30 100       293 $code = $_->{ code } if $error =~ $_->{ regex };
152             }
153 2         20 return $code;
154             }
155              
156 0         0 return $root;
157             }
158              
159             sub parseQuery {
160 70     70 0 119 my ( $qualifier, $relation ) = @_;
161 70         188 debug( "in parseQuery() with term=" . $token->getString() );
162 70         182 my $term = parseTerm( $qualifier, $relation );
163              
164 67         182 my $type = $token->getType();
165 67   100     258 while ( $type != CQL_EOF and $type != CQL_RPAREN ) {
166 34 100       128 if ( $type == CQL_AND ) {
    100          
    100          
    50          
167 12         30 match($token);
168 12         30 my $term2 = parseTerm( $qualifier, $relation );
169 10         88 $term = CQL::AndNode->new( left=>$term, right=>$term2 );
170             }
171             elsif ( $type == CQL_OR ) {
172 9         19 match($token);
173 9         90 my $term2 = parseTerm( $qualifier, $relation );
174 9         105 $term = CQL::OrNode->new( left=>$term, right=>$term2 );
175             }
176             elsif ( $type == CQL_NOT ) {
177 2         8 match($token);
178 2         7 my $term2 = parseTerm( $qualifier, $relation );
179 2         26 $term = CQL::NotNode->new( left=>$term, right=>$term2 );
180             }
181             elsif ( $type == CQL_PROX ) {
182 11         25 match($token);
183 11         53 my $proxNode = CQL::ProxNode->new( $term );
184 11         27 gatherProxParameters( $proxNode );
185 7         16 my $term2 = parseTerm( $qualifier, $relation );
186 7         25 $proxNode->addSecondTerm( $term2 );
187 7         10 $term = $proxNode;
188             }
189             else {
190 0         0 croak( "expected boolean got ".$token->getString() );
191             }
192 28         80 $type = $token->getType();
193             }
194 61         124 debug( "no more ops" );
195 61         111 return( $term );
196             }
197              
198             sub parseTerm {
199 100     100 0 162 my ( $qualifier, $relation ) = @_;
200 100         164 debug( "in parseTerm()" );
201 100         295 my $word;
202 100         111 while ( 1 ) {
203 124 100       371 if ( $token->getType() == CQL_LPAREN ) {
    100          
204 5         15 debug( "parenthesized term" );
205 5         22 match( CQL::Token->new('(') );
206 5         49 my $expr = parseQuery( $qualifier, $relation );
207 5         21 match( CQL::Token->new(')') );
208 5         17 return $expr;
209             }
210             elsif ( $token->getType() == CQL_GT ) {
211 1         4 match( $token );
212 1         4 return parsePrefix( $qualifier, $relation );
213             }
214              
215 118         214 debug( "non-parenthesised term" );
216 118         601 $word = matchSymbol( "qualifier or term" );
217              
218 118 100       249 last if ! isBaseRelation();
219              
220 24         43 $qualifier = $word;
221 24         70 debug( "creating relation with word=$word" );
222 24         70 $relation = CQL::Relation->new( $token->getString() );
223 24         60 match( $token );
224              
225 24         76 while ($token->getType() == CQL_MODIFIER ) {
226 12         24 match( $token );
227 12 50       29 if ( !isRelationModifier() ) {
228 0         0 croak( "expected relation modifier got " . $token->getString() );
229             }
230 12         38 $relation->addModifier( $token->getString() );
231 12         49 match( $token );
232             }
233             }
234              
235 91         767 debug( "qualifier=$qualifier relation=$relation term=$word" );
236 91 100 66     416 croak( "missing term" ) if ! defined($word) or $word eq '';
237              
238 89         363 my $node = CQL::TermNode->new(
239             qualifier => $qualifier,
240             relation => $relation,
241             term => $word
242             );
243 89         401 debug( "made term node: ".$node->toCQL() );
244 89         255 return $node;
245             }
246              
247             sub parsePrefix {
248 1     1 0 4 my ( $qualifier, $relation ) = @_;
249 1         3 debug( "prefix mapping" );
250 1         1 my $name = undef;
251 1         4 my $identifier = matchSymbol( "prefix name" );
252 1 50       4 if ( $token->getType() == CQL_EQ ) {
253 1         3 match( $token );
254 1         2 $name = $identifier;
255 1         3 $identifier = matchSymbol( "prefix identifier" );
256             }
257 1         4 my $node = parseQuery( $qualifier, $relation );
258 1         11 return CQL::PrefixNode->new(
259             name => $name,
260             identifier => $identifier,
261             subtree => $node
262             );
263             }
264              
265             sub gatherProxParameters {
266 11     11 0 20 my $node = shift;
267 11         14 if (0) { # CQL 1.0 (obsolete)
268             for (my $i=0; $i<4; $i++ ) {
269             if ( $token->getType() != CQL_MODIFIER ) {
270             ## end of proximity parameters
271             return;
272             }
273             match($token);
274             if ( $token->getType() != CQL_MODIFIER ) {
275             if ( $i==0 ) { gatherProxRelation($node); }
276             elsif ( $i==1 ) { gatherProxDistance($node); }
277             elsif ( $i==2 ) { gatherProxUnit($node); }
278             elsif ( $i==3 ) { gatherProxOrdering($node); }
279             }
280             }
281             } else {
282 11         33 while ( $token->getType() == CQL_MODIFIER ) {
283 15         34 match( $token );
284 15 100 66     43 if ( $token->getType() == CQL_DISTANCE ) {
    100          
    100          
285 8         33 match( $token );
286 8         22 gatherProxRelation( $node );
287 8         20 gatherProxDistance( $node );
288             } elsif ( $token->getType() == CQL_UNIT ) {
289 4         20 match( $token );
290 4 50       14 if ( $token->getType() != CQL_EQ ) {
291 0         0 croak( "expected proximity unit parameter got ".$token->getString() );
292             }
293 4         10 match( $token );
294 4         29 gatherProxUnit( $node );
295             } elsif ( $token->getType() == CQL_ORDERED
296             || $token->getType() == CQL_UNORDERED ) {
297 1         4 gatherProxOrdering( $node );
298             } else {
299 2         9 croak( "expected proximity parameter got ". $token->getString() ."(". $token->getType() .")" );
300             }
301             }
302             }
303             }
304              
305             sub gatherProxRelation {
306 8     8 0 15 my $node = shift;
307 8 50       15 if ( ! isProxRelation() ) {
308 0         0 croak( "expected proximity relation got ".$token->getString() );
309             }
310 8         30 $node->addModifier( "relation", $token->getString() );
311 8         19 match( $token );
312 8         28 debug( "gatherProxRelation matched ".$token->getString() );
313             }
314              
315             sub gatherProxDistance {
316 8     8 0 12 my $node = shift;
317 8 100       23 if ( $token->getString() !~ /^\d+$/ ) {
318 2         6 croak( "expected proximity distance got ".$token->getString() );
319             }
320 6         21 $node->addModifier( "distance", $token->getString() );
321 6         13 match( $token );
322 6         22 debug( "gatherProxDistance matched ".$token->getString() );
323             }
324              
325             sub gatherProxUnit {
326 4     4 0 6 my $node = shift;
327 4         10 my $type = $token->getType();
328 4 50 66     26 if( $type != CQL_PWORD and $type != CQL_SENTENCE and $type != CQL_PARAGRAPH
      66        
      33        
329             and $type != CQL_ELEMENT ) {
330 0         0 croak( "expected proximity unit got ".$token->getString() );
331             }
332 4         14 $node->addModifier( "unit", $token->getString() );
333 4         10 match( $token );
334 4         35 debug( "gatherProxUnit matched ".$token->getString() );
335             }
336              
337             sub gatherProxOrdering {
338 1     1 0 2 my $node = shift;
339 1         5 my $type = $token->getType();
340 1 50 33     6 if ( $type != CQL_ORDERED and $type != CQL_UNORDERED ) {
341 0         0 croak( "expected proximity ordering got ".$token->getString() );
342             }
343 1         4 $node->addModifier( "ordering", $token->getString() );
344 1         4 match( $token );
345             }
346              
347             sub isBaseRelation {
348 118     118 0 364 debug( "inside base relation: checking ttype=".$token->getType()." sval=".
349             $token->getString() );
350 118 100 66     1619 if( $token->getType() == CQL_WORD and $token->getString() !~ /\./ ) {
351 3         1438 croak( "unknown first class relation: ".$token->getString() );
352             }
353 115         495 my $type = $token->getType();
354 115   100     211 return( isProxRelation() or $type==CQL_ANY or $type==CQL_ALL
355             or $type==CQL_EXACT or $type==CQL_SCR or $type==CQL_WORD
356             or $type==CQL_WITHIN or $type==CQL_ENCLOSES);
357             }
358              
359             sub isProxRelation {
360 123     123 0 275 debug( "isProxRelation: checking ttype=".$token->getType()." sval=".
361             $token->getString() );
362 123         327 my $type = $token->getType();
363 123   100     3892 return( $type==CQL_LT or $type==CQL_GT or $type==CQL_EQ or $type==CQL_LE
364             or $type==CQL_GE or $type==CQL_NE );
365             }
366              
367             sub isRelationModifier {
368 12     12 0 35 my $type = $token->getType();
369 12 100       36 if ($type == CQL_WORD) {
370 1         4 return $token->getString() =~ /\./;
371             }
372 11   66     246 return ($type==CQL_RELEVANT or $type==CQL_FUZZY or $type==CQL_STEM
373             or $type==CQL_PHONETIC or $type==CQL_PWORD or $type==CQL_STRING
374             or $type==CQL_ISODATE or $type==CQL_NUMBER or $type==CQL_URI
375             or $type==CQL_PARTIAL or $type==CQL_MASKED or $type==CQL_UNMASKED
376             or $type==CQL_NWSE);
377             }
378              
379             sub match {
380 264     264 0 374 my $expected = shift;
381 264         3056 debug( "in match(".$expected->getString().")" );
382 264 50       710 if ( $token->getType() != $expected->getType() ) {
383 0         0 croak( "expected ".$expected->getString() .
384             " but got " . $token->getString() );
385             }
386 264         877 $token = $lexer->nextToken();
387 264         1207 debug( "got token type=".$token->getType()." string=".$token->getString() );
388             }
389              
390             sub matchSymbol {
391 120     120 0 263 debug( "in match symbol" );
392 120         297 my $return = $token->getString();
393 120         247 match( $token );
394 120         286 return $return;
395             }
396              
397             sub debug {
398 1530 50   1530 0 5154 return unless $CQL::DEBUG;
399 0           print STDERR "CQL::Parser: ", shift, "\n";
400             }
401              
402             =head1 XCQL
403              
404             CQL has an XML representation which you can generate from a CQL parse
405             tree. Just call the toXCQL() method on the root node you get back
406             from a call to parse().
407              
408             =head1 ERRORS AND DIAGNOSTICS
409              
410             As mentioned above, a CQL syntax error will result in an exception being
411             thrown. So if you have any doubts about the CQL that you are parsing you
412             should wrap the call to parse() in an eval block, and check $@
413             afterwards to make sure everything went ok.
414              
415             eval {
416             my $node = $parser->parse( $cql );
417             };
418             if ( $@ ) {
419             print "uhoh, exception $@\n";
420             }
421              
422             If you'd like to see blow by blow details while your CQL is being parsed
423             set $CQL::DEBUG equal to 1, and you will get details on STDERR. This is
424             useful if the parse tree is incorrect and you want to locate where things
425             are going wrong. Hopefully this won't happen, but if it does please notify the
426             author.
427              
428             =head1 TODO
429              
430             =over 4
431              
432             =item * toYourEngineHere() please feel free to add functionality and send in
433             patches!
434              
435             =back
436              
437             =head1 THANKYOUS
438              
439             CQL::Parser is essentially a Perl port of Mike Taylor's cql-java package
440             http://zing.z3950.org/cql/java/. Mike and IndexData were kind enough
441             to allow the author to write this port, and to make it available under
442             the terms of the Artistic License. Thanks Mike!
443              
444             The CQL::Lexer package relies heavily on Stevan Little's excellent
445             String::Tokenizer. Thanks Stevan!
446              
447             CQL::Parser was developed as a component of the Ockham project,
448             which is funded by the National Science Foundation. See http://www.ockham.org
449             for more information about Ockham.
450              
451             =head1 AUTHOR
452              
453             =over 4
454              
455             =item * Ed Summers - ehs at pobox dot com
456              
457             =item * Brian Cassidy - bricas at cpan dot org
458              
459             =item * Wilbert Hengst - W.Hengst at uva dot nl
460              
461             =back
462              
463             =head1 COPYRIGHT AND LICENSE
464              
465             Copyright 2004-2009 by Ed Summers
466              
467             This library is free software; you can redistribute it and/or modify
468             it under the same terms as Perl itself.
469              
470             =cut
471              
472             1;