File Coverage

blib/lib/XML/YYLex.pm
Criterion Covered Total %
statement 9 80 11.2
branch 0 44 0.0
condition 0 15 0.0
subroutine 3 11 27.2
pod 5 5 100.0
total 17 155 10.9


line stmt bran cond sub pod time code
1             #!/usr/local/bin/perl -w
2             ##
3             ## XML/YYLex.pm
4             ##
5             ## Daniel Bößwetter, Thu Nov 14 21:06:34 CET 2002
6             ## boesswetter@peppermind.de
7             ##
8             ## $Log: YYLex.pm,v $
9             ## Revision 1.4 2003/01/11 00:50:18 daniel
10             ## Oops, forgor versions numbers for CPAN compatibility and added homepage url
11             ## in several places (all PODs and README)
12             ##
13             ## Revision 1.3 2003/01/10 22:30:50 daniel
14             ## version 0.3 (perl 5.6 and sablot 0.90)
15             ##
16             ## Revision 1.2 2002/11/24 17:33:06 daniel
17             ## KNOWN BUGS added
18             ##
19             ## Revision 1.1.1.1 2002/11/24 17:18:15 daniel
20             ## initial checkin
21             ##
22             ##
23              
24             package XML::YYLex;
25             our $VERSION = '0.04';
26 1     1   41538 use strict qw(subs vars);
  1         3  
  1         55  
27 1         102 use vars qw(
28             $STATE_NEW_NODE
29             $STATE_DESCEND_NODE
30             $STATE_NODE_DONE
31             $PREFIX_OPENING
32             $PREFIX_CLOSING
33             $TOKEN_OTHER_TAG
34             $TOKEN_TEXT
35 1     1   7 );
  1         3  
36              
37 1     1   5 use Carp;
  1         6  
  1         6748  
38              
39             $STATE_NEW_NODE = 0;
40             $STATE_DESCEND_NODE = 1;
41             $STATE_NODE_DONE = 2;
42              
43             $PREFIX_CLOSING = "_";
44             $PREFIX_OPENING = "";
45             $TOKEN_OTHER_TAG = "OTHER";
46             $TOKEN_TEXT = "TEXT";
47              
48             =pod
49              
50             =head1 NAME
51              
52             XML::YYLex - Perl module for using perl-byacc with XML-data
53              
54             =head1 SYNOPSIS
55              
56             use XML::YYLex;
57              
58             ## create an object of a sublass of XML::YYLex suitable for your
59             ## DOM-parser:
60              
61             my $parser = XML::YYLex::create_object(
62             document => $xmldom_or_sablotron_dom_object
63             debug => 0, # or 1
64             ignore_empty_text => 1, # probably what you would expect
65             yydebug => \&some_func, # defaults to a croak
66             yyerror => \&other_func # defaults to a carp
67             );
68              
69             ## return the result of yyparse
70             my $result = $parser->run( "ByaccPackage" );
71              
72             =head1 ABSTRACT
73              
74             C is a perl module that helps you build XML-parsers with
75             perl-byacc (a version of Berkeley Yacc that can produce perl-code).
76             It uses a regular DOM-parser (currently C or C)
77             as what would normally be called a scanner (hence the name 'yylex' which
78             is what scanner-functions are traditionally called). You can then specify
79             grammars in byacc in which XML-tags or text-blocks appear as tokens and
80             thus simplifies interpretation of XML-data (sometimes :).
81              
82             =head1 DESCRIPTION
83              
84             XML::YYLex implements an abstract base-class that can be subclassed for
85             specific DOM-parsers. As of this writing, C and C
86             are supported, but others might be easily added. If you want to add
87             support for another DOM-parser, copy one of the modules C
88             or C to an appropriate name and modify it
89             to work with your DOM-parser.
90              
91             C contains two public functions:
92              
93             =over 4
94              
95             =item C
96              
97             serves as a static factory method that creates an instance of the approptiate
98             subclass. The possible keye for %args are
99              
100             =over 4
101              
102             =item C
103              
104             a reference to your DOM-document (whichever class that may be). This
105             is used for determining which parser-specific subclass to create. This
106             argument must be given. If you pass a single scalar to C
107             instead of a hash, it is assumed to be the C.
108              
109             =item C
110              
111             when set to a true value, produces lots of debug information, as well
112             from the yacc-parser as from C itself. Defaults to false.
113              
114             =item C and C
115              
116             code-refs with the same purpose as in byacc itself: called with a single
117             argument which is a warning or an error. Defaults are the functions
118             C and C (see below).
119              
120             =item C
121              
122             when set to a true value, emtpy text-nodes are not considered to be tokens
123             (which reduces your grammars complexity a lot). True by default.
124              
125             =back
126              
127             =cut
128              
129             sub create_object {
130              
131 0     0 1   my %args;
132 0 0         if ( $#_ == 0 ) {
133 0           $args{document} = shift;
134             } else {
135 0           %args = @_;
136             }
137              
138 0 0         if ( !$args{debug} ) {
139             # omit those ugly warnings
140 0 0   0     $SIG{__WARN__} = sub{ warn @_ if $_[0] !~ /^yy/ };
  0            
141             }
142              
143 0 0         die "no DOM-document given" if ref !$args{document};
144              
145 0 0         if ( ref( $args{document} ) =~ /^XML::Sablotron/ ) {
146 0           eval "use XML::Sablotron::DOM::YYLex;";
147 0           return new XML::Sablotron::DOM::YYLex( \%args );
148             } else {
149 0           eval "use XML::DOM::YYLex;";
150 0           return new XML::DOM::YYLex( \%args );
151             }
152             }
153              
154             =pod
155              
156             =item C
157              
158             which calls the byacc-generated C() function with the appropriate
159             parameters and returns it's value. C<$namespace_of_parser> is (you
160             won't believe it) the namespace of the parser generated by perl-byacc
161             (actually the same string that you specified with C<-P> on the byacc
162             command line).
163              
164             =back
165              
166             =cut
167              
168             sub run {
169 0     0 1   my ( $self, $class ) = @_;
170              
171 0           eval "use $class";
172 0 0         die( $@ ) if $@;
173 0           $self->{parser_package} = $class;
174 0           my $ref = &{$class."::new"}( $class, \&_yylex, $self->{yyerror}, $self->{yydebug} );
  0            
175              
176 0           return $ref->yyparse( [ $self, $self->{document} ] );
177             }
178              
179             =pod
180              
181             Furthermore the following functions are implemented in this package,
182             but you will most likely never call them directly. However, knowledge
183             of these might be necassary when subclassing C.
184              
185             =over 4
186              
187             =item C<_yylex( $self, $doc )>
188              
189             This function implements the traversal of the DOM-tree in an order that
190             would be the order of nodes in the XML-file (why don't we use a SAX-parser
191             right-away? Because SAX-parsers don't implement nice objects for
192             Nodes of the tree and their attributes like DOM-parsers do, that's why).
193             This one's where the magic happens.
194              
195             =cut
196              
197             sub _yylex {
198 0     0     my $ref = shift;
199 0           my ( $self, $doc ) = @$ref;
200              
201 0 0         print STDERR "entering _yylex with state $self->{state}\n"
202             if $self->{debug};
203              
204             ##
205             ## initialization
206             ##
207 0 0         if ( !defined( $self->{current_node} ) ) {
208 0           $self->{current_node} = $self->_xml_getDocumentElement( $doc );
209 0           $self->{state} = $STATE_NEW_NODE;
210             }
211              
212 0           my @res;
213             ##
214             ## new node
215             ##
216 0 0         if ( $self->{state} == $STATE_NEW_NODE ) {
    0          
    0          
217              
218 0 0         $self->{state} = !$self->_xml_isTextNode( $self->{current_node} )
219             ? $STATE_DESCEND_NODE : $STATE_NODE_DONE;
220              
221 0           @res = ( $self->_node_to_token( $self->{current_node},
222             $PREFIX_OPENING ), $self->{current_node} );
223              
224             ##
225             ## node's children
226             ##
227             } elsif ( $self->{state} == $STATE_DESCEND_NODE ) {
228              
229             ## has children ?
230 0 0         if ( my @c = @{$self->{current_node}->getChildNodes} ) {
  0            
231              
232             ## yes
233              
234             #$self->{state} = $STATE_DESCEND_NODE;
235 0           $self->{current_node} = $c[0];
236 0 0         $self->{state} = !$self->_xml_isTextNode( $self->{current_node} )
237             ? $STATE_DESCEND_NODE : $STATE_NODE_DONE;
238              
239             #while ( $self->_xml_isElementNode( $c[0] ) ) { shift @c }
240 0           @res = ( $self->_node_to_token( $c[0], $PREFIX_OPENING ), $c[0] );
241              
242             } else {
243              
244             ## no children
245            
246 0           $self->{state} = $STATE_NODE_DONE;
247 0           @res =( $self->_node_to_token( $self->{current_node}, $PREFIX_CLOSING ) );
248             }
249             ##
250             ## node done
251             ##
252             } elsif ( $self->{state} == $STATE_NODE_DONE ) {
253              
254 0 0         if ( defined( my $c = $self->{current_node}->getNextSibling ) ) {
    0          
255             ## same as STATE_NEW_NODE above:
256 0           $self->{current_node} = $c;
257 0           $self->{state} = $STATE_DESCEND_NODE;
258 0           @res = ( $self->_node_to_token( $self->{current_node}, $PREFIX_OPENING ),
259             $self->{current_node} );
260             } elsif ( !$self->_xml_isDocumentNode( $self->{current_node}->getParentNode ) )
261             {
262 0           $self->{current_node} = $self->{current_node}->getParentNode;
263 0           $self->{state} = $STATE_NODE_DONE;
264 0           @res =( $self->_node_to_token( $self->{current_node}, $PREFIX_CLOSING ) );
265             } else {
266             ## end of document
267 0           @res = ( 0 );
268             }
269             }
270              
271             #print STDERR "res=".$res[0]." ".( defined( $res[1] ) ? $res[1]->getNodeName : "" )."\n";
272 0 0         print STDERR "leaving _yylex with state $self->{state}\n"
273             if $self->{debug};
274              
275 0 0 0       if ( $self->{ignore_empty_text} and defined( $res[1] )
      0        
      0        
276             and $self->_xml_isTextNode( $res[1] )
277             and $res[1]->getNodeValue =~ /^\s*$/ )
278             {
279             ## ignore empty text-nodes
280 0           $self->{state} = $STATE_NODE_DONE;
281 0           return &_yylex( [ $self, $doc ] ); ## recursion
282             } else {
283 0 0         print "res=".join( ", ", @res )."\n" if ( $self->{debug} );
284 0           return @res;
285             }
286             }
287              
288             =pod
289              
290             =item C<_node_to_token( $self, $node, $prefix )>
291              
292             This function determines the token-number for a given node. C<$prefix>
293             equals C<$XML::YYLex::PREFIX_OPENING> (usually empty) for opening
294             tags and C<$XML::YYLex::PREFIX_CLOSING> (the underscore "_" by
295             default). The default behaviour is to look for a symbol with the name
296             of the node (for elements) in the namespace of your byacc-generated
297             parser. C<$XML::YYLex::TOKEN_TEXT> is used for text-nodes and
298             C<$XML::YYLex::TOKEN_OTHER> (I by default) is used for unknwon
299             tags (when no token with that name exists). For closing elements, the
300             prefix is prepended to the tagname (i.e. C<_html> for C<>).
301              
302             =cut
303              
304             sub _node_to_token {
305 0     0     my $self = shift;
306 0           my $node = shift;
307 0           my $prefix = shift;
308              
309             #print STDERR "_node_to_token: ".$node->getNodeName." ".$prefix."\n";
310 0           my $res;
311 0 0         if ( $self->_xml_isTextNode( $node ) ) {
  0 0          
312             #print STDERR ">>>>>Täxt ".$node->getNodeValue."\n";
313 0           $res = ${$self->{parser_package}."::".$TOKEN_TEXT};
  0            
314             } elsif ( !defined( $res =
315             ${$self->{parser_package}."::".$prefix.$node->getNodeName} ) )
316             {
317 0           $res = ${$self->{parser_package}."::".$prefix.$TOKEN_OTHER_TAG};
  0            
318             }
319              
320 0 0         print STDERR "_node_to_token: res=$res\n" if ( $self->{debug} );
321 0           return $res;
322             }
323              
324             =pod
325              
326             =item C and C
327              
328             These are the default debug- and error-handlers respectively if no
329             other functions are given to create_object. C croaks and
330             C carps it's arguments.
331              
332             =cut
333              
334             sub yyerror {
335 0     0 1   croak( "yyerror: @_" );
336             }
337              
338             sub yydebug {
339 0     0 1   carp( "yyerror: @_" );
340             }
341              
342             =pod
343              
344             =item C
345              
346             Don't call it. It serves as constructor for child-classes and must be
347             given an almost initialized object (an C<$unblessed_hashref>). See the code
348             for details.
349              
350             =cut
351              
352             sub new {
353             #print "new: ".join( ",", @_ )."\n";
354 0     0 1   my ( $class, $args_ref ) = @_;
355             #print $args_ref."\n";
356 0           my $self = bless $args_ref, $class;
357 0   0       $self->{yyerror} = $self->{yyerror} || \&yyerror;
358 0   0       $self->{yydebug} = $self->{yydebug} || \&yydebug;
359 0   0       $self->{ignore_empty_text} = $self->{ignore_empty_text} || 1;
360 0           return $self;
361             }
362              
363             1;
364             __END__