File Coverage

blib/lib/XML/Descent.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package XML::Descent;
2              
3 7     7   157275 use warnings;
  7         17  
  7         325  
4 7     7   35 use strict;
  7         16  
  7         215  
5 7     7   34 use Carp;
  7         17  
  7         588  
6 7     7   9951 use XML::TokeParser;
  0            
  0            
7              
8             =head1 NAME
9              
10             XML::Descent - Recursive descent XML parsing
11              
12             =head1 VERSION
13              
14             This document describes XML::Descent version 1.04
15              
16             =head1 SYNOPSIS
17              
18             use XML::Descent;
19              
20             # Create parser
21             my $p = XML::Descent->new( { Input => \$xml } );
22              
23             # Setup handlers
24             $p->on(
25             folder => sub {
26             my ( $elem, $attr ) = @_;
27              
28             $p->on(
29             url => sub {
30             my ( $elem, $attr ) = @_;
31             my $link = {
32             name => $attr->{name},
33             url => $p->text
34             };
35             }
36             );
37              
38             my $folder = $p->walk;
39             $folder->{name} = $attr->{name};
40             }
41             );
42              
43             # Parse
44             my $res = $p->walk;
45              
46             =head1 DESCRIPTION
47              
48             The conventional models for parsing XML are either DOM (a data structure
49             representing the entire document tree is created) or SAX (callbacks are
50             issued for each element in the XML).
51              
52             XML grammar is recursive - so it's nice to be able to write recursive
53             parsers for it. XML::Descent allows such parsers to be created.
54              
55             Typically a new XML::Descent is created and handlers are defined for
56             elements we're interested in
57              
58             my $p = XML::Descent->new( { Input => \$xml } );
59             $p->on(
60             link => sub {
61             my ( $elem, $attr ) = @_;
62             print "Found link: ", $attr->{url}, "\n";
63             $p->walk; # recurse
64             }
65             );
66             $p->walk; # parse
67              
68             A handler provides a convenient lexical scope that lasts until the
69             closing tag of the element that triggered the handler is reached.
70              
71             When called at the top level the parsing methods walk, text and
72             xml parse the whole XML document. When called recursively within a
73             handler they parse the portion of the document nested inside node that
74             triggered the handler.
75              
76             New handlers may be defined within a handler and their scope will be
77             limited to the XML inside the node that triggered the handler.
78              
79             =cut
80              
81             our $VERSION = '1.04';
82              
83             =head1 INTERFACE
84              
85             =head2 C
86              
87             Create a new XML::Descent. Options are supplied has a hash reference.
88             The only option recognised directly by XML::Descent is C which
89             should be reference to the object that provides the XML source. Any
90             value that can be passed as the first argument to
91             C<< XML::TokeParser->new >> is allowed.
92              
93             The remaining options are passed directly to C. Consult
94             that module's documentation for more details.
95              
96             =cut
97              
98             sub new {
99             my $class = shift;
100              
101             my %args = ();
102             my @opt = ();
103             for my $arg ( @_ ) {
104             if ( 'HASH' eq ref $arg ) {
105             %args = ( %args, %$arg );
106             }
107             else {
108             push @opt, $arg;
109             }
110             }
111             croak "Expected a number of name => value pairs"
112             if @opt % 2;
113             %args = ( %args, @opt );
114              
115             my $parser
116             = XML::TokeParser->new( delete $args{Input}
117             || croak( "No Input arg" ), %args )
118             || croak( "Failed to create XML::TokeParser" );
119              
120             return bless {
121             parser => $parser,
122             context => {
123             parent => undef,
124             rules => {},
125             obj => undef
126             },
127             token => undef,
128             path => [],
129              
130             }, $class;
131             }
132              
133             sub _get_rule_handler {
134             my ( $self, $tos, $elem ) = @_;
135             croak "It is not possible to register an explicit handler for '*'"
136             if '*' eq $elem;
137             while ( $tos ) {
138             if ( my $h = $tos->{rules}{$elem} || $tos->{rules}{'*'} ) {
139             return $h;
140             }
141             $tos = $tos->{parent};
142             }
143             return;
144             }
145              
146             sub _depth { scalar @{ shift->{path} } }
147              
148             =head2 C
149              
150             Parse part of the XML document tree triggering any handlers that
151             correspond with elements it contains. When called recursively within a
152             handler C visits all the elements below the element that triggered
153             the handler and then returns.
154              
155             =cut
156              
157             sub walk {
158             my $self = shift;
159              
160             TOKEN: while ( my $tok = $self->get_token ) {
161             if ( $tok->[0] eq 'S' ) {
162             my $tos = $self->{context};
163             my $handler = $self->_get_rule_handler( $tos, $tok->[1] );
164             if ( defined $handler ) {
165             my $stopat = $self->_depth;
166              
167             # Push context
168             $self->{context} = {
169             parent => $tos,
170             stopat => $stopat,
171             obj => $tos->{obj}
172             };
173              
174             # Call handler
175             $handler->( $tok->[1], $tok->[2], $tos->{obj} );
176              
177             # If handler didn't recursively parse the content of
178             # this node we need to discard it.
179             1 while $self->_depth >= $stopat
180             && ( $tok = $self->get_token );
181              
182             # Pop context
183             $self->{context} = $tos;
184             }
185             else {
186             $self->walk;
187             }
188             }
189             elsif ( $tok->[0] eq 'E' ) {
190             last TOKEN;
191             }
192             }
193             }
194              
195             =head2 C
196              
197             Register a handler to be called when the named element is encountered.
198             Multiple element names may be supplied as an array reference. Multiple
199             handlers may be registered with one call to C by supplying a number
200             of element, handler pairs.
201              
202             Calling C within a handler defines a nested local handler whose
203             scope is limited to the containing element. Handlers are called with
204             three arguments: the name of the element that triggered the handler, a
205             hash of the element's attributes and a user defined context value - see
206             C for more about that.
207              
208             For example:
209              
210             $p = XML::Descent->new( { Input => \$some_xml } );
211              
212             # Global handler - trigger anywhere an tag is found
213             $p->on(
214             options => sub {
215             my ( $elem, $attr, $ctx ) = @_;
216              
217             # Define a nested handler for elements that only
218             # applies within the handler.
219             $p->on(
220             name => sub {
221             my ( $elem, $attr, $ctx ) = @_;
222             # Get the inner text of the name element
223             my $name = $p->text;
224             print "Name: $name\n";
225             }
226             );
227              
228             # Recursively walk elements inside triggering
229             # any handlers
230             $p->walk;
231             }
232             );
233              
234             # Start parsing
235             $p->walk;
236              
237             A handler may call one of the parsing methods (C, C, C
238             or C) to consume any nested XML before returning. If none of
239             the parsing methods are called nested XML is automatically discarded so
240             that the parser can properly move past the current element.
241              
242             Nested handlers temporarily override another handler with the same name.
243             A handler named '*' will trigger for all elements for which there is no
244             explicit handler. A nested '*' handler hides all handlers defined in
245             containing scopes.
246              
247             As a shorthand you may specify a path to a nested element:
248              
249             $p->on( 'a/b/c' => sub {
250             print "Woo!\n";
251             })->walk;
252              
253             That's equivalent to:
254              
255             $p->on( a => sub {
256             $p->on( b => sub {
257             $p->on( c => sub {
258             print "Woo!\n";
259             })->walk;
260             })->walk;
261             })->walk;
262              
263             Note that this shorthand only applies to C - not to other methods
264             that accept element names.
265              
266             =cut
267              
268             sub on {
269             my $self = shift;
270             croak "Please supply a number of path => handler pairs"
271             if @_ % 2;
272              
273             while ( my ( $spec, $cb ) = splice @_, 0, 2 ) {
274             $spec = [$spec] unless ref $spec eq 'ARRAY';
275             for my $el ( @$spec ) {
276             my ( $name, $tail ) = split /\//, $el, 2;
277             if ( defined $tail ) {
278             $self->{context}{rules}{$name} = sub {
279             $self->on( $tail => $cb )->walk;
280             };
281             }
282             else {
283             $self->{context}{rules}{$el} = $cb;
284             }
285             }
286             }
287             return $self;
288             }
289              
290             =head2 C
291              
292             Inherit handlers from the containing scope. Typically used to import
293             handlers that would otherwise be masked by a catch all '*' handler.
294              
295             $p->on(
296             'a' => sub {
297             my ( $elem, $attr, $ctx ) = @_;
298             my $link = $attr->{href} || '';
299             my $text = $p->text;
300             print "Link: $text ($link)\n";
301             }
302             );
303              
304             $p->on(
305             'special' => sub {
306             my ( $elem, $attr, $ctx ) = @_;
307              
308             # Within we want to handle all
309             # tags apart from by printing them out
310             $p->on(
311             '*' => sub {
312             my ( $elem, $attr, $ctx ) = @_;
313             print "Found: $elem\n";
314             }
315             );
316              
317             # Get the handler for from our containing
318             # scope.
319             $p->inherit( 'a' );
320             $p->walk;
321             }
322             );
323              
324             The inherited handler is the handler that would have applied in the
325             containing scope for an element with the given name. For example:
326              
327             $p->on( '*' => sub { print "Whatever\n"; $p->walk; } );
328             $p->on(
329             'interesting' => sub {
330             # Inherits the default 'Whatever' handler because that's the
331             # handler that would have been called for in the
332             # containing scope
333             $p->inherit( 'frob' );
334             # Handle everything else ourselves
335             #p->on('*', sub { $p->walk; });
336             }
337             );
338              
339             =cut
340              
341             sub inherit {
342             my $self = shift;
343             my ( $path ) = @_;
344              
345             $path = [$path] unless ref $path eq 'ARRAY';
346             my $par = $self->{context}{parent};
347             $self->on( $_, $self->_get_rule_handler( $par, $_ ) ) for @$path;
348             return $self;
349             }
350              
351             sub _filter {
352             my ( $self, $mk_wrapper ) = splice @_, 0, 2;
353             croak "Please supply a number of path => handler pairs"
354             if @_ % 2;
355              
356             my $context = $self->{context};
357             while ( my ( $path, $cb ) = splice @_, 0, 2 ) {
358             $path = [$path] unless ref $path eq 'ARRAY';
359             for my $elem ( @$path ) {
360             my $h = $self->_get_rule_handler( $context, $elem )
361             or croak "No existing handler for $elem";
362             $self->{context}{rules}{$elem} = $mk_wrapper->( $h, $cb );
363             }
364             }
365             return $self;
366             }
367              
368             =head2 C
369              
370             Register a handler to be called before the existing handler for an
371             element. As with C multiple elements may be targetted by providing
372             an array ref.
373              
374             =cut
375              
376             sub before {
377             return shift->_filter(
378             sub {
379             my ( $h, $cb ) = @_;
380             sub { $cb->( @_ ); $h->( @_ ) }
381             },
382             @_
383             );
384             }
385              
386             =head2 C
387              
388             Register a handler to be called after the existing handler for an
389             element. As with C multiple elements may be targetted by providing
390             an array ref.
391              
392             =cut
393              
394             sub after {
395             return shift->_filter(
396             sub {
397             my ( $h, $cb ) = @_;
398             sub { $h->( @_ ); $cb->( @_ ) }
399             },
400             @_
401             );
402             }
403              
404             =head2 C
405              
406             Every time a handler is called a new scope is created for it. This
407             allows nested handlers to be defined. The current scope contains a user
408             context variable which can be used, for example, to keep track of an
409             object that is being filled with values parsed from the XML. The context
410             value is inherited from the parent scope but may be overridden locally.
411              
412             For example:
413              
414             my $root = {};
415              
416             # Set the outermost context
417             $p->context( $root );
418              
419             # Handle HTML links /anywhere/
420             $p->on(
421             'a' => sub {
422             my ( $elem, $attr, $ctx ) = @_;
423             my $link = {
424             href => $attr->{href},
425             text => $p->text
426             };
427             push @{ $ctx->{links} }, $link;
428             }
429             );
430              
431             # Links in the body are stored in a nested
432             # object.
433             $p->on(
434             'body' => sub {
435             my ( $elem, $attr, $ctx ) = @_;
436             my $body = {};
437             # Set the context
438             $p->context( $body );
439             $p->walk;
440             $ctx->{body} = $body;
441             }
442             );
443              
444             $p->walk;
445              
446             Note that the handler for tags stores its results in the
447             current context object - whatever that happens to be. That means
448             that outside of any tag links will be stored in C<$root> but
449             within a they will be stored in a nested object
450             (C<< $root->{body} >>). The handler itself need know nothing of
451             this.
452              
453             With no parameter C returns the current context. The current
454             context is also passed as the third argument to handlers.
455              
456             =cut
457              
458             sub context {
459             my $self = shift;
460             $self->{context}->{obj} = shift if @_;
461             return $self->{context}{obj};
462             }
463              
464             =head2 C
465              
466             Return any text contained within the current element. XML markup is
467             discarded.
468              
469             =cut
470              
471             sub text {
472             my $self = shift;
473             my @txt = ();
474              
475             TOKEN: while ( my $tok = $self->get_token ) {
476             if ( $tok->[0] eq 'S' ) {
477             push @txt, $self->text;
478             }
479             elsif ( $tok->[0] eq 'E' ) {
480             last TOKEN;
481             }
482             elsif ( $tok->[0] eq 'T' ) {
483             push @txt, $tok->[1];
484             }
485             }
486              
487             return join '', @txt;
488             }
489              
490             =head2 C
491              
492             Return the unparsed inner XML of the current element. For example:
493              
494             $p->on(
495             'item' => sub {
496             my ( $elem, $attr, $ctx ) = @_;
497             my $item_source = $p->xml;
498             print "Item: $item_source\n";
499             }
500             );
501              
502             If contains XHTML (for example) the above handler would correctly
503             capture it without recursively parsing any elements it contains. Parsing
504              
505            
506             This is the first story.
507             This is another story.
508            
509            
510             would print
511              
512             Item: This is the first story.
513             Item: This is another story.
514              
515             =cut
516              
517             sub xml {
518             my $self = shift;
519              
520             my @xml = ();
521              
522             TOKEN: while ( my $tok = $self->get_token ) {
523             if ( $tok->[0] eq 'S' ) {
524             push @xml, $tok->[4], $self->xml, $self->{token}->[2];
525             }
526             elsif ( $tok->[0] eq 'E' ) {
527             last TOKEN;
528             }
529             elsif ( $tok->[0] eq 'T' || $tok->[0] eq 'C' ) {
530             push @xml, $tok->[2];
531             }
532             elsif ( $tok->[0] eq 'PI' ) {
533             push @xml, $tok->[3];
534             }
535             else {
536             die "Unhandled token type: $tok->[0]";
537             }
538             }
539              
540             return join '', @xml;
541             }
542              
543             =head2 C
544              
545             Called within a handler returns the path that leads to the current
546             element. For example:
547              
548             $p->on(
549             'here' => sub {
550             my ( $elem, $attr, $ctx ) = @_;
551             print "I am here: ", $p->get_path, "\n";
552             $p->walk;
553             }
554             );
555              
556             would, if applied to this XML
557              
558            
559            
560            
561            
562            
563            
564            
565             print
566              
567             I am here: /outer/inner/here
568             I am here: /outer/here
569              
570             =cut
571              
572             sub get_path { '/' . join '/', @{ shift->{path} } }
573              
574             =head2 C
575              
576             XML::Descent is built on C which splits an XML document
577             into a stream of tokens representing start tags, end tags, literal text,
578             comment and processing instructions. Within an element C
579             returns the same stream of tokens that C would produce.
580             Returns C once all the tokens contained within the current
581             element have been read (i.e. it's impossible to read past the end of the
582             enclosed XML).
583              
584             =cut
585              
586             sub get_token {
587             my $self = shift;
588             my $p = $self->{parser};
589              
590             my $tok = $self->{token} = $p->get_token;
591              
592             if ( defined( $tok ) ) {
593             if ( $tok->[0] eq 'S' ) {
594             push @{ $self->{path} }, $tok->[1];
595             }
596             elsif ( $tok->[0] eq 'E' ) {
597             my $tos = pop @{ $self->{path} };
598             die "$tos <> $tok->[1]"
599             unless $tos eq $tok->[1];
600             }
601             }
602              
603             my $stopat = $self->{context}{stopat};
604             return if defined $stopat && $self->_depth < $stopat;
605             return $tok;
606             }
607              
608             =head2 C
609              
610             Get a list of all handlers that are registered locally to the current
611             scope. The returned list won't include '*' if a wildcard handler has
612             been registered.
613              
614             =cut
615              
616             sub scope_handlers {
617             sort grep { $_ ne '*' } keys %{ shift->{context}{rules} || {} };
618             }
619              
620             =head2 C
621              
622             Get a list of all registered handlers in all scopes. The returned list
623             won't include the '*' wildcard handler.
624              
625             =cut
626              
627             sub all_handlers {
628             my $self = shift;
629             my %seen = ();
630             my @h = ();
631              
632             my $tos = $self->{context};
633             while ( $tos ) {
634             push @h, grep { !$seen{$_}++ }
635             grep { $_ ne '*' } keys %{ $tos->{rules} || {} };
636             $tos = $tos->{parent};
637             }
638              
639             return sort @h;
640             }
641              
642             1;
643              
644             __END__