File Coverage

blib/lib/XML/Essex.pm
Criterion Covered Total %
statement 48 150 32.0
branch 13 74 17.5
condition 4 17 23.5
subroutine 11 41 26.8
pod 18 24 75.0
total 94 306 30.7


line stmt bran cond sub pod time code
1             package XML::Essex;
2              
3             $VERSION = 0.000_1;
4              
5             =head1 NAME
6              
7             XML::Essex - Essex XML processing primitives
8              
9             =head1 SYNOPSIS
10              
11             TODO
12              
13             =head1 DESCRIPTION
14              
15             =head2 Result Value
16              
17             The return value will be returned to the caller. For handlers, this is
18             usually a "1" for success or some other value, such as a data
19             structure that has been built or the result of a query.
20              
21             For generators and filters, it is important that the result of the next
22             filter's end_document() is returned at the end of your Essex script, so
23             that it may be used upstream of such modules as XML::Simple.
24              
25             Errors should be reported using die().
26              
27             =cut
28              
29             #=head2 A short word on abbreviations
30             #
31             #A goal of essex is to allow code to be as terse or verbose as
32             #appropriate for the job at hand. So almost every object may be
33             #abbreviated. For instance, C may be abbreviated as
34             #C when using L|/isa> to check o.
35             #
36             #Most of the examples use the abbreviated form, though you can spell
37             #them out longhand if you like. Here's a list of abbreviations:
38             #
39             # document doc
40             # element elt
41             # characters chars
42             # processing instruction pi
43             # namespace ns
44             # attribute attr
45             #
46             #Class names, functions, and parameters to the Essex C
47             #function/method are all encouraged to use these abbreviations.
48              
49             =head2 Result Values
50              
51             =for Document maintainers: if you edit this section, copy and paste it
52             over the first section of lib/XML/Essex/ResultValues.pod. Thanks.
53              
54             Essex is designed to Do The Right Thing for the vast majority of
55             uses, so it manages result values automatically unless you take
56             control. Below is a set of detailed rules for how it manages
57             the result value for a filter's processing run, but the overview is:
58              
59             =over
60              
61             =item *
62              
63             Filters normally do not need to manage a result. The result from the
64             next filter downstream will be returned automatically, or an exception
65             will be thrown if an incomplete document is sent downstream.
66              
67             =item *
68              
69             Generators act like filters mostly, except that if a generator decides
70             not to send any results downstream, it should either set a result value
71             by calling C with it, or C that result normally, just
72             like a handler.
73              
74             =item *
75              
76             Handlers should either set a result value
77             by calling C with it, or C that result normally.
78              
79             =item *
80              
81             Generators, filters and handlers should all die() on unexpected
82             conditions and most error conditions (a FALSE or undefined result is not
83             necessarily an error condition for a handler).
84              
85             Generators and filters generally should not return a value of their own
86             because this will surprise calling code which is expecting a return
87             value of the type that the final SAX handler returns.
88              
89             =for Document maintainers: if you edit this section, copy and paste it
90             over the first section of lib/XML/Essex/ResultValues.pod. Thanks.
91              
92             =cut
93              
94 2     2   5333 use Carp;
  2         4  
  2         146  
95 2     2   11 use Exporter;
  2         4  
  2         70  
96 2     2   2319 use Filter::Util::Call;
  2         2425  
  2         149  
97 2     2   2398 use UNIVERSAL;
  2         26  
  2         10  
98              
99 2     2   53 use strict;
  2         2  
  2         60  
100 2     2   9 use vars qw( %EXPORT_TAGS @EXPORT ); ## Set by the controlling process
  2         4  
  2         91  
101 2     2   1236 use XML::Essex::Constants qw( EOD );
  2         4  
  2         13  
102              
103             sub EOL() { "XML::Essex: End of last document" }
104              
105             {
106             %EXPORT_TAGS = (
107             read => [qw(
108             get
109             read_from
110             parse_doc
111             isa
112             next_event
113             path
114             type
115             xeof
116             )],
117             rules => [qw(
118             on
119             xvalue
120              
121             xpush
122             xpop
123             xset
124             xadd
125             )],
126             write => [qw(
127             put
128             write_to
129             push_output_filters
130              
131             characters
132             chars
133              
134             end_document
135             end_doc
136             end_element
137             end_elt
138             start_document
139             start_doc
140             start_element
141             start_elt
142             xml_decl
143             )],
144             );
145              
146             my %seen;
147             $EXPORT_TAGS{filter} = [
148             grep !$seen{$_}++,
149             map @$_,
150             @EXPORT_TAGS{qw( read write )}
151             ];
152              
153             %seen = ();
154              
155             @EXPORT = grep !$seen{$_}++, map @$_, values %EXPORT_TAGS;
156              
157             $EXPORT_TAGS{all} = \@EXPORT;
158             }
159              
160              
161             sub import {
162 2     2   10 my ( undef, @args ) = @_;
163              
164             ## Figure out whether to read, write, or do both.
165 2         3 my %exports;
166 2 0       39 $exports{$_} = 1 for
  0 0       0  
    50          
167             @args
168             ? map s/^://
169             ? exists $EXPORT_TAGS{$_}
170             ? @{$EXPORT_TAGS{$_}}
171             : croak "Unkown export tag ':$_' for ", __PACKAGE__
172             : $_, @args
173             : @EXPORT;
174              
175 2   33     14 my $is_reader = exists $exports{get} || exists $exports{on} || exists $exports{parse_doc};
176 2         5 my $is_writer = exists $exports{put};
177              
178 2 50 33     8 croak "XML::Essex not used as a reader (with :read, get(), :rules or on()) or a writer (with :write put())\n"
179             unless $is_reader || $is_writer;
180              
181 2 50       8 my $sax_processor_type =
    50          
182             ! $is_reader ? "XML::Generator::Essex" :
183             ! $is_writer ? "XML::Handler::Essex" :
184             "XML::Filter::Essex";
185              
186 2         2 my $state = 0; # 0=init; 1=code; 2=pod; 3=EOF
187             filter_add(
188             sub {
189 83 100   83   173 if ( $state == 0 ) {
190 2         6 $_ = join '',
191             "XML::Essex::_init '",
192             $sax_processor_type,
193             "'; XML::Essex::_cleanup eval {";
194 2         3 ++$state;
195 2         10 return 1;
196             }
197              
198 81 100       1788 return 0 if $state > 2;
199 79         193 my $status = filter_read;
200              
201 79 50 66     317 if ( $status > 0 && substr( $_, 0, 1 ) eq "=" ) {
202 0 0       0 if ( $state == 1 ) { $state = 2 }
  0 0       0  
203 0         0 elsif ( substr( $_, 0, 4 ) eq "=cut" ) { $state = 1 }
204             }
205              
206 79 100       863 return $status if $status != 0;
207              
208 2 50       8 $_ .= "\n\n=cut\n" if $state == 2;
209 2         3 $_ .= "\n;1};";
210              
211 2         5 $state = 3;
212              
213 2         8 return 1;
214             },
215 2         15 );
216              
217 2         334 goto \&Exporter::import;
218             }
219              
220             my @self_stack;
221              
222             sub _init_new {
223 0     0   0 my ( $sax_processor_type ) = @_;
224              
225 0         0 push @self_stack, $XML::Essex::Base::self;
226 0         0 my $self = $XML::Essex::Base::self = $sax_processor_type->new;
227 0         0 $self->{NoExecute} = 1;
228              
229             ## The first part of XML::Essex::Base::execute();
230 0         0 $self->reset;
231             }
232              
233             sub _init {
234 1     1   8 my ( $sax_processor_type ) = @_;
235 1 50       87 eval "require $sax_processor_type" or croak $@;
236 0           _init_new $sax_processor_type;
237             };
238              
239              
240             sub _cleanup {
241 0     0     my ( $ok ) = @_;
242 0           my $self = $XML::Essex::Base::self;
243              
244 0           my $x = $@;
245              
246             ## The last part of XML::Essex::Base::execute();
247 0           my ( $ok2, $result, $result_set ) = eval {
248 0           ( 1, $self->finish( $ok, $x ) );
249             };
250 0           $XML::Essex::Base::self = pop @self_stack;
251              
252 0 0         die $@ unless $ok2;
253 0 0         return $result if $result_set;
254 0           return 1;
255             }
256              
257             sub _reinit {
258 0     0     my $type = ref $XML::Essex::Base::self;
259 0           _cleanup 1;
260 0           _init_new $type;
261             }
262              
263             =back
264              
265             =head1 Exported Functions
266              
267             These are exported by default, use the C syntax to
268             avoid exporting any of these or export only the ones you want.
269              
270             The following export tags are also defined:
271              
272             :read get read_from parse_doc isa next_event path type xeof
273             :rules on
274             :write put write_to start_doc end_doc start_elt chars ...
275              
276             so you can
277              
278             use XML::Essex qw( :read :rules );
279              
280             for an Essex script that just handles input and uses some rules, or
281             even:
282              
283             use XML::Essex qw( parse_doc :rules );
284              
285             for a purely rule-based script.
286              
287             Importing only what you need is a little quicker and more memory
288             efficient, but it cal also allow XML::Essex to run more efficiently. If
289             you don't import any output functions (see C<:write> above), it will not
290             load the output routines. Same for the input and rule based APIs.
291              
292             =over
293              
294             =item get
295              
296             my $e = get;
297              
298             Returns the next SAX event. Sets $_ as an EXPERIMENTAL feature.
299              
300             Throws an exception (which is silently caught outside the main code)
301             on end of input.
302              
303             See C and C functions and method (in
304             L) for how to test what was just gotten.
305              
306             =cut
307              
308             sub get {
309 0     0 1   my $self = $XML::Essex::Base::self;
310              
311 0 0         $self->_read_from_default unless $self->{Reader};
312              
313 0           $XML::Essex::Base::self->get( @_ );
314             }
315              
316             =item read_from
317              
318             read_from \*STDIN; ## From a filehandle
319             read_from "-"; ## From \*STDIN
320             read_from "foo.xml"; ## From a file or URI (URI support is parser dependant)
321             read_from \$xml_string; ## From a string.
322             read_from undef; ## STDIN or files named in @ARGV, as appropriate
323              
324             Tells the next get() or parse_doc() to read from the indicated source.
325              
326             Calling read_from automatically disassembles the current processing chain
327             and builds a new one (just like Perl's open() closes an already open
328             filehandle).
329              
330             =cut
331              
332             sub XML::Essex::Base::_read_from_default {
333 0     0     my $self = shift;
334              
335 0 0 0       if ( @ARGV || $self->{FromARGV} ) {
336 0           $self->{FromARGV} = 1;
337 0 0         die EOL."\n" unless @ARGV;
338 0           read_from( shift @ARGV );
339             }
340             else {
341 0           read_from( \*STDIN );
342             }
343             }
344              
345             ## TODO: move this in to XML::Handler::Essex as a set of standard
346             ## SAX parse_foo() APIs.
347             sub read_from {
348             ## Shut down the old processing chain if it a Reader was already
349             ## created.
350             ## NOTE: This ASSumes that there is only one instance of the Essex
351             ## scripting env. in play at once. This is ok for now, but it does
352             ## contradict the idea of @self_stack. Perhaps having the source
353             ## filter set a secretly named global to point us to the right
354             ## $self would help. The goal is to enable handling of multiple
355             ## inputs at the same time: get from this, get from that.
356 0 0   0 1   _reinit if $XML::Essex::Base::self->{Reader};
357              
358 0           my $self = $XML::Essex::Base::self;
359 0           my ( $what ) = @_;
360              
361 0 0         if ( ! defined $what ) {
362 0           return delete $self->{Reader};
363             }
364              
365             $self->{Reader} = sub {
366 0     0     require XML::SAX::PurePerl; ## ugh. need XML::LibXMl to support SAX2
367 0           my $p = XML::SAX::PurePerl->new( Handler => $self );
368              
369 0           my $type = ref $what;
370              
371             ## This is purely a non-threading implementation.
372             ## TODO: build the parser and save the reference to be parsed, then
373             ## use an appropriate driver for the parser that is called when
374             ## there are no more events in @{$self->{Events}}.
375 0 0 0       if ( ! $type ) {
    0          
    0          
376 0 0         $what eq "-"
377             ? $p->parse_file( \*STDIN )
378             : $p->parse_uri( $what );
379             }
380             elsif ( $type eq "GLOB" || UNIVERSAL::isa( $what, "IO::Handle" ) ) {
381 0           $p->parse_file( $what );
382             }
383             elsif ( $type eq "SCALAR" ) {
384 0           $p->parse_string( $$what );
385             }
386             else {
387 0           croak "Don't know how to read from a $type";
388             }
389 0           };
390             }
391              
392             =item push_output_filters
393              
394             Adds an output filter to the end of the current list (and before the
395             eventual writer). Can be a class name (which will be Ced
396             unless the class can already new()) or a reference to a filter.
397              
398             =cut
399              
400             sub push_output_filters {
401 0     0 1   my $self = $XML::Essex::Base::self;
402              
403 0           push @{$self->{OutputFilters}}, @_;
  0            
404             }
405              
406             =item parse_doc
407              
408             Parses a single document from the current input. Morally equivalent to C
409             while 1;> but exits normally (as opposed to throwing an exception) when the
410             end of document is reached. Also slightly faster now and hopefully moreso
411             when optimizations can be made.
412              
413             Used to read to the end of a document, primarily in rule-based processing
414             (L).
415              
416             TODO: Allow parse_doc to take rules.
417              
418             =cut
419              
420             sub parse_doc {
421 0     0 1   my $self = $XML::Essex::Base::self;
422              
423 0 0         $self->_read_from_default unless $self->{Reader};
424              
425 0 0         write_to( \*STDOUT ) unless $self->{Writer};
426              
427             ## The result is undocumented; what should be returned is the
428             ## normal XML::Filter::Dispatcher
429 0           my $result;
430 0           eval {
431 0           $result = $self->get while 1; ## I did say I ;)
432             };
433              
434 0 0         die $@ unless $@ eq EOD . "\n";
435              
436 0           return $result;
437             }
438              
439             =item put
440              
441             Output one or more events. Usually these events are created by
442             constructors like C (see
443             L for details) or
444             are objects returned C method.
445              
446             =cut
447              
448             sub put {
449 0     0 1   my $self = $XML::Essex::Base::self;
450              
451 0 0         write_to( \*STDOUT ) unless $self->{Writer};
452 0           $self->put( @_ );
453             }
454              
455             =item write_to
456              
457             write_to \*STDOUT; ## To a filehandle
458             write_to "-"; ## To \*STDOUT
459             write_to "foo.xml"; ## To a file or URI (URI support is parser dependant)
460             write_to \$xml_string; ## To a string.
461              
462             Tells the next put() to write the indicated source.
463              
464             =cut
465              
466             sub write_to {
467 0     0 1   my $self = $XML::Essex::Base::self;
468 0           my ( $what ) = @_;
469              
470 0 0         croak "Can't write to an undefined output" unless defined $what;
471              
472 0           require XML::SAX::Writer;
473             $self->{Writer} = sub {
474 0     0     my $h = XML::SAX::Writer->new( Output => $what );
475 0 0         for ( reverse @{$self->{OutputFilters} || [] } ) {
  0            
476 0 0         unless ( ref $_ ) {
477 0 0 0       eval "require $_" or die $@ unless $_->can( "new" );
478 0           $_ = $_->new( Handler => $h );
479             }
480             else {
481 0           $_->set_handler( $h );
482             }
483 0           $h = $_;
484             }
485 0           return $h;
486 0           };
487             }
488              
489             =back
490              
491             =head2 Miscellaneous
492              
493             =over
494              
495             =item isa
496              
497             get until isa "start_elt" and $_->name eq "foo";
498             $r = get until isa $r, "start_elt" and $_->name eq "foo";
499              
500             Returns true if the parameter is of the indicated object type. Tests $_
501             unless more than one parameter is passed.
502              
503             =cut
504              
505             sub isa($) {
506 0 0   0 1   local $_ = shift if @_ >= 2;
507 0 0         UNIVERSAL::can( $_, "isa" )
508             ? $_->isa( @_ )
509             : UNIVERSAL::isa( $_, @_ );
510             }
511              
512             =item next_event
513              
514             Like C (see L), but does not remove the next event
515             from the input stream.
516              
517             get "start_document::*";
518             get if next_event->isa( "xml_decl" );
519             ...process remainder of document...
520              
521             =cut
522              
523             sub next_event {
524 0     0 1   my $self = $XML::Essex::Base::self;
525 0 0         die "No XML input defined\n" unless $self->{Reader};
526 0           $self->{Reader}->peek;
527             }
528              
529             =item path
530              
531             get "start_element::*" until path eq "/path/to/foo:bar"
532              
533             Returns the path to the current element as a string.
534              
535             =cut
536              
537             sub path {
538 0     0 1   my $self = $XML::Essex::Base::self;
539 0           return join "/", "", map $_->name, @{$self->{Stack}};
  0            
540             }
541              
542             =for import XML::Generator::Essex/put
543              
544             =item type
545              
546             get until type eq "start_document";
547             $r = get until type $r eq "start_document";
548              
549              
550             Return the type name of the object. This is the class name with a
551             leading XML::Essex:: stripped off. This is a wrapper around the
552             event's C method.
553              
554             =cut
555              
556             sub type {
557 0     0 1   my $self = $XML::Essex::Base::self;
558 0           $self->type( @_ );
559             }
560              
561             =item xeof
562              
563             Return TRUE if the last event has been read.
564              
565             =cut
566              
567             sub xeof {
568 0     0 1   my $self = $XML::Essex::Base::self;
569 0 0         die "No XML input defined\n" unless $self->{Reader};
570 0           $self->{Reader}->eof;
571             }
572              
573             =back
574              
575             =head2 Namespaces
576              
577             If this section doesn't make any sense, see
578             L
579             for your next dose of XML koolaid. If it still doesn't make any sense
580             then ding me for writing gibberish.
581              
582             Element names, attribute names, and PI targets returned by Essex are
583             generated in one of three forms, depending on whether the named item
584             has a namespace URI associated with it and whether the filter program
585             has mapped that namespace URI to a prefix. You may also use any of
586             these three forms when passing a name to Essex:
587              
588             =over
589              
590             =item "id"
591              
592             If an attribute has no NamespaceURI or an empty string for a
593             NamespaceURI, it will be returned as a simple string.
594              
595             TODO: Add an option to enable this for the default namespace or
596             for attrs in the element's namespace.
597              
598             =item "foo:id"
599              
600             If the attribute is in a namespace and there is a namespace -> prefix
601             mapping has been declared by the filter
602              
603             =item "{http://foo/}id"
604              
605             If the attribute is in a namespace with no prefix mapped to it by
606             the filter.
607              
608             =back
609              
610             Namespace prefixes from the source document are ignored; there's no
611             telling what prefix somebody might have used. Intercept the
612             start_prefix_mapping and end_prefix_mapping events to follow the weave
613             of source document namespace mappings.
614              
615             When outputting events that belong to a namespace not in the source
616             document, you need to C the start_prefix_mapping and
617             end_prefix_mapping events manually, and be careful avoid existing
618             prefixes from the document if need be while doing so. Future additions
619             to Essex should make this easier and perhaps automatic.
620              
621             Essex lets you manage namespace mappings by mapping, hiding, and
622             destroying ( $namespace => $prefix ) pairs using the functions:
623              
624             =over
625              
626             =cut
627              
628             =item namespace_map
629              
630             aka: ns_map
631              
632             my $map = ns_map(
633             $ns1 => $prefix1,
634             $ns2 => $prefix2,
635             ...
636             );
637              
638             Creates a new set of mappings in addition to any that are already in
639             effect. If a namespace is mapped to multiple prefixes, the last one
640             created is used. The mappings stay in effect until the map objected
641             referred to by C<$map> is destroyed.
642              
643             =cut
644              
645             sub ns_map {
646 0     0 0   my $self = $XML::Essex::Base::self;
647 0           return $self->new( @_ );
648             }
649              
650              
651             =back
652              
653             =head2 Rule Based Processing
654              
655             It is often advantageous to declare exceptional events that should
656             be processed as they occur in the stream rather than testing for them
657             explicitly everywhere they might occur in the script. This is done
658             using the "on" function.
659              
660             =cut
661              
662             =over
663              
664             =item on
665              
666             on(
667             "start_document::*" => sub { warn "start of document reached" },
668             "end_document::*" => sub { warn "end of document reached" },
669             );
670              
671             =for TODO
672             my $rule = on $pat1 => sub { ... }, ...;
673             ...time passes with rules in effect...
674             disable_rule $rule;
675             ...time passes with rules I in effect...
676             enable_rule $rule;
677             ...time passes with rules in effect again...
678              
679             This declares that a rule should be in effect until the end of the
680             document
681              
682             =for TODO or it is disabled.
683              
684             =for TODO Returns a handle that may be used to enable or disable all
685             rules passed in.
686              
687             For now, this must be called before the first get() for predictable
688             results.
689              
690             Rules remain in effect after the main() routine has exited to facilitate
691             pure rule based processing.
692              
693             =cut
694              
695             sub on {
696 0     0 1   my $self = $XML::Essex::Base::self;
697 0           $self->on( @_ );
698             }
699              
700             =item xvalue
701              
702             Returns the result of the expression that fired an action. Valid only
703             within rules.
704              
705             =cut
706              
707             sub xvalue {
708 0     0 1   my $self = $XML::Essex::Base::self;
709 0           $self->xvalue;
710             }
711              
712             =item xpush
713              
714             Returns the result of the expression that fired an action. Valid only
715             within rules.
716              
717             =item xpop
718              
719             Returns the result of the expression that fired an action. Valid only
720             within rules.
721              
722             =item xset
723              
724             Returns the result of the expression that fired an action. Valid only
725             within rules.
726              
727             =item xadd
728              
729             Returns the result of the expression that fired an action. Valid only
730             within rules.
731              
732             =cut
733              
734 0     0 1   sub xpush { XML::Filter::Dispatcher::xpush( @_ ) }
735 0     0 1   sub xpop { XML::Filter::Dispatcher::xpop( @_ ) }
736 0     0 1   sub xadd { XML::Filter::Dispatcher::xadd( @_ ) }
737 0     0 1   sub xset { XML::Filter::Dispatcher::xset( @_ ) }
738              
739             =back
740              
741             =head2 Event Constructors
742              
743             These are exported by :write (in addition to being available individually).
744              
745             =over
746              
747             =cut
748              
749 2     2   15 no warnings "once";
  2         3  
  2         714  
750              
751             =item chars
752              
753             aka: characters
754              
755             =cut
756              
757             sub characters {
758 0     0 0   XML::Essex::Event::characters->new( @_ );
759             }
760              
761             *chars = \&characters;
762              
763              
764             =item end_doc
765              
766             aka: end_document
767              
768             =cut
769              
770             sub end_document {
771 0     0 0   XML::Essex::Event::end_doc->new( @_ );
772             }
773              
774             *end_doc = \&end_document;
775              
776             =item end_elt
777              
778             aka: end_element
779              
780             =cut
781              
782             sub end_element {
783 0     0 0   XML::Essex::Event::end_element->new( @_ );
784             }
785              
786             *end_elt = \&end_element;
787              
788              
789             =item start_doc
790              
791             aka: start_document
792              
793             =cut
794              
795             sub start_document {
796 0     0 0   XML::Essex::Event::start_document->new( @_ );
797             }
798              
799              
800             *start_doc = \&start_document;
801              
802              
803             =item start_elt
804              
805             aka: start_element
806              
807             =cut
808              
809             sub start_element {
810 0     0 0   XML::Essex::Event::start_element->new( @_ );
811             }
812              
813              
814             *start_elt = \&start_element;
815              
816             =item xml_decl
817              
818             =cut
819              
820             sub xml_decl {
821 0     0 1   XML::Essex::Event::xml_decl->new( @_ );
822             }
823              
824              
825             =back
826              
827             =head1 IMPLEMENTATION NOTES
828              
829             XML::Essex is a source filter that wraps from the C line to the
830             end of the file in an eval { ... } block.
831              
832             =head1 LIMITATIONS
833              
834             Stay tuned.
835              
836             =head1 COPYRIGHT
837              
838             Copyright 2002, R. Barrie Slaymaker, Jr., All Rights Reserved
839              
840             =head1 LICENSE
841              
842             You may use this module under the terms of the BSD, Artistic, oir GPL licenses,
843             any version.
844              
845             =head1 AUTHOR
846              
847             Barrie Slaymaker
848              
849             =cut
850              
851             1;