File Coverage

lib/MP3/PodcastFetch/XML/SimpleParser.pm
Criterion Covered Total %
statement 59 65 90.7
branch 12 18 66.6
condition 3 6 50.0
subroutine 18 21 85.7
pod 15 15 100.0
total 107 125 85.6


line stmt bran cond sub pod time code
1             package MP3::PodcastFetch::XML::SimpleParser;
2 1     1   1102 use HTML::Parser;
  1         7493  
  1         90  
3              
4             =head1 XML::SimpleParser -- a simple sax-based parser
5              
6             =head1 SYNOPSIS
7              
8             package MyFooParser;
9             use base 'MP3::PodcastFetch::XML::SimpleParser';
10              
11             # process tags
12             sub t_foo {
13             my $self = shift;
14             my $attrs = shift;
15             if ($attrs) { # tag is starting
16             # do something
17             }
18             else {
19             # do something else
20             }
21             }
22              
23             my $parser = MyFooParser->new();
24             $parser->parse_file('/path/to/an/XML/file')
25             my @results = $parser->results;
26              
27             =head1 DESCRIPTION
28              
29             This package provides a very simple stream-based XML parser. It
30             handles open and close tags and attributes. It does not handle
31             namespaces very well. It was written to support a variety of projects
32             that do not need sophisticated processing, including
33             MP3::PodcastFetch.
34              
35             Do not confuse this with XML::SimpleParser, which is a DOM-based
36             parser.
37              
38             To use this module, create a new subclass of
39             MP3::PodcastFetch::XML::SimpleParser, and define a new method for each
40             tag that you wish to process (all other tags will be ignored). The
41             method should be named t_method_name, where "method_name" should be
42             replaced by the name of the tag you wish to handle. Tag names are case
43             sensitive. For exammple, if the XML file you wish to parse looks like
44             this:
45              
46            
47             Some char data
48             Some more char data
49            
50              
51             You could define a t_foo() and a t_bar() method to handle each of
52             these tags. If a tag name has a funny character in it, such as "-",
53             use a method that has an underscore there instead. The same goes for
54             namespace tags: for a tag like , define a method named
55             podcast_foo(). Sorry, but dynamic resolution of namespaces is not
56             supported.
57              
58             Methods should look like this:
59              
60             sub t_foo {
61             my $self = shift;
62             my $attrs = shift;
63             if ($attrs) {
64             # do something to handle the start tag
65             }
66             else {
67             # do something to handle the end tag
68             }
69             }
70              
71             When the start tag is encountered, a hash reference containing
72             the start tag's attributes are passed as the second argument (if there
73             are no attributes, then an empty hash is provided). When the end tag
74             is encountered, $attrs will be undef. This allows you to distinguish
75             between start and end tags.
76              
77             Ordinarily you will want to set up objects when encountering the start
78             tag and close and clean them up when encountering the end tag. The
79             following example shows how to transform the snippet of XML shown
80             above into the following data structure:
81              
82             { size => 3,
83             bar_list => ['Some char data','Some more char data']
84             }
85              
86             Here's the code:
87              
88             sub t_foo {
89             my $self = shift;
90             my $attrs = shift;
91             if ($attrs) { # starting
92             $self->{current} = { size => $attrs->{size}
93             bar_list => []
94             }
95             }
96             else {
97             $self->add_object($self->{current});
98             undef $self->{current};
99             }
100             }
101              
102             sub t_bar {
103             my $self = shift;
104             my $attrs = shift;
105             if ($attrs) { # starting
106             }
107             else {
108             my $list = $self->{current}{bar_list};
109             die "ERROR: got a without an enclosing " unless $list;
110             my $data = $self->char_data; # get contents
111             push @$list,@data;
112             }
113             }
114              
115             When t_foo() encounters the start of the tag, it creates a new
116             hash and stores it in a temporary hash key called "current". When it
117             encounters the tag (indicated by an undefined $attrs argument),
118             it fetches this hash and calls the inherited add_object() method to
119             add this result to the list of results to return at the end of the
120             parse. It then undefs the {current} key.
121              
122             The t_bar method does nothing when the opening is encountered,
123             but when is seen, it fetches the array ref pointed to by
124             $self->{current}{bar_list} and adds the text content of the
125             section to the list. The inherited char_data() method
126             makes it possible to get at this data. It then pushes the character
127             data onto the end of the list.
128              
129             When working with this subclass, you would call parse_file() to parse
130             an entire file at once or parse() to parse a data stream a bit at a
131             time. When the parse is finished, you'd call result() to get the list
132             of data objects (in this case, a single hash) added by add_object().
133              
134             You can also define a callback that will be invoked each time
135             add_object() is called in order to process each object as it comes in,
136             rather than storing it for later retrieval.
137              
138             You may also override the do_tag() method in order to process
139             unexpected tags that do not have a named method to process them.
140              
141             =head1 METHODS
142              
143             =over 4
144              
145             =cut
146              
147 1     1   9 use warnings;
  1         1  
  1         31  
148 1     1   5 use strict;
  1         1  
  1         917  
149              
150             =item $parser = MyParserSubclass->new()
151              
152             This method creates a new parser object in the current subclass. It takes no arguments.
153              
154             =cut
155              
156             sub new {
157 4     4 1 6 my $class = shift;
158 4   33     27 my $self = bless {},ref $class || $class;
159             my $parser = HTML::Parser->new(api_version => 3,
160 124     124   258 start_h => [ sub { $self->tag_starts(@_) },'tagname,attr' ],
161 124     124   322 end_h => [ sub { $self->tag_stops(@_) },'tagname' ],
162 4     244   78 text_h => [ sub { $self->char_data(@_) },'dtext' ]);
  244         1190  
163 4         288 $parser->xml_mode(1);
164 4         7 eval { $parser->utf8_mode(1); };
  4         12  
165 4         19 $self->parser($parser);
166 4         10 return $self;
167             }
168              
169             =item $low_level_parser = $parser->parser([$new_low_level_parser])
170              
171             MP3::PodcastFetch::XML::SimpleParser uses HTML::Parser (in xml_mode)
172             to do its low-level parsing. This method sets or gets that parser.
173              
174             =cut
175              
176             sub parser {
177 12     12 1 20 my $self = shift;
178 12         116 my $d = $self->{'XML::SimpleParser::parser'};
179 12 100       41 $self->{'XML::SimpleParser::parser'} = shift if @_;
180 12         87 $d;
181             }
182              
183             =item $parser->parse_file($path)
184              
185             This method fully parses the file given at the indicated path.
186              
187             =cut
188              
189             sub parse_file {
190 0     0 1 0 shift->parser->parse_file(@_);
191             }
192              
193             =item $parser->parse($partial_data)
194              
195             This method parses the partial XML data given by the string
196             $partial_data. This allows incremental parsing of web data using,
197             e.g., the LWP library. Call this method with each bit of partial data,
198             then call eof() at the end to allow the parser to clean up its
199             internal data structures.
200              
201             =cut
202              
203             sub parse {
204 4     4 1 15 shift->parser->parse(@_);
205             }
206              
207             =item $parser->eof()
208              
209             Tell the parser to finish the parse. Use at the end of a series of
210             parse() calls.
211              
212             =cut
213              
214             sub eof {
215 4     4 1 11 shift->parser->eof;
216             }
217              
218             =item $parser->tag_starts
219              
220             This method is called during the parse to handle a start tag. It
221             should not ordinarily be overridden or called directly.
222              
223             =cut
224              
225             # tags will be handled by a method named t_TAGNAME
226             sub tag_starts {
227 124     124 1 166 my $self = shift;
228 124         169 my ($tag,$attrs) = @_;
229 124         217 $tag =~ s/[^\w]/_/g;
230 124         166 my $method = "t_$tag";
231 124         159 $self->{'XML::SimpleParser::char_data'} = ''; # clear char data
232 124 100       586 $self->can($method)
233             ? $self->$method($attrs)
234             : $self->do_tag($tag,$attrs);
235             }
236              
237             =item $parser->tag_stops
238              
239             This method is called during the parse to handle a stop tag. It should
240             not ordinarily be overridden or called directly.
241              
242             =cut
243              
244             # tags will be handled by a method named t_TAGNAME
245             sub tag_stops {
246 124     124 1 134 my $self = shift;
247 124         133 my $tag = shift;
248 124         244 $tag =~ s/[^\w]/_/g;
249 124         1362 my $method = "t_$tag";
250 124 100       509 $self->can($method)
251             ? $self->$method()
252             : $self->do_tag($tag);
253             }
254              
255             =item $parser->char_data
256              
257             This method is called internally during the parse to handle character
258             data. It should not ordinarily be overridden or called directly.
259              
260             =cut
261              
262             sub char_data {
263 332     332 1 351 my $self = shift;
264 332 100 66     1184 if (@_ && length(my $text = shift)>0) {
265 244         1019 $self->{'XML::SimpleParser::char_data'} .= $text;
266             } else {
267 88         184 $self->trim($self->{'XML::SimpleParser::char_data'});
268             }
269             }
270              
271             =item $parser->cleanup
272              
273             This method is provided to be called at the end of the parse to handle
274             any cleanup that is needed. The default behavior is to do nothing,
275             but it can be overridden by a subclass to provide more sophisticated
276             processing.
277              
278             =cut
279              
280             sub cleanup {
281 0     0 1 0 my $self = shift;
282             }
283              
284             =item $parser->clear_results
285              
286             This method is called internally at the start of the parse to clear
287             any accumulated results and to get ready for a new parse.
288              
289             =cut
290              
291             sub clear_results {
292 0     0 1 0 shift->{'XML::SimpleParser::results'} = [];
293             }
294              
295             =item $parser->add_object(@objects)
296              
297             This method can be called during the parse to add one or more objects
298             to the results list.
299              
300             =cut
301              
302             # add one or more objects to our results list
303             sub add_object {
304 4     4 1 6 my $self = shift;
305 4 50       13 if (my $cb = $self->callback) {
306 0         0 eval {$cb->(@_)};
  0         0  
307 0 0       0 warn $@ if $@;
308             } else {
309 4         5 push @{$self->{'XML::SimpleParser::results'}},@_;
  4         25  
310             }
311             }
312              
313             =item @results = $parser->results
314              
315             In a list context this method returns the accumulated results from the
316             parse.
317              
318             In a scalar context, this method will return an array reference.
319              
320             =cut
321              
322             sub results {
323 4     4 1 6 my $self = shift;
324 4 50       13 my $r = $self->{'XML::SimpleParser::results'} or return;
325 4 50       160 return wantarray ? @$r : $r;
326             }
327              
328             =item $parser->do_tag
329              
330             This method is called whenver the parse encounters a tag that does not
331             have a specific method to handle it. The call signature is identical
332             to t_TAGNAME methods. By default, it does nothing.
333              
334             =cut
335              
336             sub do_tag {
337 16     16 1 19 my $self = shift;
338 16         80 my ($tag,$attrs) = @_;
339             # do nothing
340             }
341              
342             =item $callback = $parser->callback([$new_callback])
343              
344             This accessor allows you to get or set a callback code that will be
345             used to process objects generated by the parse. If a callback is
346             defined, then add_object() will not add the object to the results
347             list, but will instead pass it to the callback for processing. If
348             multiple objects are passed to add_object, then they will be passed to
349             the callback as one long argument list.
350              
351             =cut
352              
353             # get/set callback
354             sub callback {
355 4     4 1 7 my $self = shift;
356 4         14 my $d = $self->{'XML::SimpleParser::callback'};
357 4 50       39 $self->{'XML::SimpleParser::callback'} = shift if @_;
358 4         11 $d;
359             }
360              
361             =item $trimmed_string = $parser->trim($untrimmed_string)
362              
363             This internal method strips leading and trailing whitespace from a
364             string.
365              
366             =cut
367              
368             # utilities
369             sub trim {
370 88     88 1 86 my $self = shift;
371 88         100 my $string = shift;
372 88         139 $string =~ s/^\s+//;
373 88         175 $string =~ s/\s+$//;
374 88         1788 $string;
375             }
376              
377             1;
378              
379             =back
380              
381             =head1 SEE ALSO
382              
383             L,
384             L,
385             L,
386             L,
387             L,
388             L,
389              
390             =head1 AUTHOR
391              
392             Lincoln Stein .
393              
394             Copyright (c) 2006 Cold Spring Harbor Laboratory
395              
396             This library is free software; you can redistribute it and/or modify
397             it under the same terms as Perl itself. See DISCLAIMER.txt for
398             disclaimers of warranty.
399              
400             =cut
401