File Coverage

lib/Pod/Simple/PullParser.pm
Criterion Covered Total %
statement 209 232 90.0
branch 76 102 74.5
condition 42 67 62.6
subroutine 26 32 81.2
pod 13 17 76.4
total 366 450 81.3


line stmt bran cond sub pod time code
1             require 5;
2             package Pod::Simple::PullParser;
3             $VERSION = '3.43';
4 10     10   18214 use Pod::Simple ();
  10         26  
  10         325  
5 10     10   379 BEGIN {@ISA = ('Pod::Simple')}
6              
7 10     10   61 use strict;
  10         16  
  10         171  
8 10     10   43 use Carp ();
  10         18  
  10         155  
9              
10 10     10   4622 use Pod::Simple::PullParserStartToken;
  10         24  
  10         294  
11 10     10   3866 use Pod::Simple::PullParserEndToken;
  10         23  
  10         281  
12 10     10   3897 use Pod::Simple::PullParserTextToken;
  10         23  
  10         398  
13              
14 10 50   10   1635 BEGIN { *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG }
15              
16             __PACKAGE__->_accessorize(
17             'source_fh', # the filehandle we're reading from
18             'source_scalar_ref', # the scalarref we're reading from
19             'source_arrayref', # the arrayref we're reading from
20             );
21              
22             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
23             #
24             # And here is how we implement a pull-parser on top of a push-parser...
25              
26             sub filter {
27 0     0 1 0 my($self, $source) = @_;
28 0 0       0 $self = $self->new unless ref $self;
29              
30 0 0       0 $source = *STDIN{IO} unless defined $source;
31 0         0 $self->set_source($source);
32 0         0 $self->output_fh(*STDOUT{IO});
33              
34 0         0 $self->run; # define run() in a subclass if you want to use filter()!
35 0         0 return $self;
36             }
37              
38             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
39              
40             sub parse_string_document {
41 49     49 1 75 my $this = shift;
42 49         138 $this->set_source(\ $_[0]);
43 49         132 $this->run;
44             }
45              
46             sub parse_file {
47 13     13 1 27 my($this, $filename) = @_;
48 13         49 $this->set_source($filename);
49 13         42 $this->run;
50             }
51              
52             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
53             # In case anyone tries to use them:
54              
55             sub run {
56 10     10   68 use Carp ();
  10         18  
  10         963  
57 0 0 0 0 0 0 if( __PACKAGE__ eq ref($_[0]) || $_[0]) { # I'm not being subclassed!
58 0         0 Carp::croak "You can call run() only on subclasses of "
59             . __PACKAGE__;
60             } else {
61 0   0     0 Carp::croak join '',
62             "You can't call run() because ",
63             ref($_[0]) || $_[0], " didn't define a run() method";
64             }
65             }
66              
67             sub parse_lines {
68 10     10   62 use Carp ();
  10         21  
  10         471  
69 0     0 1 0 Carp::croak "Use set_source with ", __PACKAGE__,
70             " and subclasses, not parse_lines";
71             }
72              
73             sub parse_line {
74 10     10   57 use Carp ();
  10         19  
  10         27471  
75 0     0 0 0 Carp::croak "Use set_source with ", __PACKAGE__,
76             " and subclasses, not parse_line";
77             }
78              
79             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
80              
81             sub new {
82 89     89 1 2428 my $class = shift;
83 89         346 my $self = $class->SUPER::new(@_);
84 89 50       210 die "Couldn't construct for $class" unless $self;
85              
86 89   50     400 $self->{'token_buffer'} ||= [];
87 89   50     357 $self->{'start_token_class'} ||= 'Pod::Simple::PullParserStartToken';
88 89   50     354 $self->{'text_token_class'} ||= 'Pod::Simple::PullParserTextToken';
89 89   50     407 $self->{'end_token_class'} ||= 'Pod::Simple::PullParserEndToken';
90              
91 89         117 DEBUG > 1 and print STDERR "New pullparser object: $self\n";
92              
93 89         181 return $self;
94             }
95              
96             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
97              
98             sub get_token {
99 1398     1398 1 2586 my $self = shift;
100 1398         1479 DEBUG > 1 and print STDERR "\nget_token starting up on $self.\n";
101             DEBUG > 2 and print STDERR " Items in token-buffer (",
102             scalar( @{ $self->{'token_buffer'} } ) ,
103             ") :\n", map(
104             " " . $_->dump . "\n", @{ $self->{'token_buffer'} }
105             ),
106 1398         1479 @{ $self->{'token_buffer'} } ? '' : ' (no tokens)',
107             "\n"
108             ;
109              
110 1398         1628 until( @{ $self->{'token_buffer'} } ) {
  1985         3807  
111 587         701 DEBUG > 3 and print STDERR "I need to get something into my empty token buffer...\n";
112 587 100       1627 if($self->{'source_dead'}) {
    100          
    100          
    50          
113 79         120 DEBUG and print STDERR "$self 's source is dead.\n";
114 79         112 push @{ $self->{'token_buffer'} }, undef;
  79         166  
115             } elsif(exists $self->{'source_fh'}) {
116 23         33 my @lines;
117 23   33     71 my $fh = $self->{'source_fh'}
118             || Carp::croak('You have to call set_source before you can call get_token');
119            
120 23         32 DEBUG and print STDERR "$self 's source is filehandle $fh.\n";
121             # Read those many lines at a time
122 23         67 for(my $i = Pod::Simple::MANY_LINES; $i--;) {
123 298         303 DEBUG > 3 and print STDERR " Fetching a line from source filehandle $fh...\n";
124 298         609 local $/ = $Pod::Simple::NL;
125 298         1249 push @lines, scalar(<$fh>); # readline
126 298         471 DEBUG > 3 and print STDERR " Line is: ",
127             defined($lines[-1]) ? $lines[-1] : "\n";
128 298 100       753 unless( defined $lines[-1] ) {
129 17         21 DEBUG and print STDERR "That's it for that source fh! Killing.\n";
130 17         40 delete $self->{'source_fh'}; # so it can be GC'd
131 17         55 last;
132             }
133             # but pass thru the undef, which will set source_dead to true
134              
135             # TODO: look to see if $lines[-1] is =encoding, and if so,
136             # do horribly magic things
137              
138             }
139            
140 23         44 if(DEBUG > 8) {
141             print STDERR "* I've gotten ", scalar(@lines), " lines:\n";
142             foreach my $l (@lines) {
143             if(defined $l) {
144             print STDERR " line {$l}\n";
145             } else {
146             print STDERR " line undef\n";
147             }
148             }
149             print STDERR "* end of ", scalar(@lines), " lines\n";
150             }
151              
152 23         122 $self->SUPER::parse_lines(@lines);
153            
154             } elsif(exists $self->{'source_arrayref'}) {
155             DEBUG and print STDERR "$self 's source is arrayref $self->{'source_arrayref'}, with ",
156 2         4 scalar(@{$self->{'source_arrayref'}}), " items left in it.\n";
157              
158 2         3 DEBUG > 3 and print STDERR " Fetching ", Pod::Simple::MANY_LINES, " lines.\n";
159             $self->SUPER::parse_lines(
160 2         5 splice @{ $self->{'source_arrayref'} },
  2         15  
161             0,
162             Pod::Simple::MANY_LINES
163             );
164 2 50       4 unless( @{ $self->{'source_arrayref'} } ) {
  2         9  
165 2         4 DEBUG and print STDERR "That's it for that source arrayref! Killing.\n";
166 2         7 $self->SUPER::parse_lines(undef);
167 2         7 delete $self->{'source_arrayref'}; # so it can be GC'd
168             }
169             # to make sure that an undef is always sent to signal end-of-stream
170              
171             } elsif(exists $self->{'source_scalar_ref'}) {
172              
173             DEBUG and print STDERR "$self 's source is scalarref $self->{'source_scalar_ref'}, with ",
174             length(${ $self->{'source_scalar_ref'} }) -
175 483         622 (pos(${ $self->{'source_scalar_ref'} }) || 0),
176             " characters left to parse.\n";
177              
178 483         533 DEBUG > 3 and print STDERR " Fetching a line from source-string...\n";
179 483 100       583 if( ${ $self->{'source_scalar_ref'} } =~
  483         2047  
180             m/([^\n\r]*)((?:\r?\n)?)/g
181             ) {
182             #print(">> $1\n"),
183             $self->SUPER::parse_lines($1)
184             if length($1) or length($2)
185 56         118 or pos( ${ $self->{'source_scalar_ref'} })
186 427 100 100     2111 != length( ${ $self->{'source_scalar_ref'} });
  56   66     184  
187             # I.e., unless it's a zero-length "empty line" at the very
188             # end of "foo\nbar\n" (i.e., between the \n and the EOS).
189             } else { # that's the end. Byebye
190 56         169 $self->SUPER::parse_lines(undef);
191 56         107 delete $self->{'source_scalar_ref'};
192 56         103 DEBUG and print STDERR "That's it for that source scalarref! Killing.\n";
193             }
194              
195            
196             } else {
197 0         0 die "What source??";
198             }
199             }
200             DEBUG and print STDERR "get_token about to return ",
201             Pod::Simple::pretty( @{$self->{'token_buffer'}}
202 1398         1662 ? $self->{'token_buffer'}[-1] : undef
203             ), "\n";
204 1398         1595 return shift @{$self->{'token_buffer'}}; # that's an undef if empty
  1398         3981  
205             }
206              
207             sub unget_token {
208 96     96 1 196 my $self = shift;
209 96         122 DEBUG and print STDERR "Ungetting ", scalar(@_), " tokens: ",
210             @_ ? "@_\n" : "().\n";
211 96         171 foreach my $t (@_) {
212 656 50       1028 Carp::croak "Can't unget that, because it's not a token -- it's undef!"
213             unless defined $t;
214 656 50       1140 Carp::croak "Can't unget $t, because it's not a token -- it's a string!"
215             unless ref $t;
216 656 50       1549 Carp::croak "Can't unget $t, because it's not a token object!"
217             unless UNIVERSAL::can($t, 'type');
218             }
219            
220 96         133 unshift @{$self->{'token_buffer'}}, @_;
  96         281  
221             DEBUG > 1 and print STDERR "Token buffer now has ",
222 96         127 scalar(@{$self->{'token_buffer'}}), " items in it.\n";
223 96         192 return;
224             }
225              
226             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
227              
228             # $self->{'source_filename'} = $source;
229              
230             sub set_source {
231 90     90 1 1001327 my $self = shift @_;
232 90 50       193 return $self->{'source_fh'} unless @_;
233             Carp::croak("Cannot assign new source to pull parser; create a new instance, instead")
234 90 100 66     741 if $self->{'source_fh'} || $self->{'source_scalar_ref'} || $self->{'source_arrayref'};
      66        
235 89         138 my $handle;
236 89 50       458 if(!defined $_[0]) {
    100          
    100          
    100          
    100          
    50          
237 0         0 Carp::croak("Can't use empty-string as a source for set_source");
238             } elsif(ref(\( $_[0] )) eq 'GLOB') {
239 1         9 $self->{'source_filename'} = '' . ($handle = $_[0]);
240 1         2 DEBUG and print STDERR "$self 's source is glob $_[0]\n";
241             # and fall thru
242             } elsif(ref( $_[0] ) eq 'SCALAR') {
243 70         118 $self->{'source_scalar_ref'} = $_[0];
244 70         91 DEBUG and print STDERR "$self 's source is scalar ref $_[0]\n";
245 70         136 return;
246             } elsif(ref( $_[0] ) eq 'ARRAY') {
247 2         7 $self->{'source_arrayref'} = $_[0];
248 2         3 DEBUG and print STDERR "$self 's source is array ref $_[0]\n";
249 2         16 return;
250             } elsif(ref $_[0]) {
251 2         9 $self->{'source_filename'} = '' . ($handle = $_[0]);
252 2         5 DEBUG and print STDERR "$self 's source is fh-obj $_[0]\n";
253             } elsif(!length $_[0]) {
254 0         0 Carp::croak("Can't use empty-string as a source for set_source");
255             } else { # It's a filename!
256 14         19 DEBUG and print STDERR "$self 's source is filename $_[0]\n";
257             {
258 14         23 local *PODSOURCE;
  14         38  
259 14 50       622 open(PODSOURCE, "<$_[0]") || Carp::croak "Can't open $_[0]: $!";
260 14         89 $handle = *PODSOURCE{IO};
261             }
262 14         44 $self->{'source_filename'} = $_[0];
263 14         20 DEBUG and print STDERR " Its name is $_[0].\n";
264              
265             # TODO: file-discipline things here!
266             }
267              
268 17         39 $self->{'source_fh'} = $handle;
269 17         23 DEBUG and print STDERR " Its handle is $handle\n";
270 17         30 return 1;
271             }
272              
273             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
274              
275 0     0 0 0 sub get_title_short { shift->get_short_title(@_) } # alias
276              
277             sub get_short_title {
278 21     21 1 84 my $title = shift->get_title(@_);
279 21 100       96 $title = $1 if $title =~ m/^(\S{1,60})\s+--?\s+./s;
280             # turn "Foo::Bar -- bars for your foo" into "Foo::Bar"
281 21         63 return $title;
282             }
283              
284             sub get_title { shift->_get_titled_section(
285 39     39 1 268 'NAME', max_token => 50, desperate => 1, @_)
286             }
287             sub get_version { shift->_get_titled_section(
288 3     3 1 18 'VERSION',
289             max_token => 400,
290             accept_verbatim => 1,
291             max_content_length => 3_000,
292             @_,
293             );
294             }
295             sub get_description { shift->_get_titled_section(
296 7     7 1 184 'DESCRIPTION',
297             max_token => 400,
298             max_content_length => 3_000,
299             @_,
300             ) }
301              
302 0     0 0 0 sub get_authors { shift->get_author(@_) } # a harmless alias
303              
304             sub get_author {
305 2     2 1 122 my $this = shift;
306             # Max_token is so high because these are
307             # typically at the end of the document:
308 2 100       10 $this->_get_titled_section('AUTHOR' , max_token => 10_000, @_) ||
309             $this->_get_titled_section('AUTHORS', max_token => 10_000, @_);
310             }
311              
312             #--------------------------------------------------------------------------
313              
314             sub _get_titled_section {
315             # Based on a get_title originally contributed by Graham Barr
316 52     52   184 my($self, $titlename, %options) = (@_);
317            
318 52         127 my $max_token = delete $options{'max_token'};
319 52         86 my $desperate_for_title = delete $options{'desperate'};
320 52         89 my $accept_verbatim = delete $options{'accept_verbatim'};
321 52         81 my $max_content_length = delete $options{'max_content_length'};
322 52         80 my $nocase = delete $options{'nocase'};
323 52 100       124 $max_content_length = 120 unless defined $max_content_length;
324              
325 52 0       135 Carp::croak( "Unknown " . ((1 == keys %options) ? "option: " : "options: ")
    50          
326             . join " ", map "[$_]", sort keys %options
327             )
328             if keys %options;
329              
330 52         69 my %content_containers;
331 52         97 $content_containers{'Para'} = 1;
332 52 100       99 if($accept_verbatim) {
333 3         7 $content_containers{'Verbatim'} = 1;
334 3         6 $content_containers{'VerbatimFormatted'} = 1;
335             }
336              
337 52         102 my $token_count = 0;
338 52         88 my $title;
339             my @to_unget;
340 52         66 my $state = 0;
341 52         67 my $depth = 0;
342              
343 52 50 33     421 Carp::croak "What kind of titlename is \"$titlename\"?!" unless
344             defined $titlename and $titlename =~ m/^[A-Z ]{1,60}$/s; #sanity
345 52         108 my $titlename_re = quotemeta($titlename);
346              
347 52         114 my $head1_text_content;
348             my $para_text_content;
349 52         0 my $skipX;
350              
351 52   50     233 while(
      66        
352             ++$token_count <= ($max_token || 1_000_000)
353             and defined(my $token = $self->get_token)
354             ) {
355 563         848 push @to_unget, $token;
356              
357 563 100       1102 if ($state == 0) { # seeking =head1
    100          
    100          
    50          
358 276 100 100     623 if( $token->is_start and $token->tagname eq 'head1' ) {
359 62         90 DEBUG and print STDERR " Found head1. Seeking content...\n";
360 62         93 ++$state;
361 62         212 $head1_text_content = '';
362             }
363             }
364              
365             elsif($state == 1) { # accumulating text until end of head1
366 131 100 100     349 if( $token->is_text ) {
    100          
    100          
367 65 100       145 unless ($skipX) {
368 64         77 DEBUG and print STDERR " Adding \"", $token->text, "\" to head1-content.\n";
369 64         163 $head1_text_content .= $token->text;
370             }
371             } elsif( $token->is_tagname('X') ) {
372             # We're going to want to ignore X<> stuff.
373 2         6 $skipX = $token->is_start;
374 2         8 DEBUG and print STDERR +($skipX ? 'Start' : 'End'), 'ing ignoring of X<> tag';
375             } elsif( $token->is_end and $token->tagname eq 'head1' ) {
376 62         87 DEBUG and print STDERR " Found end of head1. Considering content...\n";
377 62 100       133 $head1_text_content = uc $head1_text_content if $nocase;
378 62 50 100     952 if($head1_text_content eq $titlename
    100 66        
    100 100        
      100        
      66        
379             or $head1_text_content =~ m/\($titlename_re\)/s
380             # We accept "=head1 Nomen Modularis (NAME)" for sake of i18n
381             ) {
382 36         56 DEBUG and print STDERR " Yup, it was $titlename. Seeking next para-content...\n";
383 36         128 ++$state;
384             } elsif(
385             $desperate_for_title
386             # if we're so desperate we'll take the first
387             # =head1's content as a title
388             and $head1_text_content =~ m/\S/
389             and $head1_text_content !~ m/^[ A-Z]+$/s
390             and $head1_text_content !~
391             m/\((?:
392             NAME | TITLE | VERSION | AUTHORS? | DESCRIPTION | SYNOPSIS
393             | COPYRIGHT | LICENSE | NOTES? | FUNCTIONS? | METHODS?
394             | CAVEATS? | BUGS? | SEE\ ALSO | SWITCHES | ENVIRONMENT
395             )\)/sx
396             # avoid accepting things like =head1 Thingy Thongy (DESCRIPTION)
397             and ($max_content_length
398             ? (length($head1_text_content) <= $max_content_length) # sanity
399             : 1)
400             ) {
401             # Looks good; trim it
402 6         27 ($title = $head1_text_content) =~ s/\s+$//;
403 6         9 DEBUG and print STDERR " It looks titular: \"$title\".\n\n Using that.\n";
404 6         17 last;
405             } else {
406 20         47 --$state;
407 20         89 DEBUG and print STDERR " Didn't look titular ($head1_text_content).\n",
408             "\n Dropping back to seeking-head1-content mode...\n";
409             }
410             }
411             }
412            
413             elsif($state == 2) {
414             # seeking start of para (which must immediately follow)
415 36 50 33     98 if($token->is_start and $content_containers{ $token->tagname }) {
416 36         51 DEBUG and print STDERR " Found start of Para. Accumulating content...\n";
417 36         105 $para_text_content = '';
418 36         140 ++$state;
419             } else {
420 0         0 DEBUG and print
421             " Didn't see an immediately subsequent start-Para. Reseeking H1\n";
422 0         0 $state = 0;
423             }
424             }
425            
426             elsif($state == 3) {
427             # accumulating text until end of Para
428 120 100 100     256 if( $token->is_text ) {
    100          
429 60         80 DEBUG and print STDERR " Adding \"", $token->text, "\" to para-content.\n";
430 60         156 $para_text_content .= $token->text;
431             # and keep looking
432            
433             } elsif( $token->is_end and $content_containers{ $token->tagname } ) {
434 36         48 DEBUG and print STDERR " Found end of Para. Considering content: ",
435             $para_text_content, "\n";
436              
437 36 50 33     289 if( $para_text_content =~ m/\S/
    50          
438             and ($max_content_length
439             ? (length($para_text_content) <= $max_content_length)
440             : 1)
441             ) {
442             # Some minimal sanity constraints, I think.
443 36         51 DEBUG and print STDERR " It looks contentworthy, I guess. Using it.\n";
444 36         59 $title = $para_text_content;
445 36         84 last;
446             } else {
447 0         0 DEBUG and print STDERR " Doesn't look at all contentworthy!\n Giving up.\n";
448 0         0 undef $title;
449 0         0 last;
450             }
451             }
452             }
453            
454             else {
455 0         0 die "IMPOSSIBLE STATE $state!\n"; # should never happen
456             }
457            
458             }
459            
460             # Put it all back!
461 52         210 $self->unget_token(@to_unget);
462            
463 52         63 if(DEBUG) {
464             if(defined $title) { print STDERR " Returning title <$title>\n" }
465             else { print STDERR "Returning title <>\n" }
466             }
467            
468 52 100       144 return '' unless defined $title;
469 42         123 $title =~ s/^\s+//;
470 42         291 return $title;
471             }
472              
473             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
474             #
475             # Methods that actually do work at parse-time:
476              
477             sub _handle_element_start {
478 346     346   492 my $self = shift; # leaving ($element_name, $attr_hash_r)
479 346         430 DEBUG > 2 and print STDERR "++ $_[0] (", map("<$_> ", %{$_[1]}), ")\n";
480            
481 346         1378 push @{ $self->{'token_buffer'} },
482 346         421 $self->{'start_token_class'}->new(@_);
483 346         679 return;
484             }
485              
486             sub _handle_text {
487 243     243   325 my $self = shift; # leaving ($text)
488 243         292 DEBUG > 2 and print STDERR "== $_[0]\n";
489 243         1276 push @{ $self->{'token_buffer'} },
490 243         282 $self->{'text_token_class'}->new(@_);
491 243         630 return;
492             }
493              
494             sub _handle_element_end {
495 332     332   433 my $self = shift; # leaving ($element_name);
496 332         385 DEBUG > 2 and print STDERR "-- $_[0]\n";
497 332         1112 push @{ $self->{'token_buffer'} },
498 332         412 $self->{'end_token_class'}->new(@_);
499 332         593 return;
500             }
501              
502             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
503              
504             1;
505              
506              
507             __END__