File Coverage

lib/Pod/Simple/PullParser.pm
Criterion Covered Total %
statement 209 232 90.0
branch 76 102 74.5
condition 42 67 62.6
subroutine 26 32 81.2
pod 13 17 76.4
total 366 450 81.3


line stmt bran cond sub pod time code
1             require 5;
2             package Pod::Simple::PullParser;
3             $VERSION = '3.42';
4 10     10   13932 use Pod::Simple ();
  10         25  
  10         305  
5 10     10   370 BEGIN {@ISA = ('Pod::Simple')}
6              
7 10     10   60 use strict;
  10         19  
  10         215  
8 10     10   46 use Carp ();
  10         16  
  10         153  
9              
10 10     10   4670 use Pod::Simple::PullParserStartToken;
  10         25  
  10         311  
11 10     10   4127 use Pod::Simple::PullParserEndToken;
  10         23  
  10         283  
12 10     10   3706 use Pod::Simple::PullParserTextToken;
  10         25  
  10         388  
13              
14 10 50   10   1629 BEGIN { *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG }
15              
16             __PACKAGE__->_accessorize(
17             'source_fh', # the filehandle we're reading from
18             'source_scalar_ref', # the scalarref we're reading from
19             'source_arrayref', # the arrayref we're reading from
20             );
21              
22             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
23             #
24             # And here is how we implement a pull-parser on top of a push-parser...
25              
26             sub filter {
27 0     0 1 0 my($self, $source) = @_;
28 0 0       0 $self = $self->new unless ref $self;
29              
30 0 0       0 $source = *STDIN{IO} unless defined $source;
31 0         0 $self->set_source($source);
32 0         0 $self->output_fh(*STDOUT{IO});
33              
34 0         0 $self->run; # define run() in a subclass if you want to use filter()!
35 0         0 return $self;
36             }
37              
38             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
39              
40             sub parse_string_document {
41 49     49 1 78 my $this = shift;
42 49         146 $this->set_source(\ $_[0]);
43 49         145 $this->run;
44             }
45              
46             sub parse_file {
47 13     13 1 37 my($this, $filename) = @_;
48 13         89 $this->set_source($filename);
49 13         51 $this->run;
50             }
51              
52             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
53             # In case anyone tries to use them:
54              
55             sub run {
56 10     10   70 use Carp ();
  10         18  
  10         1014  
57 0 0 0 0 0 0 if( __PACKAGE__ eq ref($_[0]) || $_[0]) { # I'm not being subclassed!
58 0         0 Carp::croak "You can call run() only on subclasses of "
59             . __PACKAGE__;
60             } else {
61 0   0     0 Carp::croak join '',
62             "You can't call run() because ",
63             ref($_[0]) || $_[0], " didn't define a run() method";
64             }
65             }
66              
67             sub parse_lines {
68 10     10   63 use Carp ();
  10         23  
  10         460  
69 0     0 1 0 Carp::croak "Use set_source with ", __PACKAGE__,
70             " and subclasses, not parse_lines";
71             }
72              
73             sub parse_line {
74 10     10   58 use Carp ();
  10         25  
  10         27220  
75 0     0 0 0 Carp::croak "Use set_source with ", __PACKAGE__,
76             " and subclasses, not parse_line";
77             }
78              
79             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
80              
81             sub new {
82 89     89 1 1835 my $class = shift;
83 89         368 my $self = $class->SUPER::new(@_);
84 89 50       328 die "Couldn't construct for $class" unless $self;
85              
86 89   50     392 $self->{'token_buffer'} ||= [];
87 89   50     357 $self->{'start_token_class'} ||= 'Pod::Simple::PullParserStartToken';
88 89   50     309 $self->{'text_token_class'} ||= 'Pod::Simple::PullParserTextToken';
89 89   50     400 $self->{'end_token_class'} ||= 'Pod::Simple::PullParserEndToken';
90              
91 89         108 DEBUG > 1 and print STDERR "New pullparser object: $self\n";
92              
93 89         185 return $self;
94             }
95              
96             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
97              
98             sub get_token {
99 1398     1398 1 2486 my $self = shift;
100 1398         1464 DEBUG > 1 and print STDERR "\nget_token starting up on $self.\n";
101             DEBUG > 2 and print STDERR " Items in token-buffer (",
102             scalar( @{ $self->{'token_buffer'} } ) ,
103             ") :\n", map(
104             " " . $_->dump . "\n", @{ $self->{'token_buffer'} }
105             ),
106 1398         1531 @{ $self->{'token_buffer'} } ? '' : ' (no tokens)',
107             "\n"
108             ;
109              
110 1398         1579 until( @{ $self->{'token_buffer'} } ) {
  1985         3706  
111 587         1072 DEBUG > 3 and print STDERR "I need to get something into my empty token buffer...\n";
112 587 100       1520 if($self->{'source_dead'}) {
    100          
    100          
    50          
113 79         99 DEBUG and print STDERR "$self 's source is dead.\n";
114 79         114 push @{ $self->{'token_buffer'} }, undef;
  79         158  
115             } elsif(exists $self->{'source_fh'}) {
116 23         33 my @lines;
117 23   33     66 my $fh = $self->{'source_fh'}
118             || Carp::croak('You have to call set_source before you can call get_token');
119            
120 23         33 DEBUG and print STDERR "$self 's source is filehandle $fh.\n";
121             # Read those many lines at a time
122 23         54 for(my $i = Pod::Simple::MANY_LINES; $i--;) {
123 298         314 DEBUG > 3 and print STDERR " Fetching a line from source filehandle $fh...\n";
124 298         647 local $/ = $Pod::Simple::NL;
125 298         1169 push @lines, scalar(<$fh>); # readline
126 298         434 DEBUG > 3 and print STDERR " Line is: ",
127             defined($lines[-1]) ? $lines[-1] : "\n";
128 298 100       727 unless( defined $lines[-1] ) {
129 17         24 DEBUG and print STDERR "That's it for that source fh! Killing.\n";
130 17         45 delete $self->{'source_fh'}; # so it can be GC'd
131 17         67 last;
132             }
133             # but pass thru the undef, which will set source_dead to true
134              
135             # TODO: look to see if $lines[-1] is =encoding, and if so,
136             # do horribly magic things
137              
138             }
139            
140 23         31 if(DEBUG > 8) {
141             print STDERR "* I've gotten ", scalar(@lines), " lines:\n";
142             foreach my $l (@lines) {
143             if(defined $l) {
144             print STDERR " line {$l}\n";
145             } else {
146             print STDERR " line undef\n";
147             }
148             }
149             print STDERR "* end of ", scalar(@lines), " lines\n";
150             }
151              
152 23         126 $self->SUPER::parse_lines(@lines);
153            
154             } elsif(exists $self->{'source_arrayref'}) {
155             DEBUG and print STDERR "$self 's source is arrayref $self->{'source_arrayref'}, with ",
156 2         3 scalar(@{$self->{'source_arrayref'}}), " items left in it.\n";
157              
158 2         3 DEBUG > 3 and print STDERR " Fetching ", Pod::Simple::MANY_LINES, " lines.\n";
159             $self->SUPER::parse_lines(
160 2         3 splice @{ $self->{'source_arrayref'} },
  2         9  
161             0,
162             Pod::Simple::MANY_LINES
163             );
164 2 50       5 unless( @{ $self->{'source_arrayref'} } ) {
  2         4  
165 2         4 DEBUG and print STDERR "That's it for that source arrayref! Killing.\n";
166 2         5 $self->SUPER::parse_lines(undef);
167 2         5 delete $self->{'source_arrayref'}; # so it can be GC'd
168             }
169             # to make sure that an undef is always sent to signal end-of-stream
170              
171             } elsif(exists $self->{'source_scalar_ref'}) {
172              
173             DEBUG and print STDERR "$self 's source is scalarref $self->{'source_scalar_ref'}, with ",
174             length(${ $self->{'source_scalar_ref'} }) -
175 483         543 (pos(${ $self->{'source_scalar_ref'} }) || 0),
176             " characters left to parse.\n";
177              
178 483         556 DEBUG > 3 and print STDERR " Fetching a line from source-string...\n";
179 483 100       557 if( ${ $self->{'source_scalar_ref'} } =~
  483         1936  
180             m/([^\n\r]*)((?:\r?\n)?)/g
181             ) {
182             #print(">> $1\n"),
183             $self->SUPER::parse_lines($1)
184             if length($1) or length($2)
185 56         125 or pos( ${ $self->{'source_scalar_ref'} })
186 427 100 100     1970 != length( ${ $self->{'source_scalar_ref'} });
  56   66     180  
187             # I.e., unless it's a zero-length "empty line" at the very
188             # end of "foo\nbar\n" (i.e., between the \n and the EOS).
189             } else { # that's the end. Byebye
190 56         195 $self->SUPER::parse_lines(undef);
191 56         110 delete $self->{'source_scalar_ref'};
192 56         93 DEBUG and print STDERR "That's it for that source scalarref! Killing.\n";
193             }
194              
195            
196             } else {
197 0         0 die "What source??";
198             }
199             }
200             DEBUG and print STDERR "get_token about to return ",
201             Pod::Simple::pretty( @{$self->{'token_buffer'}}
202 1398         1636 ? $self->{'token_buffer'}[-1] : undef
203             ), "\n";
204 1398         1524 return shift @{$self->{'token_buffer'}}; # that's an undef if empty
  1398         3861  
205             }
206              
207             sub unget_token {
208 96     96 1 147 my $self = shift;
209 96         114 DEBUG and print STDERR "Ungetting ", scalar(@_), " tokens: ",
210             @_ ? "@_\n" : "().\n";
211 96         179 foreach my $t (@_) {
212 656 50       976 Carp::croak "Can't unget that, because it's not a token -- it's undef!"
213             unless defined $t;
214 656 50       1075 Carp::croak "Can't unget $t, because it's not a token -- it's a string!"
215             unless ref $t;
216 656 50       1420 Carp::croak "Can't unget $t, because it's not a token object!"
217             unless UNIVERSAL::can($t, 'type');
218             }
219            
220 96         122 unshift @{$self->{'token_buffer'}}, @_;
  96         242  
221             DEBUG > 1 and print STDERR "Token buffer now has ",
222 96         119 scalar(@{$self->{'token_buffer'}}), " items in it.\n";
223 96         191 return;
224             }
225              
226             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
227              
228             # $self->{'source_filename'} = $source;
229              
230             sub set_source {
231 90     90 1 1000940 my $self = shift @_;
232 90 50       195 return $self->{'source_fh'} unless @_;
233             Carp::croak("Cannot assign new source to pull parser; create a new instance, instead")
234 90 100 66     656 if $self->{'source_fh'} || $self->{'source_scalar_ref'} || $self->{'source_arrayref'};
      66        
235 89         134 my $handle;
236 89 50       439 if(!defined $_[0]) {
    100          
    100          
    100          
    100          
    50          
237 0         0 Carp::croak("Can't use empty-string as a source for set_source");
238             } elsif(ref(\( $_[0] )) eq 'GLOB') {
239 1         7 $self->{'source_filename'} = '' . ($handle = $_[0]);
240 1         3 DEBUG and print STDERR "$self 's source is glob $_[0]\n";
241             # and fall thru
242             } elsif(ref( $_[0] ) eq 'SCALAR') {
243 70         115 $self->{'source_scalar_ref'} = $_[0];
244 70         90 DEBUG and print STDERR "$self 's source is scalar ref $_[0]\n";
245 70         123 return;
246             } elsif(ref( $_[0] ) eq 'ARRAY') {
247 2         3 $self->{'source_arrayref'} = $_[0];
248 2         3 DEBUG and print STDERR "$self 's source is array ref $_[0]\n";
249 2         5 return;
250             } elsif(ref $_[0]) {
251 2         8 $self->{'source_filename'} = '' . ($handle = $_[0]);
252 2         3 DEBUG and print STDERR "$self 's source is fh-obj $_[0]\n";
253             } elsif(!length $_[0]) {
254 0         0 Carp::croak("Can't use empty-string as a source for set_source");
255             } else { # It's a filename!
256 14         24 DEBUG and print STDERR "$self 's source is filename $_[0]\n";
257             {
258 14         22 local *PODSOURCE;
  14         43  
259 14 50       627 open(PODSOURCE, "<$_[0]") || Carp::croak "Can't open $_[0]: $!";
260 14         82 $handle = *PODSOURCE{IO};
261             }
262 14         48 $self->{'source_filename'} = $_[0];
263 14         26 DEBUG and print STDERR " Its name is $_[0].\n";
264              
265             # TODO: file-discipline things here!
266             }
267              
268 17         74 $self->{'source_fh'} = $handle;
269 17         22 DEBUG and print STDERR " Its handle is $handle\n";
270 17         39 return 1;
271             }
272              
273             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
274              
275 0     0 0 0 sub get_title_short { shift->get_short_title(@_) } # alias
276              
277             sub get_short_title {
278 21     21 1 77 my $title = shift->get_title(@_);
279 21 100       133 $title = $1 if $title =~ m/^(\S{1,60})\s+--?\s+./s;
280             # turn "Foo::Bar -- bars for your foo" into "Foo::Bar"
281 21         78 return $title;
282             }
283              
284             sub get_title { shift->_get_titled_section(
285 39     39 1 223 'NAME', max_token => 50, desperate => 1, @_)
286             }
287             sub get_version { shift->_get_titled_section(
288 3     3 1 13 'VERSION',
289             max_token => 400,
290             accept_verbatim => 1,
291             max_content_length => 3_000,
292             @_,
293             );
294             }
295             sub get_description { shift->_get_titled_section(
296 7     7 1 146 'DESCRIPTION',
297             max_token => 400,
298             max_content_length => 3_000,
299             @_,
300             ) }
301              
302 0     0 0 0 sub get_authors { shift->get_author(@_) } # a harmless alias
303              
304             sub get_author {
305 2     2 1 88 my $this = shift;
306             # Max_token is so high because these are
307             # typically at the end of the document:
308 2 100       7 $this->_get_titled_section('AUTHOR' , max_token => 10_000, @_) ||
309             $this->_get_titled_section('AUTHORS', max_token => 10_000, @_);
310             }
311              
312             #--------------------------------------------------------------------------
313              
314             sub _get_titled_section {
315             # Based on a get_title originally contributed by Graham Barr
316 52     52   191 my($self, $titlename, %options) = (@_);
317            
318 52         109 my $max_token = delete $options{'max_token'};
319 52         101 my $desperate_for_title = delete $options{'desperate'};
320 52         83 my $accept_verbatim = delete $options{'accept_verbatim'};
321 52         71 my $max_content_length = delete $options{'max_content_length'};
322 52         71 my $nocase = delete $options{'nocase'};
323 52 100       115 $max_content_length = 120 unless defined $max_content_length;
324              
325 52 0       135 Carp::croak( "Unknown " . ((1 == keys %options) ? "option: " : "options: ")
    50          
326             . join " ", map "[$_]", sort keys %options
327             )
328             if keys %options;
329              
330 52         66 my %content_containers;
331 52         83 $content_containers{'Para'} = 1;
332 52 100       117 if($accept_verbatim) {
333 3         5 $content_containers{'Verbatim'} = 1;
334 3         5 $content_containers{'VerbatimFormatted'} = 1;
335             }
336              
337 52         80 my $token_count = 0;
338 52         74 my $title;
339             my @to_unget;
340 52         63 my $state = 0;
341 52         61 my $depth = 0;
342              
343 52 50 33     380 Carp::croak "What kind of titlename is \"$titlename\"?!" unless
344             defined $titlename and $titlename =~ m/^[A-Z ]{1,60}$/s; #sanity
345 52         100 my $titlename_re = quotemeta($titlename);
346              
347 52         115 my $head1_text_content;
348             my $para_text_content;
349 52         0 my $skipX;
350              
351 52   50     225 while(
      66        
352             ++$token_count <= ($max_token || 1_000_000)
353             and defined(my $token = $self->get_token)
354             ) {
355 563         723 push @to_unget, $token;
356              
357 563 100       966 if ($state == 0) { # seeking =head1
    100          
    100          
    50          
358 276 100 100     552 if( $token->is_start and $token->tagname eq 'head1' ) {
359 62         83 DEBUG and print STDERR " Found head1. Seeking content...\n";
360 62         74 ++$state;
361 62         181 $head1_text_content = '';
362             }
363             }
364              
365             elsif($state == 1) { # accumulating text until end of head1
366 131 100 100     299 if( $token->is_text ) {
    100          
    100          
367 65 100       148 unless ($skipX) {
368 64         75 DEBUG and print STDERR " Adding \"", $token->text, "\" to head1-content.\n";
369 64         152 $head1_text_content .= $token->text;
370             }
371             } elsif( $token->is_tagname('X') ) {
372             # We're going to want to ignore X<> stuff.
373 2         5 $skipX = $token->is_start;
374 2         5 DEBUG and print STDERR +($skipX ? 'Start' : 'End'), 'ing ignoring of X<> tag';
375             } elsif( $token->is_end and $token->tagname eq 'head1' ) {
376 62         75 DEBUG and print STDERR " Found end of head1. Considering content...\n";
377 62 100       103 $head1_text_content = uc $head1_text_content if $nocase;
378 62 50 100     640 if($head1_text_content eq $titlename
    100 66        
    100 100        
      100        
      66        
379             or $head1_text_content =~ m/\($titlename_re\)/s
380             # We accept "=head1 Nomen Modularis (NAME)" for sake of i18n
381             ) {
382 36         50 DEBUG and print STDERR " Yup, it was $titlename. Seeking next para-content...\n";
383 36         122 ++$state;
384             } elsif(
385             $desperate_for_title
386             # if we're so desperate we'll take the first
387             # =head1's content as a title
388             and $head1_text_content =~ m/\S/
389             and $head1_text_content !~ m/^[ A-Z]+$/s
390             and $head1_text_content !~
391             m/\((?:
392             NAME | TITLE | VERSION | AUTHORS? | DESCRIPTION | SYNOPSIS
393             | COPYRIGHT | LICENSE | NOTES? | FUNCTIONS? | METHODS?
394             | CAVEATS? | BUGS? | SEE\ ALSO | SWITCHES | ENVIRONMENT
395             )\)/sx
396             # avoid accepting things like =head1 Thingy Thongy (DESCRIPTION)
397             and ($max_content_length
398             ? (length($head1_text_content) <= $max_content_length) # sanity
399             : 1)
400             ) {
401             # Looks good; trim it
402 6         26 ($title = $head1_text_content) =~ s/\s+$//;
403 6         13 DEBUG and print STDERR " It looks titular: \"$title\".\n\n Using that.\n";
404 6         13 last;
405             } else {
406 20         35 --$state;
407 20         68 DEBUG and print STDERR " Didn't look titular ($head1_text_content).\n",
408             "\n Dropping back to seeking-head1-content mode...\n";
409             }
410             }
411             }
412            
413             elsif($state == 2) {
414             # seeking start of para (which must immediately follow)
415 36 50 33     85 if($token->is_start and $content_containers{ $token->tagname }) {
416 36         59 DEBUG and print STDERR " Found start of Para. Accumulating content...\n";
417 36         47 $para_text_content = '';
418 36         114 ++$state;
419             } else {
420 0         0 DEBUG and print
421             " Didn't see an immediately subsequent start-Para. Reseeking H1\n";
422 0         0 $state = 0;
423             }
424             }
425            
426             elsif($state == 3) {
427             # accumulating text until end of Para
428 120 100 100     218 if( $token->is_text ) {
    100          
429 60         76 DEBUG and print STDERR " Adding \"", $token->text, "\" to para-content.\n";
430 60         117 $para_text_content .= $token->text;
431             # and keep looking
432            
433             } elsif( $token->is_end and $content_containers{ $token->tagname } ) {
434 36         44 DEBUG and print STDERR " Found end of Para. Considering content: ",
435             $para_text_content, "\n";
436              
437 36 50 33     231 if( $para_text_content =~ m/\S/
    50          
438             and ($max_content_length
439             ? (length($para_text_content) <= $max_content_length)
440             : 1)
441             ) {
442             # Some minimal sanity constraints, I think.
443 36         47 DEBUG and print STDERR " It looks contentworthy, I guess. Using it.\n";
444 36         57 $title = $para_text_content;
445 36         65 last;
446             } else {
447 0         0 DEBUG and print STDERR " Doesn't look at all contentworthy!\n Giving up.\n";
448 0         0 undef $title;
449 0         0 last;
450             }
451             }
452             }
453            
454             else {
455 0         0 die "IMPOSSIBLE STATE $state!\n"; # should never happen
456             }
457            
458             }
459            
460             # Put it all back!
461 52         189 $self->unget_token(@to_unget);
462            
463 52         56 if(DEBUG) {
464             if(defined $title) { print STDERR " Returning title <$title>\n" }
465             else { print STDERR "Returning title <>\n" }
466             }
467            
468 52 100       157 return '' unless defined $title;
469 42         112 $title =~ s/^\s+//;
470 42         212 return $title;
471             }
472              
473             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
474             #
475             # Methods that actually do work at parse-time:
476              
477             sub _handle_element_start {
478 346     346   494 my $self = shift; # leaving ($element_name, $attr_hash_r)
479 346         404 DEBUG > 2 and print STDERR "++ $_[0] (", map("<$_> ", %{$_[1]}), ")\n";
480            
481 346         1401 push @{ $self->{'token_buffer'} },
482 346         448 $self->{'start_token_class'}->new(@_);
483 346         683 return;
484             }
485              
486             sub _handle_text {
487 243     243   337 my $self = shift; # leaving ($text)
488 243         285 DEBUG > 2 and print STDERR "== $_[0]\n";
489 243         913 push @{ $self->{'token_buffer'} },
490 243         279 $self->{'text_token_class'}->new(@_);
491 243         602 return;
492             }
493              
494             sub _handle_element_end {
495 332     332   418 my $self = shift; # leaving ($element_name);
496 332         360 DEBUG > 2 and print STDERR "-- $_[0]\n";
497 332         1025 push @{ $self->{'token_buffer'} },
498 332         403 $self->{'end_token_class'}->new(@_);
499 332         618 return;
500             }
501              
502             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
503              
504             1;
505              
506              
507             __END__