File Coverage

blib/lib/Pod/Simple/PullParser.pm
Criterion Covered Total %
statement 209 232 90.0
branch 76 102 74.5
condition 42 67 62.6
subroutine 26 32 81.2
pod 13 17 76.4
total 366 450 81.3


line stmt bran cond sub pod time code
1             package Pod::Simple::PullParser;
2 10     10   139158 use strict;
  10         35  
  10         396  
3             our $VERSION = '3.45';
4 10     10   4770 use Pod::Simple ();
  10         24  
  10         375  
5 10     10   359 BEGIN {our @ISA = ('Pod::Simple')}
6              
7 10     10   60 use Carp ();
  10         21  
  10         187  
8              
9 10     10   4520 use Pod::Simple::PullParserStartToken;
  10         26  
  10         303  
10 10     10   4139 use Pod::Simple::PullParserEndToken;
  10         23  
  10         292  
11 10     10   4205 use Pod::Simple::PullParserTextToken;
  10         24  
  10         414  
12              
13 10 50   10   2371 BEGIN { *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG }
14              
15             __PACKAGE__->_accessorize(
16             'source_fh', # the filehandle we're reading from
17             'source_scalar_ref', # the scalarref we're reading from
18             'source_arrayref', # the arrayref we're reading from
19             );
20              
21             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
22             #
23             # And here is how we implement a pull-parser on top of a push-parser...
24              
25             sub filter {
26 0     0 1 0 my($self, $source) = @_;
27 0 0       0 $self = $self->new unless ref $self;
28              
29 0 0       0 $source = *STDIN{IO} unless defined $source;
30 0         0 $self->set_source($source);
31 0         0 $self->output_fh(*STDOUT{IO});
32              
33 0         0 $self->run; # define run() in a subclass if you want to use filter()!
34 0         0 return $self;
35             }
36              
37             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
38              
39             sub parse_string_document {
40 49     49 1 89 my $this = shift;
41 49         150 $this->set_source(\ $_[0]);
42 49         149 $this->run;
43             }
44              
45             sub parse_file {
46 13     13 1 40 my($this, $filename) = @_;
47 13         51 $this->set_source($filename);
48 13         50 $this->run;
49             }
50              
51             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
52             # In case anyone tries to use them:
53              
54             sub run {
55 10     10   70 use Carp ();
  10         20  
  10         1158  
56 0 0 0 0 0 0 if( __PACKAGE__ eq (ref($_[0]) || $_[0])) { # I'm not being subclassed!
57 0         0 Carp::croak "You can call run() only on subclasses of "
58             . __PACKAGE__;
59             } else {
60 0   0     0 Carp::croak join '',
61             "You can't call run() because ",
62             ref($_[0]) || $_[0], " didn't define a run() method";
63             }
64             }
65              
66             sub parse_lines {
67 10     10   99 use Carp ();
  10         19  
  10         494  
68 0     0 1 0 Carp::croak "Use set_source with ", __PACKAGE__,
69             " and subclasses, not parse_lines";
70             }
71              
72             sub parse_line {
73 10     10   57 use Carp ();
  10         17  
  10         29903  
74 0     0 0 0 Carp::croak "Use set_source with ", __PACKAGE__,
75             " and subclasses, not parse_line";
76             }
77              
78             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
79              
80             sub new {
81 89     89 1 12999 my $class = shift;
82 89         343 my $self = $class->SUPER::new(@_);
83 89 50       238 die "Couldn't construct for $class" unless $self;
84              
85 89   50     417 $self->{'token_buffer'} ||= [];
86 89   50     380 $self->{'start_token_class'} ||= 'Pod::Simple::PullParserStartToken';
87 89   50     343 $self->{'text_token_class'} ||= 'Pod::Simple::PullParserTextToken';
88 89   50     411 $self->{'end_token_class'} ||= 'Pod::Simple::PullParserEndToken';
89              
90 89         113 DEBUG > 1 and print STDERR "New pullparser object: $self\n";
91              
92 89         220 return $self;
93             }
94              
95             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
96              
97             sub get_token {
98 1398     1398 1 3473 my $self = shift;
99 1398         1729 DEBUG > 1 and print STDERR "\nget_token starting up on $self.\n";
100             DEBUG > 2 and print STDERR " Items in token-buffer (",
101             scalar( @{ $self->{'token_buffer'} } ) ,
102             ") :\n", map(
103             " " . $_->dump . "\n", @{ $self->{'token_buffer'} }
104             ),
105 1398         1655 @{ $self->{'token_buffer'} } ? '' : ' (no tokens)',
106             "\n"
107             ;
108              
109 1398         1800 until( @{ $self->{'token_buffer'} } ) {
  1985         4255  
110 587         788 DEBUG > 3 and print STDERR "I need to get something into my empty token buffer...\n";
111 587 100       1721 if($self->{'source_dead'}) {
    100          
    100          
    50          
112 79         122 DEBUG and print STDERR "$self 's source is dead.\n";
113 79         156 push @{ $self->{'token_buffer'} }, undef;
  79         185  
114             } elsif(exists $self->{'source_fh'}) {
115 23         43 my @lines;
116 23   33     127 my $fh = $self->{'source_fh'}
117             || Carp::croak('You have to call set_source before you can call get_token');
118              
119 23         32 DEBUG and print STDERR "$self 's source is filehandle $fh.\n";
120             # Read those many lines at a time
121 23         72 for(my $i = Pod::Simple::MANY_LINES; $i--;) {
122 298         410 DEBUG > 3 and print STDERR " Fetching a line from source filehandle $fh...\n";
123 298         711 local $/ = $Pod::Simple::NL;
124 298         1383 push @lines, scalar(<$fh>); # readline
125 298         542 DEBUG > 3 and print STDERR " Line is: ",
126             defined($lines[-1]) ? $lines[-1] : "\n";
127 298 100       875 unless( defined $lines[-1] ) {
128 17         35 DEBUG and print STDERR "That's it for that source fh! Killing.\n";
129 17         46 delete $self->{'source_fh'}; # so it can be GC'd
130 17         64 last;
131             }
132             # but pass thru the undef, which will set source_dead to true
133              
134             # TODO: look to see if $lines[-1] is =encoding, and if so,
135             # do horribly magic things
136              
137             }
138              
139 23         40 if(DEBUG > 8) {
140             print STDERR "* I've gotten ", scalar(@lines), " lines:\n";
141             foreach my $l (@lines) {
142             if(defined $l) {
143             print STDERR " line {$l}\n";
144             } else {
145             print STDERR " line undef\n";
146             }
147             }
148             print STDERR "* end of ", scalar(@lines), " lines\n";
149             }
150              
151 23         127 $self->SUPER::parse_lines(@lines);
152              
153             } elsif(exists $self->{'source_arrayref'}) {
154             DEBUG and print STDERR "$self 's source is arrayref $self->{'source_arrayref'}, with ",
155 2         3 scalar(@{$self->{'source_arrayref'}}), " items left in it.\n";
156              
157 2         3 DEBUG > 3 and print STDERR " Fetching ", Pod::Simple::MANY_LINES, " lines.\n";
158             $self->SUPER::parse_lines(
159 2         5 splice @{ $self->{'source_arrayref'} },
  2         12  
160             0,
161             Pod::Simple::MANY_LINES
162             );
163 2 50       14 unless( @{ $self->{'source_arrayref'} } ) {
  2         7  
164 2         3 DEBUG and print STDERR "That's it for that source arrayref! Killing.\n";
165 2         6 $self->SUPER::parse_lines(undef);
166 2         5 delete $self->{'source_arrayref'}; # so it can be GC'd
167             }
168             # to make sure that an undef is always sent to signal end-of-stream
169              
170             } elsif(exists $self->{'source_scalar_ref'}) {
171              
172             DEBUG and print STDERR "$self 's source is scalarref $self->{'source_scalar_ref'}, with ",
173             length(${ $self->{'source_scalar_ref'} }) -
174 483         604 (pos(${ $self->{'source_scalar_ref'} }) || 0),
175             " characters left to parse.\n";
176              
177 483         584 DEBUG > 3 and print STDERR " Fetching a line from source-string...\n";
178 483 100       609 if( ${ $self->{'source_scalar_ref'} } =~
  483         2159  
179             m/([^\n\r]*)((?:\r?\n)?)/g
180             ) {
181             #print(">> $1\n"),
182             $self->SUPER::parse_lines($1)
183             if length($1) or length($2)
184 56         123 or pos( ${ $self->{'source_scalar_ref'} })
185 427 100 100     2191 != length( ${ $self->{'source_scalar_ref'} });
  56   66     188  
186             # I.e., unless it's a zero-length "empty line" at the very
187             # end of "foo\nbar\n" (i.e., between the \n and the EOS).
188             } else { # that's the end. Byebye
189 56         207 $self->SUPER::parse_lines(undef);
190 56         127 delete $self->{'source_scalar_ref'};
191 56         114 DEBUG and print STDERR "That's it for that source scalarref! Killing.\n";
192             }
193              
194              
195             } else {
196 0         0 die "What source??";
197             }
198             }
199             DEBUG and print STDERR "get_token about to return ",
200             Pod::Simple::pretty( @{$self->{'token_buffer'}}
201 1398         1871 ? $self->{'token_buffer'}[-1] : undef
202             ), "\n";
203 1398         1779 return shift @{$self->{'token_buffer'}}; # that's an undef if empty
  1398         4453  
204             }
205              
206             sub unget_token {
207 96     96 1 165 my $self = shift;
208 96         122 DEBUG and print STDERR "Ungetting ", scalar(@_), " tokens: ",
209             @_ ? "@_\n" : "().\n";
210 96         173 foreach my $t (@_) {
211 656 50       1127 Carp::croak "Can't unget that, because it's not a token -- it's undef!"
212             unless defined $t;
213 656 50       1144 Carp::croak "Can't unget $t, because it's not a token -- it's a string!"
214             unless ref $t;
215 656 50       1607 Carp::croak "Can't unget $t, because it's not a token object!"
216             unless UNIVERSAL::can($t, 'type');
217             }
218              
219 96         192 unshift @{$self->{'token_buffer'}}, @_;
  96         288  
220             DEBUG > 1 and print STDERR "Token buffer now has ",
221 96         137 scalar(@{$self->{'token_buffer'}}), " items in it.\n";
222 96         203 return;
223             }
224              
225             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
226              
227             # $self->{'source_filename'} = $source;
228              
229             sub set_source {
230 90     90 1 1003015 my $self = shift @_;
231 90 50       241 return $self->{'source_fh'} unless @_;
232             Carp::croak("Cannot assign new source to pull parser; create a new instance, instead")
233 90 100 66     694 if $self->{'source_fh'} || $self->{'source_scalar_ref'} || $self->{'source_arrayref'};
      66        
234 89         140 my $handle;
235 89 50       479 if(!defined $_[0]) {
    100          
    100          
    100          
    100          
    50          
236 0         0 Carp::croak("Can't use empty-string as a source for set_source");
237             } elsif(ref(\( $_[0] )) eq 'GLOB') {
238 1         7 $self->{'source_filename'} = '' . ($handle = $_[0]);
239 1         3 DEBUG and print STDERR "$self 's source is glob $_[0]\n";
240             # and fall thru
241             } elsif(ref( $_[0] ) eq 'SCALAR') {
242 70         130 $self->{'source_scalar_ref'} = $_[0];
243 70         103 DEBUG and print STDERR "$self 's source is scalar ref $_[0]\n";
244 70         146 return;
245             } elsif(ref( $_[0] ) eq 'ARRAY') {
246 2         4 $self->{'source_arrayref'} = $_[0];
247 2         4 DEBUG and print STDERR "$self 's source is array ref $_[0]\n";
248 2         4 return;
249             } elsif(ref $_[0]) {
250 2         11 $self->{'source_filename'} = '' . ($handle = $_[0]);
251 2         7 DEBUG and print STDERR "$self 's source is fh-obj $_[0]\n";
252             } elsif(!length $_[0]) {
253 0         0 Carp::croak("Can't use empty-string as a source for set_source");
254             } else { # It's a filename!
255 14         26 DEBUG and print STDERR "$self 's source is filename $_[0]\n";
256             {
257 14         27 local *PODSOURCE;
  14         49  
258 14 50       722 open(PODSOURCE, "<$_[0]") || Carp::croak "Can't open $_[0]: $!";
259 14         90 $handle = *PODSOURCE{IO};
260             }
261 14         46 $self->{'source_filename'} = $_[0];
262 14         31 DEBUG and print STDERR " Its name is $_[0].\n";
263              
264             # TODO: file-discipline things here!
265             }
266              
267 17         37 $self->{'source_fh'} = $handle;
268 17         28 DEBUG and print STDERR " Its handle is $handle\n";
269 17         37 return 1;
270             }
271              
272             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
273              
274 0     0 0 0 sub get_title_short { shift->get_short_title(@_) } # alias
275              
276             sub get_short_title {
277 21     21 1 68 my $title = shift->get_title(@_);
278 21 100       123 $title = $1 if $title =~ m/^(\S{1,60})\s+--?\s+./s;
279             # turn "Foo::Bar -- bars for your foo" into "Foo::Bar"
280 21         62 return $title;
281             }
282              
283             sub get_title { shift->_get_titled_section(
284 39     39 1 732 'NAME', max_token => 50, desperate => 1, @_)
285             }
286             sub get_version { shift->_get_titled_section(
287 3     3 1 22 'VERSION',
288             max_token => 400,
289             accept_verbatim => 1,
290             max_content_length => 3_000,
291             @_,
292             );
293             }
294             sub get_description { shift->_get_titled_section(
295 7     7 1 617 'DESCRIPTION',
296             max_token => 400,
297             max_content_length => 3_000,
298             @_,
299             ) }
300              
301 0     0 0 0 sub get_authors { shift->get_author(@_) } # a harmless alias
302              
303             sub get_author {
304 2     2 1 647 my $this = shift;
305             # Max_token is so high because these are
306             # typically at the end of the document:
307 2 100       9 $this->_get_titled_section('AUTHOR' , max_token => 10_000, @_) ||
308             $this->_get_titled_section('AUTHORS', max_token => 10_000, @_);
309             }
310              
311             #--------------------------------------------------------------------------
312              
313             sub _get_titled_section {
314             # Based on a get_title originally contributed by Graham Barr
315 52     52   180 my($self, $titlename, %options) = (@_);
316              
317 52         111 my $max_token = delete $options{'max_token'};
318 52         90 my $desperate_for_title = delete $options{'desperate'};
319 52         85 my $accept_verbatim = delete $options{'accept_verbatim'};
320 52         92 my $max_content_length = delete $options{'max_content_length'};
321 52         86 my $nocase = delete $options{'nocase'};
322 52 100       152 $max_content_length = 120 unless defined $max_content_length;
323              
324 52 0       129 Carp::croak( "Unknown " . ((1 == keys %options) ? "option: " : "options: ")
    50          
325             . join " ", map "[$_]", sort keys %options
326             )
327             if keys %options;
328              
329 52         71 my %content_containers;
330 52         135 $content_containers{'Para'} = 1;
331 52 100       129 if($accept_verbatim) {
332 3         12 $content_containers{'Verbatim'} = 1;
333 3         9 $content_containers{'VerbatimFormatted'} = 1;
334             }
335              
336 52         76 my $token_count = 0;
337 52         93 my $title;
338             my @to_unget;
339 52         74 my $state = 0;
340 52         77 my $depth = 0;
341              
342 52 50 33     402 Carp::croak "What kind of titlename is \"$titlename\"?!" unless
343             defined $titlename and $titlename =~ m/^[A-Z ]{1,60}$/s; #sanity
344 52         129 my $titlename_re = quotemeta($titlename);
345              
346 52         132 my $head1_text_content;
347             my $para_text_content;
348 52         0 my $skipX;
349              
350 52   50     252 while(
      66        
351             ++$token_count <= ($max_token || 1_000_000)
352             and defined(my $token = $self->get_token)
353             ) {
354 563         861 push @to_unget, $token;
355              
356 563 100       1091 if ($state == 0) { # seeking =head1
    100          
    100          
    50          
357 276 100 100     691 if( $token->is_start and $token->tagname eq 'head1' ) {
358 62         94 DEBUG and print STDERR " Found head1. Seeking content...\n";
359 62         80 ++$state;
360 62         217 $head1_text_content = '';
361             }
362             }
363              
364             elsif($state == 1) { # accumulating text until end of head1
365 131 100 100     380 if( $token->is_text ) {
    100          
    100          
366 65 100       136 unless ($skipX) {
367 64         87 DEBUG and print STDERR " Adding \"", $token->text, "\" to head1-content.\n";
368 64         190 $head1_text_content .= $token->text;
369             }
370             } elsif( $token->is_tagname('X') ) {
371             # We're going to want to ignore X<> stuff.
372 2         6 $skipX = $token->is_start;
373 2         7 DEBUG and print STDERR +($skipX ? 'Start' : 'End'), 'ing ignoring of X<> tag';
374             } elsif( $token->is_end and $token->tagname eq 'head1' ) {
375 62         87 DEBUG and print STDERR " Found end of head1. Considering content...\n";
376 62 100       147 $head1_text_content = uc $head1_text_content if $nocase;
377 62 50 100     778 if($head1_text_content eq $titlename
    100 66        
    100 100        
      100        
      66        
378             or $head1_text_content =~ m/\($titlename_re\)/s
379             # We accept "=head1 Nomen Modularis (NAME)" for sake of i18n
380             ) {
381 36         59 DEBUG and print STDERR " Yup, it was $titlename. Seeking next para-content...\n";
382 36         130 ++$state;
383             } elsif(
384             $desperate_for_title
385             # if we're so desperate we'll take the first
386             # =head1's content as a title
387             and $head1_text_content =~ m/\S/
388             and $head1_text_content !~ m/^[ A-Z]+$/s
389             and $head1_text_content !~
390             m/\((?:
391             NAME | TITLE | VERSION | AUTHORS? | DESCRIPTION | SYNOPSIS
392             | COPYRIGHT | LICENSE | NOTES? | FUNCTIONS? | METHODS?
393             | CAVEATS? | BUGS? | SEE\ ALSO | SWITCHES | ENVIRONMENT
394             )\)/sx
395             # avoid accepting things like =head1 Thingy Thongy (DESCRIPTION)
396             and ($max_content_length
397             ? (length($head1_text_content) <= $max_content_length) # sanity
398             : 1)
399             ) {
400             # Looks good; trim it
401 6         27 ($title = $head1_text_content) =~ s/\s+$//;
402 6         10 DEBUG and print STDERR " It looks titular: \"$title\".\n\n Using that.\n";
403 6         18 last;
404             } else {
405 20         35 --$state;
406 20         79 DEBUG and print STDERR " Didn't look titular ($head1_text_content).\n",
407             "\n Dropping back to seeking-head1-content mode...\n";
408             }
409             }
410             }
411              
412             elsif($state == 2) {
413             # seeking start of para (which must immediately follow)
414 36 50 33     102 if($token->is_start and $content_containers{ $token->tagname }) {
415 36         54 DEBUG and print STDERR " Found start of Para. Accumulating content...\n";
416 36         66 $para_text_content = '';
417 36         134 ++$state;
418             } else {
419 0         0 DEBUG and print
420             " Didn't see an immediately subsequent start-Para. Reseeking H1\n";
421 0         0 $state = 0;
422             }
423             }
424              
425             elsif($state == 3) {
426             # accumulating text until end of Para
427 120 100 100     251 if( $token->is_text ) {
    100          
428 60         74 DEBUG and print STDERR " Adding \"", $token->text, "\" to para-content.\n";
429 60         147 $para_text_content .= $token->text;
430             # and keep looking
431              
432             } elsif( $token->is_end and $content_containers{ $token->tagname } ) {
433 36         56 DEBUG and print STDERR " Found end of Para. Considering content: ",
434             $para_text_content, "\n";
435              
436 36 50 33     256 if( $para_text_content =~ m/\S/
    50          
437             and ($max_content_length
438             ? (length($para_text_content) <= $max_content_length)
439             : 1)
440             ) {
441             # Some minimal sanity constraints, I think.
442 36         51 DEBUG and print STDERR " It looks contentworthy, I guess. Using it.\n";
443 36         64 $title = $para_text_content;
444 36         77 last;
445             } else {
446 0         0 DEBUG and print STDERR " Doesn't look at all contentworthy!\n Giving up.\n";
447 0         0 undef $title;
448 0         0 last;
449             }
450             }
451             }
452              
453             else {
454 0         0 die "IMPOSSIBLE STATE $state!\n"; # should never happen
455             }
456              
457             }
458              
459             # Put it all back!
460 52         213 $self->unget_token(@to_unget);
461              
462 52         64 if(DEBUG) {
463             if(defined $title) { print STDERR " Returning title <$title>\n" }
464             else { print STDERR "Returning title <>\n" }
465             }
466              
467 52 100       154 return '' unless defined $title;
468 42         142 $title =~ s/^\s+//;
469 42         330 return $title;
470             }
471              
472             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
473             #
474             # Methods that actually do work at parse-time:
475              
476             sub _handle_element_start {
477 346     346   558 my $self = shift; # leaving ($element_name, $attr_hash_r)
478 346         455 DEBUG > 2 and print STDERR "++ $_[0] (", map("<$_> ", %{$_[1]}), ")\n";
479              
480 346         1498 push @{ $self->{'token_buffer'} },
481 346         483 $self->{'start_token_class'}->new(@_);
482 346         774 return;
483             }
484              
485             sub _handle_text {
486 243     243   346 my $self = shift; # leaving ($text)
487 243         301 DEBUG > 2 and print STDERR "== $_[0]\n";
488 243         938 push @{ $self->{'token_buffer'} },
489 243         336 $self->{'text_token_class'}->new(@_);
490 243         722 return;
491             }
492              
493             sub _handle_element_end {
494 332     332   485 my $self = shift; # leaving ($element_name);
495 332         430 DEBUG > 2 and print STDERR "-- $_[0]\n";
496 332         1117 push @{ $self->{'token_buffer'} },
497 332         420 $self->{'end_token_class'}->new(@_);
498 332         706 return;
499             }
500              
501             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
502              
503             1;
504              
505              
506             __END__