File Coverage

blib/lib/Pod/Stupid.pm
Criterion Covered Total %
statement 53 54 98.1
branch 12 16 75.0
condition 7 9 77.7
subroutine 10 11 90.9
pod 3 3 100.0
total 85 93 91.4


line stmt bran cond sub pod time code
1 1     1   2627 use strict;
  1         4  
  1         48  
2 1     1   7 use warnings;
  1         3  
  1         68  
3             package Pod::Stupid;
4             BEGIN {
5 1     1   35 $Pod::Stupid::VERSION = '0.005';
6             }
7             BEGIN {
8 1     1   25 $Pod::Stupid::DIST = 'Pod-Stupid';
9             }
10             # ABSTRACT: The simplest, stupidest 'pod parser' possible
11              
12 1     1   1236 use English qw( -no_match_vars );
  1         2777  
  1         8  
13 1     1   516 use Carp qw( croak );
  1         2  
  1         54  
14 1     1   6 use Data::Dumper;
  1         2  
  1         50  
15 1     1   5 use Scalar::Util qw( blessed );
  1         2  
  1         1254  
16             #use Encode;
17              
18             # right now, I've hard-coded unix EOL into these regexen... I probably
19             # should use \R, however it's not supported on older perls, though the
20             # docs say it's equivalent to this:
21             my $eol = qr{ (?>\x0D\x0A?|[\x0A-\x0C\x85\x{2028}\x{2029}]) };
22              
23              
24             # match the end of any pod paragraph (pp). I'm being generous by allowing
25             # a pp to end by detecting another command pp with the lookahead thus not
26             # enforcing the "must end with blank line" part of the spec.
27             my $pod_paragraph_end_qr = qr{ (?: [\n]{2,} | [\n]+(?= ^=\w+) | \z ) }msx;
28              
29             # match a command paragraph. Note: the 'cut' directive is handled
30             # specially because it signifies the end of a block of pod and the
31             # spec states that it need not be followed by a blank line. If any
32             # other directives should be parsed the same way, put them in the
33             # qw() list below. Still, only 'cut' will end a block of pod.
34             my $cut_like_cmds_qr = join '|', qw( cut );
35             my $pod_command_qr = qr{
36             ( # capture everything as $1
37             (?:
38             ^[=] # start of pod command
39             (?!$cut_like_cmds_qr) # exclude cut, to be handled below
40             (\w+?) # command type (directive) as $2
41             (\d*) # optional command 'level' as $3
42             (?: # optionally followed by...
43             (?:
44             [ \t]+ | \n # blank space OR single newline
45             )
46             (.*?) # and 'command text' as $4
47             )?
48             $pod_paragraph_end_qr # followed by paragraph end pattern
49             )
50             | # OR... special case for cut
51             (?: # (and cut-like) commands...
52             ^[=] # start of pod command
53             ($cut_like_cmds_qr) # capture command as $5
54             (?: # if followed by...
55             [ \t]+ # horizontal white space
56             (.*?) # and grab anything else as $6
57             [\n] # up to the end of line
58             | # OR
59             [\n] # just the end of the line
60             )
61             [\n]? # and possibly one more newline
62             )
63             )
64             }msx;
65              
66             # match a non-command paragraph. this only applies when
67             # already within a pod block.
68             my $pod_paragraph_qr = qr{
69             ( # grab everything as $1...
70             ( # but just the paragraph contents as $2
71             (?:
72             ^[^=].*?$ # any lines that do not begin with =
73             )+? # until...
74             )
75             $pod_paragraph_end_qr # two newlines or end of string
76             )
77             }msx;
78              
79              
80 0     0 1 0 sub new { return bless {}, shift; }
81              
82              
83             # NOTE: the 'c' modifiers on the regexes in this sub are *critical!* NO TOUCH!
84             sub parse_string {
85 31     31 1 20728 my ($self, $text) = @_;
86              
87 31 50       88 croak "missing \$text parameter" if ! defined $text;
88              
89             # collect the parsed pieces here:
90 31         38 my @parsed_pieces;
91              
92             # find the beginning of the next pod block in the text
93             # (which, by definition, is any pod command)
94 31         1252 while ( $text =~ m{ \G (.*?) $pod_command_qr }msxgc ) {
95 161         298 my $non_pod_txt = $1;
96 161         412 my $pod_txt = $2;
97 161   66     441 my $cmd_type = $3 || $6;
98 161   100     523 my $cmd_level = $4 || '';
99 161   50     485 my $cmd_text = $5 || $7 || '';
100              
101             #print "COMMAND: [=$cmd_type$cmd_level $cmd_text]\n\n"; ### DEBUG
102              
103             # record the text that wasn't pod, if any
104 161 100       437 push @parsed_pieces, {
105             non_pod => 1,
106             orig_txt => $non_pod_txt,
107             start_pos => $LAST_MATCH_START[1],
108             end_pos => $LAST_MATCH_END[1],
109             } if $non_pod_txt;
110              
111             # record the pod found
112 161         1203 push @parsed_pieces, {
113             is_pod => 1,
114             cmd_type => $cmd_type,
115             cmd_level => $cmd_level,
116             cmd_txt => $cmd_text,
117             orig_txt => $pod_txt,
118             start_pos => $LAST_MATCH_START[2],
119             end_pos => $LAST_MATCH_END[2],
120             };
121              
122             # cut *always* signifies the end of a block of pod
123 161 100       585 next if $cmd_type eq 'cut';
124              
125             #if ( $cmd_type eq 'encoding' ) {
126             # # change encoding of string - this feels wonky to me..
127             # my $pos = pos( $text );
128             # $cmd_text =~ s/\A\s*|\s*\z//gms;
129             # $text = Encode::decode( $cmd_text, $text );
130             # pos( $text ) = $pos;
131             #}
132              
133             # look for paragraphs within the current pod block
134 133         1673 while ( $text =~ m{ \G $pod_paragraph_qr }msxgc ) {
135 311         513 my $orig_txt = $1;
136 311         415 my $paragraph = $2;
137              
138             #print "PARAGRAPH: [$paragraph]\n\n"; ### DEBUG
139              
140 311         5422 push @parsed_pieces, {
141             is_pod => 1,
142             paragraph => $paragraph,
143             orig_txt => $orig_txt,
144             start_pos => $LAST_MATCH_START[1],
145             end_pos => $LAST_MATCH_END[1],
146             };
147             }
148             }
149              
150             # Take care of any remaining text in the string
151 31   100     84 my $last_pos = pos( $text ) || 0;
152 31         52 my $end_pos = length( $text ) - 1;
153 31         56 my $remainder = substr( $text, $last_pos );
154 31 100       175 push @parsed_pieces, {
155             non_pod => 1,
156             orig_txt => $remainder,
157             start_pos => $last_pos,
158             end_pos => $end_pos,
159             } if $remainder;
160              
161 31         238 return \@parsed_pieces;
162             }
163              
164              
165              
166             sub strip_string {
167 31     31 1 134 my ( $self, $text_ref, $pod_pieces ) = @_;
168              
169 31 50       64 croak "missing \$text_ref parameter" unless defined $text_ref;
170              
171             # make a copy of the text if necessary.
172 31 50       217 $text_ref = \"$text_ref" unless ref $text_ref;
173              
174             # get the pieces if we don't already have them
175 31 50       74 $pod_pieces = $self->parse_string( $$text_ref ) unless ref $pod_pieces;
176              
177 31         35 my $shrinkage = 0;
178 31         55 for my $pp ( @$pod_pieces ) {
179              
180 520 100       959 next unless $pp->{is_pod};
181              
182 472         589 my $length = $pp->{end_pos} - $pp->{start_pos};
183 472         608 my $new_start = $pp->{start_pos} - $shrinkage;
184 472         659 substr( $$text_ref, $new_start, $length, '' );
185              
186 472         572 $shrinkage += $length;
187             }
188 31         132 return $$text_ref;
189             }
190              
191             1 && q{Beauty is in the eye of the beholder}; # Truth.
192              
193              
194              
195             =pod
196              
197             =head1 NAME
198              
199             Pod::Stupid - The simplest, stupidest 'pod parser' possible
200              
201             =head1 VERSION
202              
203             version 0.005
204              
205             =head1 SYNOPSIS
206              
207             use Pod::Stupid;
208            
209             my $file = shift; # '/some/file/with/pod.pl';
210             my $original_text = do { local( @ARGV, $/ ) = $file; <> }; # slurp
211            
212             my $ps = Pod::Stupid->new();
213            
214             # in scalar context returns an array of hashes.
215             my $pieces = $ps->parse_string( $original_text );
216            
217             # get your text sans all POD
218             my $stripped_text = $ps->strip_string( $original_text );
219            
220             # reconstruct the original text from the pieces...
221             substr( $stripped_text, $_->{start_pos}, 0, $_->{orig_txt} )
222             for grep { $_->{is_pod} } @$pieces;
223            
224             print $stripped_text eq $original_text ? "ok - $file\n" : "not ok - $file\n";
225              
226             =head1 DESCRIPTION
227              
228             This module was written to do one B thing: Given some text
229             as input, split it up into pieces of POD "paragraphs" and non-POD
230             "whatever" and output an AoH describing each piece found, in order.
231              
232             The end user can do whatever s?he wishes with the output AoH. It is
233             trivially simple to reconstruct the input from the output, and
234             hopefully I've included enough information in the inner hashes that
235             one can easily perform just about any other manipulation desired.
236              
237             =head1 INDESCRIPTION
238              
239             There are a bunch of things this module will B do:
240              
241             =over 4
242              
243             =item *
244              
245             Create a "parse tree"
246              
247             =item *
248              
249             Pod validation (it either parses or not)
250              
251             =item *
252              
253             Pod cleanup
254              
255             =item *
256              
257             "Handle" encoded text (but it I still parse)
258              
259             =item *
260              
261             Feed your cat
262              
263             =back
264              
265             However, it may make it easier to do any of the above, with a lot
266             less time and effort spent trying to grok many of the other POD
267             parsing solutions out there.
268              
269             A particular design decision I've made is to avoid needing to save
270             any state. This means there's no need or advantage to instantiating
271             an object, except for your own preferences. You can use any method
272             as either an object method or a class method and it will work the
273             same way for both. This design should also discourage me from trying
274             to bloat Pod::Stupid with every feature that tickles my fancy (or
275             yours!) but still, B
276              
277             =head1 METHODS
278              
279             =head2 new
280              
281             the most basic object constructor possible. Currently takes no
282             options because the object neither has nor needs to keep any state.
283              
284             This is only here if you want to use this module with an OO interface.
285              
286             =head2 parse_string
287              
288             Given a string, parses for pod and, in scalar context, returns an AoH
289             describing each pod paragraph found, as well as any non-pod.
290              
291             # typical usage
292             my $pieces = $ps->parse_string( $text );
293            
294             # to separate pod and non-pod
295             my @pod_pieces = grep { $_->{is_pod} } @$pieces;
296             my @non_pod_pieces = grep { $_->{non_pod} } @$pieces;
297              
298             =head2 strip_string
299              
300             given a string or string ref, and (optionally) an array of pod pieces,
301             return a copy of the string with all pod stripped out and an AoH
302             containing the pod pieces. If passed a string ref, that string is
303             modified in-place. In any case you can still always get the stripped
304             string and the array of pod parts as return values.
305              
306             # most typical usage
307             my $txt_nopod = $ps->strip_string( $text );
308            
309             # pass in a ref to change string in-place...
310             $ps->strip_string( \$text ); # $text no longer contains any pod
311            
312             # if you need the pieces...
313             my ( $txt_nopod, $pieces ) = $ps->strip_string( $text );
314            
315             # if you already have the pod pieces...
316             my $txt_nopod = $ps->strip_string( $text, $pod_pieces );
317              
318             =head1 KNOWN LIMITATIONS
319              
320             =over 4
321              
322             =item *
323              
324             Currently only works on files with unix-style line endings.
325              
326             =back
327              
328             =head1 TODO
329              
330             This is only what I've thought of... B
331              
332             =over 4
333              
334             =item *
335              
336             Fix aforementioned limitation
337              
338             =item *
339              
340             More comprehensive tests
341              
342             =item *
343              
344             A utility module to do common things with the output
345              
346             =back
347              
348             =head1 CREDITS
349              
350             Uri Guttman for giving me the task that led to my shaving this particular yak
351              
352             =head1 SEE ALSO
353              
354             =over 4
355              
356             =item *
357              
358             L
359              
360             =item *
361              
362             L
363              
364             =item *
365              
366             L
367              
368             =item *
369              
370             L
371              
372             =item *
373              
374             L
375              
376             =item *
377              
378             L
379              
380             =item *
381              
382             L
383              
384             =item *
385              
386             and about a million other things...
387              
388             =back
389              
390             =head1 POD TERMINOLOGY FOR DUMMIES (aka: me)
391              
392             =head2 paragraphs
393              
394             In Pod, everything is a paragraph. A paragraph is simply one or more
395             consecutive lines of text. Multiple paragraphs are separated from each other
396             by one or more blank lines.
397              
398             Some paragraphs have special meanings, as explained below.
399              
400             =head2 command
401              
402             A command (aka directive) is a paragraph whose first line begins with a
403             character sequence matching the regex m/^=([a-zA-Z]\S*)/
404              
405             I've actually been a bit more generous, matching m/^=(\w+)/ instead.
406             Don't rely on that though. I may have to change to be closer to the spec
407             someday.
408              
409             In the above regex, the type of command would be in $1. Different types of
410             commands have different semantics and validation rules yadda yadda.
411              
412             Currently, the following command types (directives) are described in the
413             Pod Spec L and technically,
414             a proper Pod parser should consider anything else an error. (I won't though)
415              
416             =over 4
417              
418             =item *
419              
420             head[\d] (\d is a number from 1-4)
421              
422             =item *
423              
424             pod
425              
426             =item *
427              
428             cut
429              
430             =item *
431              
432             over
433              
434             =item *
435              
436             item
437              
438             =item *
439              
440             back
441              
442             =item *
443              
444             begin
445              
446             =item *
447              
448             end
449              
450             =item *
451              
452             for
453              
454             =item *
455              
456             encoding
457              
458             =back
459              
460             =head2 directive
461              
462             Ostensibly a synonym for a command paragraph, I consider it a subset of that,
463             specifically the "command type" as described above.
464              
465             =head2 verbatim paragraph
466              
467             This is a paragraph where each line begins with whitespace.
468              
469             =head2 ordinary paragraph
470              
471             This is a prargraph where each line does B begin with whitespace
472              
473             =head2 data paragraph
474              
475             This is a paragraph that is between a pair of "=begin identifier" ...
476             "=end identifier" directives where "identifier" does not begin with a
477             literal colon (":")
478              
479             I do not plan on handling this type of paragraph in any special way.
480              
481             =head2 block
482              
483             A Pod block is a series of paragraphs beginning with any directive except
484             "=cut" and ending with the first occurence of a "=cut" directive or the
485             end of the input, whichever comes first.
486              
487             =head2 piece
488              
489             This is a term I'm introducting myself. A piece is just a hash containing info
490             on a parsed piece of the original string. Each piece is either pod or not pod.
491             If it's pod it describes the kind of pod. If it's not, it contains a 'non_pod'
492             entry. All pieces also include the start and end offsets into the original
493             string (starting at 0) as 'start_pos' and 'end_pos', respectively.
494              
495             =head1 AUTHOR
496              
497             Stephen R. Scaffidi
498              
499             =head1 COPYRIGHT AND LICENSE
500              
501             This software is copyright (c) 2010 by Stephen R. Scaffidi.
502              
503             This is free software; you can redistribute it and/or modify it under
504             the same terms as the Perl 5 programming language system itself.
505              
506             =cut
507              
508              
509             __END__