File Coverage

blib/lib/Pod/Stupid.pm

Criterion	Covered	Total	%
statement	53	54	98.1
branch	12	16	75.0
condition	7	9	77.7
subroutine	10	11	90.9
pod	3	3	100.0
total	85	93	91.4

line	stmt	bran	cond	sub	pod	time	code
1	1			1		2627	use strict;
	1					4
	1					48
2	1			1		7	use warnings;
	1					3
	1					68
3							package Pod::Stupid;
4							BEGIN {
5	1			1		35	$Pod::Stupid::VERSION = '0.005';
6							}
7							BEGIN {
8	1			1		25	$Pod::Stupid::DIST = 'Pod-Stupid';
9							}
10							# ABSTRACT: The simplest, stupidest 'pod parser' possible
11
12	1			1		1236	use English qw( -no_match_vars );
	1					2777
	1					8
13	1			1		516	use Carp qw( croak );
	1					2
	1					54
14	1			1		6	use Data::Dumper;
	1					2
	1					50
15	1			1		5	use Scalar::Util qw( blessed );
	1					2
	1					1254
16							#use Encode;
17
18							# right now, I've hard-coded unix EOL into these regexen... I probably
19							# should use \R, however it's not supported on older perls, though the
20							# docs say it's equivalent to this:
21							my $eol = qr{ (?>\x0D\x0A?\|[\x0A-\x0C\x85\x{2028}\x{2029}]) };
22
23
24							# match the end of any pod paragraph (pp). I'm being generous by allowing
25							# a pp to end by detecting another command pp with the lookahead thus not
26							# enforcing the "must end with blank line" part of the spec.
27							my $pod_paragraph_end_qr = qr{ (?: [\n]{2,} \| [\n]+(?= ^=\w+) \| \z ) }msx;
28
29							# match a command paragraph. Note: the 'cut' directive is handled
30							# specially because it signifies the end of a block of pod and the
31							# spec states that it need not be followed by a blank line. If any
32							# other directives should be parsed the same way, put them in the
33							# qw() list below. Still, only 'cut' will end a block of pod.
34							my $cut_like_cmds_qr = join '\|', qw( cut );
35							my $pod_command_qr = qr{
36							( # capture everything as $1
37							(?:
38							^[=] # start of pod command
39							(?!$cut_like_cmds_qr) # exclude cut, to be handled below
40							(\w+?) # command type (directive) as $2
41							(\d*) # optional command 'level' as $3
42							(?: # optionally followed by...
43							(?:
44							[ \t]+ \| \n # blank space OR single newline
45							)
46							(.*?) # and 'command text' as $4
47							)?
48							$pod_paragraph_end_qr # followed by paragraph end pattern
49							)
50							\| # OR... special case for cut
51							(?: # (and cut-like) commands...
52							^[=] # start of pod command
53							($cut_like_cmds_qr) # capture command as $5
54							(?: # if followed by...
55							[ \t]+ # horizontal white space
56							(.*?) # and grab anything else as $6
57							[\n] # up to the end of line
58							\| # OR
59							[\n] # just the end of the line
60							)
61							[\n]? # and possibly one more newline
62							)
63							)
64							}msx;
65
66							# match a non-command paragraph. this only applies when
67							# already within a pod block.
68							my $pod_paragraph_qr = qr{
69							( # grab everything as $1...
70							( # but just the paragraph contents as $2
71							(?:
72							^[^=].*?$ # any lines that do not begin with =
73							)+? # until...
74							)
75							$pod_paragraph_end_qr # two newlines or end of string
76							)
77							}msx;
78
79
80	0			0	1	0	sub new { return bless {}, shift; }
81
82
83							# NOTE: the 'c' modifiers on the regexes in this sub are critical! NO TOUCH!
84							sub parse_string {
85	31			31	1	20728	my ($self, $text) = @_;
86
87	31	50				88	croak "missing \$text parameter" if ! defined $text;
88
89							# collect the parsed pieces here:
90	31					38	my @parsed_pieces;
91
92							# find the beginning of the next pod block in the text
93							# (which, by definition, is any pod command)
94	31					1252	while ( $text =~ m{ \G (.*?) $pod_command_qr }msxgc ) {
95	161					298	my $non_pod_txt = $1;
96	161					412	my $pod_txt = $2;
97	161		66			441	my $cmd_type = $3 \|\| $6;
98	161		100			523	my $cmd_level = $4 \|\| '';
99	161		50			485	my $cmd_text = $5 \|\| $7 \|\| '';
100
101							#print "COMMAND: [=$cmd_type$cmd_level $cmd_text]\n\n"; ### DEBUG
102
103							# record the text that wasn't pod, if any
104	161	100				437	push @parsed_pieces, {
105							non_pod => 1,
106							orig_txt => $non_pod_txt,
107							start_pos => $LAST_MATCH_START[1],
108							end_pos => $LAST_MATCH_END[1],
109							} if $non_pod_txt;
110
111							# record the pod found
112	161					1203	push @parsed_pieces, {
113							is_pod => 1,
114							cmd_type => $cmd_type,
115							cmd_level => $cmd_level,
116							cmd_txt => $cmd_text,
117							orig_txt => $pod_txt,
118							start_pos => $LAST_MATCH_START[2],
119							end_pos => $LAST_MATCH_END[2],
120							};
121
122							# cut always signifies the end of a block of pod
123	161	100				585	next if $cmd_type eq 'cut';
124
125							#if ( $cmd_type eq 'encoding' ) {
126							# # change encoding of string - this feels wonky to me..
127							# my $pos = pos( $text );
128							# $cmd_text =~ s/\A\s\|\s\z//gms;
129							# $text = Encode::decode( $cmd_text, $text );
130							# pos( $text ) = $pos;
131							#}
132
133							# look for paragraphs within the current pod block
134	133					1673	while ( $text =~ m{ \G $pod_paragraph_qr }msxgc ) {
135	311					513	my $orig_txt = $1;
136	311					415	my $paragraph = $2;
137
138							#print "PARAGRAPH: [$paragraph]\n\n"; ### DEBUG
139
140	311					5422	push @parsed_pieces, {
141							is_pod => 1,
142							paragraph => $paragraph,
143							orig_txt => $orig_txt,
144							start_pos => $LAST_MATCH_START[1],
145							end_pos => $LAST_MATCH_END[1],
146							};
147							}
148							}
149
150							# Take care of any remaining text in the string
151	31		100			84	my $last_pos = pos( $text ) \|\| 0;
152	31					52	my $end_pos = length( $text ) - 1;
153	31					56	my $remainder = substr( $text, $last_pos );
154	31	100				175	push @parsed_pieces, {
155							non_pod => 1,
156							orig_txt => $remainder,
157							start_pos => $last_pos,
158							end_pos => $end_pos,
159							} if $remainder;
160
161	31					238	return \@parsed_pieces;
162							}
163
164
165
166							sub strip_string {
167	31			31	1	134	my ( $self, $text_ref, $pod_pieces ) = @_;
168
169	31	50				64	croak "missing \$text_ref parameter" unless defined $text_ref;
170
171							# make a copy of the text if necessary.
172	31	50				217	$text_ref = \"$text_ref" unless ref $text_ref;
173
174							# get the pieces if we don't already have them
175	31	50				74	$pod_pieces = $self->parse_string( $$text_ref ) unless ref $pod_pieces;
176
177	31					35	my $shrinkage = 0;
178	31					55	for my $pp ( @$pod_pieces ) {
179
180	520	100				959	next unless $pp->{is_pod};
181
182	472					589	my $length = $pp->{end_pos} - $pp->{start_pos};
183	472					608	my $new_start = $pp->{start_pos} - $shrinkage;
184	472					659	substr( $$text_ref, $new_start, $length, '' );
185
186	472					572	$shrinkage += $length;
187							}
188	31					132	return $$text_ref;
189							}
190
191							1 && q{Beauty is in the eye of the beholder}; # Truth.
192
193
194
195							=pod
196
197							=head1 NAME
198
199							Pod::Stupid - The simplest, stupidest 'pod parser' possible
200
201							=head1 VERSION
202
203							version 0.005
204
205							=head1 SYNOPSIS
206
207							use Pod::Stupid;
208
209							my $file = shift; # '/some/file/with/pod.pl';
210							my $original_text = do { local( @ARGV, $/ ) = $file; <> }; # slurp
211
212							my $ps = Pod::Stupid->new();
213
214							# in scalar context returns an array of hashes.
215							my $pieces = $ps->parse_string( $original_text );
216
217							# get your text sans all POD
218							my $stripped_text = $ps->strip_string( $original_text );
219
220							# reconstruct the original text from the pieces...
221							substr( $stripped_text, $_->{start_pos}, 0, $_->{orig_txt} )
222							for grep { $_->{is_pod} } @$pieces;
223
224							print $stripped_text eq $original_text ? "ok - $file\n" : "not ok - $file\n";
225
226							=head1 DESCRIPTION
227
228							This module was written to do one B thing: Given some text
229							as input, split it up into pieces of POD "paragraphs" and non-POD
230							"whatever" and output an AoH describing each piece found, in order.
231
232							The end user can do whatever s?he wishes with the output AoH. It is
233							trivially simple to reconstruct the input from the output, and
234							hopefully I've included enough information in the inner hashes that
235							one can easily perform just about any other manipulation desired.
236
237							=head1 INDESCRIPTION
238
239							There are a bunch of things this module will B do:
240
241							=over 4
242
243							=item *
244
245							Create a "parse tree"
246
247							=item *
248
249							Pod validation (it either parses or not)
250
251							=item *
252
253							Pod cleanup
254
255							=item *
256
257							"Handle" encoded text (but it I still parse)
258
259							=item *
260
261							Feed your cat
262
263							=back
264
265							However, it may make it easier to do any of the above, with a lot
266							less time and effort spent trying to grok many of the other POD
267							parsing solutions out there.
268
269							A particular design decision I've made is to avoid needing to save
270							any state. This means there's no need or advantage to instantiating
271							an object, except for your own preferences. You can use any method
272							as either an object method or a class method and it will work the
273							same way for both. This design should also discourage me from trying
274							to bloat Pod::Stupid with every feature that tickles my fancy (or
275							yours!) but still, B
276
277							=head1 METHODS
278
279							=head2 new
280
281							the most basic object constructor possible. Currently takes no
282							options because the object neither has nor needs to keep any state.
283
284							This is only here if you want to use this module with an OO interface.
285
286							=head2 parse_string
287
288							Given a string, parses for pod and, in scalar context, returns an AoH
289							describing each pod paragraph found, as well as any non-pod.
290
291							# typical usage
292							my $pieces = $ps->parse_string( $text );
293
294							# to separate pod and non-pod
295							my @pod_pieces = grep { $_->{is_pod} } @$pieces;
296							my @non_pod_pieces = grep { $_->{non_pod} } @$pieces;
297
298							=head2 strip_string
299
300							given a string or string ref, and (optionally) an array of pod pieces,
301							return a copy of the string with all pod stripped out and an AoH
302							containing the pod pieces. If passed a string ref, that string is
303							modified in-place. In any case you can still always get the stripped
304							string and the array of pod parts as return values.
305
306							# most typical usage
307							my $txt_nopod = $ps->strip_string( $text );
308
309							# pass in a ref to change string in-place...
310							$ps->strip_string( \$text ); # $text no longer contains any pod
311
312							# if you need the pieces...
313							my ( $txt_nopod, $pieces ) = $ps->strip_string( $text );
314
315							# if you already have the pod pieces...
316							my $txt_nopod = $ps->strip_string( $text, $pod_pieces );
317
318							=head1 KNOWN LIMITATIONS
319
320							=over 4
321
322							=item *
323
324							Currently only works on files with unix-style line endings.
325
326							=back
327
328							=head1 TODO
329
330							This is only what I've thought of... B
331
332							=over 4
333
334							=item *
335
336							Fix aforementioned limitation
337
338							=item *
339
340							More comprehensive tests
341
342							=item *
343
344							A utility module to do common things with the output
345
346							=back
347
348							=head1 CREDITS
349
350							Uri Guttman for giving me the task that led to my shaving this particular yak
351
352							=head1 SEE ALSO
353
354							=over 4
355
356							=item *
357
358							L
359
360							=item *
361
362							L
363
364							=item *
365
366							L
367
368							=item *
369
370							L
371
372							=item *
373
374							L
375
376							=item *
377
378							L
379
380							=item *
381
382							L
383
384							=item *
385
386							and about a million other things...
387
388							=back
389
390							=head1 POD TERMINOLOGY FOR DUMMIES (aka: me)
391
392							=head2 paragraphs
393
394							In Pod, everything is a paragraph. A paragraph is simply one or more
395							consecutive lines of text. Multiple paragraphs are separated from each other
396							by one or more blank lines.
397
398							Some paragraphs have special meanings, as explained below.
399
400							=head2 command
401
402							A command (aka directive) is a paragraph whose first line begins with a
403							character sequence matching the regex m/^=([a-zA-Z]\S*)/
404
405							I've actually been a bit more generous, matching m/^=(\w+)/ instead.
406							Don't rely on that though. I may have to change to be closer to the spec
407							someday.
408
409							In the above regex, the type of command would be in $1. Different types of
410							commands have different semantics and validation rules yadda yadda.
411
412							Currently, the following command types (directives) are described in the
413							Pod Spec L and technically,
414							a proper Pod parser should consider anything else an error. (I won't though)
415
416							=over 4
417
418							=item *
419
420							head[\d] (\d is a number from 1-4)
421
422							=item *
423
424							pod
425
426							=item *
427
428							cut
429
430							=item *
431
432							over
433
434							=item *
435
436							item
437
438							=item *
439
440							back
441
442							=item *
443
444							begin
445
446							=item *
447
448							end
449
450							=item *
451
452							for
453
454							=item *
455
456							encoding
457
458							=back
459
460							=head2 directive
461
462							Ostensibly a synonym for a command paragraph, I consider it a subset of that,
463							specifically the "command type" as described above.
464
465							=head2 verbatim paragraph
466
467							This is a paragraph where each line begins with whitespace.
468
469							=head2 ordinary paragraph
470
471							This is a prargraph where each line does B begin with whitespace
472
473							=head2 data paragraph
474
475							This is a paragraph that is between a pair of "=begin identifier" ...
476							"=end identifier" directives where "identifier" does not begin with a
477							literal colon (":")
478
479							I do not plan on handling this type of paragraph in any special way.
480
481							=head2 block
482
483							A Pod block is a series of paragraphs beginning with any directive except
484							"=cut" and ending with the first occurence of a "=cut" directive or the
485							end of the input, whichever comes first.
486
487							=head2 piece
488
489							This is a term I'm introducting myself. A piece is just a hash containing info
490							on a parsed piece of the original string. Each piece is either pod or not pod.
491							If it's pod it describes the kind of pod. If it's not, it contains a 'non_pod'
492							entry. All pieces also include the start and end offsets into the original
493							string (starting at 0) as 'start_pos' and 'end_pos', respectively.
494
495							=head1 AUTHOR
496
497							Stephen R. Scaffidi
498
499							=head1 COPYRIGHT AND LICENSE
500
501							This software is copyright (c) 2010 by Stephen R. Scaffidi.
502
503							This is free software; you can redistribute it and/or modify it under
504							the same terms as the Perl 5 programming language system itself.
505
506							=cut
507
508
509							__END__