File Coverage

blib/lib/Pod/Markdown.pm
Criterion Covered Total %
statement 395 403 98.0
branch 124 134 92.5
condition 33 43 76.7
subroutine 99 101 98.0
pod 11 59 18.6
total 662 740 89.4


line stmt bran cond sub pod time code
1             # vim: set ts=2 sts=2 sw=2 expandtab smarttab:
2             #
3             # This file is part of Pod-Markdown
4             #
5             # This software is copyright (c) 2011 by Randy Stauner.
6             #
7             # This is free software; you can redistribute it and/or modify it under
8             # the same terms as the Perl 5 programming language system itself.
9             #
10 16     16   1448355 use 5.008;
  16         240  
11 16     16   113 use strict;
  16         31  
  16         393  
12 16     16   94 use warnings;
  16         36  
  16         1767  
13              
14             package Pod::Markdown;
15             # git description: v3.200-4-gd31d626
16              
17             our $AUTHORITY = 'cpan:RWSTAUNER';
18             # ABSTRACT: Convert POD to Markdown
19             $Pod::Markdown::VERSION = '3.300';
20 16     16   10314 use Pod::Simple 3.27 (); # detected_encoding and keep_encoding bug fix
  16         510429  
  16         596  
21 16     16   8333 use parent qw(Pod::Simple::Methody);
  16         4635  
  16         121  
22 16     16   21769 use Encode ();
  16         160433  
  16         399  
23 16     16   6872 use URI::Escape ();
  16         24173  
  16         2481  
24              
25             our %URL_PREFIXES = (
26             sco => 'http://search.cpan.org/perldoc?',
27             metacpan => 'https://metacpan.org/pod/',
28             man => 'http://man.he.net/man',
29             );
30             $URL_PREFIXES{perldoc} = $URL_PREFIXES{metacpan};
31              
32             our $LOCAL_MODULE_RE = qr/^(Local::|\w*?_\w*)/;
33              
34             ## no critic
35             #{
36             our $HAS_HTML_ENTITIES;
37              
38             # Stolen from Pod::Simple::XHTML 3.28. {{{
39              
40             BEGIN {
41 16     16   972 $HAS_HTML_ENTITIES = eval "require HTML::Entities; 1";
42             }
43              
44             my %entities = (
45             q{>} => 'gt',
46             q{<} => 'lt',
47             q{'} => '#39',
48             q{"} => 'quot',
49             q{&} => 'amp',
50             );
51              
52             sub encode_entities {
53 234     234 0 455 my $self = shift;
54 234         427 my $ents = $self->html_encode_chars;
55 234 100       690 return HTML::Entities::encode_entities( $_[0], $ents ) if $HAS_HTML_ENTITIES;
56 49 100       91 if (defined $ents) {
57 48         110 $ents =~ s,(?
58 48         97 $ents =~ s,(?
59             } else {
60 1         6 $ents = join '', keys %entities;
61             }
62 49         74 my $str = $_[0];
63 49   66     290 $str =~ s/([$ents])/'&' . ($entities{$1} || sprintf '#x%X', ord $1) . ';'/ge;
  23         160  
64 49         174 return $str;
65             }
66              
67             # }}}
68              
69             # Add a few very common ones for consistency and readability
70             # (in case HTML::Entities isn't available).
71             %entities = (
72             # Pod::Markdown has always required 5.8 so unicode_to_native will be available.
73             chr(utf8::unicode_to_native(0xA0)) => 'nbsp',
74             chr(utf8::unicode_to_native(0xA9)) => 'copy',
75             %entities
76             );
77              
78             sub __entity_encode_ord_he {
79 11     11   295 my $chr = chr $_[0];
80             # Skip the encode_entities() logic and go straight for the substitution
81             # since we already have the char we know we want replaced.
82             # Both the hash and the function are documented as exportable (so should be reliable).
83 11   33     57 return $HTML::Entities::char2entity{ $chr } || HTML::Entities::num_entity( $chr );
84             }
85             sub __entity_encode_ord_basic {
86 7   33 7   327 return '&' . ($entities{chr $_[0]} || sprintf '#x%X', $_[0]) . ';';
87             }
88              
89             # From HTML::Entities 3.69
90             my $DEFAULT_ENTITY_CHARS = '^\n\r\t !\#\$%\(-;=?-~';
91              
92             #}
93             ## use critic
94              
95             # Use hash for simple "exists" check in `new` (much more accurate than `->can`).
96             my %attributes = map { ($_ => 1) }
97             qw(
98             html_encode_chars
99             match_encoding
100             output_encoding
101             local_module_re
102             local_module_url_prefix
103             man_url_prefix
104             perldoc_url_prefix
105             perldoc_fragment_format
106             markdown_fragment_format
107             include_meta_tags
108             escape_url
109             );
110              
111              
112             sub new {
113 305     305 1 220551 my $class = shift;
114 305         988 my %args = @_;
115              
116 305         1069 my $self = $class->SUPER::new();
117 305         6286 $self->preserve_whitespace(1);
118 305         2552 $self->nbsp_for_S(1);
119 305         1926 $self->accept_targets(qw( markdown html ));
120 305         5908 $self->escape_url(1);
121              
122             # Default to the global, but allow it to be overwritten in args.
123 305         1968 $self->local_module_re($LOCAL_MODULE_RE);
124              
125 305         1641 for my $type ( qw( perldoc man ) ){
126 610         1156 my $attr = $type . '_url_prefix';
127             # Initialize to the alias.
128 610         1516 $self->$attr($type);
129             }
130              
131 305         1066 while( my ($attr, $val) = each %args ){
132             # NOTE: Checking exists on a private var means we don't allow Pod::Simple
133             # attributes to be set this way. It's not very consistent, but I think
134             # I'm ok with that for now since there probably aren't many Pod::Simple attributes
135             # being changed besides `output_*` which feel like API rather than attributes.
136             # We'll see.
137             # This is currently backward-compatible as we previously just put the attribute
138             # into the private stash so anything unknown was silently ignored.
139             # We could open this up to `$self->can($attr)` in the future if that seems better
140             # but it tricked me when I was testing a misspelled attribute name
141             # which also happened to be a Pod::Simple method.
142              
143 910 100       3903 exists $attributes{ $attr } or
144             # Provide a more descriptive message than "Can't locate object method".
145             warn("Unknown argument to ${class}->new(): '$attr'"), next;
146              
147             # Call setter.
148 908         2037 $self->$attr($val);
149             }
150              
151             # TODO: call from the setters.
152 305         1888 $self->_prepare_fragment_formats;
153              
154 305 100 100     647 if(defined $self->local_module_url_prefix && $self->local_module_url_prefix eq '' && !$self->escape_url) {
      100        
155 1         19 warn("turning escape_url with an empty local_module_url_prefix is not recommended as relative URLs could be confused for IPv6 addresses");
156             }
157              
158 305         1030 return $self;
159             }
160              
161             for my $type ( qw( local_module perldoc man ) ){
162             my $attr = $type . '_url_prefix';
163 16     16   164 no strict 'refs'; ## no critic
  16         43  
  16         100219  
164             *$attr = sub {
165 1325     1325   17660 my $self = shift;
166 1325 100       2504 if (@_) {
167 853   100     3741 $self->{$attr} = $URL_PREFIXES{ $_[0] } || $_[0];
168             }
169             else {
170 472         1496 return $self->{$attr};
171             }
172             }
173             }
174              
175             ## Attribute accessors ##
176              
177              
178             sub html_encode_chars {
179 305     305 1 410 my $self = shift;
180 305         479 my $stash = $self->_private;
181              
182             # Setter.
183 305 100       622 if( @_ ){
184             # If false ('', 0, undef), disable.
185 71 50       157 if( !$_[0] ){
186 0         0 delete $stash->{html_encode_chars};
187 0         0 $stash->{encode_amp} = 1;
188 0         0 $stash->{encode_lt} = 1;
189             }
190             else {
191             # Special case boolean '1' to mean "all".
192             # If we have HTML::Entities, undef will use the default.
193             # Without it, we need to specify so that we use the same list (for consistency).
194 71 100       190 $stash->{html_encode_chars} = $_[0] eq '1' ? ($HAS_HTML_ENTITIES ? undef : $DEFAULT_ENTITY_CHARS) : $_[0];
    100          
195              
196             # If [char] doesn't get encoded, we need to do it ourselves.
197 71         149 $stash->{encode_amp} = ($self->encode_entities('&') eq '&');
198 71         2735 $stash->{encode_lt} = ($self->encode_entities('<') eq '<');
199             }
200 71         1821 return;
201             }
202              
203             # Getter.
204 234         423 return $stash->{html_encode_chars};
205             }
206              
207              
208             # I prefer ro-accessors (immutability!) but it can be confusing
209             # to not support the same API as other Pod::Simple classes.
210              
211             # NOTE: Pod::Simple::_accessorize is not a documented public API.
212             # Skip any that have already been defined.
213             __PACKAGE__->_accessorize(grep { !__PACKAGE__->can($_) } keys %attributes);
214              
215             sub _prepare_fragment_formats {
216 305     305   501 my ($self) = @_;
217              
218 305         969 foreach my $attr ( keys %attributes ){
219 3355 100       8492 next unless $attr =~ /^(\w+)_fragment_format/;
220 610         1407 my $type = $1;
221 610         1437 my $format = $self->$attr;
222              
223             # If one was provided.
224 610 100       3646 if( $format ){
225             # If the attribute is a coderef just use it.
226 510 100       1331 next if ref($format) eq 'CODE';
227             }
228             # Else determine a default.
229             else {
230 100 100       210 if( $type eq 'perldoc' ){
231             # Choose a default that matches the destination url.
232 49         98 my $target = $self->perldoc_url_prefix;
233 49         100 foreach my $alias ( qw( metacpan sco ) ){
234 98 100       251 if( $target eq $URL_PREFIXES{ $alias } ){
235 40         71 $format = $alias;
236             }
237             }
238             # This seems like a reasonable fallback.
239 49   100     139 $format ||= 'pod_simple_xhtml';
240             }
241             else {
242 51         104 $format = $type;
243             }
244             }
245              
246             # The short name should become a method name with the prefix prepended.
247 146         203 my $prefix = 'format_fragment_';
248 146         586 $format =~ s/^$prefix//;
249 146 50       613 die "Unknown fragment format '$format'"
250             unless $self->can($prefix . $format);
251              
252             # Save it.
253 146         378 $self->$attr($format);
254             }
255              
256 305         609 return;
257             }
258              
259             ## Backward compatible API ##
260              
261             # For backward compatibility (previously based on Pod::Parser):
262             # While Pod::Simple provides a parse_from_file() method
263             # it's primarily for Pod::Parser compatibility.
264             # When called without an output handle it will print to STDOUT
265             # but the old Pod::Markdown never printed to a handle
266             # so we don't want to start now.
267             sub parse_from_file {
268 10     10 1 28 my ($self, $file) = @_;
269              
270             # TODO: Check that all dependent cpan modules use the Pod::Simple API
271             # then add a deprecation warning here to avoid confusion.
272              
273 10         44 $self->output_string(\($self->{_as_markdown_}));
274 10         2700 $self->parse_file($file);
275             }
276              
277             # Likewise, though Pod::Simple doesn't define this method at all.
278 9     9 0 460 sub parse_from_filehandle { shift->parse_from_file(@_) }
279              
280              
281             ## Document state ##
282              
283             sub _private {
284 9811     9811   14060 my ($self) = @_;
285             $self->{_Pod_Markdown_} ||= {
286 9811   100     31003 indent => 0,
287             stacks => [],
288             states => [{}],
289             link => [],
290             encode_amp => 1,
291             encode_lt => 1,
292             };
293             }
294              
295             sub _increase_indent {
296 67 50   67   114 ++$_[0]->_private->{indent} >= 1
297             or die 'Invalid state: indent < 0';
298             }
299             sub _decrease_indent {
300 67 50   67   163 --$_[0]->_private->{indent} >= 0
301             or die 'Invalid state: indent < 0';
302             }
303              
304             sub _new_stack {
305 1008     1008   1464 push @{ $_[0]->_private->{stacks} }, [];
  1008         1867  
306 1008         1568 push @{ $_[0]->_private->{states} }, {};
  1008         1681  
307             }
308              
309             sub _last_string {
310 18     18   29 $_[0]->_private->{stacks}->[-1][-1];
311             }
312              
313             sub _pop_stack_text {
314 717     717   1042 $_[0]->_private->{last_state} = pop @{ $_[0]->_private->{states} };
  717         1269  
315 717         1093 join '', @{ pop @{ $_[0]->_private->{stacks} } };
  717         875  
  717         1153  
316             }
317              
318             sub _stack_state {
319 432     432   757 $_[0]->_private->{states}->[-1];
320             }
321              
322             sub _save {
323 1346     1346   2593 my ($self, $text) = @_;
324 1346         1700 push @{ $self->_private->{stacks}->[-1] }, $text;
  1346         2087  
325             # return $text; # DEBUG
326             }
327              
328             sub _save_line {
329 495     495   905 my ($self, $text) = @_;
330              
331 495         945 $text = $self->_process_escapes($text);
332              
333 495         1429 $self->_save($text . $/);
334             }
335              
336             # For paragraphs, etc.
337             sub _save_block {
338 430     430   774 my ($self, $text) = @_;
339              
340 430         805 $self->_stack_state->{blocks}++;
341              
342 430         1021 $self->_save_line($self->_indent($text) . $/);
343             }
344              
345             ## Formatting ##
346              
347             sub _chomp_all {
348 333     333   588 my ($self, $text) = @_;
349 333         1268 1 while chomp $text;
350 333         993 return $text;
351             }
352              
353             sub _indent {
354 473     473   778 my ($self, $text) = @_;
355 473         789 my $level = $self->_private->{indent};
356              
357 473 100       1038 if( $level ){
358 34         85 my $indent = ' ' x ($level * 4);
359              
360             # Capture text on the line so that we don't indent blank lines (/^\x20{4}$/).
361 34         222 $text =~ s/^(.+)/$indent$1/mg;
362             }
363              
364 473         1639 return $text;
365             }
366              
367             # as_markdown() exists solely for backward compatibility
368             # and requires having called parse_from_file() to be useful.
369              
370              
371             sub as_markdown {
372 10     10 0 454 my ($parser, %args) = @_;
373 10         16 my @header;
374             # Don't add meta tags again if we've already done it.
375 10 100 100     36 if( $args{with_meta} && !$parser->include_meta_tags ){
376 3         27 @header = $parser->_build_markdown_head;
377             }
378 10         61 return join("\n" x 2, @header, $parser->{_as_markdown_});
379             }
380              
381             sub _build_markdown_head {
382 9     9   17 my $parser = shift;
383 9         13 my $data = $parser->_private;
384             return join "\n",
385 12         75 map { qq![[meta \l$_="$data->{$_}"]]! }
386 9         17 grep { defined $data->{$_} }
  18         45  
387             qw( Title Author );
388             }
389              
390             ## Escaping ##
391              
392             # http://daringfireball.net/projects/markdown/syntax#backslash
393             # Markdown provides backslash escapes for the following characters:
394             #
395             # \ backslash
396             # ` backtick
397             # * asterisk
398             # _ underscore
399             # {} curly braces
400             # [] square brackets
401             # () parentheses
402             # # hash mark
403             # + plus sign
404             # - minus sign (hyphen)
405             # . dot
406             # ! exclamation mark
407              
408             # However some of those only need to be escaped in certain places:
409             # * Backslashes *do* need to be escaped or they may be swallowed by markdown.
410             # * Word-surrounding characters (/[`*_]/) *do* need to be escaped mid-word
411             # because the markdown spec explicitly allows mid-word em*pha*sis.
412             # * I don't actually see anything that curly braces are used for.
413             # * Escaping square brackets is enough to avoid accidentally
414             # creating links and images (so we don't need to escape plain parentheses
415             # or exclamation points as that would generate a lot of unnecesary noise).
416             # Parentheses will be escaped in urls (&end_L) to avoid premature termination.
417             # * We don't need a backslash for every hash mark or every hyphen found mid-word,
418             # just the ones that start a line (likewise for plus and dot).
419             # (Those will all be handled by _escape_paragraph_markdown).
420              
421              
422             # Backslash escape markdown characters to avoid having them interpreted.
423             sub _escape_inline_markdown {
424 461     461   704 local $_ = $_[1];
425              
426             # s/([\\`*_{}\[\]()#+-.!])/\\$1/g; # See comments above.
427 461         1142 s/([\\`*_\[\]])/\\$1/g;
428              
429 461         928 return $_;
430             }
431              
432             # Escape markdown characters that would be interpreted
433             # at the start of a line.
434             sub _escape_paragraph_markdown {
435 343     343   546 local $_ = $_[1];
436              
437             # Escape headings, horizontal rules, (unordered) lists, and blockquotes.
438 343         857 s/^([-+#>])/\\$1/mg;
439              
440             # Markdown doesn't support backslash escapes for equal signs
441             # even though they can be used to underline a header.
442             # So use html to escape them to avoid having them interpreted.
443 343         635 s/^([=])/sprintf '&#x%x;', ord($1)/mge;
  1         8  
444              
445             # Escape the dots that would wrongfully create numbered lists.
446 343         561 s/^( (?:>\s+)? \d+ ) (\.\x20)/$1\\$2/xgm;
447              
448 343         656 return $_;
449             }
450              
451              
452             # Additionally Markdown allows inline html so we need to escape things that look like it.
453             # While _some_ Markdown processors handle backslash-escaped html,
454             # [Daring Fireball](http://daringfireball.net/projects/markdown/syntax) states distinctly:
455             # > In HTML, there are two characters that demand special treatment: < and &...
456             # > If you want to use them as literal characters, you must escape them as entities, e.g. <, and &.
457              
458             # It goes on to say:
459             # > Markdown allows you to use these characters naturally,
460             # > taking care of all the necessary escaping for you.
461             # > If you use an ampersand as part of an HTML entity,
462             # > it remains unchanged; otherwise it will be translated into &.
463             # > Similarly, because Markdown supports inline HTML,
464             # > if you use angle brackets as delimiters for HTML tags, Markdown will treat them as such.
465              
466             # In order to only encode the occurrences that require it (something that
467             # could be interpreted as an entity) we escape them all so that we can do the
468             # suffix test later after the string is complete (since we don't know what
469             # strings might come after this one).
470              
471             my %_escape =
472             map {
473             my ($k, $v) = split /:/;
474             # Put the "code" marker before the char instead of after so that it doesn't
475             # get confused as the $2 (which is what requires us to entity-encode it).
476             # ( "XsX", "XcsX", "X(c?)sX" )
477             my ($s, $code, $re) = map { "\0$_$v\0" } '', map { ($_, '('.$_.'?)') } 'c';
478              
479             (
480             $k => $s,
481             $k.'_code' => $code,
482             $k.'_re' => qr/$re/,
483             )
484             }
485             qw( amp:& lt:< );
486              
487             # Make the values of this private var available to the tests.
488 1     1   89 sub __escape_sequences { %_escape }
489              
490              
491             # HTML-entity encode any characters configured by the user.
492             # If that doesn't include [&<] then we escape those chars so we can decide
493             # later if we will entity-encode them or put them back verbatim.
494             sub _encode_or_escape_entities {
495 461     461   646 my $self = $_[0];
496 461         756 my $stash = $self->_private;
497 461         751 local $_ = $_[1];
498              
499 461 100       869 if( $stash->{encode_amp} ){
    50          
500 441 100       775 if( exists($stash->{html_encode_chars}) ){
501             # Escape all amps for later processing.
502             # Pass intermediate strings to entity encoder so that it doesn't
503             # process any of the characters of our escape sequences.
504             # Use -1 to get "as many fields as possible" so that we keep leading and
505             # trailing (possibly empty) fields.
506 38         129 $_ = join $_escape{amp}, map { $self->encode_entities($_) } split /&/, $_, -1;
  57         823  
507             }
508             else {
509 403         835 s/&/$_escape{amp}/g;
510             }
511             }
512             elsif( exists($stash->{html_encode_chars}) ){
513 20         46 $_ = $self->encode_entities($_);
514             }
515              
516             s/
517 461 100       2730 if $stash->{encode_lt};
518              
519 461         916 return $_;
520             }
521              
522             # From Markdown.pl version 1.0.1 line 1172 (_DoAutoLinks).
523             my $EMAIL_MARKER = qr{
524             # < # Opening token is in parent regexp.
525             (?:mailto:)?
526             (
527             [-.\w]+
528             \@
529             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
530             )
531             >
532             }x;
533              
534             # Process any escapes we put in the text earlier,
535             # now that the text is complete (end of a block).
536             sub _process_escapes {
537 495     495   692 my $self = $_[0];
538 495         799 my $stash = $self->_private;
539 495         812 local $_ = $_[1];
540              
541             # The patterns below are taken from Markdown.pl 1.0.1 _EncodeAmpsAndAngles().
542             # In this case we only want to encode the ones that Markdown won't.
543             # This is overkill but produces nicer looking text (less escaped entities).
544             # If it proves insufficent then we'll just encode them all.
545              
546             # $1: If the escape was in a code sequence, simply replace the original.
547             # $2: If the unescaped value would be followed by characters
548             # that could be interpreted as html, entity-encode it.
549             # else: The character is safe to leave bare.
550              
551             # Neither currently allows $2 to contain '0' so bool tests are sufficient.
552              
553 495 100       951 if( $stash->{encode_amp} ){
554             # Encode & if succeeded by chars that look like an html entity.
555 462         1617 s,$_escape{amp_re}((?:#?[xX]?(?:[0-9a-fA-F]+|\w+);)?),
556 90 100       438 $1 ? '&'.$2 : $2 ? '&'.$2 : '&',egos;
    100          
557             }
558              
559 495 100       954 if( $stash->{encode_lt} ){
560             # Encode < if succeeded by chars that look like an html tag.
561             # Leave email addresses () for Markdown to process.
562 462         2357 s,$_escape{lt_re}((?=$EMAIL_MARKER)|(?:[a-z/?\$!])?),
563 72 100       374 $1 ? '<'.$2 : $2 ? '<'.$2 : '<',egos;
    100          
564             }
565              
566 495         959 return $_;
567             }
568              
569              
570             ## Parsing ##
571              
572             sub handle_text {
573 544     544 0 6747 my $self = $_[0];
574 544         919 my $stash = $self->_private;
575 544         1011 local $_ = $_[1];
576              
577             # Unless we're in a code span, verbatim block, or formatted region.
578 544 100       1076 unless( $stash->{no_escape} ){
579              
580             # We could, in theory, alter what gets escaped according to context
581             # (for example, escape square brackets (but not parens) inside link text).
582             # The markdown produced might look slightly nicer but either way you're
583             # at the whim of the markdown processor to interpret things correctly.
584             # For now just escape everything.
585              
586             # Don't let literal characters be interpreted as markdown.
587 461         921 $_ = $self->_escape_inline_markdown($_);
588              
589             # Entity-encode (or escape for later processing) necessary/desired chars.
590 461         955 $_ = $self->_encode_or_escape_entities($_);
591              
592             }
593             # If this _is_ a code section, do limited/specific handling.
594             else {
595             # Always escaping these chars ensures that we won't mangle the text
596             # in the unlikely event that a sequence matching our escape occurred in the
597             # input stream (since we're going to escape it and then unescape it).
598 83 100       327 s/&/$_escape{amp_code}/gos if $stash->{encode_amp};
599 83 100       282 s/{encode_lt};
600             }
601              
602 544         1147 $self->_save($_);
603             }
604              
605             sub start_Document {
606 291     291 0 104392 my ($self) = @_;
607 291         696 $self->_new_stack;
608             }
609              
610             sub end_Document {
611 291     291 0 22450 my ($self) = @_;
612 291         722 $self->_check_search_header;
613 291         447 my $end = pop @{ $self->_private->{stacks} };
  291         466  
614              
615 291 50       405 @{ $self->_private->{stacks} } == 0
  291         480  
616             or die 'Document ended with stacks remaining';
617              
618 291         944 my @doc = $self->_chomp_all(join('', @$end)) . $/;
619              
620 291 100       889 if( $self->include_meta_tags ){
621 6         50 unshift @doc, $self->_build_markdown_head, ($/ x 2);
622             }
623              
624 291 100       2136 if( my $encoding = $self->_get_output_encoding ){
625             # Do the check outside the loop(s) for efficiency.
626 55 100       516 my $ents = $HAS_HTML_ENTITIES ? \&__entity_encode_ord_he : \&__entity_encode_ord_basic;
627             # Iterate indices to avoid copying large strings.
628 55         164 for my $i ( 0 .. $#doc ){
629 55         78 print { $self->{output_fh} } Encode::encode($encoding, $doc[$i], $ents);
  55         166  
630             }
631             }
632             else {
633 236         1276 print { $self->{output_fh} } @doc;
  236         989  
634             }
635             }
636              
637             sub _get_output_encoding {
638 291     291   509 my ($self) = @_;
639              
640             # If 'match_encoding' is set we need to return an encoding.
641             # If pod has no =encoding, Pod::Simple will guess if it sees a high-bit char.
642             # If there are no high-bit chars, encoding is undef.
643             # Use detected_encoding() rather than encoding() because if Pod::Simple
644             # can't use whatever encoding was specified, we probably can't either.
645             # Fallback to 'o_e' if no match is found. This gives the user the choice,
646             # since otherwise there would be no reason to specify 'o_e' *and* 'm_e'.
647             # Fallback to UTF-8 since it is a reasonable default these days.
648              
649 291 100 100     601 return $self->detected_encoding || $self->output_encoding || 'UTF-8'
650             if $self->match_encoding;
651              
652             # If output encoding wasn't specified, return false.
653 279         1659 return $self->output_encoding;
654             }
655              
656             ## Blocks ##
657              
658             sub start_Verbatim {
659 17     17 0 5123 my ($self) = @_;
660 17         54 $self->_new_stack;
661 17         37 $self->_private->{no_escape} = 1;
662             }
663              
664             sub end_Verbatim {
665 17     17 0 159 my ($self) = @_;
666              
667 17         39 my $text = $self->_pop_stack_text;
668              
669 17         46 $text = $self->_indent_verbatim($text);
670              
671 17         41 $self->_private->{no_escape} = 0;
672              
673             # Verbatim blocks do not generate a separate "Para" event.
674 17         41 $self->_save_block($text);
675             }
676              
677             sub _indent_verbatim {
678 17     17   37 my ($self, $paragraph) = @_;
679              
680             # NOTE: Pod::Simple expands the tabs for us (as suggested by perlpodspec).
681             # Pod::Simple also has a 'strip_verbatim_indent' attribute
682             # but it doesn't sound like it gains us anything over this method.
683              
684             # POD verbatim can start with any number of spaces (or tabs)
685             # markdown should be 4 spaces (or a tab)
686             # so indent any paragraphs so that all lines start with at least 4 spaces
687 17         63 my @lines = split /\n/, $paragraph;
688 17         31 my $indent = ' ' x 4;
689 17         56 foreach my $line ( @lines ){
690 32 100       116 next unless $line =~ m/^( +)/;
691             # find the smallest indentation
692 31 100       106 $indent = $1 if length($1) < length($indent);
693             }
694 17 100       65 if( (my $smallest = length($indent)) < 4 ){
695             # invert to get what needs to be prepended
696 11         31 $indent = ' ' x (4 - $smallest);
697              
698             # Prepend indent to each line.
699             # We could check /\S/ to only indent non-blank lines,
700             # but it's backward compatible to respect the whitespace.
701             # Additionally, both pod and markdown say they ignore blank lines
702             # so it shouldn't hurt to leave them in.
703 11 100       27 $paragraph = join "\n", map { length($_) ? $indent . $_ : '' } @lines;
  23         88  
704             }
705              
706 17         46 return $paragraph;
707             }
708              
709             sub start_Para {
710 343     343 0 147466 $_[0]->_new_stack;
711             }
712              
713             sub end_Para {
714 343     343 0 3687 my ($self) = @_;
715 343         642 my $text = $self->_pop_stack_text;
716              
717 343         799 $text = $self->_escape_paragraph_markdown($text);
718              
719 343         723 $self->_save_block($text);
720             }
721              
722              
723             ## Headings ##
724              
725 40     40 0 6166 sub start_head1 { $_[0]->_start_head(1) }
726 40     40 0 445 sub end_head1 { $_[0]->_end_head(1) }
727 11     11 0 3425 sub start_head2 { $_[0]->_start_head(2) }
728 11     11 0 151 sub end_head2 { $_[0]->_end_head(2) }
729 1     1 0 413 sub start_head3 { $_[0]->_start_head(3) }
730 1     1 0 12 sub end_head3 { $_[0]->_end_head(3) }
731 0     0 0 0 sub start_head4 { $_[0]->_start_head(4) }
732 0     0 0 0 sub end_head4 { $_[0]->_end_head(4) }
733              
734             sub _check_search_header {
735 343     343   563 my ($self) = @_;
736             # Save the text since the last heading if we want it for metadata.
737 343 100       582 if( my $last = $self->_private->{search_header} ){
738 18         39 for( $self->_private->{$last} = $self->_last_string ){
739 18         44 s/\A\s+//;
740 18         96 s/\s+\z//;
741             }
742             }
743             }
744             sub _start_head {
745 52     52   103 my ($self) = @_;
746 52         137 $self->_check_search_header;
747 52         118 $self->_new_stack;
748             }
749              
750             sub _end_head {
751 52     52   90 my ($self, $num) = @_;
752 52         110 my $h = '#' x $num;
753              
754 52         146 my $text = $self->_pop_stack_text;
755             $self->_private->{search_header} =
756 52 100       240 $text =~ /NAME/ ? 'Title'
    100          
757             : $text =~ /AUTHOR/ ? 'Author'
758             : undef;
759              
760             # TODO: option for $h suffix
761             # TODO: put a name="" if $self->{embed_anchor_tags}; ?
762             # https://rt.cpan.org/Ticket/Display.html?id=57776
763 52         212 $self->_save_block(join(' ', $h, $text));
764             }
765              
766             ## Lists ##
767              
768             # With Pod::Simple->parse_empty_lists(1) there could be an over_empty event,
769             # but what would you do with that?
770              
771             sub _start_list {
772 22     22   44 my ($self) = @_;
773 22         51 $self->_new_stack;
774              
775             # Nest again b/c start_item will pop this to look for preceding content.
776 22         54 $self->_increase_indent;
777 22         43 $self->_new_stack;
778             }
779              
780             sub _end_list {
781 22     22   47 my ($self) = @_;
782 22         55 $self->_handle_between_item_content;
783              
784             # Finish the list.
785              
786             # All the child elements should be blocks,
787             # but don't end with a double newline.
788 22         58 my $text = $self->_chomp_all($self->_pop_stack_text);
789              
790 22         95 $_[0]->_save_line($text . $/);
791             }
792              
793             sub _handle_between_item_content {
794 65     65   104 my ($self) = @_;
795              
796             # This might be empty (if the list item had no additional content).
797 65 100       123 if( my $text = $self->_pop_stack_text ){
798             # Else it's a sub-document.
799             # If there are blocks we need to separate with blank lines.
800 21 100       40 if( $self->_private->{last_state}->{blocks} ){
801 16         55 $text = $/ . $text;
802             }
803             # If not, we can condense the text.
804             # In this module's history there was a patch contributed to specifically
805             # produce "huddled" lists so we'll try to maintain that functionality.
806             else {
807 5         30 $text = $self->_chomp_all($text) . $/;
808             }
809 21         51 $self->_save($text)
810             }
811              
812 65         142 $self->_decrease_indent;
813             }
814              
815             sub _start_item {
816 43     43   84 my ($self) = @_;
817 43         124 $self->_handle_between_item_content;
818 43         84 $self->_new_stack;
819             }
820              
821             sub _end_item {
822 43     43   79 my ($self, $marker) = @_;
823 43         74 my $text = $self->_pop_stack_text;
824 43 100 66     266 $self->_save_line($self->_indent($marker .
825             # Add a space only if there is text after the marker.
826             (defined($text) && length($text) ? ' ' . $text : '')
827             ));
828              
829             # Store any possible contents in a new stack (like a sub-document).
830 43         121 $self->_increase_indent;
831 43         84 $self->_new_stack;
832             }
833              
834 7     7 0 1635 sub start_over_bullet { $_[0]->_start_list }
835 7     7 0 647 sub end_over_bullet { $_[0]->_end_list }
836              
837 18     18 0 6163 sub start_item_bullet { $_[0]->_start_item }
838 18     18 0 178 sub end_item_bullet { $_[0]->_end_item('-') }
839              
840 9     9 0 2355 sub start_over_number { $_[0]->_start_list }
841 9     9 0 1481 sub end_over_number { $_[0]->_end_list }
842              
843             sub start_item_number {
844 18     18 0 5472 $_[0]->_start_item;
845             # It seems like this should be a stack,
846             # but from testing it appears that the corresponding 'end' event
847             # comes right after the text (it doesn't surround any embedded content).
848             # See t/nested.t which shows start-item, text, end-item, para, start-item....
849 18         39 $_[0]->_private->{item_number} = $_[1]->{number};
850             }
851              
852             sub end_item_number {
853 18     18 0 194 my ($self) = @_;
854 18         36 $self->_end_item($self->_private->{item_number} . '.');
855             }
856              
857             # Markdown doesn't support definition lists
858             # so do regular (unordered) lists with indented paragraphs.
859 6     6 0 1570 sub start_over_text { $_[0]->_start_list }
860 6     6 0 685 sub end_over_text { $_[0]->_end_list }
861              
862 7     7 0 2134 sub start_item_text { $_[0]->_start_item }
863 7     7 0 74 sub end_item_text { $_[0]->_end_item('-')}
864              
865              
866             # perlpodspec equates an over/back region with no items to a blockquote.
867             sub start_over_block {
868             # NOTE: We don't actually need to indent for a blockquote.
869 3     3 0 981 $_[0]->_new_stack;
870             }
871              
872             sub end_over_block {
873 3     3 0 540 my ($self) = @_;
874              
875             # Chomp first to avoid prefixing a blank line with a `>`.
876 3         10 my $text = $self->_chomp_all($self->_pop_stack_text);
877              
878             # NOTE: Paragraphs will already be escaped.
879              
880             # I don't really like either of these implementations
881             # but the join/map/split seems a little better and benches a little faster.
882             # You would lose the last newline but we've already chomped.
883             #$text =~ s{^(.)?}{'>' . (defined($1) && length($1) ? (' ' . $1) : '')}mge;
884 3 100       62 $text = join $/, map { length($_) ? '> ' . $_ : '>' } split qr-$/-, $text;
  22         62  
885              
886 3         18 $self->_save_block($text);
887             }
888              
889             ## Custom Formats ##
890              
891             sub start_for {
892 10     10 0 2638 my ($self, $attr) = @_;
893 10         60 $self->_new_stack;
894              
895 10 100       35 if( $attr->{target} eq 'html' ){
896             # Use another stack so we can indent
897             # (not syntactily necessary but seems appropriate).
898 2         6 $self->_new_stack;
899 2         7 $self->_increase_indent;
900 2         5 $self->_private->{no_escape} = 1;
901             # Mark this so we know to undo it.
902 2         7 $self->_stack_state->{for_html} = 1;
903             }
904             }
905              
906             sub end_for {
907 10     10 0 718 my ($self) = @_;
908             # Data gets saved as a block (which will handle indents),
909             # but if there was html we'll alter this, so chomp and save a block again.
910 10         19 my $text = $self->_chomp_all($self->_pop_stack_text);
911              
912 10 100       21 if( $self->_private->{last_state}->{for_html} ){
913 2         4 $self->_private->{no_escape} = 0;
914             # Save it to the next stack up so we can pop it again (we made two stacks).
915 2         5 $self->_save($text);
916 2         7 $self->_decrease_indent;
917 2         6 $text = join "\n", '
', $self->_chomp_all($self->_pop_stack_text), '
';
918             }
919              
920 10         22 $self->_save_block($text);
921             }
922              
923             # Data events will be emitted for any formatted regions that have been enabled
924             # (by default, `markdown` and `html`).
925              
926             sub start_Data {
927 5     5 0 421 my ($self) = @_;
928             # TODO: limit this to what's in attr?
929 5         12 $self->_private->{no_escape}++;
930 5         12 $self->_new_stack;
931             }
932              
933             sub end_Data {
934 5     5 0 48 my ($self) = @_;
935 5         12 my $text = $self->_pop_stack_text;
936 5         11 $self->_private->{no_escape}--;
937 5         12 $self->_save_block($text);
938             }
939              
940             ## Codes ##
941              
942 94     94 0 927 sub start_B { $_[0]->_save('**') }
943 47     47 0 485 sub end_B { $_[0]->start_B() }
944              
945 36     36 0 333 sub start_I { $_[0]->_save('_') }
946 18     18 0 213 sub end_I { $_[0]->start_I() }
947              
948             sub start_C {
949 61     61 0 958 my ($self) = @_;
950 61         186 $self->_new_stack;
951 61         126 $self->_private->{no_escape}++;
952             }
953              
954             sub end_C {
955 61     61 0 560 my ($self) = @_;
956 61         112 $self->_private->{no_escape}--;
957 61         140 $self->_save( $self->_wrap_code_span($self->_pop_stack_text) );
958             }
959              
960             # Use code spans for F<>.
961 4     4 0 69 sub start_F { shift->start_C(@_); }
962 4     4 0 44 sub end_F { shift ->end_C(@_); }
963              
964             sub start_L {
965 93     93 0 1542 my ($self, $flags) = @_;
966 93         206 $self->_new_stack;
967 93         140 push @{ $self->_private->{link} }, $flags;
  93         157  
968             }
969              
970             sub end_L {
971 93     93 0 896 my ($self) = @_;
972 93 50       121 my $flags = pop @{ $self->_private->{link} }
  93         144  
973             or die 'Invalid state: link end with no link start';
974              
975 93         150 my ($type, $to, $section) = @{$flags}{qw( type to section )};
  93         230  
976              
977 93 50       624 my $url = (
    100          
    100          
978             $type eq 'url' ? $to
979             : $type eq 'man' ? $self->format_man_url($to, $section)
980             : $type eq 'pod' ? $self->format_perldoc_url($to, $section)
981             : undef
982             );
983              
984 93         254 my $text = $self->_pop_stack_text;
985              
986             # NOTE: I don't think the perlpodspec says what to do with L<|blah>
987             # but it seems like a blank link text just doesn't make sense
988 93 100       267 if( !length($text) ){
989 4 100       11 $text =
    100          
990             $section ?
991             $to ? sprintf('"%s" in %s', $section, $to)
992             : ('"' . $section . '"')
993             : $to;
994             }
995              
996             # FIXME: What does Pod::Simple::X?HTML do for this?
997             # if we don't know how to handle the url just print the pod back out
998 93 50       368 if (!$url) {
999 0         0 $self->_save(sprintf 'L<%s>', $flags->{raw});
1000 0         0 return;
1001             }
1002              
1003             # In the url we need to escape quotes and parentheses lest markdown
1004             # break the url (cut it short and/or wrongfully interpret a title).
1005              
1006             # Backslash escapes do not work for the space and quotes.
1007             # URL-encoding the space is not sufficient
1008             # (the quotes confuse some parsers and produce invalid html).
1009             # I've arbitratily chosen HTML encoding to hide them from markdown
1010             # while mangling the url as litle as possible.
1011 93         686 $url =~ s/([ '"])/sprintf '&#x%x;', ord($1)/ge;
  11         74  
1012              
1013             # We also need to double any backslashes that may be present
1014             # (lest they be swallowed up) and stop parens from breaking the url.
1015 93         404 $url =~ s/([\\()])/\\$1/g;
1016              
1017             # TODO: put section name in title if not the same as $text
1018 93         525 $self->_save('[' . $text . '](' . $url . ')');
1019             }
1020              
1021             sub start_X {
1022 1     1 0 19 $_[0]->_new_stack;
1023             }
1024              
1025             sub end_X {
1026 1     1 0 11 my ($self) = @_;
1027 1         4 my $text = $self->_pop_stack_text;
1028             # TODO: mangle $text?
1029             # TODO: put if configured
1030             }
1031              
1032             # A code span can be delimited by multiple backticks (and a space)
1033             # similar to pod codes (C<< code >>), so ensure we use a big enough
1034             # delimiter to not have it broken by embedded backticks.
1035             sub _wrap_code_span {
1036 61     61   138 my ($self, $arg) = @_;
1037 61         99 my $longest = 0;
1038 61         202 while( $arg =~ /([`]+)/g ){
1039 5         11 my $len = length($1);
1040 5 100       21 $longest = $len if $longest < $len;
1041             }
1042 61         143 my $delim = '`' x ($longest + 1);
1043 61 100       149 my $pad = $longest > 0 ? ' ' : '';
1044 61         199 return $delim . $pad . $arg . $pad . $delim;
1045             }
1046              
1047             ## Link Formatting (TODO: Move this to another module) ##
1048              
1049              
1050             sub format_man_url {
1051 6     6 1 14 my ($self, $to) = @_;
1052 6         15 my ($page, $part) = ($to =~ /^ ([^(]+) (?: \( (\S+) \) )? /x);
1053 6   50     171 return $self->man_url_prefix . ($part || 1) . '/' . ($page || $to);
      33        
1054             }
1055              
1056              
1057             sub format_perldoc_url {
1058 70     70 1 139 my ($self, $name, $section) = @_;
1059              
1060 70         158 my $url_prefix = $self->perldoc_url_prefix;
1061 70 100 100     240 if (
      100        
1062             defined($name)
1063             && $self->is_local_module($name)
1064             && defined($self->local_module_url_prefix)
1065             ) {
1066 5         12 $url_prefix = $self->local_module_url_prefix;
1067             }
1068              
1069 70         1413 my $url = '';
1070              
1071             # If the link is to another module (external link).
1072 70 100       164 if ($name) {
1073 59 100       1446 $url = $url_prefix . ($self->escape_url ? URI::Escape::uri_escape($name) : $name);
1074             }
1075              
1076             # See https://rt.cpan.org/Ticket/Display.html?id=57776
1077             # for a discussion on the need to mangle the section.
1078 70 100       2892 if ($section){
1079              
1080 42 100       1255 my $method = $url
1081             # If we already have a prefix on the url it's external.
1082             ? $self->perldoc_fragment_format
1083             # Else an internal link points to this markdown doc.
1084             : $self->markdown_fragment_format;
1085              
1086 42 100       285 $method = 'format_fragment_' . $method
1087             unless ref($method);
1088              
1089             {
1090             # Set topic to enable code refs to be simple.
1091 42         63 local $_ = $section;
  42         60  
1092 42         110 $section = $self->$method($section);
1093             }
1094              
1095 42         147 $url .= '#' . $section;
1096             }
1097              
1098 70         449 return $url;
1099             }
1100              
1101              
1102             # TODO: simple, pandoc, etc?
1103              
1104             sub format_fragment_markdown {
1105 3     3 1 10 my ($self, $section) = @_;
1106              
1107             # If this is an internal link (to another section in this doc)
1108             # we can't be sure what the heading id's will look like
1109             # (it depends on what is rendering the markdown to html)
1110             # but we can try to follow popular conventions.
1111              
1112             # http://johnmacfarlane.net/pandoc/demo/example9/pandocs-markdown.html#header-identifiers-in-html-latex-and-context
1113             #$section =~ s/(?![-_.])[[:punct:]]//g;
1114             #$section =~ s/\s+/-/g;
1115 3         8 $section =~ s/\W+/-/g;
1116 3         127 $section =~ s/-+$//;
1117 3         6 $section =~ s/^-+//;
1118 3         9 $section = lc $section;
1119             #$section =~ s/^[^a-z]+//;
1120 3   50     7 $section ||= 'section';
1121              
1122 3         8 return $section;
1123             }
1124              
1125              
1126             {
1127             # From Pod::Simple::XHTML 3.28.
1128             # The strings gets passed through encode_entities() before idify().
1129             # If we don't do it here the substitutions below won't operate consistently.
1130              
1131             sub format_fragment_pod_simple_xhtml {
1132 9     9 1 19 my ($self, $t) = @_;
1133              
1134             # encode_entities {
1135             # We need to use the defaults in case html_encode_chars has been customized
1136             # (since the purpose is to match what external sources are doing).
1137              
1138 9         19 local $self->_private->{html_encode_chars};
1139 9         26 $t = $self->encode_entities($t);
1140             # }
1141              
1142             # idify {
1143 9         400 for ($t) {
1144 9         20 s/<[^>]+>//g; # Strip HTML.
1145 9         101 s/&[^;]+;//g; # Strip entities.
1146 9         606 s/^\s+//; s/\s+$//; # Strip white space.
  9         116  
1147 9         98 s/^([^a-zA-Z]+)$/pod$1/; # Prepend "pod" if no valid chars.
1148 9         115 s/^[^a-zA-Z]+//; # First char must be a letter.
1149 9         90 s/[^-a-zA-Z0-9_:.]+/-/g; # All other chars must be valid.
1150 9         149 s/[-:.]+$//; # Strip trailing punctuation.
1151             }
1152             # }
1153              
1154 9         52 return $t;
1155             }
1156             }
1157              
1158              
1159             sub format_fragment_pod_simple_html {
1160 9     9 1 17 my ($self, $section) = @_;
1161              
1162             # From Pod::Simple::HTML 3.28.
1163              
1164             # section_name_tidy {
1165 9         20 $section =~ s/^\s+//;
1166 9         156 $section =~ s/\s+$//;
1167 9         162 $section =~ tr/ /_/;
1168 9         272 $section =~ tr/\x00-\x1F\x80-\x9F//d if 'A' eq chr(65); # drop crazy characters
1169              
1170             #$section = $self->unicode_escape_url($section);
1171             # unicode_escape_url {
1172 9         20 $section =~ s/([^\x00-\xFF])/'('.ord($1).')'/eg;
  0         0  
1173             # Turn char 1234 into "(1234)"
1174             # }
1175              
1176 9 50       22 $section = '_' unless length $section;
1177 9         20 return $section;
1178             # }
1179             }
1180              
1181              
1182 9     9 1 21 sub format_fragment_metacpan { shift->format_fragment_pod_simple_xhtml(@_); }
1183 9     9 1 22 sub format_fragment_sco { shift->format_fragment_pod_simple_html(@_); }
1184              
1185              
1186             sub is_local_module {
1187 59     59 1 109 my ($self, $name) = @_;
1188              
1189 59         163 return ($name =~ $self->local_module_re);
1190             }
1191              
1192             1;
1193              
1194             __END__