File Coverage

blib/lib/Pod/Markdown.pm
Criterion Covered Total %
statement 387 395 97.9
branch 118 128 92.1
condition 22 34 64.7
subroutine 98 100 98.0
pod 11 59 18.6
total 636 716 88.8


line stmt bran cond sub pod time code
1             # vim: set ts=2 sts=2 sw=2 expandtab smarttab:
2             #
3             # This file is part of Pod-Markdown
4             #
5             # This software is copyright (c) 2011 by Randy Stauner.
6             #
7             # This is free software; you can redistribute it and/or modify it under
8             # the same terms as the Perl 5 programming language system itself.
9             #
10 16     16   1213850 use 5.008;
  16         227  
11 16     16   100 use strict;
  16         28  
  16         359  
12 16     16   102 use warnings;
  16         26  
  16         1290  
13              
14             package Pod::Markdown;
15             # git description: v3.100-2-g1399a61
16              
17             our $AUTHORITY = 'cpan:RWSTAUNER';
18             # ABSTRACT: Convert POD to Markdown
19             $Pod::Markdown::VERSION = '3.101';
20 16     16   8817 use Pod::Simple 3.27 (); # detected_encoding and keep_encoding bug fix
  16         426825  
  16         533  
21 16     16   7067 use parent qw(Pod::Simple::Methody);
  16         4028  
  16         93  
22 16     16   19659 use Encode ();
  16         137742  
  16         1948  
23              
24             our %URL_PREFIXES = (
25             sco => 'http://search.cpan.org/perldoc?',
26             metacpan => 'https://metacpan.org/pod/',
27             man => 'http://man.he.net/man',
28             );
29             $URL_PREFIXES{perldoc} = $URL_PREFIXES{metacpan};
30              
31             our $LOCAL_MODULE_RE = qr/^(Local::|\w*?_\w*)/;
32              
33             ## no critic
34             #{
35             our $HAS_HTML_ENTITIES;
36              
37             # Stolen from Pod::Simple::XHTML 3.28. {{{
38              
39             BEGIN {
40 16     16   894 $HAS_HTML_ENTITIES = eval "require HTML::Entities; 1";
41             }
42              
43             my %entities = (
44             q{>} => 'gt',
45             q{<} => 'lt',
46             q{'} => '#39',
47             q{"} => 'quot',
48             q{&} => 'amp',
49             );
50              
51             sub encode_entities {
52 234     234 0 384 my $self = shift;
53 234         359 my $ents = $self->html_encode_chars;
54 234 100       549 return HTML::Entities::encode_entities( $_[0], $ents ) if $HAS_HTML_ENTITIES;
55 49 100       75 if (defined $ents) {
56 48         84 $ents =~ s,(?
57 48         72 $ents =~ s,(?
58             } else {
59 1         5 $ents = join '', keys %entities;
60             }
61 49         66 my $str = $_[0];
62 49   66     227 $str =~ s/([$ents])/'&' . ($entities{$1} || sprintf '#x%X', ord $1) . ';'/ge;
  23         146  
63 49         152 return $str;
64             }
65              
66             # }}}
67              
68             # Add a few very common ones for consistency and readability
69             # (in case HTML::Entities isn't available).
70             %entities = (
71             # Pod::Markdown has always required 5.8 so unicode_to_native will be available.
72             chr(utf8::unicode_to_native(0xA0)) => 'nbsp',
73             chr(utf8::unicode_to_native(0xA9)) => 'copy',
74             %entities
75             );
76              
77             sub __entity_encode_ord_he {
78 11     11   251 my $chr = chr $_[0];
79             # Skip the encode_entities() logic and go straight for the substitution
80             # since we already have the char we know we want replaced.
81             # Both the hash and the function are documented as exportable (so should be reliable).
82 11   33     54 return $HTML::Entities::char2entity{ $chr } || HTML::Entities::num_entity( $chr );
83             }
84             sub __entity_encode_ord_basic {
85 7   33 7   241 return '&' . ($entities{chr $_[0]} || sprintf '#x%X', $_[0]) . ';';
86             }
87              
88             # From HTML::Entities 3.69
89             my $DEFAULT_ENTITY_CHARS = '^\n\r\t !\#\$%\(-;=?-~';
90              
91             #}
92             ## use critic
93              
94             # Use hash for simple "exists" check in `new` (much more accurate than `->can`).
95             my %attributes = map { ($_ => 1) }
96             qw(
97             html_encode_chars
98             match_encoding
99             output_encoding
100             local_module_re
101             local_module_url_prefix
102             man_url_prefix
103             perldoc_url_prefix
104             perldoc_fragment_format
105             markdown_fragment_format
106             include_meta_tags
107             );
108              
109              
110             sub new {
111 302     302 1 173799 my $class = shift;
112 302         807 my %args = @_;
113              
114 302         946 my $self = $class->SUPER::new();
115 302         5005 $self->preserve_whitespace(1);
116 302         2075 $self->nbsp_for_S(1);
117 302         1613 $self->accept_targets(qw( markdown html ));
118              
119             # Default to the global, but allow it to be overwritten in args.
120 302         4904 $self->local_module_re($LOCAL_MODULE_RE);
121              
122 302         1426 for my $type ( qw( perldoc man ) ){
123 604         975 my $attr = $type . '_url_prefix';
124             # Initialize to the alias.
125 604         1278 $self->$attr($type);
126             }
127              
128 302         877 while( my ($attr, $val) = each %args ){
129             # NOTE: Checking exists on a private var means we don't allow Pod::Simple
130             # attributes to be set this way. It's not very consistent, but I think
131             # I'm ok with that for now since there probably aren't many Pod::Simple attributes
132             # being changed besides `output_*` which feel like API rather than attributes.
133             # We'll see.
134             # This is currently backward-compatible as we previously just put the attribute
135             # into the private stash so anything unknown was silently ignored.
136             # We could open this up to `$self->can($attr)` in the future if that seems better
137             # but it tricked me when I was testing a misspelled attribute name
138             # which also happened to be a Pod::Simple method.
139              
140 905 100       3231 exists $attributes{ $attr } or
141             # Provide a more descriptive message than "Can't locate object method".
142             warn("Unknown argument to ${class}->new(): '$attr'"), next;
143              
144             # Call setter.
145 903         1662 $self->$attr($val);
146             }
147              
148             # TODO: call from the setters.
149 302         1501 $self->_prepare_fragment_formats;
150              
151 302         779 return $self;
152             }
153              
154             for my $type ( qw( local_module perldoc man ) ){
155             my $attr = $type . '_url_prefix';
156 16     16   128 no strict 'refs'; ## no critic
  16         33  
  16         85790  
157             *$attr = sub {
158 987     987   12581 my $self = shift;
159 987 100       1503 if (@_) {
160 844   66     3074 $self->{$attr} = $URL_PREFIXES{ $_[0] } || $_[0];
161             }
162             else {
163 143         411 return $self->{$attr};
164             }
165             }
166             }
167              
168             ## Attribute accessors ##
169              
170              
171             sub html_encode_chars {
172 305     305 1 326 my $self = shift;
173 305         412 my $stash = $self->_private;
174              
175             # Setter.
176 305 100       517 if( @_ ){
177             # If false ('', 0, undef), disable.
178 71 50       124 if( !$_[0] ){
179 0         0 delete $stash->{html_encode_chars};
180 0         0 $stash->{encode_amp} = 1;
181 0         0 $stash->{encode_lt} = 1;
182             }
183             else {
184             # Special case boolean '1' to mean "all".
185             # If we have HTML::Entities, undef will use the default.
186             # Without it, we need to specify so that we use the same list (for consistency).
187 71 100       172 $stash->{html_encode_chars} = $_[0] eq '1' ? ($HAS_HTML_ENTITIES ? undef : $DEFAULT_ENTITY_CHARS) : $_[0];
    100          
188              
189             # If [char] doesn't get encoded, we need to do it ourselves.
190 71         139 $stash->{encode_amp} = ($self->encode_entities('&') eq '&');
191 71         2363 $stash->{encode_lt} = ($self->encode_entities('<') eq '<');
192             }
193 71         1539 return;
194             }
195              
196             # Getter.
197 234         354 return $stash->{html_encode_chars};
198             }
199              
200              
201             # I prefer ro-accessors (immutability!) but it can be confusing
202             # to not support the same API as other Pod::Simple classes.
203              
204             # NOTE: Pod::Simple::_accessorize is not a documented public API.
205             # Skip any that have already been defined.
206             __PACKAGE__->_accessorize(grep { !__PACKAGE__->can($_) } keys %attributes);
207              
208             sub _prepare_fragment_formats {
209 302     302   414 my ($self) = @_;
210              
211 302         856 foreach my $attr ( keys %attributes ){
212 3020 100       6272 next unless $attr =~ /^(\w+)_fragment_format/;
213 604         1105 my $type = $1;
214 604         1200 my $format = $self->$attr;
215              
216             # If one was provided.
217 604 100       2949 if( $format ){
218             # If the attribute is a coderef just use it.
219 510 100       1108 next if ref($format) eq 'CODE';
220             }
221             # Else determine a default.
222             else {
223 94 100       170 if( $type eq 'perldoc' ){
224             # Choose a default that matches the destination url.
225 46         75 my $target = $self->perldoc_url_prefix;
226 46         117 foreach my $alias ( qw( metacpan sco ) ){
227 92 100       197 if( $target eq $URL_PREFIXES{ $alias } ){
228 37         53 $format = $alias;
229             }
230             }
231             # This seems like a reasonable fallback.
232 46   100     94 $format ||= 'pod_simple_xhtml';
233             }
234             else {
235 48         66 $format = $type;
236             }
237             }
238              
239             # The short name should become a method name with the prefix prepended.
240 140         200 my $prefix = 'format_fragment_';
241 140         437 $format =~ s/^$prefix//;
242 140 50       509 die "Unknown fragment format '$format'"
243             unless $self->can($prefix . $format);
244              
245             # Save it.
246 140         329 $self->$attr($format);
247             }
248              
249 302         640 return;
250             }
251              
252             ## Backward compatible API ##
253              
254             # For backward compatibility (previously based on Pod::Parser):
255             # While Pod::Simple provides a parse_from_file() method
256             # it's primarily for Pod::Parser compatibility.
257             # When called without an output handle it will print to STDOUT
258             # but the old Pod::Markdown never printed to a handle
259             # so we don't want to start now.
260             sub parse_from_file {
261 10     10 1 17 my ($self, $file) = @_;
262              
263             # TODO: Check that all dependent cpan modules use the Pod::Simple API
264             # then add a deprecation warning here to avoid confusion.
265              
266 10         38 $self->output_string(\($self->{_as_markdown_}));
267 10         2019 $self->parse_file($file);
268             }
269              
270             # Likewise, though Pod::Simple doesn't define this method at all.
271 9     9 0 344 sub parse_from_filehandle { shift->parse_from_file(@_) }
272              
273              
274             ## Document state ##
275              
276             sub _private {
277 9761     9761   11450 my ($self) = @_;
278             $self->{_Pod_Markdown_} ||= {
279 9761   100     25648 indent => 0,
280             stacks => [],
281             states => [{}],
282             link => [],
283             encode_amp => 1,
284             encode_lt => 1,
285             };
286             }
287              
288             sub _increase_indent {
289 67 50   67   108 ++$_[0]->_private->{indent} >= 1
290             or die 'Invalid state: indent < 0';
291             }
292             sub _decrease_indent {
293 67 50   67   88 --$_[0]->_private->{indent} >= 0
294             or die 'Invalid state: indent < 0';
295             }
296              
297             sub _new_stack {
298 1002     1002   1141 push @{ $_[0]->_private->{stacks} }, [];
  1002         1524  
299 1002         1224 push @{ $_[0]->_private->{states} }, {};
  1002         1374  
300             }
301              
302             sub _last_string {
303 18     18   25 $_[0]->_private->{stacks}->[-1][-1];
304             }
305              
306             sub _pop_stack_text {
307 713     713   789 $_[0]->_private->{last_state} = pop @{ $_[0]->_private->{states} };
  713         982  
308 713         878 join '', @{ pop @{ $_[0]->_private->{stacks} } };
  713         734  
  713         920  
309             }
310              
311             sub _stack_state {
312 430     430   604 $_[0]->_private->{states}->[-1];
313             }
314              
315             sub _save {
316 1340     1340   2176 my ($self, $text) = @_;
317 1340         1423 push @{ $self->_private->{stacks}->[-1] }, $text;
  1340         1737  
318             # return $text; # DEBUG
319             }
320              
321             sub _save_line {
322 493     493   727 my ($self, $text) = @_;
323              
324 493         769 $text = $self->_process_escapes($text);
325              
326 493         1211 $self->_save($text . $/);
327             }
328              
329             # For paragraphs, etc.
330             sub _save_block {
331 428     428   630 my ($self, $text) = @_;
332              
333 428         651 $self->_stack_state->{blocks}++;
334              
335 428         784 $self->_save_line($self->_indent($text) . $/);
336             }
337              
338             ## Formatting ##
339              
340             sub _chomp_all {
341 331     331   518 my ($self, $text) = @_;
342 331         1027 1 while chomp $text;
343 331         791 return $text;
344             }
345              
346             sub _indent {
347 471     471   633 my ($self, $text) = @_;
348 471         627 my $level = $self->_private->{indent};
349              
350 471 100       775 if( $level ){
351 34         70 my $indent = ' ' x ($level * 4);
352              
353             # Capture text on the line so that we don't indent blank lines (/^\x20{4}$/).
354 34         183 $text =~ s/^(.+)/$indent$1/mg;
355             }
356              
357 471         1343 return $text;
358             }
359              
360             # as_markdown() exists solely for backward compatibility
361             # and requires having called parse_from_file() to be useful.
362              
363              
364             sub as_markdown {
365 10     10 0 276 my ($parser, %args) = @_;
366 10         11 my @header;
367             # Don't add meta tags again if we've already done it.
368 10 100 100     30 if( $args{with_meta} && !$parser->include_meta_tags ){
369 3         21 @header = $parser->_build_markdown_head;
370             }
371 10         47 return join("\n" x 2, @header, $parser->{_as_markdown_});
372             }
373              
374             sub _build_markdown_head {
375 9     9   12 my $parser = shift;
376 9         10 my $data = $parser->_private;
377             return join "\n",
378 12         55 map { qq![[meta \l$_="$data->{$_}"]]! }
379 9         17 grep { defined $data->{$_} }
  18         38  
380             qw( Title Author );
381             }
382              
383             ## Escaping ##
384              
385             # http://daringfireball.net/projects/markdown/syntax#backslash
386             # Markdown provides backslash escapes for the following characters:
387             #
388             # \ backslash
389             # ` backtick
390             # * asterisk
391             # _ underscore
392             # {} curly braces
393             # [] square brackets
394             # () parentheses
395             # # hash mark
396             # + plus sign
397             # - minus sign (hyphen)
398             # . dot
399             # ! exclamation mark
400              
401             # However some of those only need to be escaped in certain places:
402             # * Backslashes *do* need to be escaped or they may be swallowed by markdown.
403             # * Word-surrounding characters (/[`*_]/) *do* need to be escaped mid-word
404             # because the markdown spec explicitly allows mid-word em*pha*sis.
405             # * I don't actually see anything that curly braces are used for.
406             # * Escaping square brackets is enough to avoid accidentally
407             # creating links and images (so we don't need to escape plain parentheses
408             # or exclamation points as that would generate a lot of unnecesary noise).
409             # Parentheses will be escaped in urls (&end_L) to avoid premature termination.
410             # * We don't need a backslash for every hash mark or every hyphen found mid-word,
411             # just the ones that start a line (likewise for plus and dot).
412             # (Those will all be handled by _escape_paragraph_markdown).
413              
414              
415             # Backslash escape markdown characters to avoid having them interpreted.
416             sub _escape_inline_markdown {
417 459     459   598 local $_ = $_[1];
418              
419             # s/([\\`*_{}\[\]()#+-.!])/\\$1/g; # See comments above.
420 459         921 s/([\\`*_\[\]])/\\$1/g;
421              
422 459         805 return $_;
423             }
424              
425             # Escape markdown characters that would be interpreted
426             # at the start of a line.
427             sub _escape_paragraph_markdown {
428 341     341   471 local $_ = $_[1];
429              
430             # Escape headings, horizontal rules, (unordered) lists, and blockquotes.
431 341         723 s/^([-+#>])/\\$1/mg;
432              
433             # Markdown doesn't support backslash escapes for equal signs
434             # even though they can be used to underline a header.
435             # So use html to escape them to avoid having them interpreted.
436 341         493 s/^([=])/sprintf '&#x%x;', ord($1)/mge;
  1         7  
437              
438             # Escape the dots that would wrongfully create numbered lists.
439 341         450 s/^( (?:>\s+)? \d+ ) (\.\x20)/$1\\$2/xgm;
440              
441 341         517 return $_;
442             }
443              
444              
445             # Additionally Markdown allows inline html so we need to escape things that look like it.
446             # While _some_ Markdown processors handle backslash-escaped html,
447             # [Daring Fireball](http://daringfireball.net/projects/markdown/syntax) states distinctly:
448             # > In HTML, there are two characters that demand special treatment: < and &...
449             # > If you want to use them as literal characters, you must escape them as entities, e.g. <, and &.
450              
451             # It goes on to say:
452             # > Markdown allows you to use these characters naturally,
453             # > taking care of all the necessary escaping for you.
454             # > If you use an ampersand as part of an HTML entity,
455             # > it remains unchanged; otherwise it will be translated into &.
456             # > Similarly, because Markdown supports inline HTML,
457             # > if you use angle brackets as delimiters for HTML tags, Markdown will treat them as such.
458              
459             # In order to only encode the occurrences that require it (something that
460             # could be interpreted as an entity) we escape them all so that we can do the
461             # suffix test later after the string is complete (since we don't know what
462             # strings might come after this one).
463              
464             my %_escape =
465             map {
466             my ($k, $v) = split /:/;
467             # Put the "code" marker before the char instead of after so that it doesn't
468             # get confused as the $2 (which is what requires us to entity-encode it).
469             # ( "XsX", "XcsX", "X(c?)sX" )
470             my ($s, $code, $re) = map { "\0$_$v\0" } '', map { ($_, '('.$_.'?)') } 'c';
471              
472             (
473             $k => $s,
474             $k.'_code' => $code,
475             $k.'_re' => qr/$re/,
476             )
477             }
478             qw( amp:& lt:< );
479              
480             # Make the values of this private var available to the tests.
481 1     1   77 sub __escape_sequences { %_escape }
482              
483              
484             # HTML-entity encode any characters configured by the user.
485             # If that doesn't include [&<] then we escape those chars so we can decide
486             # later if we will entity-encode them or put them back verbatim.
487             sub _encode_or_escape_entities {
488 459     459   516 my $self = $_[0];
489 459         604 my $stash = $self->_private;
490 459         656 local $_ = $_[1];
491              
492 459 100       738 if( $stash->{encode_amp} ){
    50          
493 439 100       651 if( exists($stash->{html_encode_chars}) ){
494             # Escape all amps for later processing.
495             # Pass intermediate strings to entity encoder so that it doesn't
496             # process any of the characters of our escape sequences.
497             # Use -1 to get "as many fields as possible" so that we keep leading and
498             # trailing (possibly empty) fields.
499 38         126 $_ = join $_escape{amp}, map { $self->encode_entities($_) } split /&/, $_, -1;
  57         677  
500             }
501             else {
502 401         699 s/&/$_escape{amp}/g;
503             }
504             }
505             elsif( exists($stash->{html_encode_chars}) ){
506 20         35 $_ = $self->encode_entities($_);
507             }
508              
509             s/
510 459 100       2315 if $stash->{encode_lt};
511              
512 459         770 return $_;
513             }
514              
515             # From Markdown.pl version 1.0.1 line 1172 (_DoAutoLinks).
516             my $EMAIL_MARKER = qr{
517             # < # Opening token is in parent regexp.
518             (?:mailto:)?
519             (
520             [-.\w]+
521             \@
522             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
523             )
524             >
525             }x;
526              
527             # Process any escapes we put in the text earlier,
528             # now that the text is complete (end of a block).
529             sub _process_escapes {
530 493     493   588 my $self = $_[0];
531 493         628 my $stash = $self->_private;
532 493         726 local $_ = $_[1];
533              
534             # The patterns below are taken from Markdown.pl 1.0.1 _EncodeAmpsAndAngles().
535             # In this case we only want to encode the ones that Markdown won't.
536             # This is overkill but produces nicer looking text (less escaped entities).
537             # If it proves insufficent then we'll just encode them all.
538              
539             # $1: If the escape was in a code sequence, simply replace the original.
540             # $2: If the unescaped value would be followed by characters
541             # that could be interpreted as html, entity-encode it.
542             # else: The character is safe to leave bare.
543              
544             # Neither currently allows $2 to contain '0' so bool tests are sufficient.
545              
546 493 100       784 if( $stash->{encode_amp} ){
547             # Encode & if succeeded by chars that look like an html entity.
548 460         1324 s,$_escape{amp_re}((?:#?[xX]?(?:[0-9a-fA-F]+|\w+);)?),
549 90 100       400 $1 ? '&'.$2 : $2 ? '&'.$2 : '&',egos;
    100          
550             }
551              
552 493 100       845 if( $stash->{encode_lt} ){
553             # Encode < if succeeded by chars that look like an html tag.
554             # Leave email addresses () for Markdown to process.
555 460         1827 s,$_escape{lt_re}((?=$EMAIL_MARKER)|(?:[a-z/?\$!])?),
556 72 100       332 $1 ? '<'.$2 : $2 ? '<'.$2 : '<',egos;
    100          
557             }
558              
559 493         851 return $_;
560             }
561              
562              
563             ## Parsing ##
564              
565             sub handle_text {
566 542     542 0 5467 my $self = $_[0];
567 542         744 my $stash = $self->_private;
568 542         889 local $_ = $_[1];
569              
570             # Unless we're in a code span, verbatim block, or formatted region.
571 542 100       908 unless( $stash->{no_escape} ){
572              
573             # We could, in theory, alter what gets escaped according to context
574             # (for example, escape square brackets (but not parens) inside link text).
575             # The markdown produced might look slightly nicer but either way you're
576             # at the whim of the markdown processor to interpret things correctly.
577             # For now just escape everything.
578              
579             # Don't let literal characters be interpreted as markdown.
580 459         746 $_ = $self->_escape_inline_markdown($_);
581              
582             # Entity-encode (or escape for later processing) necessary/desired chars.
583 459         850 $_ = $self->_encode_or_escape_entities($_);
584              
585             }
586             # If this _is_ a code section, do limited/specific handling.
587             else {
588             # Always escaping these chars ensures that we won't mangle the text
589             # in the unlikely event that a sequence matching our escape occurred in the
590             # input stream (since we're going to escape it and then unescape it).
591 83 100       327 s/&/$_escape{amp_code}/gos if $stash->{encode_amp};
592 83 100       254 s/{encode_lt};
593             }
594              
595 542         911 $self->_save($_);
596             }
597              
598             sub start_Document {
599 289     289 0 86474 my ($self) = @_;
600 289         580 $self->_new_stack;
601             }
602              
603             sub end_Document {
604 289     289 0 18596 my ($self) = @_;
605 289         616 $self->_check_search_header;
606 289         358 my $end = pop @{ $self->_private->{stacks} };
  289         389  
607              
608 289 50       341 @{ $self->_private->{stacks} } == 0
  289         384  
609             or die 'Document ended with stacks remaining';
610              
611 289         776 my @doc = $self->_chomp_all(join('', @$end)) . $/;
612              
613 289 100       713 if( $self->include_meta_tags ){
614 6         38 unshift @doc, $self->_build_markdown_head, ($/ x 2);
615             }
616              
617 289 100       1669 if( my $encoding = $self->_get_output_encoding ){
618             # Do the check outside the loop(s) for efficiency.
619 55 100       396 my $ents = $HAS_HTML_ENTITIES ? \&__entity_encode_ord_he : \&__entity_encode_ord_basic;
620             # Iterate indices to avoid copying large strings.
621 55         130 for my $i ( 0 .. $#doc ){
622 55         61 print { $self->{output_fh} } Encode::encode($encoding, $doc[$i], $ents);
  55         151  
623             }
624             }
625             else {
626 234         1048 print { $self->{output_fh} } @doc;
  234         758  
627             }
628             }
629              
630             sub _get_output_encoding {
631 289     289   406 my ($self) = @_;
632              
633             # If 'match_encoding' is set we need to return an encoding.
634             # If pod has no =encoding, Pod::Simple will guess if it sees a high-bit char.
635             # If there are no high-bit chars, encoding is undef.
636             # Use detected_encoding() rather than encoding() because if Pod::Simple
637             # can't use whatever encoding was specified, we probably can't either.
638             # Fallback to 'o_e' if no match is found. This gives the user the choice,
639             # since otherwise there would be no reason to specify 'o_e' *and* 'm_e'.
640             # Fallback to UTF-8 since it is a reasonable default these days.
641              
642 289 100 100     498 return $self->detected_encoding || $self->output_encoding || 'UTF-8'
643             if $self->match_encoding;
644              
645             # If output encoding wasn't specified, return false.
646 277         1357 return $self->output_encoding;
647             }
648              
649             ## Blocks ##
650              
651             sub start_Verbatim {
652 17     17 0 4536 my ($self) = @_;
653 17         47 $self->_new_stack;
654 17         33 $self->_private->{no_escape} = 1;
655             }
656              
657             sub end_Verbatim {
658 17     17 0 157 my ($self) = @_;
659              
660 17         34 my $text = $self->_pop_stack_text;
661              
662 17         41 $text = $self->_indent_verbatim($text);
663              
664 17         60 $self->_private->{no_escape} = 0;
665              
666             # Verbatim blocks do not generate a separate "Para" event.
667 17         35 $self->_save_block($text);
668             }
669              
670             sub _indent_verbatim {
671 17     17   35 my ($self, $paragraph) = @_;
672              
673             # NOTE: Pod::Simple expands the tabs for us (as suggested by perlpodspec).
674             # Pod::Simple also has a 'strip_verbatim_indent' attribute
675             # but it doesn't sound like it gains us anything over this method.
676              
677             # POD verbatim can start with any number of spaces (or tabs)
678             # markdown should be 4 spaces (or a tab)
679             # so indent any paragraphs so that all lines start with at least 4 spaces
680 17         64 my @lines = split /\n/, $paragraph;
681 17         53 my $indent = ' ' x 4;
682 17         36 foreach my $line ( @lines ){
683 32 100       114 next unless $line =~ m/^( +)/;
684             # find the smallest indentation
685 31 100       104 $indent = $1 if length($1) < length($indent);
686             }
687 17 100       40 if( (my $smallest = length($indent)) < 4 ){
688             # invert to get what needs to be prepended
689 11         27 $indent = ' ' x (4 - $smallest);
690              
691             # Prepend indent to each line.
692             # We could check /\S/ to only indent non-blank lines,
693             # but it's backward compatible to respect the whitespace.
694             # Additionally, both pod and markdown say they ignore blank lines
695             # so it shouldn't hurt to leave them in.
696 11 100       24 $paragraph = join "\n", map { length($_) ? $indent . $_ : '' } @lines;
  23         89  
697             }
698              
699 17         42 return $paragraph;
700             }
701              
702             sub start_Para {
703 341     341 0 120645 $_[0]->_new_stack;
704             }
705              
706             sub end_Para {
707 341     341 0 2999 my ($self) = @_;
708 341         523 my $text = $self->_pop_stack_text;
709              
710 341         662 $text = $self->_escape_paragraph_markdown($text);
711              
712 341         553 $self->_save_block($text);
713             }
714              
715              
716             ## Headings ##
717              
718 40     40 0 5247 sub start_head1 { $_[0]->_start_head(1) }
719 40     40 0 372 sub end_head1 { $_[0]->_end_head(1) }
720 11     11 0 2744 sub start_head2 { $_[0]->_start_head(2) }
721 11     11 0 122 sub end_head2 { $_[0]->_end_head(2) }
722 1     1 0 336 sub start_head3 { $_[0]->_start_head(3) }
723 1     1 0 11 sub end_head3 { $_[0]->_end_head(3) }
724 0     0 0 0 sub start_head4 { $_[0]->_start_head(4) }
725 0     0 0 0 sub end_head4 { $_[0]->_end_head(4) }
726              
727             sub _check_search_header {
728 341     341   473 my ($self) = @_;
729             # Save the text since the last heading if we want it for metadata.
730 341 100       517 if( my $last = $self->_private->{search_header} ){
731 18         32 for( $self->_private->{$last} = $self->_last_string ){
732 18         40 s/\A\s+//;
733 18         84 s/\s+\z//;
734             }
735             }
736             }
737             sub _start_head {
738 52     52   95 my ($self) = @_;
739 52         116 $self->_check_search_header;
740 52         98 $self->_new_stack;
741             }
742              
743             sub _end_head {
744 52     52   82 my ($self, $num) = @_;
745 52         94 my $h = '#' x $num;
746              
747 52         98 my $text = $self->_pop_stack_text;
748             $self->_private->{search_header} =
749 52 100       194 $text =~ /NAME/ ? 'Title'
    100          
750             : $text =~ /AUTHOR/ ? 'Author'
751             : undef;
752              
753             # TODO: option for $h suffix
754             # TODO: put a name="" if $self->{embed_anchor_tags}; ?
755             # https://rt.cpan.org/Ticket/Display.html?id=57776
756 52         155 $self->_save_block(join(' ', $h, $text));
757             }
758              
759             ## Lists ##
760              
761             # With Pod::Simple->parse_empty_lists(1) there could be an over_empty event,
762             # but what would you do with that?
763              
764             sub _start_list {
765 22     22   37 my ($self) = @_;
766 22         55 $self->_new_stack;
767              
768             # Nest again b/c start_item will pop this to look for preceding content.
769 22         49 $self->_increase_indent;
770 22         39 $self->_new_stack;
771             }
772              
773             sub _end_list {
774 22     22   35 my ($self) = @_;
775 22         45 $self->_handle_between_item_content;
776              
777             # Finish the list.
778              
779             # All the child elements should be blocks,
780             # but don't end with a double newline.
781 22         33 my $text = $self->_chomp_all($self->_pop_stack_text);
782              
783 22         76 $_[0]->_save_line($text . $/);
784             }
785              
786             sub _handle_between_item_content {
787 65     65   82 my ($self) = @_;
788              
789             # This might be empty (if the list item had no additional content).
790 65 100       114 if( my $text = $self->_pop_stack_text ){
791             # Else it's a sub-document.
792             # If there are blocks we need to separate with blank lines.
793 21 100       43 if( $self->_private->{last_state}->{blocks} ){
794 16         39 $text = $/ . $text;
795             }
796             # If not, we can condense the text.
797             # In this module's history there was a patch contributed to specifically
798             # produce "huddled" lists so we'll try to maintain that functionality.
799             else {
800 5         9 $text = $self->_chomp_all($text) . $/;
801             }
802 21         47 $self->_save($text)
803             }
804              
805 65         119 $self->_decrease_indent;
806             }
807              
808             sub _start_item {
809 43     43   65 my ($self) = @_;
810 43         85 $self->_handle_between_item_content;
811 43         81 $self->_new_stack;
812             }
813              
814             sub _end_item {
815 43     43   66 my ($self, $marker) = @_;
816 43         61 my $text = $self->_pop_stack_text;
817 43 100 66     215 $self->_save_line($self->_indent($marker .
818             # Add a space only if there is text after the marker.
819             (defined($text) && length($text) ? ' ' . $text : '')
820             ));
821              
822             # Store any possible contents in a new stack (like a sub-document).
823 43         92 $self->_increase_indent;
824 43         54 $self->_new_stack;
825             }
826              
827 7     7 0 1299 sub start_over_bullet { $_[0]->_start_list }
828 7     7 0 527 sub end_over_bullet { $_[0]->_end_list }
829              
830 18     18 0 4712 sub start_item_bullet { $_[0]->_start_item }
831 18     18 0 140 sub end_item_bullet { $_[0]->_end_item('-') }
832              
833 9     9 0 1958 sub start_over_number { $_[0]->_start_list }
834 9     9 0 991 sub end_over_number { $_[0]->_end_list }
835              
836             sub start_item_number {
837 18     18 0 4638 $_[0]->_start_item;
838             # It seems like this should be a stack,
839             # but from testing it appears that the corresponding 'end' event
840             # comes right after the text (it doesn't surround any embedded content).
841             # See t/nested.t which shows start-item, text, end-item, para, start-item....
842 18         35 $_[0]->_private->{item_number} = $_[1]->{number};
843             }
844              
845             sub end_item_number {
846 18     18 0 148 my ($self) = @_;
847 18         27 $self->_end_item($self->_private->{item_number} . '.');
848             }
849              
850             # Markdown doesn't support definition lists
851             # so do regular (unordered) lists with indented paragraphs.
852 6     6 0 1357 sub start_over_text { $_[0]->_start_list }
853 6     6 0 574 sub end_over_text { $_[0]->_end_list }
854              
855 7     7 0 1761 sub start_item_text { $_[0]->_start_item }
856 7     7 0 61 sub end_item_text { $_[0]->_end_item('-')}
857              
858              
859             # perlpodspec equates an over/back region with no items to a blockquote.
860             sub start_over_block {
861             # NOTE: We don't actually need to indent for a blockquote.
862 3     3 0 719 $_[0]->_new_stack;
863             }
864              
865             sub end_over_block {
866 3     3 0 406 my ($self) = @_;
867              
868             # Chomp first to avoid prefixing a blank line with a `>`.
869 3         7 my $text = $self->_chomp_all($self->_pop_stack_text);
870              
871             # NOTE: Paragraphs will already be escaped.
872              
873             # I don't really like either of these implementations
874             # but the join/map/split seems a little better and benches a little faster.
875             # You would lose the last newline but we've already chomped.
876             #$text =~ s{^(.)?}{'>' . (defined($1) && length($1) ? (' ' . $1) : '')}mge;
877 3 100       38 $text = join $/, map { length($_) ? '> ' . $_ : '>' } split qr-$/-, $text;
  22         48  
878              
879 3         12 $self->_save_block($text);
880             }
881              
882             ## Custom Formats ##
883              
884             sub start_for {
885 10     10 0 2354 my ($self, $attr) = @_;
886 10         26 $self->_new_stack;
887              
888 10 100       30 if( $attr->{target} eq 'html' ){
889             # Use another stack so we can indent
890             # (not syntactily necessary but seems appropriate).
891 2         7 $self->_new_stack;
892 2         10 $self->_increase_indent;
893 2         46 $self->_private->{no_escape} = 1;
894             # Mark this so we know to undo it.
895 2         6 $self->_stack_state->{for_html} = 1;
896             }
897             }
898              
899             sub end_for {
900 10     10 0 589 my ($self) = @_;
901             # Data gets saved as a block (which will handle indents),
902             # but if there was html we'll alter this, so chomp and save a block again.
903 10         14 my $text = $self->_chomp_all($self->_pop_stack_text);
904              
905 10 100       14 if( $self->_private->{last_state}->{for_html} ){
906 2         5 $self->_private->{no_escape} = 0;
907             # Save it to the next stack up so we can pop it again (we made two stacks).
908 2         6 $self->_save($text);
909 2         6 $self->_decrease_indent;
910 2         4 $text = join "\n", '
', $self->_chomp_all($self->_pop_stack_text), '
';
911             }
912              
913 10         18 $self->_save_block($text);
914             }
915              
916             # Data events will be emitted for any formatted regions that have been enabled
917             # (by default, `markdown` and `html`).
918              
919             sub start_Data {
920 5     5 0 396 my ($self) = @_;
921             # TODO: limit this to what's in attr?
922 5         11 $self->_private->{no_escape}++;
923 5         11 $self->_new_stack;
924             }
925              
926             sub end_Data {
927 5     5 0 46 my ($self) = @_;
928 5         10 my $text = $self->_pop_stack_text;
929 5         8 $self->_private->{no_escape}--;
930 5         9 $self->_save_block($text);
931             }
932              
933             ## Codes ##
934              
935 94     94 0 735 sub start_B { $_[0]->_save('**') }
936 47     47 0 425 sub end_B { $_[0]->start_B() }
937              
938 36     36 0 281 sub start_I { $_[0]->_save('_') }
939 18     18 0 159 sub end_I { $_[0]->start_I() }
940              
941             sub start_C {
942 61     61 0 864 my ($self) = @_;
943 61         133 $self->_new_stack;
944 61         98 $self->_private->{no_escape}++;
945             }
946              
947             sub end_C {
948 61     61 0 481 my ($self) = @_;
949 61         101 $self->_private->{no_escape}--;
950 61         149 $self->_save( $self->_wrap_code_span($self->_pop_stack_text) );
951             }
952              
953             # Use code spans for F<>.
954 4     4 0 66 sub start_F { shift->start_C(@_); }
955 4     4 0 41 sub end_F { shift ->end_C(@_); }
956              
957             sub start_L {
958 91     91 0 1236 my ($self, $flags) = @_;
959 91         164 $self->_new_stack;
960 91         122 push @{ $self->_private->{link} }, $flags;
  91         134  
961             }
962              
963             sub end_L {
964 91     91 0 710 my ($self) = @_;
965 91 50       96 my $flags = pop @{ $self->_private->{link} }
  91         127  
966             or die 'Invalid state: link end with no link start';
967              
968 91         121 my ($type, $to, $section) = @{$flags}{qw( type to section )};
  91         199  
969              
970 91 50       271 my $url = (
    100          
    100          
971             $type eq 'url' ? $to
972             : $type eq 'man' ? $self->format_man_url($to, $section)
973             : $type eq 'pod' ? $self->format_perldoc_url($to, $section)
974             : undef
975             );
976              
977 91         169 my $text = $self->_pop_stack_text;
978              
979             # NOTE: I don't think the perlpodspec says what to do with L<|blah>
980             # but it seems like a blank link text just doesn't make sense
981 91 100       182 if( !length($text) ){
982 4 100       9 $text =
    100          
983             $section ?
984             $to ? sprintf('"%s" in %s', $section, $to)
985             : ('"' . $section . '"')
986             : $to;
987             }
988              
989             # FIXME: What does Pod::Simple::X?HTML do for this?
990             # if we don't know how to handle the url just print the pod back out
991 91 50       245 if (!$url) {
992 0         0 $self->_save(sprintf 'L<%s>', $flags->{raw});
993 0         0 return;
994             }
995              
996             # In the url we need to escape quotes and parentheses lest markdown
997             # break the url (cut it short and/or wrongfully interpret a title).
998              
999             # Backslash escapes do not work for the space and quotes.
1000             # URL-encoding the space is not sufficient
1001             # (the quotes confuse some parsers and produce invalid html).
1002             # I've arbitratily chosen HTML encoding to hide them from markdown
1003             # while mangling the url as litle as possible.
1004 91         553 $url =~ s/([ '"])/sprintf '&#x%x;', ord($1)/ge;
  11         55  
1005              
1006             # We also need to double any backslashes that may be present
1007             # (lest they be swallowed up) and stop parens from breaking the url.
1008 91         326 $url =~ s/([\\()])/\\$1/g;
1009              
1010             # TODO: put section name in title if not the same as $text
1011 91         455 $self->_save('[' . $text . '](' . $url . ')');
1012             }
1013              
1014             sub start_X {
1015 1     1 0 16 $_[0]->_new_stack;
1016             }
1017              
1018             sub end_X {
1019 1     1 0 10 my ($self) = @_;
1020 1         3 my $text = $self->_pop_stack_text;
1021             # TODO: mangle $text?
1022             # TODO: put if configured
1023             }
1024              
1025             # A code span can be delimited by multiple backticks (and a space)
1026             # similar to pod codes (C<< code >>), so ensure we use a big enough
1027             # delimiter to not have it broken by embedded backticks.
1028             sub _wrap_code_span {
1029 61     61   113 my ($self, $arg) = @_;
1030 61         105 my $longest = 0;
1031 61         191 while( $arg =~ /([`]+)/g ){
1032 5         12 my $len = length($1);
1033 5 100       21 $longest = $len if $longest < $len;
1034             }
1035 61         127 my $delim = '`' x ($longest + 1);
1036 61 100       131 my $pad = $longest > 0 ? ' ' : '';
1037 61         215 return $delim . $pad . $arg . $pad . $delim;
1038             }
1039              
1040             ## Link Formatting (TODO: Move this to another module) ##
1041              
1042              
1043             sub format_man_url {
1044 6     6 1 8 my ($self, $to) = @_;
1045 6         13 my ($page, $part) = ($to =~ /^ ([^(]+) (?: \( (\S+) \) )? /x);
1046 6   50     99 return $self->man_url_prefix . ($part || 1) . '/' . ($page || $to);
      33        
1047             }
1048              
1049              
1050             sub format_perldoc_url {
1051 68     68 1 141 my ($self, $name, $section) = @_;
1052              
1053 68   66     187 my $url_prefix = defined($name)
1054             && $self->is_local_module($name)
1055             && $self->local_module_url_prefix
1056             || $self->perldoc_url_prefix;
1057              
1058 68         94 my $url = '';
1059              
1060             # If the link is to another module (external link).
1061 68 100       125 if ($name) {
1062 57         1155 $url = $url_prefix . $name;
1063             }
1064              
1065             # See https://rt.cpan.org/Ticket/Display.html?id=57776
1066             # for a discussion on the need to mangle the section.
1067 68 100       641 if ($section){
1068              
1069 42 100       987 my $method = $url
1070             # If we already have a prefix on the url it's external.
1071             ? $self->perldoc_fragment_format
1072             # Else an internal link points to this markdown doc.
1073             : $self->markdown_fragment_format;
1074              
1075 42 100       250 $method = 'format_fragment_' . $method
1076             unless ref($method);
1077              
1078             {
1079             # Set topic to enable code refs to be simple.
1080 42         50 local $_ = $section;
  42         54  
1081 42         89 $section = $self->$method($section);
1082             }
1083              
1084 42         110 $url .= '#' . $section;
1085             }
1086              
1087 68         369 return $url;
1088             }
1089              
1090              
1091             # TODO: simple, pandoc, etc?
1092              
1093             sub format_fragment_markdown {
1094 3     3 1 7 my ($self, $section) = @_;
1095              
1096             # If this is an internal link (to another section in this doc)
1097             # we can't be sure what the heading id's will look like
1098             # (it depends on what is rendering the markdown to html)
1099             # but we can try to follow popular conventions.
1100              
1101             # http://johnmacfarlane.net/pandoc/demo/example9/pandocs-markdown.html#header-identifiers-in-html-latex-and-context
1102             #$section =~ s/(?![-_.])[[:punct:]]//g;
1103             #$section =~ s/\s+/-/g;
1104 3         6 $section =~ s/\W+/-/g;
1105 3         104 $section =~ s/-+$//;
1106 3         5 $section =~ s/^-+//;
1107 3         9 $section = lc $section;
1108             #$section =~ s/^[^a-z]+//;
1109 3   50     7 $section ||= 'section';
1110              
1111 3         5 return $section;
1112             }
1113              
1114              
1115             {
1116             # From Pod::Simple::XHTML 3.28.
1117             # The strings gets passed through encode_entities() before idify().
1118             # If we don't do it here the substitutions below won't operate consistently.
1119              
1120             sub format_fragment_pod_simple_xhtml {
1121 9     9 1 11 my ($self, $t) = @_;
1122              
1123             # encode_entities {
1124             # We need to use the defaults in case html_encode_chars has been customized
1125             # (since the purpose is to match what external sources are doing).
1126              
1127 9         18 local $self->_private->{html_encode_chars};
1128 9         21 $t = $self->encode_entities($t);
1129             # }
1130              
1131             # idify {
1132 9         286 for ($t) {
1133 9         15 s/<[^>]+>//g; # Strip HTML.
1134 9         83 s/&[^;]+;//g; # Strip entities.
1135 9         73 s/^\s+//; s/\s+$//; # Strip white space.
  9         91  
1136 9         79 s/^([^a-zA-Z]+)$/pod$1/; # Prepend "pod" if no valid chars.
1137 9         95 s/^[^a-zA-Z]+//; # First char must be a letter.
1138 9         72 s/[^-a-zA-Z0-9_:.]+/-/g; # All other chars must be valid.
1139 9         118 s/[-:.]+$//; # Strip trailing punctuation.
1140             }
1141             # }
1142              
1143 9         43 return $t;
1144             }
1145             }
1146              
1147              
1148             sub format_fragment_pod_simple_html {
1149 9     9 1 13 my ($self, $section) = @_;
1150              
1151             # From Pod::Simple::HTML 3.28.
1152              
1153             # section_name_tidy {
1154 9         16 $section =~ s/^\s+//;
1155 9         128 $section =~ s/\s+$//;
1156 9         130 $section =~ tr/ /_/;
1157 9         220 $section =~ tr/\x00-\x1F\x80-\x9F//d if 'A' eq chr(65); # drop crazy characters
1158              
1159             #$section = $self->unicode_escape_url($section);
1160             # unicode_escape_url {
1161 9         18 $section =~ s/([^\x00-\xFF])/'('.ord($1).')'/eg;
  0         0  
1162             # Turn char 1234 into "(1234)"
1163             # }
1164              
1165 9 50       19 $section = '_' unless length $section;
1166 9         17 return $section;
1167             # }
1168             }
1169              
1170              
1171 9     9 1 19 sub format_fragment_metacpan { shift->format_fragment_pod_simple_xhtml(@_); }
1172 9     9 1 19 sub format_fragment_sco { shift->format_fragment_pod_simple_html(@_); }
1173              
1174              
1175             sub is_local_module {
1176 57     57 1 82 my ($self, $name) = @_;
1177              
1178 57         113 return ($name =~ $self->local_module_re);
1179             }
1180              
1181             1;
1182              
1183             __END__