File Coverage

blib/lib/Pod/Markdown.pm

Criterion	Covered	Total	%
statement	395	403	98.0
branch	124	134	92.5
condition	33	43	76.7
subroutine	99	101	98.0
pod	11	59	18.6
total	662	740	89.4

line	stmt	bran	cond	sub	pod	time	code
1							# vim: set ts=2 sts=2 sw=2 expandtab smarttab:
2							#
3							# This file is part of Pod-Markdown
4							#
5							# This software is copyright (c) 2011 by Randy Stauner.
6							#
7							# This is free software; you can redistribute it and/or modify it under
8							# the same terms as the Perl 5 programming language system itself.
9							#
10	16			16		1448355	use 5.008;
	16					240
11	16			16		113	use strict;
	16					31
	16					393
12	16			16		94	use warnings;
	16					36
	16					1767
13
14							package Pod::Markdown;
15							# git description: v3.200-4-gd31d626
16
17							our $AUTHORITY = 'cpan:RWSTAUNER';
18							# ABSTRACT: Convert POD to Markdown
19							$Pod::Markdown::VERSION = '3.300';
20	16			16		10314	use Pod::Simple 3.27 (); # detected_encoding and keep_encoding bug fix
	16					510429
	16					596
21	16			16		8333	use parent qw(Pod::Simple::Methody);
	16					4635
	16					121
22	16			16		21769	use Encode ();
	16					160433
	16					399
23	16			16		6872	use URI::Escape ();
	16					24173
	16					2481
24
25							our %URL_PREFIXES = (
26							sco => 'http://search.cpan.org/perldoc?',
27							metacpan => 'https://metacpan.org/pod/',
28							man => 'http://man.he.net/man',
29							);
30							$URL_PREFIXES{perldoc} = $URL_PREFIXES{metacpan};
31
32							our $LOCAL_MODULE_RE = qr/^(Local::\|\w?_\w)/;
33
34							## no critic
35							#{
36							our $HAS_HTML_ENTITIES;
37
38							# Stolen from Pod::Simple::XHTML 3.28. {{{
39
40							BEGIN {
41	16			16		972	$HAS_HTML_ENTITIES = eval "require HTML::Entities; 1";
42							}
43
44							my %entities = (
45							q{>} => 'gt',
46							q{<} => 'lt',
47							q{'} => '#39',
48							q{"} => 'quot',
49							q{&} => 'amp',
50							);
51
52							sub encode_entities {
53	234			234	0	455	my $self = shift;
54	234					427	my $ents = $self->html_encode_chars;
55	234	100				690	return HTML::Entities::encode_entities( $_[0], $ents ) if $HAS_HTML_ENTITIES;
56	49	100				91	if (defined $ents) {
57	48					110	$ents =~ s,(?
58	48					97	$ents =~ s,(?
59							} else {
60	1					6	$ents = join '', keys %entities;
61							}
62	49					74	my $str = $_[0];
63	49		66			290	$str =~ s/([$ents])/'&' . ($entities{$1} \|\| sprintf '#x%X', ord $1) . ';'/ge;
	23					160
64	49					174	return $str;
65							}
66
67							# }}}
68
69							# Add a few very common ones for consistency and readability
70							# (in case HTML::Entities isn't available).
71							%entities = (
72							# Pod::Markdown has always required 5.8 so unicode_to_native will be available.
73							chr(utf8::unicode_to_native(0xA0)) => 'nbsp',
74							chr(utf8::unicode_to_native(0xA9)) => 'copy',
75							%entities
76							);
77
78							sub __entity_encode_ord_he {
79	11			11		295	my $chr = chr $_[0];
80							# Skip the encode_entities() logic and go straight for the substitution
81							# since we already have the char we know we want replaced.
82							# Both the hash and the function are documented as exportable (so should be reliable).
83	11		33			57	return $HTML::Entities::char2entity{ $chr } \|\| HTML::Entities::num_entity( $chr );
84							}
85							sub __entity_encode_ord_basic {
86	7		33	7		327	return '&' . ($entities{chr $_[0]} \|\| sprintf '#x%X', $_[0]) . ';';
87							}
88
89							# From HTML::Entities 3.69
90							my $DEFAULT_ENTITY_CHARS = '^\n\r\t !\#\$%\(-;=?-~';
91
92							#}
93							## use critic
94
95							# Use hash for simple "exists" check in `new` (much more accurate than `->can`).
96							my %attributes = map { ($_ => 1) }
97							qw(
98							html_encode_chars
99							match_encoding
100							output_encoding
101							local_module_re
102							local_module_url_prefix
103							man_url_prefix
104							perldoc_url_prefix
105							perldoc_fragment_format
106							markdown_fragment_format
107							include_meta_tags
108							escape_url
109							);
110
111
112							sub new {
113	305			305	1	220551	my $class = shift;
114	305					988	my %args = @_;
115
116	305					1069	my $self = $class->SUPER::new();
117	305					6286	$self->preserve_whitespace(1);
118	305					2552	$self->nbsp_for_S(1);
119	305					1926	$self->accept_targets(qw( markdown html ));
120	305					5908	$self->escape_url(1);
121
122							# Default to the global, but allow it to be overwritten in args.
123	305					1968	$self->local_module_re($LOCAL_MODULE_RE);
124
125	305					1641	for my $type ( qw( perldoc man ) ){
126	610					1156	my $attr = $type . '_url_prefix';
127							# Initialize to the alias.
128	610					1516	$self->$attr($type);
129							}
130
131	305					1066	while( my ($attr, $val) = each %args ){
132							# NOTE: Checking exists on a private var means we don't allow Pod::Simple
133							# attributes to be set this way. It's not very consistent, but I think
134							# I'm ok with that for now since there probably aren't many Pod::Simple attributes
135							# being changed besides `output_*` which feel like API rather than attributes.
136							# We'll see.
137							# This is currently backward-compatible as we previously just put the attribute
138							# into the private stash so anything unknown was silently ignored.
139							# We could open this up to `$self->can($attr)` in the future if that seems better
140							# but it tricked me when I was testing a misspelled attribute name
141							# which also happened to be a Pod::Simple method.
142
143	910	100				3903	exists $attributes{ $attr } or
144							# Provide a more descriptive message than "Can't locate object method".
145							warn("Unknown argument to ${class}->new(): '$attr'"), next;
146
147							# Call setter.
148	908					2037	$self->$attr($val);
149							}
150
151							# TODO: call from the setters.
152	305					1888	$self->_prepare_fragment_formats;
153
154	305	100	100			647	if(defined $self->local_module_url_prefix && $self->local_module_url_prefix eq '' && !$self->escape_url) {
			100
155	1					19	warn("turning escape_url with an empty local_module_url_prefix is not recommended as relative URLs could be confused for IPv6 addresses");
156							}
157
158	305					1030	return $self;
159							}
160
161							for my $type ( qw( local_module perldoc man ) ){
162							my $attr = $type . '_url_prefix';
163	16			16		164	no strict 'refs'; ## no critic
	16					43
	16					100219
164							*$attr = sub {
165	1325			1325		17660	my $self = shift;
166	1325	100				2504	if (@_) {
167	853		100			3741	$self->{$attr} = $URL_PREFIXES{ $_[0] } \|\| $_[0];
168							}
169							else {
170	472					1496	return $self->{$attr};
171							}
172							}
173							}
174
175							## Attribute accessors ##
176
177
178							sub html_encode_chars {
179	305			305	1	410	my $self = shift;
180	305					479	my $stash = $self->_private;
181
182							# Setter.
183	305	100				622	if( @_ ){
184							# If false ('', 0, undef), disable.
185	71	50				157	if( !$_[0] ){
186	0					0	delete $stash->{html_encode_chars};
187	0					0	$stash->{encode_amp} = 1;
188	0					0	$stash->{encode_lt} = 1;
189							}
190							else {
191							# Special case boolean '1' to mean "all".
192							# If we have HTML::Entities, undef will use the default.
193							# Without it, we need to specify so that we use the same list (for consistency).
194	71	100				190	$stash->{html_encode_chars} = $_[0] eq '1' ? ($HAS_HTML_ENTITIES ? undef : $DEFAULT_ENTITY_CHARS) : $_[0];
		100
195
196							# If [char] doesn't get encoded, we need to do it ourselves.
197	71					149	$stash->{encode_amp} = ($self->encode_entities('&') eq '&');
198	71					2735	$stash->{encode_lt} = ($self->encode_entities('<') eq '<');
199							}
200	71					1821	return;
201							}
202
203							# Getter.
204	234					423	return $stash->{html_encode_chars};
205							}
206
207
208							# I prefer ro-accessors (immutability!) but it can be confusing
209							# to not support the same API as other Pod::Simple classes.
210
211							# NOTE: Pod::Simple::_accessorize is not a documented public API.
212							# Skip any that have already been defined.
213							__PACKAGE__->_accessorize(grep { !__PACKAGE__->can($_) } keys %attributes);
214
215							sub _prepare_fragment_formats {
216	305			305		501	my ($self) = @_;
217
218	305					969	foreach my $attr ( keys %attributes ){
219	3355	100				8492	next unless $attr =~ /^(\w+)_fragment_format/;
220	610					1407	my $type = $1;
221	610					1437	my $format = $self->$attr;
222
223							# If one was provided.
224	610	100				3646	if( $format ){
225							# If the attribute is a coderef just use it.
226	510	100				1331	next if ref($format) eq 'CODE';
227							}
228							# Else determine a default.
229							else {
230	100	100				210	if( $type eq 'perldoc' ){
231							# Choose a default that matches the destination url.
232	49					98	my $target = $self->perldoc_url_prefix;
233	49					100	foreach my $alias ( qw( metacpan sco ) ){
234	98	100				251	if( $target eq $URL_PREFIXES{ $alias } ){
235	40					71	$format = $alias;
236							}
237							}
238							# This seems like a reasonable fallback.
239	49		100			139	$format \|\|= 'pod_simple_xhtml';
240							}
241							else {
242	51					104	$format = $type;
243							}
244							}
245
246							# The short name should become a method name with the prefix prepended.
247	146					203	my $prefix = 'format_fragment_';
248	146					586	$format =~ s/^$prefix//;
249	146	50				613	die "Unknown fragment format '$format'"
250							unless $self->can($prefix . $format);
251
252							# Save it.
253	146					378	$self->$attr($format);
254							}
255
256	305					609	return;
257							}
258
259							## Backward compatible API ##
260
261							# For backward compatibility (previously based on Pod::Parser):
262							# While Pod::Simple provides a parse_from_file() method
263							# it's primarily for Pod::Parser compatibility.
264							# When called without an output handle it will print to STDOUT
265							# but the old Pod::Markdown never printed to a handle
266							# so we don't want to start now.
267							sub parse_from_file {
268	10			10	1	28	my ($self, $file) = @_;
269
270							# TODO: Check that all dependent cpan modules use the Pod::Simple API
271							# then add a deprecation warning here to avoid confusion.
272
273	10					44	$self->output_string(\($self->{_as_markdown_}));
274	10					2700	$self->parse_file($file);
275							}
276
277							# Likewise, though Pod::Simple doesn't define this method at all.
278	9			9	0	460	sub parse_from_filehandle { shift->parse_from_file(@_) }
279
280
281							## Document state ##
282
283							sub _private {
284	9811			9811		14060	my ($self) = @_;
285							$self->{_Pod_Markdown_} \|\|= {
286	9811		100			31003	indent => 0,
287							stacks => [],
288							states => [{}],
289							link => [],
290							encode_amp => 1,
291							encode_lt => 1,
292							};
293							}
294
295							sub _increase_indent {
296	67	50		67		114	++$_[0]->_private->{indent} >= 1
297							or die 'Invalid state: indent < 0';
298							}
299							sub _decrease_indent {
300	67	50		67		163	--$_[0]->_private->{indent} >= 0
301							or die 'Invalid state: indent < 0';
302							}
303
304							sub _new_stack {
305	1008			1008		1464	push @{ $_[0]->_private->{stacks} }, [];
	1008					1867
306	1008					1568	push @{ $_[0]->_private->{states} }, {};
	1008					1681
307							}
308
309							sub _last_string {
310	18			18		29	$_[0]->_private->{stacks}->[-1][-1];
311							}
312
313							sub _pop_stack_text {
314	717			717		1042	$_[0]->_private->{last_state} = pop @{ $_[0]->_private->{states} };
	717					1269
315	717					1093	join '', @{ pop @{ $_[0]->_private->{stacks} } };
	717					875
	717					1153
316							}
317
318							sub _stack_state {
319	432			432		757	$_[0]->_private->{states}->[-1];
320							}
321
322							sub _save {
323	1346			1346		2593	my ($self, $text) = @_;
324	1346					1700	push @{ $self->_private->{stacks}->[-1] }, $text;
	1346					2087
325							# return $text; # DEBUG
326							}
327
328							sub _save_line {
329	495			495		905	my ($self, $text) = @_;
330
331	495					945	$text = $self->_process_escapes($text);
332
333	495					1429	$self->_save($text . $/);
334							}
335
336							# For paragraphs, etc.
337							sub _save_block {
338	430			430		774	my ($self, $text) = @_;
339
340	430					805	$self->_stack_state->{blocks}++;
341
342	430					1021	$self->_save_line($self->_indent($text) . $/);
343							}
344
345							## Formatting ##
346
347							sub _chomp_all {
348	333			333		588	my ($self, $text) = @_;
349	333					1268	1 while chomp $text;
350	333					993	return $text;
351							}
352
353							sub _indent {
354	473			473		778	my ($self, $text) = @_;
355	473					789	my $level = $self->_private->{indent};
356
357	473	100				1038	if( $level ){
358	34					85	my $indent = ' ' x ($level * 4);
359
360							# Capture text on the line so that we don't indent blank lines (/^\x20{4}$/).
361	34					222	$text =~ s/^(.+)/$indent$1/mg;
362							}
363
364	473					1639	return $text;
365							}
366
367							# as_markdown() exists solely for backward compatibility
368							# and requires having called parse_from_file() to be useful.
369
370
371							sub as_markdown {
372	10			10	0	454	my ($parser, %args) = @_;
373	10					16	my @header;
374							# Don't add meta tags again if we've already done it.
375	10	100	100			36	if( $args{with_meta} && !$parser->include_meta_tags ){
376	3					27	@header = $parser->_build_markdown_head;
377							}
378	10					61	return join("\n" x 2, @header, $parser->{_as_markdown_});
379							}
380
381							sub _build_markdown_head {
382	9			9		17	my $parser = shift;
383	9					13	my $data = $parser->_private;
384							return join "\n",
385	12					75	map { qq![[meta \l$_="$data->{$_}"]]! }
386	9					17	grep { defined $data->{$_} }
	18					45
387							qw( Title Author );
388							}
389
390							## Escaping ##
391
392							# http://daringfireball.net/projects/markdown/syntax#backslash
393							# Markdown provides backslash escapes for the following characters:
394							#
395							# \ backslash
396							# ` backtick
397							# * asterisk
398							# _ underscore
399							# {} curly braces
400							# [] square brackets
401							# () parentheses
402							# # hash mark
403							# + plus sign
404							# - minus sign (hyphen)
405							# . dot
406							# ! exclamation mark
407
408							# However some of those only need to be escaped in certain places:
409							# * Backslashes do need to be escaped or they may be swallowed by markdown.
410							# * Word-surrounding characters (/[`_]/) do* need to be escaped mid-word
411							# because the markdown spec explicitly allows mid-word emphasis.
412							# * I don't actually see anything that curly braces are used for.
413							# * Escaping square brackets is enough to avoid accidentally
414							# creating links and images (so we don't need to escape plain parentheses
415							# or exclamation points as that would generate a lot of unnecesary noise).
416							# Parentheses will be escaped in urls (&end_L) to avoid premature termination.
417							# * We don't need a backslash for every hash mark or every hyphen found mid-word,
418							# just the ones that start a line (likewise for plus and dot).
419							# (Those will all be handled by _escape_paragraph_markdown).
420
421
422							# Backslash escape markdown characters to avoid having them interpreted.
423							sub _escape_inline_markdown {
424	461			461		704	local $_ = $_[1];
425
426							# s/([\\`*_{}\[\]()#+-.!])/\\$1/g; # See comments above.
427	461					1142	s/([\\`*_\[\]])/\\$1/g;
428
429	461					928	return $_;
430							}
431
432							# Escape markdown characters that would be interpreted
433							# at the start of a line.
434							sub _escape_paragraph_markdown {
435	343			343		546	local $_ = $_[1];
436
437							# Escape headings, horizontal rules, (unordered) lists, and blockquotes.
438	343					857	s/^([-+#>])/\\$1/mg;
439
440							# Markdown doesn't support backslash escapes for equal signs
441							# even though they can be used to underline a header.
442							# So use html to escape them to avoid having them interpreted.
443	343					635	s/^([=])/sprintf '&#x%x;', ord($1)/mge;
	1					8
444
445							# Escape the dots that would wrongfully create numbered lists.
446	343					561	s/^( (?:>\s+)? \d+ ) (\.\x20)/$1\\$2/xgm;
447
448	343					656	return $_;
449							}
450
451
452							# Additionally Markdown allows inline html so we need to escape things that look like it.
453							# While _some_ Markdown processors handle backslash-escaped html,
454							# [Daring Fireball](http://daringfireball.net/projects/markdown/syntax) states distinctly:
455							# > In HTML, there are two characters that demand special treatment: < and &...
456							# > If you want to use them as literal characters, you must escape them as entities, e.g. <, and &.
457
458							# It goes on to say:
459							# > Markdown allows you to use these characters naturally,
460							# > taking care of all the necessary escaping for you.
461							# > If you use an ampersand as part of an HTML entity,
462							# > it remains unchanged; otherwise it will be translated into &.
463							# > Similarly, because Markdown supports inline HTML,
464							# > if you use angle brackets as delimiters for HTML tags, Markdown will treat them as such.
465
466							# In order to only encode the occurrences that require it (something that
467							# could be interpreted as an entity) we escape them all so that we can do the
468							# suffix test later after the string is complete (since we don't know what
469							# strings might come after this one).
470
471							my %_escape =
472							map {
473							my ($k, $v) = split /:/;
474							# Put the "code" marker before the char instead of after so that it doesn't
475							# get confused as the $2 (which is what requires us to entity-encode it).
476							# ( "XsX", "XcsX", "X(c?)sX" )
477							my ($s, $code, $re) = map { "\0$_$v\0" } '', map { ($_, '('.$_.'?)') } 'c';
478
479							(
480							$k => $s,
481							$k.'_code' => $code,
482							$k.'_re' => qr/$re/,
483							)
484							}
485							qw( amp:& lt:< );
486
487							# Make the values of this private var available to the tests.
488	1			1		89	sub __escape_sequences { %_escape }
489
490
491							# HTML-entity encode any characters configured by the user.
492							# If that doesn't include [&<] then we escape those chars so we can decide
493							# later if we will entity-encode them or put them back verbatim.
494							sub _encode_or_escape_entities {
495	461			461		646	my $self = $_[0];
496	461					756	my $stash = $self->_private;
497	461					751	local $_ = $_[1];
498
499	461	100				869	if( $stash->{encode_amp} ){
		50
500	441	100				775	if( exists($stash->{html_encode_chars}) ){
501							# Escape all amps for later processing.
502							# Pass intermediate strings to entity encoder so that it doesn't
503							# process any of the characters of our escape sequences.
504							# Use -1 to get "as many fields as possible" so that we keep leading and
505							# trailing (possibly empty) fields.
506	38					129	$_ = join $_escape{amp}, map { $self->encode_entities($_) } split /&/, $_, -1;
	57					823
507							}
508							else {
509	403					835	s/&/$_escape{amp}/g;
510							}
511							}
512							elsif( exists($stash->{html_encode_chars}) ){
513	20					46	$_ = $self->encode_entities($_);
514							}
515
516							s/
517	461	100				2730	if $stash->{encode_lt};
518
519	461					916	return $_;
520							}
521
522							# From Markdown.pl version 1.0.1 line 1172 (_DoAutoLinks).
523							my $EMAIL_MARKER = qr{
524							# < # Opening token is in parent regexp.
525							(?:mailto:)?
526							(
527							[-.\w]+
528							\@
529							[-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
530							)
531							>
532							}x;
533
534							# Process any escapes we put in the text earlier,
535							# now that the text is complete (end of a block).
536							sub _process_escapes {
537	495			495		692	my $self = $_[0];
538	495					799	my $stash = $self->_private;
539	495					812	local $_ = $_[1];
540
541							# The patterns below are taken from Markdown.pl 1.0.1 _EncodeAmpsAndAngles().
542							# In this case we only want to encode the ones that Markdown won't.
543							# This is overkill but produces nicer looking text (less escaped entities).
544							# If it proves insufficent then we'll just encode them all.
545
546							# $1: If the escape was in a code sequence, simply replace the original.
547							# $2: If the unescaped value would be followed by characters
548							# that could be interpreted as html, entity-encode it.
549							# else: The character is safe to leave bare.
550
551							# Neither currently allows $2 to contain '0' so bool tests are sufficient.
552
553	495	100				951	if( $stash->{encode_amp} ){
554							# Encode & if succeeded by chars that look like an html entity.
555	462					1617	s,$_escape{amp_re}((?:#?[xX]?(?:[0-9a-fA-F]+\|\w+);)?),
556	90	100				438	$1 ? '&'.$2 : $2 ? '&'.$2 : '&',egos;
		100
557							}
558
559	495	100				954	if( $stash->{encode_lt} ){
560							# Encode < if succeeded by chars that look like an html tag.
561							# Leave email addresses () for Markdown to process.
562	462					2357	s,$_escape{lt_re}((?=$EMAIL_MARKER)\|(?:[a-z/?\$!])?),
563	72	100				374	$1 ? '<'.$2 : $2 ? '<'.$2 : '<',egos;
		100
564							}
565
566	495					959	return $_;
567							}
568
569
570							## Parsing ##
571
572							sub handle_text {
573	544			544	0	6747	my $self = $_[0];
574	544					919	my $stash = $self->_private;
575	544					1011	local $_ = $_[1];
576
577							# Unless we're in a code span, verbatim block, or formatted region.
578	544	100				1076	unless( $stash->{no_escape} ){
579
580							# We could, in theory, alter what gets escaped according to context
581							# (for example, escape square brackets (but not parens) inside link text).
582							# The markdown produced might look slightly nicer but either way you're
583							# at the whim of the markdown processor to interpret things correctly.
584							# For now just escape everything.
585
586							# Don't let literal characters be interpreted as markdown.
587	461					921	$_ = $self->_escape_inline_markdown($_);
588
589							# Entity-encode (or escape for later processing) necessary/desired chars.
590	461					955	$_ = $self->_encode_or_escape_entities($_);
591
592							}
593							# If this _is_ a code section, do limited/specific handling.
594							else {
595							# Always escaping these chars ensures that we won't mangle the text
596							# in the unlikely event that a sequence matching our escape occurred in the
597							# input stream (since we're going to escape it and then unescape it).
598	83	100				327	s/&/$_escape{amp_code}/gos if $stash->{encode_amp};
599	83	100				282	s/{encode_lt};
600							}
601
602	544					1147	$self->_save($_);
603							}
604
605							sub start_Document {
606	291			291	0	104392	my ($self) = @_;
607	291					696	$self->_new_stack;
608							}
609
610							sub end_Document {
611	291			291	0	22450	my ($self) = @_;
612	291					722	$self->_check_search_header;
613	291					447	my $end = pop @{ $self->_private->{stacks} };
	291					466
614
615	291	50				405	@{ $self->_private->{stacks} } == 0
	291					480
616							or die 'Document ended with stacks remaining';
617
618	291					944	my @doc = $self->_chomp_all(join('', @$end)) . $/;
619
620	291	100				889	if( $self->include_meta_tags ){
621	6					50	unshift @doc, $self->_build_markdown_head, ($/ x 2);
622							}
623
624	291	100				2136	if( my $encoding = $self->_get_output_encoding ){
625							# Do the check outside the loop(s) for efficiency.
626	55	100				516	my $ents = $HAS_HTML_ENTITIES ? \&__entity_encode_ord_he : \&__entity_encode_ord_basic;
627							# Iterate indices to avoid copying large strings.
628	55					164	for my $i ( 0 .. $#doc ){
629	55					78	print { $self->{output_fh} } Encode::encode($encoding, $doc[$i], $ents);
	55					166
630							}
631							}
632							else {
633	236					1276	print { $self->{output_fh} } @doc;
	236					989
634							}
635							}
636
637							sub _get_output_encoding {
638	291			291		509	my ($self) = @_;
639
640							# If 'match_encoding' is set we need to return an encoding.
641							# If pod has no =encoding, Pod::Simple will guess if it sees a high-bit char.
642							# If there are no high-bit chars, encoding is undef.
643							# Use detected_encoding() rather than encoding() because if Pod::Simple
644							# can't use whatever encoding was specified, we probably can't either.
645							# Fallback to 'o_e' if no match is found. This gives the user the choice,
646							# since otherwise there would be no reason to specify 'o_e' and 'm_e'.
647							# Fallback to UTF-8 since it is a reasonable default these days.
648
649	291	100	100			601	return $self->detected_encoding \|\| $self->output_encoding \|\| 'UTF-8'
650							if $self->match_encoding;
651
652							# If output encoding wasn't specified, return false.
653	279					1659	return $self->output_encoding;
654							}
655
656							## Blocks ##
657
658							sub start_Verbatim {
659	17			17	0	5123	my ($self) = @_;
660	17					54	$self->_new_stack;
661	17					37	$self->_private->{no_escape} = 1;
662							}
663
664							sub end_Verbatim {
665	17			17	0	159	my ($self) = @_;
666
667	17					39	my $text = $self->_pop_stack_text;
668
669	17					46	$text = $self->_indent_verbatim($text);
670
671	17					41	$self->_private->{no_escape} = 0;
672
673							# Verbatim blocks do not generate a separate "Para" event.
674	17					41	$self->_save_block($text);
675							}
676
677							sub _indent_verbatim {
678	17			17		37	my ($self, $paragraph) = @_;
679
680							# NOTE: Pod::Simple expands the tabs for us (as suggested by perlpodspec).
681							# Pod::Simple also has a 'strip_verbatim_indent' attribute
682							# but it doesn't sound like it gains us anything over this method.
683
684							# POD verbatim can start with any number of spaces (or tabs)
685							# markdown should be 4 spaces (or a tab)
686							# so indent any paragraphs so that all lines start with at least 4 spaces
687	17					63	my @lines = split /\n/, $paragraph;
688	17					31	my $indent = ' ' x 4;
689	17					56	foreach my $line ( @lines ){
690	32	100				116	next unless $line =~ m/^( +)/;
691							# find the smallest indentation
692	31	100				106	$indent = $1 if length($1) < length($indent);
693							}
694	17	100				65	if( (my $smallest = length($indent)) < 4 ){
695							# invert to get what needs to be prepended
696	11					31	$indent = ' ' x (4 - $smallest);
697
698							# Prepend indent to each line.
699							# We could check /\S/ to only indent non-blank lines,
700							# but it's backward compatible to respect the whitespace.
701							# Additionally, both pod and markdown say they ignore blank lines
702							# so it shouldn't hurt to leave them in.
703	11	100				27	$paragraph = join "\n", map { length($_) ? $indent . $_ : '' } @lines;
	23					88
704							}
705
706	17					46	return $paragraph;
707							}
708
709							sub start_Para {
710	343			343	0	147466	$_[0]->_new_stack;
711							}
712
713							sub end_Para {
714	343			343	0	3687	my ($self) = @_;
715	343					642	my $text = $self->_pop_stack_text;
716
717	343					799	$text = $self->_escape_paragraph_markdown($text);
718
719	343					723	$self->_save_block($text);
720							}
721
722
723							## Headings ##
724
725	40			40	0	6166	sub start_head1 { $_[0]->_start_head(1) }
726	40			40	0	445	sub end_head1 { $_[0]->_end_head(1) }
727	11			11	0	3425	sub start_head2 { $_[0]->_start_head(2) }
728	11			11	0	151	sub end_head2 { $_[0]->_end_head(2) }
729	1			1	0	413	sub start_head3 { $_[0]->_start_head(3) }
730	1			1	0	12	sub end_head3 { $_[0]->_end_head(3) }
731	0			0	0	0	sub start_head4 { $_[0]->_start_head(4) }
732	0			0	0	0	sub end_head4 { $_[0]->_end_head(4) }
733
734							sub _check_search_header {
735	343			343		563	my ($self) = @_;
736							# Save the text since the last heading if we want it for metadata.
737	343	100				582	if( my $last = $self->_private->{search_header} ){
738	18					39	for( $self->_private->{$last} = $self->_last_string ){
739	18					44	s/\A\s+//;
740	18					96	s/\s+\z//;
741							}
742							}
743							}
744							sub _start_head {
745	52			52		103	my ($self) = @_;
746	52					137	$self->_check_search_header;
747	52					118	$self->_new_stack;
748							}
749
750							sub _end_head {
751	52			52		90	my ($self, $num) = @_;
752	52					110	my $h = '#' x $num;
753
754	52					146	my $text = $self->_pop_stack_text;
755							$self->_private->{search_header} =
756	52	100				240	$text =~ /NAME/ ? 'Title'
		100
757							: $text =~ /AUTHOR/ ? 'Author'
758							: undef;
759
760							# TODO: option for $h suffix
761							# TODO: put a name="" if $self->{embed_anchor_tags}; ?
762							# https://rt.cpan.org/Ticket/Display.html?id=57776
763	52					212	$self->_save_block(join(' ', $h, $text));
764							}
765
766							## Lists ##
767
768							# With Pod::Simple->parse_empty_lists(1) there could be an over_empty event,
769							# but what would you do with that?
770
771							sub _start_list {
772	22			22		44	my ($self) = @_;
773	22					51	$self->_new_stack;
774
775							# Nest again b/c start_item will pop this to look for preceding content.
776	22					54	$self->_increase_indent;
777	22					43	$self->_new_stack;
778							}
779
780							sub _end_list {
781	22			22		47	my ($self) = @_;
782	22					55	$self->_handle_between_item_content;
783
784							# Finish the list.
785
786							# All the child elements should be blocks,
787							# but don't end with a double newline.
788	22					58	my $text = $self->_chomp_all($self->_pop_stack_text);
789
790	22					95	$_[0]->_save_line($text . $/);
791							}
792
793							sub _handle_between_item_content {
794	65			65		104	my ($self) = @_;
795
796							# This might be empty (if the list item had no additional content).
797	65	100				123	if( my $text = $self->_pop_stack_text ){
798							# Else it's a sub-document.
799							# If there are blocks we need to separate with blank lines.
800	21	100				40	if( $self->_private->{last_state}->{blocks} ){
801	16					55	$text = $/ . $text;
802							}
803							# If not, we can condense the text.
804							# In this module's history there was a patch contributed to specifically
805							# produce "huddled" lists so we'll try to maintain that functionality.
806							else {
807	5					30	$text = $self->_chomp_all($text) . $/;
808							}
809	21					51	$self->_save($text)
810							}
811
812	65					142	$self->_decrease_indent;
813							}
814
815							sub _start_item {
816	43			43		84	my ($self) = @_;
817	43					124	$self->_handle_between_item_content;
818	43					84	$self->_new_stack;
819							}
820
821							sub _end_item {
822	43			43		79	my ($self, $marker) = @_;
823	43					74	my $text = $self->_pop_stack_text;
824	43	100	66			266	$self->_save_line($self->_indent($marker .
825							# Add a space only if there is text after the marker.
826							(defined($text) && length($text) ? ' ' . $text : '')
827							));
828
829							# Store any possible contents in a new stack (like a sub-document).
830	43					121	$self->_increase_indent;
831	43					84	$self->_new_stack;
832							}
833
834	7			7	0	1635	sub start_over_bullet { $_[0]->_start_list }
835	7			7	0	647	sub end_over_bullet { $_[0]->_end_list }
836
837	18			18	0	6163	sub start_item_bullet { $_[0]->_start_item }
838	18			18	0	178	sub end_item_bullet { $_[0]->_end_item('-') }
839
840	9			9	0	2355	sub start_over_number { $_[0]->_start_list }
841	9			9	0	1481	sub end_over_number { $_[0]->_end_list }
842
843							sub start_item_number {
844	18			18	0	5472	$_[0]->_start_item;
845							# It seems like this should be a stack,
846							# but from testing it appears that the corresponding 'end' event
847							# comes right after the text (it doesn't surround any embedded content).
848							# See t/nested.t which shows start-item, text, end-item, para, start-item....
849	18					39	$_[0]->_private->{item_number} = $_[1]->{number};
850							}
851
852							sub end_item_number {
853	18			18	0	194	my ($self) = @_;
854	18					36	$self->_end_item($self->_private->{item_number} . '.');
855							}
856
857							# Markdown doesn't support definition lists
858							# so do regular (unordered) lists with indented paragraphs.
859	6			6	0	1570	sub start_over_text { $_[0]->_start_list }
860	6			6	0	685	sub end_over_text { $_[0]->_end_list }
861
862	7			7	0	2134	sub start_item_text { $_[0]->_start_item }
863	7			7	0	74	sub end_item_text { $_[0]->_end_item('-')}
864
865
866							# perlpodspec equates an over/back region with no items to a blockquote.
867							sub start_over_block {
868							# NOTE: We don't actually need to indent for a blockquote.
869	3			3	0	981	$_[0]->_new_stack;
870							}
871
872							sub end_over_block {
873	3			3	0	540	my ($self) = @_;
874
875							# Chomp first to avoid prefixing a blank line with a `>`.
876	3					10	my $text = $self->_chomp_all($self->_pop_stack_text);
877
878							# NOTE: Paragraphs will already be escaped.
879
880							# I don't really like either of these implementations
881							# but the join/map/split seems a little better and benches a little faster.
882							# You would lose the last newline but we've already chomped.
883							#$text =~ s{^(.)?}{'>' . (defined($1) && length($1) ? (' ' . $1) : '')}mge;
884	3	100				62	$text = join $/, map { length($_) ? '> ' . $_ : '>' } split qr-$/-, $text;
	22					62
885
886	3					18	$self->_save_block($text);
887							}
888
889							## Custom Formats ##
890
891							sub start_for {
892	10			10	0	2638	my ($self, $attr) = @_;
893	10					60	$self->_new_stack;
894
895	10	100				35	if( $attr->{target} eq 'html' ){
896							# Use another stack so we can indent
897							# (not syntactily necessary but seems appropriate).
898	2					6	$self->_new_stack;
899	2					7	$self->_increase_indent;
900	2					5	$self->_private->{no_escape} = 1;
901							# Mark this so we know to undo it.
902	2					7	$self->_stack_state->{for_html} = 1;
903							}
904							}
905
906							sub end_for {
907	10			10	0	718	my ($self) = @_;
908							# Data gets saved as a block (which will handle indents),
909							# but if there was html we'll alter this, so chomp and save a block again.
910	10					19	my $text = $self->_chomp_all($self->_pop_stack_text);
911
912	10	100				21	if( $self->_private->{last_state}->{for_html} ){
913	2					4	$self->_private->{no_escape} = 0;
914							# Save it to the next stack up so we can pop it again (we made two stacks).
915	2					5	$self->_save($text);
916	2					7	$self->_decrease_indent;
917	2					6	$text = join "\n", ' ', $self->_chomp_all($self->_pop_stack_text), ' ';
918							}
919
920	10					22	$self->_save_block($text);
921							}
922
923							# Data events will be emitted for any formatted regions that have been enabled
924							# (by default, `markdown` and `html`).
925
926							sub start_Data {
927	5			5	0	421	my ($self) = @_;
928							# TODO: limit this to what's in attr?
929	5					12	$self->_private->{no_escape}++;
930	5					12	$self->_new_stack;
931							}
932
933							sub end_Data {
934	5			5	0	48	my ($self) = @_;
935	5					12	my $text = $self->_pop_stack_text;
936	5					11	$self->_private->{no_escape}--;
937	5					12	$self->_save_block($text);
938							}
939
940							## Codes ##
941
942	94			94	0	927	sub start_B { $_[0]->_save('**') }
943	47			47	0	485	sub end_B { $_[0]->start_B() }
944
945	36			36	0	333	sub start_I { $_[0]->_save('_') }
946	18			18	0	213	sub end_I { $_[0]->start_I() }
947
948							sub start_C {
949	61			61	0	958	my ($self) = @_;
950	61					186	$self->_new_stack;
951	61					126	$self->_private->{no_escape}++;
952							}
953
954							sub end_C {
955	61			61	0	560	my ($self) = @_;
956	61					112	$self->_private->{no_escape}--;
957	61					140	$self->_save( $self->_wrap_code_span($self->_pop_stack_text) );
958							}
959
960							# Use code spans for F<>.
961	4			4	0	69	sub start_F { shift->start_C(@_); }
962	4			4	0	44	sub end_F { shift ->end_C(@_); }
963
964							sub start_L {
965	93			93	0	1542	my ($self, $flags) = @_;
966	93					206	$self->_new_stack;
967	93					140	push @{ $self->_private->{link} }, $flags;
	93					157
968							}
969
970							sub end_L {
971	93			93	0	896	my ($self) = @_;
972	93	50				121	my $flags = pop @{ $self->_private->{link} }
	93					144
973							or die 'Invalid state: link end with no link start';
974
975	93					150	my ($type, $to, $section) = @{$flags}{qw( type to section )};
	93					230
976
977	93	50				624	my $url = (
		100
		100
978							$type eq 'url' ? $to
979							: $type eq 'man' ? $self->format_man_url($to, $section)
980							: $type eq 'pod' ? $self->format_perldoc_url($to, $section)
981							: undef
982							);
983
984	93					254	my $text = $self->_pop_stack_text;
985
986							# NOTE: I don't think the perlpodspec says what to do with L<\|blah>
987							# but it seems like a blank link text just doesn't make sense
988	93	100				267	if( !length($text) ){
989	4	100				11	$text =
		100
990							$section ?
991							$to ? sprintf('"%s" in %s', $section, $to)
992							: ('"' . $section . '"')
993							: $to;
994							}
995
996							# FIXME: What does Pod::Simple::X?HTML do for this?
997							# if we don't know how to handle the url just print the pod back out
998	93	50				368	if (!$url) {
999	0					0	$self->_save(sprintf 'L<%s>', $flags->{raw});
1000	0					0	return;
1001							}
1002
1003							# In the url we need to escape quotes and parentheses lest markdown
1004							# break the url (cut it short and/or wrongfully interpret a title).
1005
1006							# Backslash escapes do not work for the space and quotes.
1007							# URL-encoding the space is not sufficient
1008							# (the quotes confuse some parsers and produce invalid html).
1009							# I've arbitratily chosen HTML encoding to hide them from markdown
1010							# while mangling the url as litle as possible.
1011	93					686	$url =~ s/([ '"])/sprintf '&#x%x;', ord($1)/ge;
	11					74
1012
1013							# We also need to double any backslashes that may be present
1014							# (lest they be swallowed up) and stop parens from breaking the url.
1015	93					404	$url =~ s/([\\()])/\\$1/g;
1016
1017							# TODO: put section name in title if not the same as $text
1018	93					525	$self->_save('[' . $text . '](' . $url . ')');
1019							}
1020
1021							sub start_X {
1022	1			1	0	19	$_[0]->_new_stack;
1023							}
1024
1025							sub end_X {
1026	1			1	0	11	my ($self) = @_;
1027	1					4	my $text = $self->_pop_stack_text;
1028							# TODO: mangle $text?
1029							# TODO: put if configured
1030							}
1031
1032							# A code span can be delimited by multiple backticks (and a space)
1033							# similar to pod codes (C<< code >>), so ensure we use a big enough
1034							# delimiter to not have it broken by embedded backticks.
1035							sub _wrap_code_span {
1036	61			61		138	my ($self, $arg) = @_;
1037	61					99	my $longest = 0;
1038	61					202	while( $arg =~ /([`]+)/g ){
1039	5					11	my $len = length($1);
1040	5	100				21	$longest = $len if $longest < $len;
1041							}
1042	61					143	my $delim = '`' x ($longest + 1);
1043	61	100				149	my $pad = $longest > 0 ? ' ' : '';
1044	61					199	return $delim . $pad . $arg . $pad . $delim;
1045							}
1046
1047							## Link Formatting (TODO: Move this to another module) ##
1048
1049
1050							sub format_man_url {
1051	6			6	1	14	my ($self, $to) = @_;
1052	6					15	my ($page, $part) = ($to =~ /^ ([^(]+) (?: $ (\S+) $ )? /x);
1053	6		50			171	return $self->man_url_prefix . ($part \|\| 1) . '/' . ($page \|\| $to);
			33
1054							}
1055
1056
1057							sub format_perldoc_url {
1058	70			70	1	139	my ($self, $name, $section) = @_;
1059
1060	70					158	my $url_prefix = $self->perldoc_url_prefix;
1061	70	100	100			240	if (
			100
1062							defined($name)
1063							&& $self->is_local_module($name)
1064							&& defined($self->local_module_url_prefix)
1065							) {
1066	5					12	$url_prefix = $self->local_module_url_prefix;
1067							}
1068
1069	70					1413	my $url = '';
1070
1071							# If the link is to another module (external link).
1072	70	100				164	if ($name) {
1073	59	100				1446	$url = $url_prefix . ($self->escape_url ? URI::Escape::uri_escape($name) : $name);
1074							}
1075
1076							# See https://rt.cpan.org/Ticket/Display.html?id=57776
1077							# for a discussion on the need to mangle the section.
1078	70	100				2892	if ($section){
1079
1080	42	100				1255	my $method = $url
1081							# If we already have a prefix on the url it's external.
1082							? $self->perldoc_fragment_format
1083							# Else an internal link points to this markdown doc.
1084							: $self->markdown_fragment_format;
1085
1086	42	100				285	$method = 'format_fragment_' . $method
1087							unless ref($method);
1088
1089							{
1090							# Set topic to enable code refs to be simple.
1091	42					63	local $_ = $section;
	42					60
1092	42					110	$section = $self->$method($section);
1093							}
1094
1095	42					147	$url .= '#' . $section;
1096							}
1097
1098	70					449	return $url;
1099							}
1100
1101
1102							# TODO: simple, pandoc, etc?
1103
1104							sub format_fragment_markdown {
1105	3			3	1	10	my ($self, $section) = @_;
1106
1107							# If this is an internal link (to another section in this doc)
1108							# we can't be sure what the heading id's will look like
1109							# (it depends on what is rendering the markdown to html)
1110							# but we can try to follow popular conventions.
1111
1112							# http://johnmacfarlane.net/pandoc/demo/example9/pandocs-markdown.html#header-identifiers-in-html-latex-and-context
1113							#$section =~ s/(?![-_.])[[:punct:]]//g;
1114							#$section =~ s/\s+/-/g;
1115	3					8	$section =~ s/\W+/-/g;
1116	3					127	$section =~ s/-+$//;
1117	3					6	$section =~ s/^-+//;
1118	3					9	$section = lc $section;
1119							#$section =~ s/^[^a-z]+//;
1120	3		50			7	$section \|\|= 'section';
1121
1122	3					8	return $section;
1123							}
1124
1125
1126							{
1127							# From Pod::Simple::XHTML 3.28.
1128							# The strings gets passed through encode_entities() before idify().
1129							# If we don't do it here the substitutions below won't operate consistently.
1130
1131							sub format_fragment_pod_simple_xhtml {
1132	9			9	1	19	my ($self, $t) = @_;
1133
1134							# encode_entities {
1135							# We need to use the defaults in case html_encode_chars has been customized
1136							# (since the purpose is to match what external sources are doing).
1137
1138	9					19	local $self->_private->{html_encode_chars};
1139	9					26	$t = $self->encode_entities($t);
1140							# }
1141
1142							# idify {
1143	9					400	for ($t) {
1144	9					20	s/<[^>]+>//g; # Strip HTML.
1145	9					101	s/&[^;]+;//g; # Strip entities.
1146	9					606	s/^\s+//; s/\s+$//; # Strip white space.
	9					116
1147	9					98	s/^([^a-zA-Z]+)$/pod$1/; # Prepend "pod" if no valid chars.
1148	9					115	s/^[^a-zA-Z]+//; # First char must be a letter.
1149	9					90	s/[^-a-zA-Z0-9_:.]+/-/g; # All other chars must be valid.
1150	9					149	s/[-:.]+$//; # Strip trailing punctuation.
1151							}
1152							# }
1153
1154	9					52	return $t;
1155							}
1156							}
1157
1158
1159							sub format_fragment_pod_simple_html {
1160	9			9	1	17	my ($self, $section) = @_;
1161
1162							# From Pod::Simple::HTML 3.28.
1163
1164							# section_name_tidy {
1165	9					20	$section =~ s/^\s+//;
1166	9					156	$section =~ s/\s+$//;
1167	9					162	$section =~ tr/ /_/;
1168	9					272	$section =~ tr/\x00-\x1F\x80-\x9F//d if 'A' eq chr(65); # drop crazy characters
1169
1170							#$section = $self->unicode_escape_url($section);
1171							# unicode_escape_url {
1172	9					20	$section =~ s/([^\x00-\xFF])/'('.ord($1).')'/eg;
	0					0
1173							# Turn char 1234 into "(1234)"
1174							# }
1175
1176	9	50				22	$section = '_' unless length $section;
1177	9					20	return $section;
1178							# }
1179							}
1180
1181
1182	9			9	1	21	sub format_fragment_metacpan { shift->format_fragment_pod_simple_xhtml(@_); }
1183	9			9	1	22	sub format_fragment_sco { shift->format_fragment_pod_simple_html(@_); }
1184
1185
1186							sub is_local_module {
1187	59			59	1	109	my ($self, $name) = @_;
1188
1189	59					163	return ($name =~ $self->local_module_re);
1190							}
1191
1192							1;
1193
1194							__END__