File Coverage

blib/lib/Markdown/To/POD.pm

Criterion	Covered	Total	%
statement	279	496	56.2
branch	23	98	23.4
condition	9	61	14.7
subroutine	42	55	76.3
pod	3	3	100.0
total	356	713	49.9

line	stmt	bran	cond	sub	pod	time	code
1							package Markdown::To::POD;
2
3							our $DATE = '2014-07-28'; # DATE
4							our $VERSION = '0.04'; # VERSION
5							# ABSTRACT: Convert Markdown syntax to POD
6
7	1			1		1466	use 5.010001;
	1					3
	1					44
8	1			1		5	use strict;
	1					2
	1					30
9	1			1		6	use warnings;
	1					1
	1					28
10	1			1		5	use re 'eval';
	1					2
	1					52
11
12	1			1		6	use Digest::MD5 qw(md5_hex);
	1					9
	1					74
13	1			1		2303	use Encode qw();
	1					19989
	1					28
14	1			1		9	use Carp qw(croak);
	1					3
	1					75
15	1			1		5	use base 'Exporter';
	1					2
	1					1346
16
17							our @EXPORT_OK = qw(markdown_to_pod);
18
19
20							# Regex to match balanced [brackets]. See Friedl's
21							# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
22							our ($g_nested_brackets, $g_nested_parens);
23							$g_nested_brackets = qr{
24							(?> # Atomic matching
25							[^\[\]]+ # Anything other than brackets
26							\|
27							\[
28							(??{ $g_nested_brackets }) # Recursive set of nested brackets
29							\]
30							)*
31							}x;
32							# Doesn't allow for whitespace, because we're using it to match URLs:
33							$g_nested_parens = qr{
34							(?> # Atomic matching
35							[^()\s]+ # Anything other than parens or whitespace
36							\|
37							\(
38							(??{ $g_nested_parens }) # Recursive set of nested brackets
39							\)
40							)*
41							}x;
42
43							# Table of hash values for escaped characters:
44							our %g_escape_table;
45							foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
46							$g_escape_table{$char} = md5_hex($char);
47							}
48
49
50							sub new {
51	3			3	1	6	my ($class, %p) = @_;
52
53	3		50			20	$p{base_url} \|\|= ''; # This is the base URL to be used for WikiLinks
54
55	3	50	33			20	$p{tab_width} = 4 unless (defined $p{tab_width} and $p{tab_width} =~ m/^\d+$/);
56
57	3		50			16	$p{empty_element_suffix} \|\|= ' />'; # Change to ">" for HTML output
58
59	3	50				8	$p{trust_list_start_value} = $p{trust_list_start_value} ? 1 : 0;
60
61	3					11	my $self = { params => \%p };
62	3		33			15	bless $self, ref($class) \|\| $class;
63	3					7	return $self;
64							}
65
66
67							sub markdown_to_pod {
68	6			6	1	18	my ( $self, $text, $options ) = @_;
69
70							# Detect functional mode, and create an instance for this run
71	6	100				18	unless (ref $self) {
72	3	50				10	if ( $self ne __PACKAGE__ ) {
73	3					13	my $ob = __PACKAGE__->new();
74							# $self is text, $text is options
75	3					12	return $ob->markdown_to_pod($self, $text);
76							}
77							else {
78	0					0	croak('Calling ' . $self . '->markdown (as a class method) is not supported.');
79							}
80							}
81
82	3		50			14	$options \|\|= {};
83
84	3					5	%$self = (%{ $self->{params} }, %$options, params => $self->{params});
	3					33
85
86	3					12	$self->_CleanUpRunData($options);
87
88	3					13	return $self->_Markdown($text);
89							}
90
91							sub _CleanUpRunData {
92	3			3		7	my ($self, $options) = @_;
93							# Clear the global hashes. If we don't clear these, you get conflicts
94							# from other articles when generating a page which contains more than
95							# one article (e.g. an index page that shows the N most recent
96							# articles).
97	3	50				42	$self->{_urls} = $options->{urls} ? $options->{urls} : {}; # FIXME - document passing this option (tested in 05options.t).
98	3					9	$self->{_titles} = {};
99	3					10	$self->{_html_blocks} = {};
100							# Used to track when we're inside an ordered or unordered list
101							# (see _ProcessListItems() for details)
102	3					8	$self->{_list_level} = 0;
103
104							}
105
106							sub _Markdown {
107							#
108							# Main function. The order in which other subs are called here is
109							# essential. Link and image substitutions need to happen before
110							# _EscapeSpecialChars(), so that any *'s or _'s in the
111							# and tags get encoded.
112							#
113	3			3		6	my ($self, $text, $options) = @_;
114
115	3					10	$text = $self->_CleanUpDoc($text);
116
117							# Turn block-level HTML elements into hash entries, and interpret markdown in them if they have a 'markdown="1"' attribute
118	3					18	$text = $self->_HashHTMLBlocks($text, {interpret_markdown_on_attribute => 1});
119
120	3					28	$text = $self->_StripLinkDefinitions($text);
121
122	3					19	$text = $self->_RunBlockGamut($text, {wrap_in_p_tags => 1});
123
124	3					12	$text = $self->_UnescapeSpecialChars($text);
125
126	3					9	$text = $self->_ConvertCopyright($text);
127
128	3					46	return $text . "\n";
129							}
130
131
132							sub urls {
133	0			0	1	0	my ( $self ) = @_;
134
135	0					0	return $self->{_urls};
136							}
137
138							sub _CleanUpDoc {
139	3			3		8	my ($self, $text) = @_;
140
141							# Standardize line endings:
142	3					8	$text =~ s{\r\n}{\n}g; # DOS to Unix
143	3					6	$text =~ s{\r}{\n}g; # Mac to Unix
144
145							# Make sure $text ends with a couple of newlines:
146	3					5	$text .= "\n\n";
147
148							# Convert all tabs to spaces.
149	3					11	$text = $self->_Detab($text);
150
151							# Strip any lines consisting only of spaces and tabs.
152							# This makes subsequent regexen easier to write, because we can
153							# match consecutive blank lines with /\n+/ instead of something
154							# contorted like /[ \t]*\n+/ .
155	3					9	$text =~ s/^[ \t]+$//mg;
156
157	3					7	return $text;
158							}
159
160							sub _StripLinkDefinitions {
161							#
162							# Strips link definitions from text, stores the URLs and titles in
163							# hash references.
164							#
165	3			3		7	my ($self, $text) = @_;
166	3					8	my $less_than_tab = $self->{tab_width} - 1;
167
168							# Link defs are in the form: ^[id]: url "optional title"
169	3					64	while ($text =~ s{
170							^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
171							[ \t]*
172							\n? # maybe one newline
173							[ \t]*
174							? # url = \$2
175							[ \t]*
176							\n? # maybe one newline
177							[ \t]*
178							(?:
179							(?<=\s) # lookbehind for whitespace
180							["(]
181							(.+?) # title = \$3
182							[")]
183							[ \t]*
184							)? # title is optional
185							(?:\n+\|\Z)
186							}{}omx) {
187	0					0	$self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
188	0	0				0	if ($3) {
189	0					0	$self->{_titles}{lc $1} = $3;
190	0					0	$self->{_titles}{lc $1} =~ s/"/"/g;
191							}
192
193							}
194
195	3					8	return $text;
196							}
197
198							sub _md5_utf8 {
199							# Internal function used to safely MD5sum chunks of the input, which might be Unicode in Perl's internal representation.
200	0			0		0	my $input = shift;
201	0	0				0	return unless defined $input;
202	0	0				0	if (Encode::is_utf8 $input) {
203	0					0	return md5_hex(Encode::encode('utf8', $input));
204							}
205							else {
206	0					0	return md5_hex($input);
207							}
208							}
209
210							sub _HashHTMLBlocks {
211	6			6		12	my ($self, $text, $options) = @_;
212	6					11	my $less_than_tab = $self->{tab_width} - 1;
213
214							# Hashify HTML blocks (protect from further interpretation by encoding to an md5):
215							# We only want to do this for block-level HTML tags, such as headers,
216							# lists, and tables. That's because we still want to wrap s around
217							# "paragraphs" that are wrapped in non-block-level tags, such as anchors,
218							# phrase emphasis, and spans. The list of tags we're looking for is
219							# hard-coded:
220	6					21	my $block_tags = qr{
221							(?:
222							p \| div \| h[1-6] \| blockquote \| pre \| table \|
223							dl \| ol \| ul \| script \| noscript \| form \|
224							fieldset \| iframe \| math \| ins \| del
225							)
226							}x;
227
228	6					18	my $tag_attrs = qr{
229							(?: # Match one attr name/value pair
230							\s+ # There needs to be at least some whitespace
231							# before each attribute name.
232							[\w.:_-]+ # Attribute name
233							\s=\s
234							(?:
235							".+?" # "Attribute value"
236							\|
237							'.+?' # 'Attribute value'
238							\|
239							[^\s]+? # AttributeValue (HTML5)
240							)
241							)* # Zero or more
242							}x;
243
244	6					141	my $empty_tag = qr{< \w+ $tag_attrs \s* />}oxms;
245	6					113	my $open_tag = qr{< $block_tags $tag_attrs \s* >}oxms;
246	6					10	my $close_tag = undef; # let Text::Balanced handle this
247	6					7	my $prefix_pattern = undef; # Text::Balanced
248	6					17	my $markdown_attr = qr{ \s* markdown \s* = \s* (['"]) (.*?) \1 }xs;
249
250	1			1		1233	use Text::Balanced qw(gen_extract_tagged);
	1					36049
	1					4229
251	6					45	my $extract_block = gen_extract_tagged($open_tag, $close_tag, $prefix_pattern, { ignore => [$empty_tag] });
252
253	6					728	my @chunks;
254							# parse each line, looking for block-level HTML tags
255	6					87	while ($text =~ s{^(([ ]{0,$less_than_tab}<)?.*\n)}{}m) {
256	12					31	my $cur_line = $1;
257	12	50				26	if (defined $2) {
258							# current line could be start of code block
259
260	0					0	my ($tag, $remainder, $prefix, $opening_tag, $text_in_tag, $closing_tag) = $extract_block->($cur_line . $text);
261	0	0				0	if ($tag) {
262	0	0	0			0	if ($options->{interpret_markdown_on_attribute} and $opening_tag =~ s/$markdown_attr//i) {
263	0					0	my $markdown = $2;
264	0	0				0	if ($markdown =~ /^(1\|on\|yes)$/) {
265							# interpret markdown and reconstruct $tag to include the interpreted $text_in_tag
266	0					0	my $wrap_in_p_tags = $opening_tag =~ /^<(div\|iframe)/;
267	0					0	$tag = $prefix . $opening_tag . "\n"
268							. $self->_RunBlockGamut($text_in_tag, {wrap_in_p_tags => $wrap_in_p_tags})
269							. "\n" . $closing_tag
270							;
271							} else {
272							# just remove the markdown="0" attribute
273	0					0	$tag = $prefix . $opening_tag . $text_in_tag . $closing_tag;
274							}
275							}
276	0					0	my $key = _md5_utf8($tag);
277	0					0	$self->{_html_blocks}{$key} = $tag;
278	0					0	push @chunks, "\n\n" . $key . "\n\n";
279	0					0	$text = $remainder;
280							}
281							else {
282							# No tag match, so toss $cur_line into @chunks
283	0					0	push @chunks, $cur_line;
284							}
285							}
286							else {
287							# current line could NOT be start of code block
288	12					78	push @chunks, $cur_line;
289							}
290
291							}
292	6					12	push @chunks, $text; # whatever is left
293
294	6					13	$text = join '', @chunks;
295
296	6					81	return $text;
297							}
298
299							sub _HashHR {
300	3			3		4	my ($self, $text) = @_;
301	3					9	my $less_than_tab = $self->{tab_width} - 1;
302
303	3					63	$text =~ s{
304							(?:
305							(?<=\n\n) # Starting after a blank line
306							\| # or
307							\A\n? # the beginning of the doc
308							)
309							( # save in $1
310							[ ]{0,$less_than_tab}
311							<(hr) # start tag = $2
312							\b # word break
313							([^<>])*? #
314							/?> # the matching end tag
315							[ \t]*
316							(?=\n{2,}\|\Z) # followed by a blank line or end of document
317							)
318							}{
319	0					0	my $key = _md5_utf8($1);
320	0					0	$self->{_html_blocks}{$key} = $1;
321	0					0	"\n\n" . $key . "\n\n";
322							}egx;
323
324	3					9	return $text;
325							}
326
327							sub _HashHTMLComments {
328	3			3		6	my ($self, $text) = @_;
329	3					7	my $less_than_tab = $self->{tab_width} - 1;
330
331							# Special case for standalone HTML comments:
332	3					52	$text =~ s{
333							(?:
334							(?<=\n\n) # Starting after a blank line
335							\| # or
336							\A\n? # the beginning of the doc
337							)
338							( # save in $1
339							[ ]{0,$less_than_tab}
340							(?s:
341
342							(--.?--\s)+
343							>
344							)
345							[ \t]*
346							(?=\n{2,}\|\Z) # followed by a blank line or end of document
347							)
348							}{
349	0					0	my $key = _md5_utf8($1);
350	0					0	$self->{_html_blocks}{$key} = $1;
351	0					0	"\n\n" . $key . "\n\n";
352							}egx;
353
354	3					10	return $text;
355							}
356
357							sub _HashPHPASPBlocks {
358	3			3		5	my ($self, $text) = @_;
359	3					5	my $less_than_tab = $self->{tab_width} - 1;
360
361							# PHP and ASP-style processor instructions ( and <%…%>)
362	3					62	$text =~ s{
363							(?:
364							(?<=\n\n) # Starting after a blank line
365							\| # or
366							\A\n? # the beginning of the doc
367							)
368							( # save in $1
369							[ ]{0,$less_than_tab}
370							(?s:
371							<([?%]) # $2
372							.*?
373							\2>
374							)
375							[ \t]*
376							(?=\n{2,}\|\Z) # followed by a blank line or end of document
377							)
378							}{
379	0					0	my $key = _md5_utf8($1);
380	0					0	$self->{_html_blocks}{$key} = $1;
381	0					0	"\n\n" . $key . "\n\n";
382							}egx;
383	3					10	return $text;
384							}
385
386							sub _RunBlockGamut {
387							#
388							# These are all the transformations that form block-level
389							# tags like paragraphs, headers, and list items.
390							#
391	3			3		7	my ($self, $text, $options) = @_;
392
393							# Do headers first, as these populate cross-refs
394	3					10	$text = $self->_DoHeaders($text);
395
396							# Do Horizontal Rules:
397	3					8	my $less_than_tab = $self->{tab_width} - 1;
398							#$text =~ s{^[ ]{0,$less_than_tab}(\[ ]?){3,}[ \t]$}{\n{empty_element_suffix}\n}gmx;
399							#$text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
400							#$text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
401	3					32	$text =~ s{^[ ]{0,$less_than_tab}(\[ ]?){3,}[ \t]$}{"\n" . ("=" x 72) . "\n\n"}egmx;
	0					0
402	3					28	$text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{"\n" . ("=" x 72) . "\n\n"}egmx;
	0					0
403	3					28	$text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{"\n" . ("=" x 72) . "\n\n"}egmx;
	0					0
404
405	3					12	$text = $self->_DoLists($text);
406
407	3					12	$text = $self->_DoCodeBlocks($text);
408
409	3					11	$text = $self->_DoBlockQuotes($text);
410
411							# We already ran _HashHTMLBlocks() before, in Markdown(), but that
412							# was to escape raw HTML in the original Markdown source. This time,
413							# we're escaping the markup we've just created, so that we don't wrap
414							# tags around block-level tags.
415	3					9	$text = $self->_HashHTMLBlocks($text);
416
417							# Special case just for . It was easier to make a special case than
418							# to make the other regex more complicated.
419	3					14	$text = $self->_HashHR($text);
420
421	3					10	$text = $self->_HashHTMLComments($text);
422
423	3					12	$text = $self->_HashPHPASPBlocks($text);
424
425	3					18	$text = $self->_FormParagraphs($text, {wrap_in_p_tags => $options->{wrap_in_p_tags}});
426
427	3					11	return $text;
428							}
429
430							sub _RunSpanGamut {
431							#
432							# These are all the transformations that occur within block-level
433							# tags like paragraphs, headers, and list items.
434							#
435	3			3		6	my ($self, $text) = @_;
436
437	3					10	$text = $self->_DoCodeSpans($text);
438	3					9	$text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
439	3					10	$text = $self->_EscapeSpecialChars($text);
440
441							# Process anchor and image tags. Images must come first,
442							# because ![foo][f] looks like an anchor.
443	3					11	$text = $self->_DoImages($text);
444	3					10	$text = $self->_DoAnchors($text);
445
446							# Make links out of things like ``
447							# Must come after _DoAnchors(), because you can use < and >
448							# delimiters in inline links like [this]().
449	3					11	$text = $self->_DoAutoLinks($text);
450
451	3					8	$text = $self->_EncodeAmpsAndAngles($text);
452
453	3					10	$text = $self->_DoItalicsAndBold($text);
454
455							# FIXME - Is hard coding space here sane, or does this want to be related to tab width?
456							# Do hard breaks:
457	3					9	$text =~ s/ {2,}\n/ {empty_element_suffix}\n/g;
458
459	3					11	return $text;
460							}
461
462							sub _EscapeSpecialChars {
463	3			3		5	my ($self, $text) = @_;
464	3		33			30	my $tokens \|\|= $self->_TokenizeHTML($text);
465
466	3					6	$text = ''; # rebuild $text from the tokens
467							# my $in_pre = 0; # Keep track of when we're inside or `tags.`
468							# my $tags_to_skip = qr!<(/?)(?:pre\|code\|kbd\|script\|math)[\s>]!;
469
470	3					7	foreach my $cur_token (@$tokens) {
471	7	100				17	if ($cur_token->[0] eq "tag") {
472							# Within tags, encode * and _ so they don't conflict
473							# with their use in Markdown for italics and strong.
474							# We're replacing each such character with its
475							# corresponding MD5 checksum value; this is likely
476							# overkill, but it should prevent us from colliding
477							# with the escape values by accident.
478	2					5	$cur_token->[1] =~ s! \* !$g_escape_table{'*'}!ogx;
479	2					3	$cur_token->[1] =~ s! _ !$g_escape_table{'_'}!ogx;
480	2					4	$text .= $cur_token->[1];
481							} else {
482	5					7	my $t = $cur_token->[1];
483	5					13	$t = $self->_EncodeBackslashEscapes($t);
484	5					14	$text .= $t;
485							}
486							}
487	3					11	return $text;
488							}
489
490							sub _EscapeSpecialCharsWithinTagAttributes {
491							#
492							# Within tags -- meaning between < and > -- encode [\ ` * _] so they
493							# don't conflict with their use in Markdown for code, italics and strong.
494							# We're replacing each such character with its corresponding MD5 checksum
495							# value; this is likely overkill, but it should prevent us from colliding
496							# with the escape values by accident.
497							#
498	3			3		6	my ($self, $text) = @_;
499	3		33			16	my $tokens \|\|= $self->_TokenizeHTML($text);
500	3					6	$text = ''; # rebuild $text from the tokens
501
502	3					8	foreach my $cur_token (@$tokens) {
503	7	100				21	if ($cur_token->[0] eq "tag") {
504	2					5	$cur_token->[1] =~ s! \\ !$g_escape_table{'\\'}!gox;
505	2					4	$cur_token->[1] =~ s{ (?<=.)(?=.) }{$g_escape_table{'`'}}gox;
506	2					4	$cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gox;
507	2					3	$cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gox;
508							}
509	7					17	$text .= $cur_token->[1];
510							}
511	3					11	return $text;
512							}
513
514							sub _DoAnchors {
515							#
516							# Turn Markdown link shortcuts into XHTML tags.
517							#
518	3			3		4	my ($self, $text) = @_;
519
520							#
521							# First, handle reference-style links: [link text] [id]
522							#
523	3					127	$text =~ s{
524							( # wrap whole match in $1
525							\[
526							($g_nested_brackets) # link text = $2
527							\]
528
529							[ ]? # one optional space
530							(?:\n[ ]*)? # one optional newline followed by spaces
531
532							\[
533							(.*?) # id = $3
534							\]
535							)
536							}{
537	0					0	my $whole_match = $1;
538	0					0	my $link_text = $2;
539	0					0	my $link_id = lc $3;
540
541	0	0				0	if ($link_id eq "") {
542	0					0	$link_id = lc $link_text; # for shortcut links like [this][].
543							}
544
545	0					0	$link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
546
547	0					0	$self->_GenerateAnchor($whole_match, $link_text, $link_id);
548							}xsge;
549
550							#
551							# Next, inline-style links: [link text](url "optional title")
552							#
553	3					208	$text =~ s{
554							( # wrap whole match in $1
555							\[
556							($g_nested_brackets) # link text = $2
557							\]
558							\( # literal paren
559							[ \t]*
560							($g_nested_parens) # href = $3
561							[ \t]*
562							( # $4
563							(['"]) # quote char = $5
564							(.*?) # Title = $6
565							\5 # matching quote
566							[ \t]* # ignore any spaces/tabs between closing quote and )
567							)? # title is optional
568							\)
569							)
570							}{
571	0					0	my $result;
572	0					0	my $whole_match = $1;
573	0					0	my $link_text = $2;
574	0					0	my $url = $3;
575	0					0	my $title = $6;
576
577	0					0	$self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
578							}xsge;
579
580							#
581							# Last, handle reference-style shortcuts: [link text]
582							# These must come last in case you've also got [link test][1]
583							# or [link test](/foo)
584							#
585	3					8	$text =~ s{
586							( # wrap whole match in $1
587							\[
588							([^\[\]]+) # link text = $2; can't contain '[' or ']'
589							\]
590							)
591							}{
592	0					0	my $result;
593	0					0	my $whole_match = $1;
594	0					0	my $link_text = $2;
595	0					0	(my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
596
597	0					0	$self->_GenerateAnchor($whole_match, $link_text, $link_id);
598							}xsge;
599
600	3					9	return $text;
601							}
602
603							sub _GenerateAnchor {
604							# FIXME - Fugly, change to named params?
605	0			0		0	my ($self, $whole_match, $link_text, $link_id, $url, $title, $attributes) = @_;
606
607	0					0	my $result;
608
609	0	0				0	$attributes = '' unless defined $attributes;
610
611	0	0	0			0	if ( !defined $url && defined $self->{_urls}{$link_id}) {
612	0					0	$url = $self->{_urls}{$link_id};
613							}
614
615	0	0				0	if (!defined $url) {
616	0					0	return $whole_match;
617							}
618
619	0					0	$url =~ s! \* !$g_escape_table{'*'}!gox; # We've got to encode these to avoid
620	0					0	$url =~ s! _ !$g_escape_table{'_'}!gox; # conflicting with italics/bold.
621	0					0	$url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
622
623	0					0	$result = qq{
624
625	0	0	0			0	if ( !defined $title && defined $link_id && defined $self->{_titles}{$link_id} ) {
			0
626	0					0	$title = $self->{_titles}{$link_id};
627							}
628
629	0	0				0	if ( defined $title ) {
630	0					0	$title =~ s/"/"/g;
631	0					0	$title =~ s! \* !$g_escape_table{'*'}!gox;
632	0					0	$title =~ s! _ !$g_escape_table{'_'}!gox;
633	0					0	$result .= qq{ title="$title"};
634							}
635
636							#$result .= "$attributes>$link_text";
637	0	0				0	$result = __podfmt(L => ($url . ($title ? "\|$title" : "")));
638
639	0					0	return $result;
640							}
641
642							sub _DoImages {
643							#
644							# Turn Markdown image shortcuts into tags.
645							#
646	3			3		5	my ($self, $text) = @_;
647
648							#
649							# First, handle reference-style labeled images: ![alt text][id]
650							#
651	3					6	$text =~ s{
652							( # wrap whole match in $1
653							!\[
654							(.*?) # alt text = $2
655							\]
656
657							[ ]? # one optional space
658							(?:\n[ ]*)? # one optional newline followed by spaces
659
660							\[
661							(.*?) # id = $3
662							\]
663
664							)
665							}{
666	0					0	my $result;
667	0					0	my $whole_match = $1;
668	0					0	my $alt_text = $2;
669	0					0	my $link_id = lc $3;
670
671	0	0				0	if ($link_id eq '') {
672	0					0	$link_id = lc $alt_text; # for shortcut links like ![this][].
673							}
674
675	0					0	$self->_GenerateImage($whole_match, $alt_text, $link_id);
676							}xsge;
677
678							#
679							# Next, handle inline images: ![alt text](url "optional title")
680							# Don't forget: encode * and _
681
682	3					184	$text =~ s{
683							( # wrap whole match in $1
684							!\[
685							(.*?) # alt text = $2
686							\]
687							\( # literal paren
688							[ \t]*
689							($g_nested_parens) # src url - href = $3
690							[ \t]*
691							( # $4
692							(['"]) # quote char = $5
693							(.*?) # title = $6
694							\5 # matching quote
695							[ \t]*
696							)? # title is optional
697							\)
698							)
699							}{
700	0					0	my $result;
701	0					0	my $whole_match = $1;
702	0					0	my $alt_text = $2;
703	0					0	my $url = $3;
704	0					0	my $title = '';
705	0	0				0	if (defined($6)) {
706	0					0	$title = $6;
707							}
708
709	0					0	$self->_GenerateImage($whole_match, $alt_text, undef, $url, $title);
710							}xsge;
711
712	3					10	return $text;
713							}
714
715							sub _GenerateImage {
716							# FIXME - Fugly, change to named params?
717	0			0		0	my ($self, $whole_match, $alt_text, $link_id, $url, $title, $attributes) = @_;
718
719	0					0	my $result;
720
721	0	0				0	$attributes = '' unless defined $attributes;
722
723	0		0			0	$alt_text \|\|= '';
724	0					0	$alt_text =~ s/"/"/g;
725							# FIXME - how about >
726
727	0	0	0			0	if ( !defined $url && defined $self->{_urls}{$link_id}) {
728	0					0	$url = $self->{_urls}{$link_id};
729							}
730
731							# If there's no such link ID, leave intact:
732	0	0				0	return $whole_match unless defined $url;
733
734	0					0	$url =~ s! \* !$g_escape_table{'*'}!ogx; # We've got to encode these to avoid
735	0					0	$url =~ s! _ !$g_escape_table{'_'}!ogx; # conflicting with italics/bold.
736	0					0	$url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
737
738	0	0	0			0	if (!defined $title && length $link_id && defined $self->{_titles}{$link_id} && length $self->{_titles}{$link_id}) {
			0
			0
739	0					0	$title = $self->{_titles}{$link_id};
740							}
741
742	0					0	$result = qq{
743	0	0	0			0	if (defined $title && length $title) {
744	0					0	$title =~ s! \* !$g_escape_table{'*'}!ogx;
745	0					0	$title =~ s! _ !$g_escape_table{'_'}!ogx;
746	0					0	$title =~ s/"/"/g;
747	0					0	$result .= qq{ title="$title"};
748							}
749	0					0	$result .= $attributes . $self->{empty_element_suffix};
750
751	0					0	$result = "\n\n=begin HTML\n\n$result\n\n=end HTML\n\n";
752
753	0					0	return $result;
754							}
755
756							sub _DoHeaders {
757	3			3		4	my ($self, $text) = @_;
758
759							# Setext-style headers:
760							# Header 1
761							# ========
762							#
763							# Header 2
764							# --------
765							#
766	3					7	$text =~ s{ ^(.+)[ \t]\n=+[ \t]\n+ }{
767	0					0	$self->_GenerateHeader('1', $1);
768							}egmx;
769
770	3					5	$text =~ s{ ^(.+)[ \t]\n-+[ \t]\n+ }{
771	0					0	$self->_GenerateHeader('2', $1);
772							}egmx;
773
774
775							# atx-style headers:
776							# # Header 1
777							# ## Header 2
778							# ## Header 2 with closing hashes ##
779							# ...
780							# ###### Header 6
781							#
782	3					4	my $l;
783	3					6	$text =~ s{
784							^(\#{1,6}) # $1 = string of #'s
785							[ \t]*
786							(.+?) # $2 = Header text
787							[ \t]*
788							\#* # optional closing #'s (not counted)
789							\n+
790							}{
791	0					0	my $h_level = length($1);
792	0					0	$self->_GenerateHeader($h_level, $2);
793							}egmx;
794
795	3					7	return $text;
796							}
797
798							sub _GenerateHeader {
799	0			0		0	my ($self, $level, $id) = @_;
800
801							#return "" . $self->_RunSpanGamut($id) . "\n\n";
802	0					0	return "=head$level " . $self->_RunSpanGamut($id) . "\n\n";
803							}
804
805							sub _DoLists {
806							#
807							# Form HTML ordered (numbered) and unordered (bulleted) lists.
808							#
809	3			3		6	my ($self, $text) = @_;
810	3					7	my $less_than_tab = $self->{tab_width} - 1;
811
812							# Re-usable patterns to match list item bullets and number markers:
813	3					10	my $marker_ul = qr/[*+-]/;
814	3					9	my $marker_ol = qr/\d+[.]/;
815	3					34	my $marker_any = qr/(?:$marker_ul\|$marker_ol)/;
816
817							# Re-usable pattern to match any entirel ul or ol list:
818	3					76	my $whole_list = qr{
819							( # $1 = whole list
820							( # $2
821							[ ]{0,$less_than_tab}
822							(${marker_any}) # $3 = first list item marker
823							[ \t]+
824							)
825							(?s:.+?)
826							( # $4
827							\z
828							\|
829							\n{2,}
830							(?=\S)
831							(?! # Negative lookahead for another list item marker
832							[ \t]*
833							${marker_any}[ \t]+
834							)
835							)
836							)
837							}mx;
838
839							# We use a different prefix before nested lists than top-level lists.
840							# See extended comment in _ProcessListItems().
841							#
842							# Note: There's a bit of duplication here. My original implementation
843							# created a scalar regex pattern as the conditional result of the test on
844							# $self->{_list_level}, and then only ran the $text =~ s{...}{...}egmx
845							# substitution once, using the scalar as the pattern. This worked,
846							# everywhere except when running under MT on my hosting account at Pair
847							# Networks. There, this caused all rebuilds to be killed by the reaper (or
848							# perhaps they crashed, but that seems incredibly unlikely given that the
849							# same script on the same server ran fine except under MT. I've spent
850							# more time trying to figure out why this is happening than I'd like to
851							# admit. My only guess, backed up by the fact that this workaround works,
852							# is that Perl optimizes the substition when it can figure out that the
853							# pattern will never change, and when this optimization isn't on, we run
854							# afoul of the reaper. Thus, the slightly redundant code to that uses two
855							# static s/// patterns rather than one conditional pattern.
856
857	3	50				13	if ($self->{_list_level}) {
858	0					0	$text =~ s{
859							^
860							$whole_list
861							}{
862	0					0	my $list = $1;
863	0					0	my $marker = $3;
864	0	0				0	my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
865							# Turn double returns into triple returns, so that we can make a
866							# paragraph for the last item in a list, if necessary:
867	0					0	$list =~ s/\n{2,}/\n\n\n/g;
868	0	0				0	my $result = ( $list_type eq 'ul' ) ?
869							$self->_ProcessListItemsUL($list, $marker_ul)
870							: $self->_ProcessListItemsOL($list, $marker_ol);
871
872	0					0	$result = $self->_MakeList($list_type, $result, $marker);
873	0					0	$result;
874							}egmx;
875							}
876							else {
877	3					96	$text =~ s{
878							(?:(?<=\n\n)\|\A\n?)
879							$whole_list
880							}{
881	0					0	my $list = $1;
882	0					0	my $marker = $3;
883	0	0				0	my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
884							# Turn double returns into triple returns, so that we can make a
885							# paragraph for the last item in a list, if necessary:
886	0					0	$list =~ s/\n{2,}/\n\n\n/g;
887	0	0				0	my $result = ( $list_type eq 'ul' ) ?
888							$self->_ProcessListItemsUL($list, $marker_ul)
889							: $self->_ProcessListItemsOL($list, $marker_ol);
890	0					0	$result = $self->_MakeList($list_type, $result, $marker);
891	0					0	$result;
892							}egmx;
893							}
894
895
896	3					16	return $text;
897							}
898
899							sub _MakeList {
900	0			0		0	my ($self, $list_type, $content, $marker) = @_;
901
902	0	0	0			0	if ($list_type eq 'ol' and $self->{trust_list_start_value}) {
903	0					0	my ($num) = $marker =~ /^(\d+)[.]/;
904							#return " \n" . $content . " \n";
905	0					0	return "=over\n\n" . $content . "=back\n\n";
906							}
907
908							#return "<$list_type>\n" . $content . "\n";
909	0					0	return "=over\n\n" . $content . "=back\n\n";
910							}
911
912							sub _ProcessListItemsOL {
913							#
914							# Process the contents of a single ordered list, splitting it
915							# into individual list items.
916							#
917
918	0			0		0	my ($self, $list_str, $marker_any) = @_;
919
920
921							# The $self->{_list_level} global keeps track of when we're inside a list.
922							# Each time we enter a list, we increment it; when we leave a list,
923							# we decrement. If it's zero, we're not in a list anymore.
924							#
925							# We do this because when we're not inside a list, we want to treat
926							# something like this:
927							#
928							# I recommend upgrading to version
929							# 8. Oops, now this line is treated
930							# as a sub-list.
931							#
932							# As a single paragraph, despite the fact that the second line starts
933							# with a digit-period-space sequence.
934							#
935							# Whereas when we're inside a list (or sub-list), that line will be
936							# treated as the start of a sub-list. What a kludge, huh? This is
937							# an aspect of Markdown's syntax that's hard to parse perfectly
938							# without resorting to mind-reading. Perhaps the solution is to
939							# change the syntax rules such that sub-lists must start with a
940							# starting cardinal number; e.g. "1." or "a.".
941
942	0					0	$self->{_list_level}++;
943
944							# trim trailing blank lines:
945	0					0	$list_str =~ s/\n{2,}\z/\n/;
946
947
948	0					0	my $i = 0;
949
950	0					0	$list_str =~ s{
951							(\n)? # leading line = $1
952							(^[ \t]*) # leading whitespace = $2
953							($marker_any) [ \t]+ # list marker = $3
954							((?s:.+?) # list item text = $4
955							(\n{1,2}))
956							(?= \n* (\z \| \2 ($marker_any) [ \t]+))
957							}{
958	0					0	my $item = $4;
959	0					0	my $leading_line = $1;
960	0					0	my $leading_space = $2;
961
962	0	0	0			0	if ($leading_line or ($item =~ m/\n{2,}/)) {
963	0					0	$item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
964							}
965							else {
966							# Recursion for sub-lists:
967	0					0	$item = $self->_DoLists($self->_Outdent($item));
968	0					0	chomp $item;
969	0					0	$item = $self->_RunSpanGamut($item);
970							}
971
972							#" " . $item . " \n";
973	0					0	$i++; "=item $i. " . $item . "\n\n";
	0					0
974							}egmxo;
975
976	0					0	$self->{_list_level}--;
977	0					0	return $list_str;
978							}
979
980							sub _ProcessListItemsUL {
981							#
982							# Process the contents of a single unordered list, splitting it
983							# into individual list items.
984							#
985
986	0			0		0	my ($self, $list_str, $marker_any) = @_;
987
988
989							# The $self->{_list_level} global keeps track of when we're inside a list.
990							# Each time we enter a list, we increment it; when we leave a list,
991							# we decrement. If it's zero, we're not in a list anymore.
992							#
993							# We do this because when we're not inside a list, we want to treat
994							# something like this:
995							#
996							# I recommend upgrading to version
997							# 8. Oops, now this line is treated
998							# as a sub-list.
999							#
1000							# As a single paragraph, despite the fact that the second line starts
1001							# with a digit-period-space sequence.
1002							#
1003							# Whereas when we're inside a list (or sub-list), that line will be
1004							# treated as the start of a sub-list. What a kludge, huh? This is
1005							# an aspect of Markdown's syntax that's hard to parse perfectly
1006							# without resorting to mind-reading. Perhaps the solution is to
1007							# change the syntax rules such that sub-lists must start with a
1008							# starting cardinal number; e.g. "1." or "a.".
1009
1010	0					0	$self->{_list_level}++;
1011
1012							# trim trailing blank lines:
1013	0					0	$list_str =~ s/\n{2,}\z/\n/;
1014
1015
1016	0					0	$list_str =~ s{
1017							(\n)? # leading line = $1
1018							(^[ \t]*) # leading whitespace = $2
1019							($marker_any) [ \t]+ # list marker = $3
1020							((?s:.+?) # list item text = $4
1021							(\n{1,2}))
1022							(?= \n* (\z \| \2 ($marker_any) [ \t]+))
1023							}{
1024	0					0	my $item = $4;
1025	0					0	my $leading_line = $1;
1026	0					0	my $leading_space = $2;
1027
1028	0	0	0			0	if ($leading_line or ($item =~ m/\n{2,}/)) {
1029	0					0	$item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
1030							}
1031							else {
1032							# Recursion for sub-lists:
1033	0					0	$item = $self->_DoLists($self->_Outdent($item));
1034	0					0	chomp $item;
1035	0					0	$item = $self->_RunSpanGamut($item);
1036							}
1037
1038							#" " . $item . " \n";
1039	0					0	"=item * " . $item . "\n\n";
1040							}egmxo;
1041
1042	0					0	$self->{_list_level}--;
1043	0					0	return $list_str;
1044							}
1045
1046							sub _DoCodeBlocks {
1047							#
1048							# Process Markdown code blocks (indented with 4 spaces or 1 tab):
1049							# * outdent the spaces/tab
1050							# * encode <, >, & into HTML entities
1051							# * escape Markdown special characters into MD5 hashes
1052							# * trim leading and trailing newlines
1053							#
1054
1055	3			3		7	my ($self, $text) = @_;
1056
1057	3					66	$text =~ s{
1058							(?:\n\n\|\A)
1059							( # $1 = the code block -- one or more lines, starting with a space/tab
1060							(?:
1061							(?:[ ]{$self->{tab_width}} \| \t) # Lines must start with a tab or a tab-width of spaces
1062							.*\n+
1063							)+
1064							)
1065							((?=^[ ]{0,$self->{tab_width}}\S)\|\Z) # Lookahead for non-space at line-start, or end of doc
1066							}{
1067	0					0	my $codeblock = $1;
1068	0					0	my $result; # return value
1069
1070	0					0	$codeblock = $self->_EncodeCode($self->_Outdent($codeblock), 0);
1071	0					0	$codeblock = $self->_Detab($codeblock);
1072	0					0	$codeblock =~ s/\A\n+//; # trim leading newlines
1073	0					0	$codeblock =~ s/\n+\z//; # trim trailing newlines
1074
1075							#$result = "\n\n `" . $codeblock . "\n` \n\n";
1076	0					0	$codeblock =~ s/^/ /mg;
1077
1078	0					0	$result = "\n\n" . $codeblock . "\n\n";
1079
1080	0					0	$result;
1081							}egmx;
1082
1083	3					9	return $text;
1084							}
1085
1086							sub _DoCodeSpans {
1087							#
1088							# * Backtick quotes are used for spans.
1089							#
1090							# * You can use multiple backticks as the delimiters if you want to
1091							# include literal backticks in the code span. So, this input:
1092							#
1093							# Just type ``foo `bar` baz`` at the prompt.
1094							#
1095							# Will translate to:
1096							#
1097							# Just type foo `bar` baz at the prompt.
1098							#
1099							# There's no arbitrary limit to the number of backticks you
1100							# can use as delimters. If you need three consecutive backticks
1101							# in your code, use four for delimiters, etc.
1102							#
1103							# * You can use spaces to get literal backticks at the edges:
1104							#
1105							# ... type `` `bar` `` ...
1106							#
1107							# Turns to:
1108							#
1109							# ... type `bar` ...
1110							#
1111
1112	3			3		5	my ($self, $text) = @_;
1113
1114	3					11	$text =~ s@
1115							(?
1116							(`+) # $1 = Opening run of `
1117							(.+?) # $2 = The code block
1118							(?
1119							\1 # Matching closer
1120							(?!`)
1121							@
1122	1					3	my $c = "$2";
1123	1					12	$c =~ s/^[ \t]*//g; # leading whitespace
1124	1					6	$c =~ s/[ \t]*$//g; # trailing whitespace
1125	1					5	$c = $self->_EncodeCode($c);
1126							#"`$c`";
1127	1					3	__podfmt(C => $c);
1128							@egsx;
1129
1130	3					6	return $text;
1131							}
1132
1133							sub _EncodeCode {
1134							#
1135							# Encode/escape certain characters inside Markdown code runs.
1136							# The point is that in code, these characters are literals,
1137							# and lose their special Markdown meanings.
1138							#
1139	1			1		3	my $self = shift;
1140	1					3	local $_ = shift;
1141	1		50			10	my $do_angle_bracket = shift // 1;
1142
1143							# Encode all ampersands; HTML entities are not
1144							# entities within a Markdown code span.
1145							#s/&/&/g;
1146
1147							# Encode $'s, but only if we're running under Blosxom.
1148							# (Blosxom interpolates Perl variables in article bodies.)
1149							{
1150	1			1		29	no warnings 'once';
	1					2
	1					2297
	1					2
1151	1	50				4	if (defined($blosxom::version)) {
1152							#s/\$/$/g;
1153							}
1154							}
1155
1156
1157							# Do the angle bracket song and dance:
1158							#s! < !<!gx;
1159							#s! > !>!gx;
1160	1	100				7	s! ([<>]) !$1 eq '<' ? 'E' : 'E'!egx if $do_angle_bracket;
	2	50				10
1161
1162							# Now, escape characters that are magic in Markdown:
1163	1					3	s! \* !$g_escape_table{'*'}!ogx;
1164	1					2	s! _ !$g_escape_table{'_'}!ogx;
1165	1					2	s! { !$g_escape_table{'{'}!ogx;
1166	1					2	s! } !$g_escape_table{'}'}!ogx;
1167	1					2	s! \[ !$g_escape_table{'['}!ogx;
1168	1					2	s! \] !$g_escape_table{']'}!ogx;
1169	1					2	s! \\ !$g_escape_table{'\\'}!ogx;
1170
1171	1					3	return $_;
1172							}
1173
1174							sub __podfmt {
1175	2			2		6	my ($fmt, $content) = @_;
1176	2	100				10	if ($content =~ /[<>]/) {
1177	1					5	"$fmt<< $content >>";
1178							} else {
1179	1					8	"$fmt<$content>";
1180							}
1181							}
1182
1183							sub _DoItalicsAndBold {
1184	3			3		5	my ($self, $text) = @_;
1185
1186							# Handle at beginning of lines:
1187	3					8	$text =~ s{ ^(\\\|__) (?=\S) (.+?[_]) (?<=\S) \1 }
1188							#{$2}gsx;
1189	0					0	{__podfmt(B => $2)}gsex;
1190
1191	3					7	$text =~ s{ ^(\*\|_) (?=\S) (.+?) (?<=\S) \1 }
1192							#{$2}gsx;
1193	0					0	{__podfmt(I => $2)}gsex;
1194
1195							# must go first:
1196	3					13	$text =~ s{ (?<=\W) (\\\|__) (?=\S) (.+?[_]) (?<=\S) \1 }
1197							#{$2}gsx;
1198	0					0	{__podfmt(B => $2)}gsex;
1199
1200	3					18	$text =~ s{ (?<=\W) (\*\|_) (?=\S) (.+?) (?<=\S) \1 }
1201							#{$2}gsx;
1202	1					5	{__podfmt(I => $2)}gsex;
1203
1204							# And now, a second pass to catch nested strong and emphasis special cases
1205	3					12	$text =~ s{ (?<=\W) (\\\|__) (?=\S) (.+?[_]) (?<=\S) \1 }
1206							#{$2}gsx;
1207	0					0	{__podfmt(B => $2)}gsex;
1208
1209	3					12	$text =~ s{ (?<=\W) (\*\|_) (?=\S) (.+?) (?<=\S) \1 }
1210							#{$2}gsx;
1211	0					0	{__podfmt(I => $2)}gsex;
1212
1213	3					8	return $text;
1214							}
1215
1216							sub _DoBlockQuotes {
1217	3			3		7	my ($self, $text) = @_;
1218
1219	3					4	$text =~ s{
1220							( # Wrap whole match in $1
1221							(
1222							^[ \t]*>[ \t]? # '>' at the start of a line
1223							.+\n # rest of the first line
1224							(.+\n)* # subsequent consecutive lines
1225							\n* # blanks
1226							)+
1227							)
1228							}{
1229	0					0	my $bq = $1;
1230	0					0	$bq =~ s/^([ \t]*>)/ $1/gm;
1231	0					0	$bq;
1232							#$bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1233							#$bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1234							#$bq = $self->_RunBlockGamut($bq, {wrap_in_p_tags => 1}); # recurse
1235
1236							#$bq =~ s/^/ /mg;
1237							## These leading spaces screw with content, so we need to fix that:
1238							#$bq =~ s{
1239							# (\s* .+? )
1240							# }{
1241							# my $pre = $1;
1242							# #$pre =~ s/^ //mg;
1243							# $pre;
1244							# }egsx;
1245							#
1246							#" \n$bq\n \n\n";
1247							}egmx;
1248
1249
1250	3					7	return $text;
1251							}
1252
1253							sub _FormParagraphs {
1254							#
1255							# Params:
1256							# $text - string to process with html tags
1257							#
1258	3			3		6	my ($self, $text, $options) = @_;
1259
1260							# Strip leading and trailing lines:
1261	3					5	$text =~ s/\A\n+//;
1262	3					11	$text =~ s/\n+\z//;
1263
1264	3					14	my @grafs = split(/\n{2,}/, $text);
1265
1266							#
1267							# Wrap tags.
1268							#
1269	3					8	foreach (@grafs) {
1270	3	50				11	unless (defined( $self->{_html_blocks}{$_} )) {
1271	3					10	$_ = $self->_RunSpanGamut($_);
1272							#if ($options->{wrap_in_p_tags}) {
1273							# s/^([ \t]*) //;
1274							# $_ .= " ";
1275							#}
1276							}
1277							}
1278
1279							#
1280							# Unhashify HTML blocks
1281							#
1282	3					9	foreach (@grafs) {
1283	3	50				16	if (defined( $self->{_html_blocks}{$_} )) {
1284	0					0	$_ = $self->{_html_blocks}{$_};
1285							}
1286							}
1287
1288	3					14	return join "\n\n", @grafs;
1289							}
1290
1291							sub _EncodeAmpsAndAngles {
1292							# Smart processing for ampersands and angle brackets that need to be encoded.
1293
1294	3			3		7	my ($self, $text) = @_;
1295	3	50	33			14	return '' if (!defined $text or !length $text);
1296
1297	3					9	return $text;
1298
1299							# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1300							# http://bumppo.net/projects/amputator/
1301	0					0	$text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+\|\w+);)/&/g;
1302
1303							# Encode naked <'s
1304	0					0	$text =~ s{<(?![a-z/?\$!])}{<}gi;
1305
1306							# And >'s - added by Fletcher Penney
1307							# $text =~ s{>(?![a-z/?\$!])}{>}gi;
1308							# Causes problems...
1309
1310							# Remove encoding inside comments
1311	0					0	$text =~ s{
1312							(?<=) # End comments
1315							}{
1316	0					0	my $t = $1;
1317	0					0	$t =~ s/&/&/g;
1318	0					0	$t =~ s/</
1319	0					0	$t;
1320							}egsx;
1321
1322	0					0	return $text;
1323							}
1324
1325							sub _EncodeBackslashEscapes {
1326							#
1327							# Parameter: String.
1328							# Returns: The string, with after processing the following backslash
1329							# escape sequences.
1330							#
1331	5			5		6	my $self = shift;
1332	5					40	local $_ = shift;
1333
1334	5					9	s! \\\\ !$g_escape_table{'\\'}!ogx; # Must process escaped backslashes first.
1335	5					7	s! \\` !$g_escape_table{'`'}!ogx;
1336	5					7	s! \\\* !$g_escape_table{'*'}!ogx;
1337	5					5	s! \\_ !$g_escape_table{'_'}!ogx;
1338	5					8	s! \\\{ !$g_escape_table{'{'}!ogx;
1339	5					7	s! \\\} !$g_escape_table{'}'}!ogx;
1340	5					6	s! \\\[ !$g_escape_table{'['}!ogx;
1341	5					6	s! \\\] !$g_escape_table{']'}!ogx;
1342	5					13	s! \\\( !$g_escape_table{'('}!ogx;
1343	5					6	s! \\\) !$g_escape_table{')'}!ogx;
1344	5					6	s! \\> !$g_escape_table{'>'}!ogx;
1345	5					8	s! \\\# !$g_escape_table{'#'}!ogx;
1346	5					7	s! \\\+ !$g_escape_table{'+'}!ogx;
1347	5					6	s! \\\- !$g_escape_table{'-'}!ogx;
1348	5					6	s! \\\. !$g_escape_table{'.'}!ogx;
1349	5					6	s{ \\! }{$g_escape_table{'!'}}ogx;
1350
1351	5					13	return $_;
1352							}
1353
1354							sub _DoAutoLinks {
1355	3			3		5	my ($self, $text) = @_;
1356
1357							#$text =~ s{<((https?\|ftp):[^'">\s]+)>}{$1}gi;
1358							#$text =~ s{<((https?\|ftp):[^'">\s]+)>}{__podfmt(L => $1)}egi;
1359
1360							# Email addresses:
1361	3					4	$text =~ s{
1362							<
1363							(?:mailto:)?
1364							(
1365							[-.\w\+]+
1366							\@
1367							[-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1368							)
1369							>
1370							}{
1371	0					0	$self->_EncodeEmailAddress( $self->_UnescapeSpecialChars($1) );
1372							}egix;
1373
1374	3					7	return $text;
1375							}
1376
1377							sub _EncodeEmailAddress {
1378							#
1379							# Input: an email address, e.g. "foo@example.com"
1380							#
1381							# Output: the email address as a mailto link, with each character
1382							# of the address encoded as either a decimal or hex entity, in
1383							# the hopes of foiling most address harvesting spam bots. E.g.:
1384							#
1385							# 1386	# xample.com">foo
1387							# @example.com
1388							#
1389							# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1390							# mailing list:
1391							#
1392
1393	0			0		0	my ($self, $addr) = @_;
1394
1395							my @encode = (
1396	0			0		0	sub { '&#' . ord(shift) . ';' },
1397	0			0		0	sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1398	0			0		0	sub { shift },
1399	0					0	);
1400
1401	0					0	$addr = "mailto:" . $addr;
1402
1403	0					0	$addr =~ s{(.)}{
1404	0					0	my $char = $1;
1405	0	0				0	if ( $char eq '@' ) {
		0
1406							# this must be encoded. I insist.
1407	0					0	$char = $encode[int rand 1]->($char);
1408							}
1409							elsif ( $char ne ':' ) {
1410							# leave ':' alone (to spot mailto: later)
1411	0					0	my $r = rand;
1412							# roughly 10% raw, 45% hex, 45% dec
1413	0	0				0	$char = (
		0
1414							$r > .9 ? $encode[2]->($char) :
1415							$r < .45 ? $encode[1]->($char) :
1416							$encode[0]->($char)
1417							);
1418							}
1419	0					0	$char;
1420							}gex;
1421
1422	0					0	$addr = qq{$addr};
1423	0					0	$addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1424
1425	0					0	return $addr;
1426							}
1427
1428							sub _UnescapeSpecialChars {
1429							#
1430							# Swap back in all the special characters we've hidden.
1431							#
1432	3			3		7	my ($self, $text) = @_;
1433
1434	3					15	while( my($char, $hash) = each(%g_escape_table) ) {
1435	48					526	$text =~ s/$hash/$char/g;
1436							}
1437	3					7	return $text;
1438							}
1439
1440							sub _TokenizeHTML {
1441							#
1442							# Parameter: String containing HTML markup.
1443							# Returns: Reference to an array of the tokens comprising the input
1444							# string. Each token is either a tag (possibly with nested,
1445							# tags contained therein, such as , or a
1446							# run of text between tags. Each element of the array is a
1447							# two-element array; the first is either 'tag' or 'text';
1448							# the second is the actual value.
1449							#
1450							#
1451							# Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1452							#
1453							#
1454
1455	6			6		8	my ($self, $str) = @_;
1456	6					7	my $pos = 0;
1457	6					8	my $len = length $str;
1458	6					6	my @tokens;
1459
1460	6					7	my $depth = 6;
1461	6					30	my $nested_tags = join('\|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
1462	6					196	my $match = qr/(?s: ) \| # comment
1463							(?s: <\? .*? \?> ) \| # processing instruction
1464							$nested_tags/iox; # nested tags
1465
1466	6					224	while ($str =~ m/($match)/og) {
1467	4					7	my $whole_tag = $1;
1468	4					7	my $sec_start = pos $str;
1469	4					5	my $tag_start = $sec_start - length $whole_tag;
1470	4	50				56	if ($pos < $tag_start) {
1471	4					15	push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1472							}
1473	4					10	push @tokens, ['tag', $whole_tag];
1474	4					21	$pos = pos $str;
1475							}
1476	6	50				29	push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1477	6					38	\@tokens;
1478							}
1479
1480							sub _Outdent {
1481							#
1482							# Remove one level of line-leading tabs or spaces
1483							#
1484	0			0		0	my ($self, $text) = @_;
1485
1486	0					0	$text =~ s/^(\t\|[ ]{1,$self->{tab_width}})//gm;
1487	0					0	return $text;
1488							}
1489
1490							sub _Detab {
1491							#
1492							# Cribbed from a post by Bart Lateur:
1493							#
1494							#
1495	3			3		6	my ($self, $text) = @_;
1496
1497							# FIXME - Better anchor/regex would be quicker.
1498
1499							# Original:
1500							#$text =~ s{(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}ge;
1501
1502							# Much swifter, but pretty hateful:
1503	3					14	do {} while ($text =~ s{^(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}mge);
	0					0
1504	3					8	return $text;
1505							}
1506
1507							sub _ConvertCopyright {
1508	3			3		5	my ($self, $text) = @_;
1509							# Convert to an XML compatible form of copyright symbol
1510
1511	3					5	$text =~ s/©/©/gi;
1512
1513	3					6	return $text;
1514							}
1515
1516							1;
1517
1518							__END__