Coverage Summary

lib/Text/Markdown.pm

Criterion	Covered	Total	%
statement	439	487	90.1
branch	71	92	77.1
condition	28	59	47.4
subroutine	51	53	96.2
pod	3	3	100.0
total	592	694	85.3

and as well).

54 =head1 SYNTAX

56 This module implements the 'original' Markdown markdown syntax from:

58 http://daringfireball.net/projects/markdown/

60 Note that L ensures that the output always ends with

61 B newline. The fact that multiple newlines are collapsed into one

62 makes sense, because this is the behavior of HTML towards whispace. The

63 fact that there's always a newline at the end makes sense again, given

64 that the output will always be nested in a B-level element (as

opposed to an inline element). That block element can be a C<<

(most often), or a C<< >>.

67
68							Markdown is B interpreted in HTML block-level elements, in order for
69							chunks of pasted HTML (e.g. JavaScript widgets, web counters) to not be
70							magically (mis)interpreted. For selective processing of Markdown in some,
71							but not other, HTML block elements, add a C attribute to the block
72							element and set its value to C<1>, C or C:
73
74
75							* Home
76							* About
77							* Contact
78
79
80							The extra C attribute will be stripped when generating the output.
81
82							=head1 OPTIONS
83
84							Text::Markdown supports a number of options to its processor which control
85							the behaviour of the output document.
86
87							These options can be supplied to the constructor, or in a hash within
88							individual calls to the L method. See the SYNOPSIS for examples
89							of both styles.
90
91							The options for the processor are:
92
93							=over
94
95							=item empty_element_suffix
96
97							This option controls the end of empty element tags:
98
99							'/>' for XHTML (default)
100							'>' for HTML
101
102							=item tab_width
103
104							Controls indent width in the generated markup. Defaults to 4.
105
106							=item trust_list_start_value
107
108							If true, ordered lists will use the first number as the starting point for
109							numbering. This will let you pick up where you left off by writing:
110
111							1. foo
112							2. bar
113
114							some paragraph
115
116							3. baz
117							6. quux
118
119							(Note that in the above, quux will be numbered 4.)
120
121							=back
122
123							=cut
124
125							# Regex to match balanced [brackets]. See Friedl's
126							# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
127							our ($g_nested_brackets, $g_nested_parens);
128							$g_nested_brackets = qr{
129							(?> # Atomic matching
130							[^\[\]]+ # Anything other than brackets
131							\|
132							\[
133							(??{ $g_nested_brackets }) # Recursive set of nested brackets
134							\]
135							)*
136							}x;
137							# Doesn't allow for whitespace, because we're using it to match URLs:
138							$g_nested_parens = qr{
139							(?> # Atomic matching
140							[^()\s]+ # Anything other than parens or whitespace
141							\|
142							\(
143							(??{ $g_nested_parens }) # Recursive set of nested brackets
144							\)
145							)*
146							}x;
147
148							# Table of hash values for escaped characters:
149							our %g_escape_table;
150							foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
151							$g_escape_table{$char} = md5_hex($char);
152							}
153
154							=head1 METHODS
155
156							=head2 new
157
158							A simple constructor, see the SYNTAX and OPTIONS sections for more information.
159
160							=cut
161
162							sub new {
163	15			15	1	7880	my ($class, %p) = @_;
164
165	15		50			143	$p{base_url} \|\|= ''; # This is the base URL to be used for WikiLinks
166
167	15	100	66			143	$p{tab_width} = 4 unless (defined $p{tab_width} and $p{tab_width} =~ m/^\d+$/);
168
169	15		50			98	$p{empty_element_suffix} \|\|= ' />'; # Change to ">" for HTML output
170
171	15	100				58	$p{trust_list_start_value} = $p{trust_list_start_value} ? 1 : 0;
172
173	15					59	my $self = { params => \%p };
174	15		33			107	bless $self, ref($class) \|\| $class;
175	15					59	return $self;
176							}
177
178							=head2 markdown
179
180							The main function as far as the outside world is concerned. See the SYNOPSIS
181							for details on use.
182
183							=cut
184
185							sub markdown {
186	53			53	1	26970	my ( $self, $text, $options ) = @_;
187
188							# Detect functional mode, and create an instance for this run
189	53	100				200	unless (ref $self) {
190	5	100				21	if ( $self ne __PACKAGE__ ) {
191	4					23	my $ob = __PACKAGE__->new();
192							# $self is text, $text is options
193	4					24	return $ob->markdown($self, $text);
194							}
195							else {
196	1					23	croak('Calling ' . $self . '->markdown (as a class method) is not supported.');
197							}
198							}
199
200	48		100			260	$options \|\|= {};
201
202	48					79	%$self = (%{ $self->{params} }, %$options, params => $self->{params});
	48					572
203
204	48					497	$self->_CleanUpRunData($options);
205
206	48					298	return $self->_Markdown($text);
207							}
208
209							sub _CleanUpRunData {
210	48			48		91	my ($self, $options) = @_;
211							# Clear the global hashes. If we don't clear these, you get conflicts
212							# from other articles when generating a page which contains more than
213							# one article (e.g. an index page that shows the N most recent
214							# articles).
215	48	50				203	$self->{_urls} = $options->{urls} ? $options->{urls} : {}; # FIXME - document passing this option (tested in 05options.t).
216	48					172	$self->{_titles} = {};
217	48					107	$self->{_html_blocks} = {};
218							# Used to track when we're inside an ordered or unordered list
219							# (see _ProcessListItems() for details)
220	48					110	$self->{_list_level} = 0;
221
222							}
223
224							sub _Markdown {
225							#
226							# Main function. The order in which other subs are called here is
227							# essential. Link and image substitutions need to happen before
228							# _EscapeSpecialChars(), so that any *'s or _'s in the
229							# and tags get encoded.
230							#
231	48			48		96	my ($self, $text, $options) = @_;
232
233	48					141	$text = $self->_CleanUpDoc($text);
234
235							# Turn block-level HTML elements into hash entries, and interpret markdown in them if they have a 'markdown="1"' attribute
236	48					242	$text = $self->_HashHTMLBlocks($text, {interpret_markdown_on_attribute => 1});
237
238	48					209	$text = $self->_StripLinkDefinitions($text);
239
240	48					248	$text = $self->_RunBlockGamut($text, {wrap_in_p_tags => 1});
241
242	48					176	$text = $self->_UnescapeSpecialChars($text);
243
244	48					159	$text = $self->_ConvertCopyright($text);
245
246	48					377	return $text . "\n";
247							}
248
249							=head2 urls
250
251							Returns a reference to a hash with the key being the markdown reference
252							and the value being the URL.
253
254							Useful for building scripts which preprocess a list of links before the
255							main content. See F for an example of this hashref being
256							passed back into the markdown method to create links.
257
258							=cut
259
260							sub urls {
261	0			0	1	0	my ( $self ) = @_;
262
263	0					0	return $self->{_urls};
264							}
265
266							sub _CleanUpDoc {
267	48			48		131	my ($self, $text) = @_;
268
269							# Standardize line endings:
270	48					160	$text =~ s{\r\n}{\n}g; # DOS to Unix
271	48					143	$text =~ s{\r}{\n}g; # Mac to Unix
272
273							# Make sure $text ends with a couple of newlines:
274	48					1147	$text .= "\n\n";
275
276							# Convert all tabs to spaces.
277	48					141	$text = $self->_Detab($text);
278
279							# Strip any lines consisting only of spaces and tabs.
280							# This makes subsequent regexen easier to write, because we can
281							# match consecutive blank lines with /\n+/ instead of something
282							# contorted like /[ \t]*\n+/ .
283	48					529	$text =~ s/^[ \t]+$//mg;
284
285	48					116	return $text;
286							}
287
288							sub _StripLinkDefinitions {
289							#
290							# Strips link definitions from text, stores the URLs and titles in
291							# hash references.
292							#
293	48			48		108	my ($self, $text) = @_;
294	48					104	my $less_than_tab = $self->{tab_width} - 1;
295
296							# Link defs are in the form: ^[id]: url "optional title"
297	48					1111	while ($text =~ s{
298							^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
299							[ \t]*
300							\n? # maybe one newline
301							[ \t]*
302							? # url = \$2
303							[ \t]*
304							\n? # maybe one newline
305							[ \t]*
306							(?:
307							(?<=\s) # lookbehind for whitespace
308							["(]
309							(.+?) # title = \$3
310							[")]
311							[ \t]*
312							)? # title is optional
313							(?:\n+\|\Z)
314							}{}omx) {
315	23					68	$self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
316	23	100				3234	if ($3) {
317	1					4	$self->{_titles}{lc $1} = $3;
318	1					17	$self->{_titles}{lc $1} =~ s/"/"/g;
319							}
320
321							}
322
323	48					122	return $text;
324							}
325
326							sub _md5_utf8 {
327							# Internal function used to safely MD5sum chunks of the input, which might be Unicode in Perl's internal representation.
328	166			166		308	my $input = shift;
329	166	50				483	return unless defined $input;
330	166	50				985	if (Encode::is_utf8 $input) {
331	0					0	return md5_hex(Encode::encode('utf8', $input));
332							}
333							else {
334	166					1971	return md5_hex($input);
335							}
336							}
337
338							sub _HashHTMLBlocks {
339	115			115		193	my ($self, $text, $options) = @_;
340	115					222	my $less_than_tab = $self->{tab_width} - 1;
341
342							# Hashify HTML blocks (protect from further interpretation by encoding to an md5):
343							# We only want to do this for block-level HTML tags, such as headers,
344							# lists, and tables. That's because we still want to wrap s around
345							# "paragraphs" that are wrapped in non-block-level tags, such as anchors,
346							# phrase emphasis, and spans. The list of tags we're looking for is
347							# hard-coded:
348	115					369	my $block_tags = qr{
349							(?:
350							p \| div \| h[1-6] \| blockquote \| pre \| table \|
351							dl \| ol \| ul \| script \| noscript \| form \|
352							fieldset \| iframe \| math \| ins \| del
353							)
354							}x;
355
356	115					284	my $tag_attrs = qr{
357							(?: # Match one attr name/value pair
358							\s+ # There needs to be at least some whitespace
359							# before each attribute name.
360							[\w.:_-]+ # Attribute name
361							\s=\s
362							(?:
363							".+?" # "Attribute value"
364							\|
365							'.+?' # 'Attribute value'
366							\|
367							[^\s]+? # AttributeValue (HTML5)
368							)
369							)* # Zero or more
370							}x;
371
372	115					1508	my $empty_tag = qr{< \w+ $tag_attrs \s* />}oxms;
373	115					1969	my $open_tag = qr{< $block_tags $tag_attrs \s* >}oxms;
374	115					222	my $close_tag = undef; # let Text::Balanced handle this
375	115					139	my $prefix_pattern = undef; # Text::Balanced
376	115					292	my $markdown_attr = qr{ \s* markdown \s* = \s* (['"]) (.*?) \1 }xs;
377
378	13			13		25504	use Text::Balanced qw(gen_extract_tagged);
	13					368852
	13					75010
379	115					601	my $extract_block = gen_extract_tagged($open_tag, $close_tag, $prefix_pattern, { ignore => [$empty_tag] });
380
381	115					17558	my @chunks;
382							# parse each line, looking for block-level HTML tags
383	115					1737	while ($text =~ s{^(([ ]{0,$less_than_tab}<)?.*\n)}{}m) {
384	2597					5402	my $cur_line = $1;
385	2597	100				5888	if (defined $2) {
386							# current line could be start of code block
387
388	187					1821	my ($tag, $remainder, $prefix, $opening_tag, $text_in_tag, $closing_tag) = $extract_block->($cur_line . $text);
389	187	100				573593	if ($tag) {
390	158	100	100			1642	if ($options->{interpret_markdown_on_attribute} and $opening_tag =~ s/$markdown_attr//i) {
391	15					34	my $markdown = $2;
392	15	100				63	if ($markdown =~ /^(1\|on\|yes)$/) {
393							# interpret markdown and reconstruct $tag to include the interpreted $text_in_tag
394	13					41	my $wrap_in_p_tags = $opening_tag =~ /^<(div\|iframe)/;
395	13					63	$tag = $prefix . $opening_tag . "\n"
396							. $self->_RunBlockGamut($text_in_tag, {wrap_in_p_tags => $wrap_in_p_tags})
397							. "\n" . $closing_tag
398							;
399							} else {
400							# just remove the markdown="0" attribute
401	2					6	$tag = $prefix . $opening_tag . $text_in_tag . $closing_tag;
402							}
403							}
404	158					475	my $key = _md5_utf8($tag);
405	158					955	$self->{_html_blocks}{$key} = $tag;
406	158					800	push @chunks, "\n\n" . $key . "\n\n";
407	158					2855	$text = $remainder;
408							}
409							else {
410							# No tag match, so toss $cur_line into @chunks
411	29					266	push @chunks, $cur_line;
412							}
413							}
414							else {
415							# current line could NOT be start of code block
416	2410					24713	push @chunks, $cur_line;
417							}
418
419							}
420	115					278	push @chunks, $text; # whatever is left
421
422	115					541	$text = join '', @chunks;
423
424	115					1799	return $text;
425							}
426
427							sub _HashHR {
428	67			67		118	my ($self, $text) = @_;
429	67					144	my $less_than_tab = $self->{tab_width} - 1;
430
431	67					1177	$text =~ s{
432							(?:
433							(?<=\n\n) # Starting after a blank line
434							\| # or
435							\A\n? # the beginning of the doc
436							)
437							( # save in $1
438							[ ]{0,$less_than_tab}
439							<(hr) # start tag = $2
440							\b # word break
441							([^<>])*? #
442							/?> # the matching end tag
443							[ \t]*
444							(?=\n{2,}\|\Z) # followed by a blank line or end of document
445							)
446							}{
447	6					20	my $key = _md5_utf8($1);
448	6					51	$self->{_html_blocks}{$key} = $1;
449	6					267	"\n\n" . $key . "\n\n";
450							}egx;
451
452	67					190	return $text;
453							}
454
455							sub _HashHTMLComments {
456	67			67		107	my ($self, $text) = @_;
457	67					124	my $less_than_tab = $self->{tab_width} - 1;
458
459							# Special case for standalone HTML comments:
460	67					1018	$text =~ s{
461							(?:
462							(?<=\n\n) # Starting after a blank line
463							\| # or
464							\A\n? # the beginning of the doc
465							)
466							( # save in $1
467							[ ]{0,$less_than_tab}
468							(?s:
469
470							(--.?--\s)+
471							>
472							)
473							[ \t]*
474							(?=\n{2,}\|\Z) # followed by a blank line or end of document
475							)
476							}{
477	0					0	my $key = _md5_utf8($1);
478	0					0	$self->{_html_blocks}{$key} = $1;
479	0					0	"\n\n" . $key . "\n\n";
480							}egx;
481
482	67					173	return $text;
483							}
484
485							sub _HashPHPASPBlocks {
486	67			67		104	my ($self, $text) = @_;
487	67					116	my $less_than_tab = $self->{tab_width} - 1;
488
489							# PHP and ASP-style processor instructions ( and <%…%>)
490	67					1069	$text =~ s{
491							(?:
492							(?<=\n\n) # Starting after a blank line
493							\| # or
494							\A\n? # the beginning of the doc
495							)
496							( # save in $1
497							[ ]{0,$less_than_tab}
498							(?s:
499							<([?%]) # $2
500							.*?
501							\2>
502							)
503							[ \t]*
504							(?=\n{2,}\|\Z) # followed by a blank line or end of document
505							)
506							}{
507	2					5	my $key = _md5_utf8($1);
508	2					8	$self->{_html_blocks}{$key} = $1;
509	2					13	"\n\n" . $key . "\n\n";
510							}egx;
511	67					174	return $text;
512							}
513
514							sub _RunBlockGamut {
515							#
516							# These are all the transformations that form block-level
517							# tags like paragraphs, headers, and list items.
518							#
519	67			67		124	my ($self, $text, $options) = @_;
520
521							# Do headers first, as these populate cross-refs
522	67					179	$text = $self->_DoHeaders($text);
523
524							# Do Horizontal Rules:
525	67					159	my $less_than_tab = $self->{tab_width} - 1;
526	67					728	$text =~ s{^[ ]{0,$less_than_tab}(\[ ]?){3,}[ \t]$}{\n{empty_element_suffix}\n}gmx;
527	67					893	$text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
528	67					434	$text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
529
530	67					217	$text = $self->_DoLists($text);
531
532	67					216	$text = $self->_DoCodeBlocks($text);
533
534	67					348	$text = $self->_DoBlockQuotes($text);
535
536							# We already ran _HashHTMLBlocks() before, in Markdown(), but that
537							# was to escape raw HTML in the original Markdown source. This time,
538							# we're escaping the markup we've just created, so that we don't wrap
539							# tags around block-level tags.
540	67					169	$text = $self->_HashHTMLBlocks($text);
541
542							# Special case just for . It was easier to make a special case than
543							# to make the other regex more complicated.
544	67					260	$text = $self->_HashHR($text);
545
546	67					360	$text = $self->_HashHTMLComments($text);
547
548	67					199	$text = $self->_HashPHPASPBlocks($text);
549
550	67					320	$text = $self->_FormParagraphs($text, {wrap_in_p_tags => $options->{wrap_in_p_tags}});
551
552	67					284	return $text;
553							}
554
555							sub _RunSpanGamut {
556							#
557							# These are all the transformations that occur within block-level
558							# tags like paragraphs, headers, and list items.
559							#
560	261			261		456	my ($self, $text) = @_;
561
562	261					593	$text = $self->_DoCodeSpans($text);
563	261					1114	$text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
564	261					875	$text = $self->_EscapeSpecialChars($text);
565
566							# Process anchor and image tags. Images must come first,
567							# because ![foo][f] looks like an anchor.
568	261					708	$text = $self->_DoImages($text);
569	261					815	$text = $self->_DoAnchors($text);
570
571							# Make links out of things like ``
572							# Must come after _DoAnchors(), because you can use < and >
573							# delimiters in inline links like [this]().
574	261					872	$text = $self->_DoAutoLinks($text);
575
576	261					642	$text = $self->_EncodeAmpsAndAngles($text);
577
578	261					884	$text = $self->_DoItalicsAndBold($text);
579
580							# FIXME - Is hard coding space here sane, or does this want to be related to tab width?
581							# Do hard breaks:
582	261					588	$text =~ s/ {2,}\n/ {empty_element_suffix}\n/g;
583
584	261					1122	return $text;
585							}
586
587							sub _EscapeSpecialChars {
588	261			261		464	my ($self, $text) = @_;
589	261		33			981	my $tokens \|\|= $self->_TokenizeHTML($text);
590
591	261					392	$text = ''; # rebuild $text from the tokens
592							# my $in_pre = 0; # Keep track of when we're inside or `tags.`
593							# my $tags_to_skip = qr!<(/?)(?:pre\|code\|kbd\|script\|math)[\s>]!;
594
595	261					440	foreach my $cur_token (@$tokens) {
596	613	100				1984	if ($cur_token->[0] eq "tag") {
597							# Within tags, encode * and _ so they don't conflict
598							# with their use in Markdown for italics and strong.
599							# We're replacing each such character with its
600							# corresponding MD5 checksum value; this is likely
601							# overkill, but it should prevent us from colliding
602							# with the escape values by accident.
603	200					354	$cur_token->[1] =~ s! \* !$g_escape_table{'*'}!ogx;
604	200					250	$cur_token->[1] =~ s! _ !$g_escape_table{'_'}!ogx;
605	200					716	$text .= $cur_token->[1];
606							} else {
607	413					745	my $t = $cur_token->[1];
608	413					1203	$t = $self->_EncodeBackslashEscapes($t);
609	413					1257	$text .= $t;
610							}
611							}
612	261					936	return $text;
613							}
614
615							sub _EscapeSpecialCharsWithinTagAttributes {
616							#
617							# Within tags -- meaning between < and > -- encode [\ ` * _] so they
618							# don't conflict with their use in Markdown for code, italics and strong.
619							# We're replacing each such character with its corresponding MD5 checksum
620							# value; this is likely overkill, but it should prevent us from colliding
621							# with the escape values by accident.
622							#
623	261			261		378	my ($self, $text) = @_;
624	261		33			1293	my $tokens \|\|= $self->_TokenizeHTML($text);
625	261					419	$text = ''; # rebuild $text from the tokens
626
627	261					601	foreach my $cur_token (@$tokens) {
628	613	100				1722	if ($cur_token->[0] eq "tag") {
629	200					574	$cur_token->[1] =~ s! \\ !$g_escape_table{'\\'}!gox;
630	200					572	$cur_token->[1] =~ s{ (?<=.)(?=.) }{$g_escape_table{'`'}}gox;
631	200					352	$cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gox;
632	200					255	$cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gox;
633							}
634	613					1547	$text .= $cur_token->[1];
635							}
636	261					1172	return $text;
637							}
638
639							sub _DoAnchors {
640							#
641							# Turn Markdown link shortcuts into XHTML tags.
642							#
643	261			261		407	my ($self, $text) = @_;
644
645							#
646							# First, handle reference-style links: [link text] [id]
647							#
648	261					12702	$text =~ s{
649							( # wrap whole match in $1
650							\[
651							($g_nested_brackets) # link text = $2
652							\]
653
654							[ ]? # one optional space
655							(?:\n[ ]*)? # one optional newline followed by spaces
656
657							\[
658							(.*?) # id = $3
659							\]
660							)
661							}{
662	32					83	my $whole_match = $1;
663	32					129	my $link_text = $2;
664	32					66	my $link_id = lc $3;
665
666	32	100				86	if ($link_id eq "") {
667	12					24	$link_id = lc $link_text; # for shortcut links like [this][].
668							}
669
670	32					71	$link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
671
672	32					83	$self->_GenerateAnchor($whole_match, $link_text, $link_id);
673							}xsge;
674
675							#
676							# Next, inline-style links: [link text](url "optional title")
677							#
678	261					22943	$text =~ s{
679							( # wrap whole match in $1
680							\[
681							($g_nested_brackets) # link text = $2
682							\]
683							\( # literal paren
684							[ \t]*
685							($g_nested_parens) # href = $3
686							[ \t]*
687							( # $4
688							(['"]) # quote char = $5
689							(.*?) # Title = $6
690							\5 # matching quote
691							[ \t]* # ignore any spaces/tabs between closing quote and )
692							)? # title is optional
693							\)
694							)
695							}{
696	22					29	my $result;
697	22					53	my $whole_match = $1;
698	22					34	my $link_text = $2;
699	22					36	my $url = $3;
700	22					33	my $title = $6;
701
702	22					61	$self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
703							}xsge;
704
705							#
706							# Last, handle reference-style shortcuts: [link text]
707							# These must come last in case you've also got [link test][1]
708							# or [link test](/foo)
709							#
710	261					798	$text =~ s{
711							( # wrap whole match in $1
712							\[
713							([^\[\]]+) # link text = $2; can't contain '[' or ']'
714							\]
715							)
716							}{
717	16					33	my $result;
718	16					40	my $whole_match = $1;
719	16					38	my $link_text = $2;
720	16					56	(my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
721
722	16					52	$self->_GenerateAnchor($whole_match, $link_text, $link_id);
723							}xsge;
724
725	261					964	return $text;
726							}
727
728							sub _GenerateAnchor {
729							# FIXME - Fugly, change to named params?
730	70			70		150	my ($self, $whole_match, $link_text, $link_id, $url, $title, $attributes) = @_;
731
732	70					80	my $result;
733
734	70	50				174	$attributes = '' unless defined $attributes;
735
736	70	100	100			350	if ( !defined $url && defined $self->{_urls}{$link_id}) {
737	35					172	$url = $self->{_urls}{$link_id};
738							}
739
740	70	100				152	if (!defined $url) {
741	13					62	return $whole_match;
742							}
743
744	57					83	$url =~ s! \* !$g_escape_table{'*'}!gox; # We've got to encode these to avoid
745	57					83	$url =~ s! _ !$g_escape_table{'_'}!gox; # conflicting with italics/bold.
746	57					73	$url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
747
748	57					116	$result = qq{
749
750	57	100	100			368	if ( !defined $title && defined $link_id && defined $self->{_titles}{$link_id} ) {
			100
751	3					7	$title = $self->{_titles}{$link_id};
752							}
753
754	57	100				113	if ( defined $title ) {
755	4					8	$title =~ s/"/"/g;
756	4					9	$title =~ s! \* !$g_escape_table{'*'}!gox;
757	4					6	$title =~ s! _ !$g_escape_table{'_'}!gox;
758	4					10	$result .= qq{ title="$title"};
759							}
760
761	57					99	$result .= "$attributes>$link_text";
762
763	57					355	return $result;
764							}
765
766							sub _DoImages {
767							#
768							# Turn Markdown image shortcuts into tags.
769							#
770	261			261		399	my ($self, $text) = @_;
771
772							#
773							# First, handle reference-style labeled images: ![alt text][id]
774							#
775	261					482	$text =~ s{
776							( # wrap whole match in $1
777							!\[
778							(.*?) # alt text = $2
779							\]
780
781							[ ]? # one optional space
782							(?:\n[ ]*)? # one optional newline followed by spaces
783
784							\[
785							(.*?) # id = $3
786							\]
787
788							)
789							}{
790	0					0	my $result;
791	0					0	my $whole_match = $1;
792	0					0	my $alt_text = $2;
793	0					0	my $link_id = lc $3;
794
795	0	0				0	if ($link_id eq '') {
796	0					0	$link_id = lc $alt_text; # for shortcut links like ![this][].
797							}
798
799	0					0	$self->_GenerateImage($whole_match, $alt_text, $link_id);
800							}xsge;
801
802							#
803							# Next, handle inline images: ![alt text](url "optional title")
804							# Don't forget: encode * and _
805
806	261					18206	$text =~ s{
807							( # wrap whole match in $1
808							!\[
809							(.*?) # alt text = $2
810							\]
811							\( # literal paren
812							[ \t]*
813							($g_nested_parens) # src url - href = $3
814							[ \t]*
815							( # $4
816							(['"]) # quote char = $5
817							(.*?) # title = $6
818							\5 # matching quote
819							[ \t]*
820							)? # title is optional
821							\)
822							)
823							}{
824	0					0	my $result;
825	0					0	my $whole_match = $1;
826	0					0	my $alt_text = $2;
827	0					0	my $url = $3;
828	0					0	my $title = '';
829	0	0				0	if (defined($6)) {
830	0					0	$title = $6;
831							}
832
833	0					0	$self->_GenerateImage($whole_match, $alt_text, undef, $url, $title);
834							}xsge;
835
836	261					1025	return $text;
837							}
838
839							sub _GenerateImage {
840							# FIXME - Fugly, change to named params?
841	0			0		0	my ($self, $whole_match, $alt_text, $link_id, $url, $title, $attributes) = @_;
842
843	0					0	my $result;
844
845	0	0				0	$attributes = '' unless defined $attributes;
846
847	0		0			0	$alt_text \|\|= '';
848	0					0	$alt_text =~ s/"/"/g;
849							# FIXME - how about >
850
851	0	0	0			0	if ( !defined $url && defined $self->{_urls}{$link_id}) {
852	0					0	$url = $self->{_urls}{$link_id};
853							}
854
855							# If there's no such link ID, leave intact:
856	0	0				0	return $whole_match unless defined $url;
857
858	0					0	$url =~ s! \* !$g_escape_table{'*'}!ogx; # We've got to encode these to avoid
859	0					0	$url =~ s! _ !$g_escape_table{'_'}!ogx; # conflicting with italics/bold.
860	0					0	$url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
861
862	0	0	0			0	if (!defined $title && length $link_id && defined $self->{_titles}{$link_id} && length $self->{_titles}{$link_id}) {
			0
			0
863	0					0	$title = $self->{_titles}{$link_id};
864							}
865
866	0					0	$result = qq{
867	0	0	0			0	if (defined $title && length $title) {
868	0					0	$title =~ s! \* !$g_escape_table{'*'}!ogx;
869	0					0	$title =~ s! _ !$g_escape_table{'_'}!ogx;
870	0					0	$title =~ s/"/"/g;
871	0					0	$result .= qq{ title="$title"};
872							}
873	0					0	$result .= $attributes . $self->{empty_element_suffix};
874
875	0					0	return $result;
876							}
877
878							sub _DoHeaders {
879	67			67		99	my ($self, $text) = @_;
880
881							# Setext-style headers:
882							# Header 1
883							# ========
884							#
885							# Header 2
886							# --------
887							#
888	67					122	$text =~ s{ ^(.+)[ \t]\n=+[ \t]\n+ }{
889	1					5	$self->_GenerateHeader('1', $1);
890							}egmx;
891
892	67					2919	$text =~ s{ ^(.+)[ \t]\n-+[ \t]\n+ }{
893	0					0	$self->_GenerateHeader('2', $1);
894							}egmx;
895
896
897							# atx-style headers:
898							# # Header 1
899							# ## Header 2
900							# ## Header 2 with closing hashes ##
901							# ...
902							# ###### Header 6
903							#
904	67					74	my $l;
905	67					164	$text =~ s{
906							^(\#{1,6}) # $1 = string of #'s
907							[ \t]*
908							(.+?) # $2 = Header text
909							[ \t]*
910							\#* # optional closing #'s (not counted)
911							\n+
912							}{
913	4					11	my $h_level = length($1);
914	4					11	$self->_GenerateHeader($h_level, $2);
915							}egmx;
916
917	67					146	return $text;
918							}
919
920							sub _GenerateHeader {
921	5			5		15	my ($self, $level, $id) = @_;
922
923	5					19	return "" . $self->_RunSpanGamut($id) . "\n\n";
924							}
925
926							sub _DoLists {
927							#
928							# Form HTML ordered (numbered) and unordered (bulleted) lists.
929							#
930	111			111		203	my ($self, $text) = @_;
931	111					201	my $less_than_tab = $self->{tab_width} - 1;
932
933							# Re-usable patterns to match list item bullets and number markers:
934	111					366	my $marker_ul = qr/[*+-]/;
935	111					307	my $marker_ol = qr/\d+[.]/;
936	111					779	my $marker_any = qr/(?:$marker_ul\|$marker_ol)/;
937
938							# Re-usable pattern to match any entirel ul or ol list:
939	111					1653	my $whole_list = qr{
940							( # $1 = whole list
941							( # $2
942							[ ]{0,$less_than_tab}
943							(${marker_any}) # $3 = first list item marker
944							[ \t]+
945							)
946							(?s:.+?)
947							( # $4
948							\z
949							\|
950							\n{2,}
951							(?=\S)
952							(?! # Negative lookahead for another list item marker
953							[ \t]*
954							${marker_any}[ \t]+
955							)
956							)
957							)
958							}mx;
959
960							# We use a different prefix before nested lists than top-level lists.
961							# See extended comment in _ProcessListItems().
962							#
963							# Note: There's a bit of duplication here. My original implementation
964							# created a scalar regex pattern as the conditional result of the test on
965							# $self->{_list_level}, and then only ran the $text =~ s{...}{...}egmx
966							# substitution once, using the scalar as the pattern. This worked,
967							# everywhere except when running under MT on my hosting account at Pair
968							# Networks. There, this caused all rebuilds to be killed by the reaper (or
969							# perhaps they crashed, but that seems incredibly unlikely given that the
970							# same script on the same server ran fine except under MT. I've spent
971							# more time trying to figure out why this is happening than I'd like to
972							# admit. My only guess, backed up by the fact that this workaround works,
973							# is that Perl optimizes the substition when it can figure out that the
974							# pattern will never change, and when this optimization isn't on, we run
975							# afoul of the reaper. Thus, the slightly redundant code to that uses two
976							# static s/// patterns rather than one conditional pattern.
977
978	111	100				317	if ($self->{_list_level}) {
979	47					758	$text =~ s{
980							^
981							$whole_list
982							}{
983	9					20	my $list = $1;
984	9					15	my $marker = $3;
985	9	100				43	my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
986							# Turn double returns into triple returns, so that we can make a
987							# paragraph for the last item in a list, if necessary:
988	9					24	$list =~ s/\n{2,}/\n\n\n/g;
989	9	100				40	my $result = ( $list_type eq 'ul' ) ?
990							$self->_ProcessListItemsUL($list, $marker_ul)
991							: $self->_ProcessListItemsOL($list, $marker_ol);
992
993	9					24	$result = $self->_MakeList($list_type, $result, $marker);
994	9					38	$result;
995							}egmx;
996							}
997							else {
998	64					2420	$text =~ s{
999							(?:(?<=\n\n)\|\A\n?)
1000							$whole_list
1001							}{
1002	13					46	my $list = $1;
1003	13					28	my $marker = $3;
1004	13	100				84	my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
1005							# Turn double returns into triple returns, so that we can make a
1006							# paragraph for the last item in a list, if necessary:
1007	13					79	$list =~ s/\n{2,}/\n\n\n/g;
1008	13	100				82	my $result = ( $list_type eq 'ul' ) ?
1009							$self->_ProcessListItemsUL($list, $marker_ul)
1010							: $self->_ProcessListItemsOL($list, $marker_ol);
1011	13					47	$result = $self->_MakeList($list_type, $result, $marker);
1012	13					4737	$result;
1013							}egmx;
1014							}
1015
1016
1017	111					603	return $text;
1018							}
1019
1020							sub _MakeList {
1021	22			22		39	my ($self, $list_type, $content, $marker) = @_;
1022
1023	22	100	100			107	if ($list_type eq 'ol' and $self->{trust_list_start_value}) {
1024	2					11	my ($num) = $marker =~ /^(\d+)[.]/;
1025	2					9	return " \n" . $content . " \n";
1026							}
1027
1028	20					75	return "<$list_type>\n" . $content . "\n";
1029							}
1030
1031							sub _ProcessListItemsOL {
1032							#
1033							# Process the contents of a single ordered list, splitting it
1034							# into individual list items.
1035							#
1036
1037	6			6		14	my ($self, $list_str, $marker_any) = @_;
1038
1039
1040							# The $self->{_list_level} global keeps track of when we're inside a list.
1041							# Each time we enter a list, we increment it; when we leave a list,
1042							# we decrement. If it's zero, we're not in a list anymore.
1043							#
1044							# We do this because when we're not inside a list, we want to treat
1045							# something like this:
1046							#
1047							# I recommend upgrading to version
1048							# 8. Oops, now this line is treated
1049							# as a sub-list.
1050							#
1051							# As a single paragraph, despite the fact that the second line starts
1052							# with a digit-period-space sequence.
1053							#
1054							# Whereas when we're inside a list (or sub-list), that line will be
1055							# treated as the start of a sub-list. What a kludge, huh? This is
1056							# an aspect of Markdown's syntax that's hard to parse perfectly
1057							# without resorting to mind-reading. Perhaps the solution is to
1058							# change the syntax rules such that sub-lists must start with a
1059							# starting cardinal number; e.g. "1." or "a.".
1060
1061	6					12	$self->{_list_level}++;
1062
1063							# trim trailing blank lines:
1064	6					20	$list_str =~ s/\n{2,}\z/\n/;
1065
1066
1067	6					206	$list_str =~ s{
1068							(\n)? # leading line = $1
1069							(^[ \t]*) # leading whitespace = $2
1070							($marker_any) [ \t]+ # list marker = $3
1071							((?s:.+?) # list item text = $4
1072							(\n{1,2}))
1073							(?= \n* (\z \| \2 ($marker_any) [ \t]+))
1074							}{
1075	10					26	my $item = $4;
1076	10					16	my $leading_line = $1;
1077	10					18	my $leading_space = $2;
1078
1079	10	50	33			67	if ($leading_line or ($item =~ m/\n{2,}/)) {
1080	0					0	$item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
1081							}
1082							else {
1083							# Recursion for sub-lists:
1084	10					29	$item = $self->_DoLists($self->_Outdent($item));
1085	10					24	chomp $item;
1086	10					81	$item = $self->_RunSpanGamut($item);
1087							}
1088
1089	10					55	" " . $item . " \n";
1090							}egmxo;
1091
1092	6					19	$self->{_list_level}--;
1093	6					19	return $list_str;
1094							}
1095
1096							sub _ProcessListItemsUL {
1097							#
1098							# Process the contents of a single unordered list, splitting it
1099							# into individual list items.
1100							#
1101
1102	16			16		37	my ($self, $list_str, $marker_any) = @_;
1103
1104
1105							# The $self->{_list_level} global keeps track of when we're inside a list.
1106							# Each time we enter a list, we increment it; when we leave a list,
1107							# we decrement. If it's zero, we're not in a list anymore.
1108							#
1109							# We do this because when we're not inside a list, we want to treat
1110							# something like this:
1111							#
1112							# I recommend upgrading to version
1113							# 8. Oops, now this line is treated
1114							# as a sub-list.
1115							#
1116							# As a single paragraph, despite the fact that the second line starts
1117							# with a digit-period-space sequence.
1118							#
1119							# Whereas when we're inside a list (or sub-list), that line will be
1120							# treated as the start of a sub-list. What a kludge, huh? This is
1121							# an aspect of Markdown's syntax that's hard to parse perfectly
1122							# without resorting to mind-reading. Perhaps the solution is to
1123							# change the syntax rules such that sub-lists must start with a
1124							# starting cardinal number; e.g. "1." or "a.".
1125
1126	16					27	$self->{_list_level}++;
1127
1128							# trim trailing blank lines:
1129	16					57	$list_str =~ s/\n{2,}\z/\n/;
1130
1131
1132	16					245	$list_str =~ s{
1133							(\n)? # leading line = $1
1134							(^[ \t]*) # leading whitespace = $2
1135							($marker_any) [ \t]+ # list marker = $3
1136							((?s:.+?) # list item text = $4
1137							(\n{1,2}))
1138							(?= \n* (\z \| \2 ($marker_any) [ \t]+))
1139							}{
1140	37					89	my $item = $4;
1141	37					50	my $leading_line = $1;
1142	37					49	my $leading_space = $2;
1143
1144	37	100	66			178	if ($leading_line or ($item =~ m/\n{2,}/)) {
1145	3					8	$item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
1146							}
1147							else {
1148							# Recursion for sub-lists:
1149	34					88	$item = $self->_DoLists($self->_Outdent($item));
1150	34					72	chomp $item;
1151	34					89	$item = $self->_RunSpanGamut($item);
1152							}
1153
1154	37					250	" " . $item . " \n";
1155							}egmxo;
1156
1157	16					37	$self->{_list_level}--;
1158	16					39	return $list_str;
1159							}
1160
1161							sub _DoCodeBlocks {
1162							#
1163							# Process Markdown code blocks (indented with 4 spaces or 1 tab):
1164							# * outdent the spaces/tab
1165							# * encode <, >, & into HTML entities
1166							# * escape Markdown special characters into MD5 hashes
1167							# * trim leading and trailing newlines
1168							#
1169
1170	67			67		105	my ($self, $text) = @_;
1171
1172	67					2087	$text =~ s{
1173							(?:\n\n\|\A)
1174							( # $1 = the code block -- one or more lines, starting with a space/tab
1175							(?:
1176							(?:[ ]{$self->{tab_width}} \| \t) # Lines must start with a tab or a tab-width of spaces
1177							.*\n+
1178							)+
1179							)
1180							((?=^[ ]{0,$self->{tab_width}}\S)\|\Z) # Lookahead for non-space at line-start, or end of doc
1181							}{
1182	90					207	my $codeblock = $1;
1183	90					99	my $result; # return value
1184
1185	90					206	$codeblock = $self->_EncodeCode($self->_Outdent($codeblock));
1186	90					241	$codeblock = $self->_Detab($codeblock);
1187	90					140	$codeblock =~ s/\A\n+//; # trim leading newlines
1188	90					388	$codeblock =~ s/\n+\z//; # trim trailing newlines
1189
1190	90					206	$result = "\n\n `" . $codeblock . "\n` \n\n";
1191
1192	90					2959	$result;
1193							}egmx;
1194
1195	67					271	return $text;
1196							}
1197
1198							sub _DoCodeSpans {
1199							#
1200							# * Backtick quotes are used for spans.
1201							#
1202							# * You can use multiple backticks as the delimiters if you want to
1203							# include literal backticks in the code span. So, this input:
1204							#
1205							# Just type ``foo `bar` baz`` at the prompt.
1206							#
1207							# Will translate to:
1208							#
1209							# Just type foo `bar` baz at the prompt.
1210							#
1211							# There's no arbitrary limit to the number of backticks you
1212							# can use as delimters. If you need three consecutive backticks
1213							# in your code, use four for delimiters, etc.
1214							#
1215							# * You can use spaces to get literal backticks at the edges:
1216							#
1217							# ... type `` `bar` `` ...
1218							#
1219							# Turns to:
1220							#
1221							# ... type `bar` ...
1222							#
1223
1224	261			261		517	my ($self, $text) = @_;
1225
1226	261					727	$text =~ s@
1227							(?
1228							(`+) # $1 = Opening run of `
1229							(.+?) # $2 = The code block
1230							(?
1231							\1 # Matching closer
1232							(?!`)
1233							@
1234	55					381	my $c = "$2";
1235	55					353	$c =~ s/^[ \t]*//g; # leading whitespace
1236	55					427	$c =~ s/[ \t]*$//g; # trailing whitespace
1237	55					204	$c = $self->_EncodeCode($c);
1238	55					304	"`$c`";
1239							@egsx;
1240
1241	261					557	return $text;
1242							}
1243
1244							sub _EncodeCode {
1245							#
1246							# Encode/escape certain characters inside Markdown code runs.
1247							# The point is that in code, these characters are literals,
1248							# and lose their special Markdown meanings.
1249							#
1250	145			145		213	my $self = shift;
1251	145					297	local $_ = shift;
1252
1253							# Encode all ampersands; HTML entities are not
1254							# entities within a Markdown code span.
1255	145					293	s/&/&/g;
1256
1257							# Encode $'s, but only if we're running under Blosxom.
1258							# (Blosxom interpolates Perl variables in article bodies.)
1259							{
1260	13			13		208	no warnings 'once';
	13					29
	13					50075
	145					173
1261	145	50				520	if (defined($blosxom::version)) {
1262	0					0	s/\$/$/g;
1263							}
1264							}
1265
1266
1267							# Do the angle bracket song and dance:
1268	145					508	s! < !<!gx;
1269	145					983	s! > !>!gx;
1270
1271							# Now, escape characters that are magic in Markdown:
1272	145					316	s! \* !$g_escape_table{'*'}!ogx;
1273	145					220	s! _ !$g_escape_table{'_'}!ogx;
1274	145					280	s! { !$g_escape_table{'{'}!ogx;
1275	145					190	s! } !$g_escape_table{'}'}!ogx;
1276	145					384	s! \[ !$g_escape_table{'['}!ogx;
1277	145					273	s! \] !$g_escape_table{']'}!ogx;
1278	145					317	s! \\ !$g_escape_table{'\\'}!ogx;
1279
1280	145					640	return $_;
1281							}
1282
1283							sub _DoItalicsAndBold {
1284	261			261		365	my ($self, $text) = @_;
1285
1286							# Handle at beginning of lines:
1287	261					671	$text =~ s{ ^(\\\|__) (?=\S) (.+?[_]) (?<=\S) \1 }
1288							{$2}gsx;
1289
1290	261					585	$text =~ s{ ^(\*\|_) (?=\S) (.+?) (?<=\S) \1 }
1291							{$2}gsx;
1292
1293							# must go first:
1294	261					3622	$text =~ s{ (?<=\W) (\\\|__) (?=\S) (.+?[_]) (?<=\S) \1 }
1295							{$2}gsx;
1296
1297	261					4263	$text =~ s{ (?<=\W) (\*\|_) (?=\S) (.+?) (?<=\S) \1 }
1298							{$2}gsx;
1299
1300							# And now, a second pass to catch nested strong and emphasis special cases
1301	261					3733	$text =~ s{ (?<=\W) (\\\|__) (?=\S) (.+?[_]) (?<=\S) \1 }
1302							{$2}gsx;
1303
1304	261					3734	$text =~ s{ (?<=\W) (\*\|_) (?=\S) (.+?) (?<=\S) \1 }
1305							{$2}gsx;
1306
1307	261					639	return $text;
1308							}
1309
1310							sub _DoBlockQuotes {
1311	67			67		114	my ($self, $text) = @_;
1312
1313	67					305	$text =~ s{
1314							( # Wrap whole match in $1
1315							(
1316							^[ \t]*>[ \t]? # '>' at the start of a line
1317							.+\n # rest of the first line
1318							(.+\n)* # subsequent consecutive lines
1319							\n* # blanks
1320							)+
1321							)
1322							}{
1323	3					8	my $bq = $1;
1324	3					12	$bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1325	3					7	$bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1326	3					16	$bq = $self->_RunBlockGamut($bq, {wrap_in_p_tags => 1}); # recurse
1327
1328	3					12	$bq =~ s/^/ /mg;
1329							# These leading spaces screw with content, so we need to fix that:
1330	3					5	$bq =~ s{
1331							(\s* .+? )
1332							}{
1333	0					0	my $pre = $1;
1334	0					0	$pre =~ s/^ //mg;
1335	0					0	$pre;
1336							}egsx;
1337
1338	3					15	" \n$bq\n \n\n";
1339							}egmx;
1340
1341
1342	67					119	return $text;
1343							}
1344
1345							sub _FormParagraphs {
1346							#
1347							# Params:
1348							# $text - string to process with html tags
1349							#
1350	67			67		113	my ($self, $text, $options) = @_;
1351
1352							# Strip leading and trailing lines:
1353	67					182	$text =~ s/\A\n+//;
1354	67					413	$text =~ s/\n+\z//;
1355
1356	67					692	my @grafs = split(/\n{2,}/, $text);
1357
1358							#
1359							# Wrap tags.
1360							#
1361	67					150	foreach (@grafs) {
1362	378	100				2012	unless (defined( $self->{_html_blocks}{$_} )) {
1363	212					787	$_ = $self->_RunSpanGamut($_);
1364	212	100				786	if ($options->{wrap_in_p_tags}) {
1365	211					1217	s/^([ \t]*)/ /;
1366	211					623	$_ .= " ";
1367							}
1368							}
1369							}
1370
1371							#
1372							# Unhashify HTML blocks
1373							#
1374	67					128	foreach (@grafs) {
1375	378	100				1120	if (defined( $self->{_html_blocks}{$_} )) {
1376	166					389	$_ = $self->{_html_blocks}{$_};
1377							}
1378							}
1379
1380	67					349	return join "\n\n", @grafs;
1381							}
1382
1383							sub _EncodeAmpsAndAngles {
1384							# Smart processing for ampersands and angle brackets that need to be encoded.
1385
1386	284			284		536	my ($self, $text) = @_;
1387	284	50	33			1176	return '' if (!defined $text or !length $text);
1388
1389							# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1390							# http://bumppo.net/projects/amputator/
1391	284					718	$text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+\|\w+);)/&/g;
1392
1393							# Encode naked <'s
1394	284					538	$text =~ s{<(?![a-z/?\$!])}{<}gi;
1395
1396							# And >'s - added by Fletcher Penney
1397							# $text =~ s{>(?![a-z/?\$!])}{>}gi;
1398							# Causes problems...
1399
1400							# Remove encoding inside comments
1401	284					4029	$text =~ s{
1402							(?<=) # End comments
1405							}{
1406	1					3	my $t = $1;
1407	1					4	$t =~ s/&/&/g;
1408	1					3	$t =~ s/</
1409	1					4	$t;
1410							}egsx;
1411
1412	284					776	return $text;
1413							}
1414
1415							sub _EncodeBackslashEscapes {
1416							#
1417							# Parameter: String.
1418							# Returns: The string, with after processing the following backslash
1419							# escape sequences.
1420							#
1421	413			413		536	my $self = shift;
1422	413					606	local $_ = shift;
1423
1424	413					826	s! \\\\ !$g_escape_table{'\\'}!ogx; # Must process escaped backslashes first.
1425	413					1008	s! \\` !$g_escape_table{'`'}!ogx;
1426	413					602	s! \\\* !$g_escape_table{'*'}!ogx;
1427	413					644	s! \\_ !$g_escape_table{'_'}!ogx;
1428	413					863	s! \\\{ !$g_escape_table{'{'}!ogx;
1429	413					598	s! \\\} !$g_escape_table{'}'}!ogx;
1430	413					559	s! \\\[ !$g_escape_table{'['}!ogx;
1431	413					482	s! \\\] !$g_escape_table{']'}!ogx;
1432	413					603	s! \\\( !$g_escape_table{'('}!ogx;
1433	413					713	s! \\\) !$g_escape_table{')'}!ogx;
1434	413					579	s! \\> !$g_escape_table{'>'}!ogx;
1435	413					456	s! \\\# !$g_escape_table{'#'}!ogx;
1436	413					565	s! \\\+ !$g_escape_table{'+'}!ogx;
1437	413					486	s! \\\- !$g_escape_table{'-'}!ogx;
1438	413					757	s! \\\. !$g_escape_table{'.'}!ogx;
1439	413					496	s{ \\! }{$g_escape_table{'!'}}ogx;
1440
1441	413					1095	return $_;
1442							}
1443
1444							sub _DoAutoLinks {
1445	261			261		386	my ($self, $text) = @_;
1446
1447	261					450	$text =~ s{<((https?\|ftp):[^'">\s]+)>}{$1}gi;
1448
1449							# Email addresses:
1450	261					362	$text =~ s{
1451							<
1452							(?:mailto:)?
1453							(
1454							[-.\w\+]+
1455							\@
1456							[-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1457							)
1458							>
1459							}{
1460	1					4	$self->_EncodeEmailAddress( $self->_UnescapeSpecialChars($1) );
1461							}egix;
1462
1463	261					471	return $text;
1464							}
1465
1466							sub _EncodeEmailAddress {
1467							#
1468							# Input: an email address, e.g. "foo@example.com"
1469							#
1470							# Output: the email address as a mailto link, with each character
1471							# of the address encoded as either a decimal or hex entity, in
1472							# the hopes of foiling most address harvesting spam bots. E.g.:
1473							#
1474							# 1475	# xample.com">foo
1476							# @example.com
1477							#
1478							# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1479							# mailing list:
1480							#
1481
1482	1			1		2	my ($self, $addr) = @_;
1483
1484							my @encode = (
1485	7			7		16	sub { '&#' . ord(shift) . ';' },
1486	5			5		17	sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1487	3			3		6	sub { shift },
1488	1					17	);
1489
1490	1					3	$addr = "mailto:" . $addr;
1491
1492	1					4	$addr =~ s{(.)}{
1493	16					35	my $char = $1;
1494	16	100				41	if ( $char eq '@' ) {
		100
1495							# this must be encoded. I insist.
1496	1					3	$char = $encode[int rand 1]->($char);
1497							}
1498							elsif ( $char ne ':' ) {
1499							# leave ':' alone (to spot mailto: later)
1500	14					79	my $r = rand;
1501							# roughly 10% raw, 45% hex, 45% dec
1502	14	100				50	$char = (
		100
1503							$r > .9 ? $encode[2]->($char) :
1504							$r < .45 ? $encode[1]->($char) :
1505							$encode[0]->($char)
1506							);
1507							}
1508	16					43	$char;
1509							}gex;
1510
1511	1					6	$addr = qq{$addr};
1512	1					7	$addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1513
1514	1					8	return $addr;
1515							}
1516
1517							sub _UnescapeSpecialChars {
1518							#
1519							# Swap back in all the special characters we've hidden.
1520							#
1521	49			49		80	my ($self, $text) = @_;
1522
1523	49					226	while( my($char, $hash) = each(%g_escape_table) ) {
1524	784					9145	$text =~ s/$hash/$char/g;
1525							}
1526	49					121	return $text;
1527							}
1528
1529							sub _TokenizeHTML {
1530							#
1531							# Parameter: String containing HTML markup.
1532							# Returns: Reference to an array of the tokens comprising the input
1533							# string. Each token is either a tag (possibly with nested,
1534							# tags contained therein, such as , or a
1535							# run of text between tags. Each element of the array is a
1536							# two-element array; the first is either 'tag' or 'text';
1537							# the second is the actual value.
1538							#
1539							#
1540							# Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1541							#
1542							#
1543
1544	522			522		707	my ($self, $str) = @_;
1545	522					601	my $pos = 0;
1546	522					886	my $len = length $str;
1547	522					702	my @tokens;
1548
1549	522					617	my $depth = 6;
1550	522					2425	my $nested_tags = join('\|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
1551	522					4246	my $match = qr/(?s: ) \| # comment
1552							(?s: <\? .*? \?> ) \| # processing instruction
1553							$nested_tags/iox; # nested tags
1554
1555	522					5082	while ($str =~ m/($match)/og) {
1556	400					717	my $whole_tag = $1;
1557	400					459	my $sec_start = pos $str;
1558	400					496	my $tag_start = $sec_start - length $whole_tag;
1559	400	100				741	if ($pos < $tag_start) {
1560	332					1716	push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1561							}
1562	400					956	push @tokens, ['tag', $whole_tag];
1563	400					2596	$pos = pos $str;
1564							}
1565	522	100				2534	push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1566	522					3059	\@tokens;
1567							}
1568
1569							sub _Outdent {
1570							#
1571							# Remove one level of line-leading tabs or spaces
1572							#
1573	137			137		227	my ($self, $text) = @_;
1574
1575	137					1146	$text =~ s/^(\t\|[ ]{1,$self->{tab_width}})//gm;
1576	137					519	return $text;
1577							}
1578
1579							sub _Detab {
1580							#
1581							# Cribbed from a post by Bart Lateur:
1582							#
1583							#
1584	138			138		275	my ($self, $text) = @_;
1585
1586							# FIXME - Better anchor/regex would be quicker.
1587
1588							# Original:
1589							#$text =~ s{(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}ge;
1590
1591							# Much swifter, but pretty hateful:
1592	138					4135	do {} while ($text =~ s{^(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}mge);
	27					43115
1593	138					406	return $text;
1594							}
1595
1596							sub _ConvertCopyright {
1597	48			48		85	my ($self, $text) = @_;
1598							# Convert to an XML compatible form of copyright symbol
1599
1600	48					270	$text =~ s/©/©/gi;
1601
1602	48					107	return $text;
1603							}
1604
1605							1;
1606
1607							__END__

File Coverage