File Coverage

lib/Text/Markdown.pm
Criterion Covered Total %
statement 439 487 90.1
branch 71 92 77.1
condition 28 59 47.4
subroutine 51 53 96.2
pod 3 3 100.0
total 592 694 85.3


line stmt bran cond sub pod time code
1             package Text::Markdown;
2             require 5.008_000;
3 13     13   25034 use strict;
  13         26  
  13         532  
4 13     13   74 use warnings;
  13         25  
  13         457  
5 13     13   77 use re 'eval';
  13         21  
  13         692  
6              
7 13     13   68 use Digest::MD5 qw(md5_hex);
  13         28  
  13         1052  
8 13     13   14823 use Encode qw();
  13         213441  
  13         384  
9 13     13   111 use Carp qw(croak);
  13         24  
  13         859  
10 13     13   69 use base 'Exporter';
  13         28  
  13         22663  
11              
12             our $VERSION = '1.000031'; # 1.0.31
13             $VERSION = eval $VERSION;
14             our @EXPORT_OK = qw(markdown);
15              
16             =head1 NAME
17              
18             Text::Markdown - Convert Markdown syntax to (X)HTML
19              
20             =head1 SYNOPSIS
21              
22             use Text::Markdown 'markdown';
23             my $html = markdown($text);
24              
25             use Text::Markdown 'markdown';
26             my $html = markdown( $text, {
27             empty_element_suffix => '>',
28             tab_width => 2,
29             } );
30              
31             use Text::Markdown;
32             my $m = Text::Markdown->new;
33             my $html = $m->markdown($text);
34              
35             use Text::Markdown;
36             my $m = Text::MultiMarkdown->new(
37             empty_element_suffix => '>',
38             tab_width => 2,
39             );
40             my $html = $m->markdown( $text );
41              
42             =head1 DESCRIPTION
43              
44             Markdown is a text-to-HTML filter; it translates an easy-to-read /
45             easy-to-write structured text format into HTML. Markdown's text format
46             is most similar to that of plain text email, and supports features such
47             as headers, *emphasis*, code blocks, blockquotes, and links.
48              
49             Markdown's syntax is designed not as a generic markup language, but
50             specifically to serve as a front-end to (X)HTML. You can use span-level
51             HTML tags anywhere in a Markdown document, and you can use block level
52             HTML tags (like
and as well).
53              
54             =head1 SYNTAX
55              
56             This module implements the 'original' Markdown markdown syntax from:
57              
58             http://daringfireball.net/projects/markdown/
59              
60             Note that L ensures that the output always ends with
61             B newline. The fact that multiple newlines are collapsed into one
62             makes sense, because this is the behavior of HTML towards whispace. The
63             fact that there's always a newline at the end makes sense again, given
64             that the output will always be nested in a B-level element (as
65             opposed to an inline element). That block element can be a C<<

>>

66             (most often), or a C<< >>.
67              
68             Markdown is B interpreted in HTML block-level elements, in order for
69             chunks of pasted HTML (e.g. JavaScript widgets, web counters) to not be
70             magically (mis)interpreted. For selective processing of Markdown in some,
71             but not other, HTML block elements, add a C attribute to the block
72             element and set its value to C<1>, C or C:
73              
74            
75             * Home
76             * About
77             * Contact
78            
79              
80             The extra C attribute will be stripped when generating the output.
81              
82             =head1 OPTIONS
83              
84             Text::Markdown supports a number of options to its processor which control
85             the behaviour of the output document.
86              
87             These options can be supplied to the constructor, or in a hash within
88             individual calls to the L method. See the SYNOPSIS for examples
89             of both styles.
90              
91             The options for the processor are:
92              
93             =over
94              
95             =item empty_element_suffix
96              
97             This option controls the end of empty element tags:
98              
99             '/>' for XHTML (default)
100             '>' for HTML
101              
102             =item tab_width
103              
104             Controls indent width in the generated markup. Defaults to 4.
105              
106             =item trust_list_start_value
107              
108             If true, ordered lists will use the first number as the starting point for
109             numbering. This will let you pick up where you left off by writing:
110              
111             1. foo
112             2. bar
113              
114             some paragraph
115              
116             3. baz
117             6. quux
118              
119             (Note that in the above, quux will be numbered 4.)
120              
121             =back
122              
123             =cut
124              
125             # Regex to match balanced [brackets]. See Friedl's
126             # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
127             our ($g_nested_brackets, $g_nested_parens);
128             $g_nested_brackets = qr{
129             (?> # Atomic matching
130             [^\[\]]+ # Anything other than brackets
131             |
132             \[
133             (??{ $g_nested_brackets }) # Recursive set of nested brackets
134             \]
135             )*
136             }x;
137             # Doesn't allow for whitespace, because we're using it to match URLs:
138             $g_nested_parens = qr{
139             (?> # Atomic matching
140             [^()\s]+ # Anything other than parens or whitespace
141             |
142             \(
143             (??{ $g_nested_parens }) # Recursive set of nested brackets
144             \)
145             )*
146             }x;
147              
148             # Table of hash values for escaped characters:
149             our %g_escape_table;
150             foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
151             $g_escape_table{$char} = md5_hex($char);
152             }
153              
154             =head1 METHODS
155              
156             =head2 new
157              
158             A simple constructor, see the SYNTAX and OPTIONS sections for more information.
159              
160             =cut
161              
162             sub new {
163 15     15 1 7880 my ($class, %p) = @_;
164              
165 15   50     143 $p{base_url} ||= ''; # This is the base URL to be used for WikiLinks
166              
167 15 100 66     143 $p{tab_width} = 4 unless (defined $p{tab_width} and $p{tab_width} =~ m/^\d+$/);
168              
169 15   50     98 $p{empty_element_suffix} ||= ' />'; # Change to ">" for HTML output
170              
171 15 100       58 $p{trust_list_start_value} = $p{trust_list_start_value} ? 1 : 0;
172              
173 15         59 my $self = { params => \%p };
174 15   33     107 bless $self, ref($class) || $class;
175 15         59 return $self;
176             }
177              
178             =head2 markdown
179              
180             The main function as far as the outside world is concerned. See the SYNOPSIS
181             for details on use.
182              
183             =cut
184              
185             sub markdown {
186 53     53 1 26970 my ( $self, $text, $options ) = @_;
187              
188             # Detect functional mode, and create an instance for this run
189 53 100       200 unless (ref $self) {
190 5 100       21 if ( $self ne __PACKAGE__ ) {
191 4         23 my $ob = __PACKAGE__->new();
192             # $self is text, $text is options
193 4         24 return $ob->markdown($self, $text);
194             }
195             else {
196 1         23 croak('Calling ' . $self . '->markdown (as a class method) is not supported.');
197             }
198             }
199              
200 48   100     260 $options ||= {};
201              
202 48         79 %$self = (%{ $self->{params} }, %$options, params => $self->{params});
  48         572  
203              
204 48         497 $self->_CleanUpRunData($options);
205              
206 48         298 return $self->_Markdown($text);
207             }
208              
209             sub _CleanUpRunData {
210 48     48   91 my ($self, $options) = @_;
211             # Clear the global hashes. If we don't clear these, you get conflicts
212             # from other articles when generating a page which contains more than
213             # one article (e.g. an index page that shows the N most recent
214             # articles).
215 48 50       203 $self->{_urls} = $options->{urls} ? $options->{urls} : {}; # FIXME - document passing this option (tested in 05options.t).
216 48         172 $self->{_titles} = {};
217 48         107 $self->{_html_blocks} = {};
218             # Used to track when we're inside an ordered or unordered list
219             # (see _ProcessListItems() for details)
220 48         110 $self->{_list_level} = 0;
221              
222             }
223              
224             sub _Markdown {
225             #
226             # Main function. The order in which other subs are called here is
227             # essential. Link and image substitutions need to happen before
228             # _EscapeSpecialChars(), so that any *'s or _'s in the
229             # and tags get encoded.
230             #
231 48     48   96 my ($self, $text, $options) = @_;
232              
233 48         141 $text = $self->_CleanUpDoc($text);
234              
235             # Turn block-level HTML elements into hash entries, and interpret markdown in them if they have a 'markdown="1"' attribute
236 48         242 $text = $self->_HashHTMLBlocks($text, {interpret_markdown_on_attribute => 1});
237              
238 48         209 $text = $self->_StripLinkDefinitions($text);
239              
240 48         248 $text = $self->_RunBlockGamut($text, {wrap_in_p_tags => 1});
241              
242 48         176 $text = $self->_UnescapeSpecialChars($text);
243              
244 48         159 $text = $self->_ConvertCopyright($text);
245              
246 48         377 return $text . "\n";
247             }
248              
249             =head2 urls
250              
251             Returns a reference to a hash with the key being the markdown reference
252             and the value being the URL.
253              
254             Useful for building scripts which preprocess a list of links before the
255             main content. See F for an example of this hashref being
256             passed back into the markdown method to create links.
257              
258             =cut
259              
260             sub urls {
261 0     0 1 0 my ( $self ) = @_;
262              
263 0         0 return $self->{_urls};
264             }
265              
266             sub _CleanUpDoc {
267 48     48   131 my ($self, $text) = @_;
268              
269             # Standardize line endings:
270 48         160 $text =~ s{\r\n}{\n}g; # DOS to Unix
271 48         143 $text =~ s{\r}{\n}g; # Mac to Unix
272              
273             # Make sure $text ends with a couple of newlines:
274 48         1147 $text .= "\n\n";
275              
276             # Convert all tabs to spaces.
277 48         141 $text = $self->_Detab($text);
278              
279             # Strip any lines consisting only of spaces and tabs.
280             # This makes subsequent regexen easier to write, because we can
281             # match consecutive blank lines with /\n+/ instead of something
282             # contorted like /[ \t]*\n+/ .
283 48         529 $text =~ s/^[ \t]+$//mg;
284              
285 48         116 return $text;
286             }
287              
288             sub _StripLinkDefinitions {
289             #
290             # Strips link definitions from text, stores the URLs and titles in
291             # hash references.
292             #
293 48     48   108 my ($self, $text) = @_;
294 48         104 my $less_than_tab = $self->{tab_width} - 1;
295              
296             # Link defs are in the form: ^[id]: url "optional title"
297 48         1111 while ($text =~ s{
298             ^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
299             [ \t]*
300             \n? # maybe *one* newline
301             [ \t]*
302             ? # url = \$2
303             [ \t]*
304             \n? # maybe one newline
305             [ \t]*
306             (?:
307             (?<=\s) # lookbehind for whitespace
308             ["(]
309             (.+?) # title = \$3
310             [")]
311             [ \t]*
312             )? # title is optional
313             (?:\n+|\Z)
314             }{}omx) {
315 23         68 $self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
316 23 100       3234 if ($3) {
317 1         4 $self->{_titles}{lc $1} = $3;
318 1         17 $self->{_titles}{lc $1} =~ s/"/"/g;
319             }
320              
321             }
322              
323 48         122 return $text;
324             }
325              
326             sub _md5_utf8 {
327             # Internal function used to safely MD5sum chunks of the input, which might be Unicode in Perl's internal representation.
328 166     166   308 my $input = shift;
329 166 50       483 return unless defined $input;
330 166 50       985 if (Encode::is_utf8 $input) {
331 0         0 return md5_hex(Encode::encode('utf8', $input));
332             }
333             else {
334 166         1971 return md5_hex($input);
335             }
336             }
337              
338             sub _HashHTMLBlocks {
339 115     115   193 my ($self, $text, $options) = @_;
340 115         222 my $less_than_tab = $self->{tab_width} - 1;
341              
342             # Hashify HTML blocks (protect from further interpretation by encoding to an md5):
343             # We only want to do this for block-level HTML tags, such as headers,
344             # lists, and tables. That's because we still want to wrap

s around

345             # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
346             # phrase emphasis, and spans. The list of tags we're looking for is
347             # hard-coded:
348 115         369 my $block_tags = qr{
349             (?:
350             p | div | h[1-6] | blockquote | pre | table |
351             dl | ol | ul | script | noscript | form |
352             fieldset | iframe | math | ins | del
353             )
354             }x;
355              
356 115         284 my $tag_attrs = qr{
357             (?: # Match one attr name/value pair
358             \s+ # There needs to be at least some whitespace
359             # before each attribute name.
360             [\w.:_-]+ # Attribute name
361             \s*=\s*
362             (?:
363             ".+?" # "Attribute value"
364             |
365             '.+?' # 'Attribute value'
366             |
367             [^\s]+? # AttributeValue (HTML5)
368             )
369             )* # Zero or more
370             }x;
371              
372 115         1508 my $empty_tag = qr{< \w+ $tag_attrs \s* />}oxms;
373 115         1969 my $open_tag = qr{< $block_tags $tag_attrs \s* >}oxms;
374 115         222 my $close_tag = undef; # let Text::Balanced handle this
375 115         139 my $prefix_pattern = undef; # Text::Balanced
376 115         292 my $markdown_attr = qr{ \s* markdown \s* = \s* (['"]) (.*?) \1 }xs;
377              
378 13     13   25504 use Text::Balanced qw(gen_extract_tagged);
  13         368852  
  13         75010  
379 115         601 my $extract_block = gen_extract_tagged($open_tag, $close_tag, $prefix_pattern, { ignore => [$empty_tag] });
380              
381 115         17558 my @chunks;
382             # parse each line, looking for block-level HTML tags
383 115         1737 while ($text =~ s{^(([ ]{0,$less_than_tab}<)?.*\n)}{}m) {
384 2597         5402 my $cur_line = $1;
385 2597 100       5888 if (defined $2) {
386             # current line could be start of code block
387              
388 187         1821 my ($tag, $remainder, $prefix, $opening_tag, $text_in_tag, $closing_tag) = $extract_block->($cur_line . $text);
389 187 100       573593 if ($tag) {
390 158 100 100     1642 if ($options->{interpret_markdown_on_attribute} and $opening_tag =~ s/$markdown_attr//i) {
391 15         34 my $markdown = $2;
392 15 100       63 if ($markdown =~ /^(1|on|yes)$/) {
393             # interpret markdown and reconstruct $tag to include the interpreted $text_in_tag
394 13         41 my $wrap_in_p_tags = $opening_tag =~ /^<(div|iframe)/;
395 13         63 $tag = $prefix . $opening_tag . "\n"
396             . $self->_RunBlockGamut($text_in_tag, {wrap_in_p_tags => $wrap_in_p_tags})
397             . "\n" . $closing_tag
398             ;
399             } else {
400             # just remove the markdown="0" attribute
401 2         6 $tag = $prefix . $opening_tag . $text_in_tag . $closing_tag;
402             }
403             }
404 158         475 my $key = _md5_utf8($tag);
405 158         955 $self->{_html_blocks}{$key} = $tag;
406 158         800 push @chunks, "\n\n" . $key . "\n\n";
407 158         2855 $text = $remainder;
408             }
409             else {
410             # No tag match, so toss $cur_line into @chunks
411 29         266 push @chunks, $cur_line;
412             }
413             }
414             else {
415             # current line could NOT be start of code block
416 2410         24713 push @chunks, $cur_line;
417             }
418              
419             }
420 115         278 push @chunks, $text; # whatever is left
421              
422 115         541 $text = join '', @chunks;
423              
424 115         1799 return $text;
425             }
426              
427             sub _HashHR {
428 67     67   118 my ($self, $text) = @_;
429 67         144 my $less_than_tab = $self->{tab_width} - 1;
430              
431 67         1177 $text =~ s{
432             (?:
433             (?<=\n\n) # Starting after a blank line
434             | # or
435             \A\n? # the beginning of the doc
436             )
437             ( # save in $1
438             [ ]{0,$less_than_tab}
439             <(hr) # start tag = $2
440             \b # word break
441             ([^<>])*? #
442             /?> # the matching end tag
443             [ \t]*
444             (?=\n{2,}|\Z) # followed by a blank line or end of document
445             )
446             }{
447 6         20 my $key = _md5_utf8($1);
448 6         51 $self->{_html_blocks}{$key} = $1;
449 6         267 "\n\n" . $key . "\n\n";
450             }egx;
451              
452 67         190 return $text;
453             }
454              
455             sub _HashHTMLComments {
456 67     67   107 my ($self, $text) = @_;
457 67         124 my $less_than_tab = $self->{tab_width} - 1;
458              
459             # Special case for standalone HTML comments:
460 67         1018 $text =~ s{
461             (?:
462             (?<=\n\n) # Starting after a blank line
463             | # or
464             \A\n? # the beginning of the doc
465             )
466             ( # save in $1
467             [ ]{0,$less_than_tab}
468             (?s:
469            
470             (--.*?--\s*)+
471             >
472             )
473             [ \t]*
474             (?=\n{2,}|\Z) # followed by a blank line or end of document
475             )
476             }{
477 0         0 my $key = _md5_utf8($1);
478 0         0 $self->{_html_blocks}{$key} = $1;
479 0         0 "\n\n" . $key . "\n\n";
480             }egx;
481              
482 67         173 return $text;
483             }
484              
485             sub _HashPHPASPBlocks {
486 67     67   104 my ($self, $text) = @_;
487 67         116 my $less_than_tab = $self->{tab_width} - 1;
488              
489             # PHP and ASP-style processor instructions ( and <%…%>)
490 67         1069 $text =~ s{
491             (?:
492             (?<=\n\n) # Starting after a blank line
493             | # or
494             \A\n? # the beginning of the doc
495             )
496             ( # save in $1
497             [ ]{0,$less_than_tab}
498             (?s:
499             <([?%]) # $2
500             .*?
501             \2>
502             )
503             [ \t]*
504             (?=\n{2,}|\Z) # followed by a blank line or end of document
505             )
506             }{
507 2         5 my $key = _md5_utf8($1);
508 2         8 $self->{_html_blocks}{$key} = $1;
509 2         13 "\n\n" . $key . "\n\n";
510             }egx;
511 67         174 return $text;
512             }
513              
514             sub _RunBlockGamut {
515             #
516             # These are all the transformations that form block-level
517             # tags like paragraphs, headers, and list items.
518             #
519 67     67   124 my ($self, $text, $options) = @_;
520              
521             # Do headers first, as these populate cross-refs
522 67         179 $text = $self->_DoHeaders($text);
523              
524             # Do Horizontal Rules:
525 67         159 my $less_than_tab = $self->{tab_width} - 1;
526 67         728 $text =~ s{^[ ]{0,$less_than_tab}(\*[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
527 67         893 $text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
528 67         434 $text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
529              
530 67         217 $text = $self->_DoLists($text);
531              
532 67         216 $text = $self->_DoCodeBlocks($text);
533              
534 67         348 $text = $self->_DoBlockQuotes($text);
535              
536             # We already ran _HashHTMLBlocks() before, in Markdown(), but that
537             # was to escape raw HTML in the original Markdown source. This time,
538             # we're escaping the markup we've just created, so that we don't wrap
539             #

tags around block-level tags.

540 67         169 $text = $self->_HashHTMLBlocks($text);
541              
542             # Special case just for
. It was easier to make a special case than
543             # to make the other regex more complicated.
544 67         260 $text = $self->_HashHR($text);
545              
546 67         360 $text = $self->_HashHTMLComments($text);
547              
548 67         199 $text = $self->_HashPHPASPBlocks($text);
549              
550 67         320 $text = $self->_FormParagraphs($text, {wrap_in_p_tags => $options->{wrap_in_p_tags}});
551              
552 67         284 return $text;
553             }
554              
555             sub _RunSpanGamut {
556             #
557             # These are all the transformations that occur *within* block-level
558             # tags like paragraphs, headers, and list items.
559             #
560 261     261   456 my ($self, $text) = @_;
561              
562 261         593 $text = $self->_DoCodeSpans($text);
563 261         1114 $text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
564 261         875 $text = $self->_EscapeSpecialChars($text);
565              
566             # Process anchor and image tags. Images must come first,
567             # because ![foo][f] looks like an anchor.
568 261         708 $text = $self->_DoImages($text);
569 261         815 $text = $self->_DoAnchors($text);
570              
571             # Make links out of things like ``
572             # Must come after _DoAnchors(), because you can use < and >
573             # delimiters in inline links like [this]().
574 261         872 $text = $self->_DoAutoLinks($text);
575              
576 261         642 $text = $self->_EncodeAmpsAndAngles($text);
577              
578 261         884 $text = $self->_DoItalicsAndBold($text);
579              
580             # FIXME - Is hard coding space here sane, or does this want to be related to tab width?
581             # Do hard breaks:
582 261         588 $text =~ s/ {2,}\n/ {empty_element_suffix}\n/g;
583              
584 261         1122 return $text;
585             }
586              
587             sub _EscapeSpecialChars {
588 261     261   464 my ($self, $text) = @_;
589 261   33     981 my $tokens ||= $self->_TokenizeHTML($text);
590              
591 261         392 $text = ''; # rebuild $text from the tokens
592             # my $in_pre = 0; # Keep track of when we're inside
 or  tags. 
593             # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
594              
595 261         440 foreach my $cur_token (@$tokens) {
596 613 100       1984 if ($cur_token->[0] eq "tag") {
597             # Within tags, encode * and _ so they don't conflict
598             # with their use in Markdown for italics and strong.
599             # We're replacing each such character with its
600             # corresponding MD5 checksum value; this is likely
601             # overkill, but it should prevent us from colliding
602             # with the escape values by accident.
603 200         354 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!ogx;
604 200         250 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!ogx;
605 200         716 $text .= $cur_token->[1];
606             } else {
607 413         745 my $t = $cur_token->[1];
608 413         1203 $t = $self->_EncodeBackslashEscapes($t);
609 413         1257 $text .= $t;
610             }
611             }
612 261         936 return $text;
613             }
614              
615             sub _EscapeSpecialCharsWithinTagAttributes {
616             #
617             # Within tags -- meaning between < and > -- encode [\ ` * _] so they
618             # don't conflict with their use in Markdown for code, italics and strong.
619             # We're replacing each such character with its corresponding MD5 checksum
620             # value; this is likely overkill, but it should prevent us from colliding
621             # with the escape values by accident.
622             #
623 261     261   378 my ($self, $text) = @_;
624 261   33     1293 my $tokens ||= $self->_TokenizeHTML($text);
625 261         419 $text = ''; # rebuild $text from the tokens
626              
627 261         601 foreach my $cur_token (@$tokens) {
628 613 100       1722 if ($cur_token->[0] eq "tag") {
629 200         574 $cur_token->[1] =~ s! \\ !$g_escape_table{'\\'}!gox;
630 200         572 $cur_token->[1] =~ s{ (?<=.)(?=.) }{$g_escape_table{'`'}}gox;
631 200         352 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gox;
632 200         255 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gox;
633             }
634 613         1547 $text .= $cur_token->[1];
635             }
636 261         1172 return $text;
637             }
638              
639             sub _DoAnchors {
640             #
641             # Turn Markdown link shortcuts into XHTML tags.
642             #
643 261     261   407 my ($self, $text) = @_;
644              
645             #
646             # First, handle reference-style links: [link text] [id]
647             #
648 261         12702 $text =~ s{
649             ( # wrap whole match in $1
650             \[
651             ($g_nested_brackets) # link text = $2
652             \]
653              
654             [ ]? # one optional space
655             (?:\n[ ]*)? # one optional newline followed by spaces
656              
657             \[
658             (.*?) # id = $3
659             \]
660             )
661             }{
662 32         83 my $whole_match = $1;
663 32         129 my $link_text = $2;
664 32         66 my $link_id = lc $3;
665              
666 32 100       86 if ($link_id eq "") {
667 12         24 $link_id = lc $link_text; # for shortcut links like [this][].
668             }
669              
670 32         71 $link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
671              
672 32         83 $self->_GenerateAnchor($whole_match, $link_text, $link_id);
673             }xsge;
674              
675             #
676             # Next, inline-style links: [link text](url "optional title")
677             #
678 261         22943 $text =~ s{
679             ( # wrap whole match in $1
680             \[
681             ($g_nested_brackets) # link text = $2
682             \]
683             \( # literal paren
684             [ \t]*
685             ($g_nested_parens) # href = $3
686             [ \t]*
687             ( # $4
688             (['"]) # quote char = $5
689             (.*?) # Title = $6
690             \5 # matching quote
691             [ \t]* # ignore any spaces/tabs between closing quote and )
692             )? # title is optional
693             \)
694             )
695             }{
696 22         29 my $result;
697 22         53 my $whole_match = $1;
698 22         34 my $link_text = $2;
699 22         36 my $url = $3;
700 22         33 my $title = $6;
701              
702 22         61 $self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
703             }xsge;
704              
705             #
706             # Last, handle reference-style shortcuts: [link text]
707             # These must come last in case you've also got [link test][1]
708             # or [link test](/foo)
709             #
710 261         798 $text =~ s{
711             ( # wrap whole match in $1
712             \[
713             ([^\[\]]+) # link text = $2; can't contain '[' or ']'
714             \]
715             )
716             }{
717 16         33 my $result;
718 16         40 my $whole_match = $1;
719 16         38 my $link_text = $2;
720 16         56 (my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
721              
722 16         52 $self->_GenerateAnchor($whole_match, $link_text, $link_id);
723             }xsge;
724              
725 261         964 return $text;
726             }
727              
728             sub _GenerateAnchor {
729             # FIXME - Fugly, change to named params?
730 70     70   150 my ($self, $whole_match, $link_text, $link_id, $url, $title, $attributes) = @_;
731              
732 70         80 my $result;
733              
734 70 50       174 $attributes = '' unless defined $attributes;
735              
736 70 100 100     350 if ( !defined $url && defined $self->{_urls}{$link_id}) {
737 35         172 $url = $self->{_urls}{$link_id};
738             }
739              
740 70 100       152 if (!defined $url) {
741 13         62 return $whole_match;
742             }
743              
744 57         83 $url =~ s! \* !$g_escape_table{'*'}!gox; # We've got to encode these to avoid
745 57         83 $url =~ s! _ !$g_escape_table{'_'}!gox; # conflicting with italics/bold.
746 57         73 $url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
747              
748 57         116 $result = qq{
749              
750 57 100 100     368 if ( !defined $title && defined $link_id && defined $self->{_titles}{$link_id} ) {
      100        
751 3         7 $title = $self->{_titles}{$link_id};
752             }
753              
754 57 100       113 if ( defined $title ) {
755 4         8 $title =~ s/"/"/g;
756 4         9 $title =~ s! \* !$g_escape_table{'*'}!gox;
757 4         6 $title =~ s! _ !$g_escape_table{'_'}!gox;
758 4         10 $result .= qq{ title="$title"};
759             }
760              
761 57         99 $result .= "$attributes>$link_text";
762              
763 57         355 return $result;
764             }
765              
766             sub _DoImages {
767             #
768             # Turn Markdown image shortcuts into tags.
769             #
770 261     261   399 my ($self, $text) = @_;
771              
772             #
773             # First, handle reference-style labeled images: ![alt text][id]
774             #
775 261         482 $text =~ s{
776             ( # wrap whole match in $1
777             !\[
778             (.*?) # alt text = $2
779             \]
780              
781             [ ]? # one optional space
782             (?:\n[ ]*)? # one optional newline followed by spaces
783              
784             \[
785             (.*?) # id = $3
786             \]
787              
788             )
789             }{
790 0         0 my $result;
791 0         0 my $whole_match = $1;
792 0         0 my $alt_text = $2;
793 0         0 my $link_id = lc $3;
794              
795 0 0       0 if ($link_id eq '') {
796 0         0 $link_id = lc $alt_text; # for shortcut links like ![this][].
797             }
798              
799 0         0 $self->_GenerateImage($whole_match, $alt_text, $link_id);
800             }xsge;
801              
802             #
803             # Next, handle inline images: ![alt text](url "optional title")
804             # Don't forget: encode * and _
805              
806 261         18206 $text =~ s{
807             ( # wrap whole match in $1
808             !\[
809             (.*?) # alt text = $2
810             \]
811             \( # literal paren
812             [ \t]*
813             ($g_nested_parens) # src url - href = $3
814             [ \t]*
815             ( # $4
816             (['"]) # quote char = $5
817             (.*?) # title = $6
818             \5 # matching quote
819             [ \t]*
820             )? # title is optional
821             \)
822             )
823             }{
824 0         0 my $result;
825 0         0 my $whole_match = $1;
826 0         0 my $alt_text = $2;
827 0         0 my $url = $3;
828 0         0 my $title = '';
829 0 0       0 if (defined($6)) {
830 0         0 $title = $6;
831             }
832              
833 0         0 $self->_GenerateImage($whole_match, $alt_text, undef, $url, $title);
834             }xsge;
835              
836 261         1025 return $text;
837             }
838              
839             sub _GenerateImage {
840             # FIXME - Fugly, change to named params?
841 0     0   0 my ($self, $whole_match, $alt_text, $link_id, $url, $title, $attributes) = @_;
842              
843 0         0 my $result;
844              
845 0 0       0 $attributes = '' unless defined $attributes;
846              
847 0   0     0 $alt_text ||= '';
848 0         0 $alt_text =~ s/"/"/g;
849             # FIXME - how about >
850              
851 0 0 0     0 if ( !defined $url && defined $self->{_urls}{$link_id}) {
852 0         0 $url = $self->{_urls}{$link_id};
853             }
854              
855             # If there's no such link ID, leave intact:
856 0 0       0 return $whole_match unless defined $url;
857              
858 0         0 $url =~ s! \* !$g_escape_table{'*'}!ogx; # We've got to encode these to avoid
859 0         0 $url =~ s! _ !$g_escape_table{'_'}!ogx; # conflicting with italics/bold.
860 0         0 $url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
861              
862 0 0 0     0 if (!defined $title && length $link_id && defined $self->{_titles}{$link_id} && length $self->{_titles}{$link_id}) {
      0        
      0        
863 0         0 $title = $self->{_titles}{$link_id};
864             }
865              
866 0         0 $result = qq{$alt_text
867 0 0 0     0 if (defined $title && length $title) {
868 0         0 $title =~ s! \* !$g_escape_table{'*'}!ogx;
869 0         0 $title =~ s! _ !$g_escape_table{'_'}!ogx;
870 0         0 $title =~ s/"/"/g;
871 0         0 $result .= qq{ title="$title"};
872             }
873 0         0 $result .= $attributes . $self->{empty_element_suffix};
874              
875 0         0 return $result;
876             }
877              
878             sub _DoHeaders {
879 67     67   99 my ($self, $text) = @_;
880              
881             # Setext-style headers:
882             # Header 1
883             # ========
884             #
885             # Header 2
886             # --------
887             #
888 67         122 $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
889 1         5 $self->_GenerateHeader('1', $1);
890             }egmx;
891              
892 67         2919 $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
893 0         0 $self->_GenerateHeader('2', $1);
894             }egmx;
895              
896              
897             # atx-style headers:
898             # # Header 1
899             # ## Header 2
900             # ## Header 2 with closing hashes ##
901             # ...
902             # ###### Header 6
903             #
904 67         74 my $l;
905 67         164 $text =~ s{
906             ^(\#{1,6}) # $1 = string of #'s
907             [ \t]*
908             (.+?) # $2 = Header text
909             [ \t]*
910             \#* # optional closing #'s (not counted)
911             \n+
912             }{
913 4         11 my $h_level = length($1);
914 4         11 $self->_GenerateHeader($h_level, $2);
915             }egmx;
916              
917 67         146 return $text;
918             }
919              
920             sub _GenerateHeader {
921 5     5   15 my ($self, $level, $id) = @_;
922              
923 5         19 return "" . $self->_RunSpanGamut($id) . "\n\n";
924             }
925              
926             sub _DoLists {
927             #
928             # Form HTML ordered (numbered) and unordered (bulleted) lists.
929             #
930 111     111   203 my ($self, $text) = @_;
931 111         201 my $less_than_tab = $self->{tab_width} - 1;
932              
933             # Re-usable patterns to match list item bullets and number markers:
934 111         366 my $marker_ul = qr/[*+-]/;
935 111         307 my $marker_ol = qr/\d+[.]/;
936 111         779 my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
937              
938             # Re-usable pattern to match any entirel ul or ol list:
939 111         1653 my $whole_list = qr{
940             ( # $1 = whole list
941             ( # $2
942             [ ]{0,$less_than_tab}
943             (${marker_any}) # $3 = first list item marker
944             [ \t]+
945             )
946             (?s:.+?)
947             ( # $4
948             \z
949             |
950             \n{2,}
951             (?=\S)
952             (?! # Negative lookahead for another list item marker
953             [ \t]*
954             ${marker_any}[ \t]+
955             )
956             )
957             )
958             }mx;
959              
960             # We use a different prefix before nested lists than top-level lists.
961             # See extended comment in _ProcessListItems().
962             #
963             # Note: There's a bit of duplication here. My original implementation
964             # created a scalar regex pattern as the conditional result of the test on
965             # $self->{_list_level}, and then only ran the $text =~ s{...}{...}egmx
966             # substitution once, using the scalar as the pattern. This worked,
967             # everywhere except when running under MT on my hosting account at Pair
968             # Networks. There, this caused all rebuilds to be killed by the reaper (or
969             # perhaps they crashed, but that seems incredibly unlikely given that the
970             # same script on the same server ran fine *except* under MT. I've spent
971             # more time trying to figure out why this is happening than I'd like to
972             # admit. My only guess, backed up by the fact that this workaround works,
973             # is that Perl optimizes the substition when it can figure out that the
974             # pattern will never change, and when this optimization isn't on, we run
975             # afoul of the reaper. Thus, the slightly redundant code to that uses two
976             # static s/// patterns rather than one conditional pattern.
977              
978 111 100       317 if ($self->{_list_level}) {
979 47         758 $text =~ s{
980             ^
981             $whole_list
982             }{
983 9         20 my $list = $1;
984 9         15 my $marker = $3;
985 9 100       43 my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
986             # Turn double returns into triple returns, so that we can make a
987             # paragraph for the last item in a list, if necessary:
988 9         24 $list =~ s/\n{2,}/\n\n\n/g;
989 9 100       40 my $result = ( $list_type eq 'ul' ) ?
990             $self->_ProcessListItemsUL($list, $marker_ul)
991             : $self->_ProcessListItemsOL($list, $marker_ol);
992              
993 9         24 $result = $self->_MakeList($list_type, $result, $marker);
994 9         38 $result;
995             }egmx;
996             }
997             else {
998 64         2420 $text =~ s{
999             (?:(?<=\n\n)|\A\n?)
1000             $whole_list
1001             }{
1002 13         46 my $list = $1;
1003 13         28 my $marker = $3;
1004 13 100       84 my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
1005             # Turn double returns into triple returns, so that we can make a
1006             # paragraph for the last item in a list, if necessary:
1007 13         79 $list =~ s/\n{2,}/\n\n\n/g;
1008 13 100       82 my $result = ( $list_type eq 'ul' ) ?
1009             $self->_ProcessListItemsUL($list, $marker_ul)
1010             : $self->_ProcessListItemsOL($list, $marker_ol);
1011 13         47 $result = $self->_MakeList($list_type, $result, $marker);
1012 13         4737 $result;
1013             }egmx;
1014             }
1015              
1016              
1017 111         603 return $text;
1018             }
1019              
1020             sub _MakeList {
1021 22     22   39 my ($self, $list_type, $content, $marker) = @_;
1022              
1023 22 100 100     107 if ($list_type eq 'ol' and $self->{trust_list_start_value}) {
1024 2         11 my ($num) = $marker =~ /^(\d+)[.]/;
1025 2         9 return "
    \n" . $content . "
\n";
1026             }
1027              
1028 20         75 return "<$list_type>\n" . $content . "\n";
1029             }
1030              
1031             sub _ProcessListItemsOL {
1032             #
1033             # Process the contents of a single ordered list, splitting it
1034             # into individual list items.
1035             #
1036              
1037 6     6   14 my ($self, $list_str, $marker_any) = @_;
1038              
1039              
1040             # The $self->{_list_level} global keeps track of when we're inside a list.
1041             # Each time we enter a list, we increment it; when we leave a list,
1042             # we decrement. If it's zero, we're not in a list anymore.
1043             #
1044             # We do this because when we're not inside a list, we want to treat
1045             # something like this:
1046             #
1047             # I recommend upgrading to version
1048             # 8. Oops, now this line is treated
1049             # as a sub-list.
1050             #
1051             # As a single paragraph, despite the fact that the second line starts
1052             # with a digit-period-space sequence.
1053             #
1054             # Whereas when we're inside a list (or sub-list), that line will be
1055             # treated as the start of a sub-list. What a kludge, huh? This is
1056             # an aspect of Markdown's syntax that's hard to parse perfectly
1057             # without resorting to mind-reading. Perhaps the solution is to
1058             # change the syntax rules such that sub-lists must start with a
1059             # starting cardinal number; e.g. "1." or "a.".
1060              
1061 6         12 $self->{_list_level}++;
1062              
1063             # trim trailing blank lines:
1064 6         20 $list_str =~ s/\n{2,}\z/\n/;
1065              
1066              
1067 6         206 $list_str =~ s{
1068             (\n)? # leading line = $1
1069             (^[ \t]*) # leading whitespace = $2
1070             ($marker_any) [ \t]+ # list marker = $3
1071             ((?s:.+?) # list item text = $4
1072             (\n{1,2}))
1073             (?= \n* (\z | \2 ($marker_any) [ \t]+))
1074             }{
1075 10         26 my $item = $4;
1076 10         16 my $leading_line = $1;
1077 10         18 my $leading_space = $2;
1078              
1079 10 50 33     67 if ($leading_line or ($item =~ m/\n{2,}/)) {
1080 0         0 $item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
1081             }
1082             else {
1083             # Recursion for sub-lists:
1084 10         29 $item = $self->_DoLists($self->_Outdent($item));
1085 10         24 chomp $item;
1086 10         81 $item = $self->_RunSpanGamut($item);
1087             }
1088              
1089 10         55 "
  • " . $item . "
  • \n";
    1090             }egmxo;
    1091              
    1092 6         19 $self->{_list_level}--;
    1093 6         19 return $list_str;
    1094             }
    1095              
    1096             sub _ProcessListItemsUL {
    1097             #
    1098             # Process the contents of a single unordered list, splitting it
    1099             # into individual list items.
    1100             #
    1101              
    1102 16     16   37 my ($self, $list_str, $marker_any) = @_;
    1103              
    1104              
    1105             # The $self->{_list_level} global keeps track of when we're inside a list.
    1106             # Each time we enter a list, we increment it; when we leave a list,
    1107             # we decrement. If it's zero, we're not in a list anymore.
    1108             #
    1109             # We do this because when we're not inside a list, we want to treat
    1110             # something like this:
    1111             #
    1112             # I recommend upgrading to version
    1113             # 8. Oops, now this line is treated
    1114             # as a sub-list.
    1115             #
    1116             # As a single paragraph, despite the fact that the second line starts
    1117             # with a digit-period-space sequence.
    1118             #
    1119             # Whereas when we're inside a list (or sub-list), that line will be
    1120             # treated as the start of a sub-list. What a kludge, huh? This is
    1121             # an aspect of Markdown's syntax that's hard to parse perfectly
    1122             # without resorting to mind-reading. Perhaps the solution is to
    1123             # change the syntax rules such that sub-lists must start with a
    1124             # starting cardinal number; e.g. "1." or "a.".
    1125              
    1126 16         27 $self->{_list_level}++;
    1127              
    1128             # trim trailing blank lines:
    1129 16         57 $list_str =~ s/\n{2,}\z/\n/;
    1130              
    1131              
    1132 16         245 $list_str =~ s{
    1133             (\n)? # leading line = $1
    1134             (^[ \t]*) # leading whitespace = $2
    1135             ($marker_any) [ \t]+ # list marker = $3
    1136             ((?s:.+?) # list item text = $4
    1137             (\n{1,2}))
    1138             (?= \n* (\z | \2 ($marker_any) [ \t]+))
    1139             }{
    1140 37         89 my $item = $4;
    1141 37         50 my $leading_line = $1;
    1142 37         49 my $leading_space = $2;
    1143              
    1144 37 100 66     178 if ($leading_line or ($item =~ m/\n{2,}/)) {
    1145 3         8 $item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
    1146             }
    1147             else {
    1148             # Recursion for sub-lists:
    1149 34         88 $item = $self->_DoLists($self->_Outdent($item));
    1150 34         72 chomp $item;
    1151 34         89 $item = $self->_RunSpanGamut($item);
    1152             }
    1153              
    1154 37         250 "
  • " . $item . "
  • \n";
    1155             }egmxo;
    1156              
    1157 16         37 $self->{_list_level}--;
    1158 16         39 return $list_str;
    1159             }
    1160              
    1161             sub _DoCodeBlocks {
    1162             #
    1163             # Process Markdown code blocks (indented with 4 spaces or 1 tab):
    1164             # * outdent the spaces/tab
    1165             # * encode <, >, & into HTML entities
    1166             # * escape Markdown special characters into MD5 hashes
    1167             # * trim leading and trailing newlines
    1168             #
    1169              
    1170 67     67   105 my ($self, $text) = @_;
    1171              
    1172 67         2087 $text =~ s{
    1173             (?:\n\n|\A)
    1174             ( # $1 = the code block -- one or more lines, starting with a space/tab
    1175             (?:
    1176             (?:[ ]{$self->{tab_width}} | \t) # Lines must start with a tab or a tab-width of spaces
    1177             .*\n+
    1178             )+
    1179             )
    1180             ((?=^[ ]{0,$self->{tab_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
    1181             }{
    1182 90         207 my $codeblock = $1;
    1183 90         99 my $result; # return value
    1184              
    1185 90         206 $codeblock = $self->_EncodeCode($self->_Outdent($codeblock));
    1186 90         241 $codeblock = $self->_Detab($codeblock);
    1187 90         140 $codeblock =~ s/\A\n+//; # trim leading newlines
    1188 90         388 $codeblock =~ s/\n+\z//; # trim trailing newlines
    1189              
    1190 90         206 $result = "\n\n
    " . $codeblock . "\n
    \n\n";
    1191              
    1192 90         2959 $result;
    1193             }egmx;
    1194              
    1195 67         271 return $text;
    1196             }
    1197              
    1198             sub _DoCodeSpans {
    1199             #
    1200             # * Backtick quotes are used for spans.
    1201             #
    1202             # * You can use multiple backticks as the delimiters if you want to
    1203             # include literal backticks in the code span. So, this input:
    1204             #
    1205             # Just type ``foo `bar` baz`` at the prompt.
    1206             #
    1207             # Will translate to:
    1208             #
    1209             #

    Just type foo `bar` baz at the prompt.

    1210             #
    1211             # There's no arbitrary limit to the number of backticks you
    1212             # can use as delimters. If you need three consecutive backticks
    1213             # in your code, use four for delimiters, etc.
    1214             #
    1215             # * You can use spaces to get literal backticks at the edges:
    1216             #
    1217             # ... type `` `bar` `` ...
    1218             #
    1219             # Turns to:
    1220             #
    1221             # ... type `bar` ...
    1222             #
    1223              
    1224 261     261   517 my ($self, $text) = @_;
    1225              
    1226 261         727 $text =~ s@
    1227             (?
    1228             (`+) # $1 = Opening run of `
    1229             (.+?) # $2 = The code block
    1230             (?
    1231             \1 # Matching closer
    1232             (?!`)
    1233             @
    1234 55         381 my $c = "$2";
    1235 55         353 $c =~ s/^[ \t]*//g; # leading whitespace
    1236 55         427 $c =~ s/[ \t]*$//g; # trailing whitespace
    1237 55         204 $c = $self->_EncodeCode($c);
    1238 55         304 "$c";
    1239             @egsx;
    1240              
    1241 261         557 return $text;
    1242             }
    1243              
    1244             sub _EncodeCode {
    1245             #
    1246             # Encode/escape certain characters inside Markdown code runs.
    1247             # The point is that in code, these characters are literals,
    1248             # and lose their special Markdown meanings.
    1249             #
    1250 145     145   213 my $self = shift;
    1251 145         297 local $_ = shift;
    1252              
    1253             # Encode all ampersands; HTML entities are not
    1254             # entities within a Markdown code span.
    1255 145         293 s/&/&/g;
    1256              
    1257             # Encode $'s, but only if we're running under Blosxom.
    1258             # (Blosxom interpolates Perl variables in article bodies.)
    1259             {
    1260 13     13   208 no warnings 'once';
      13         29  
      13         50075  
      145         173  
    1261 145 50       520 if (defined($blosxom::version)) {
    1262 0         0 s/\$/$/g;
    1263             }
    1264             }
    1265              
    1266              
    1267             # Do the angle bracket song and dance:
    1268 145         508 s! < !<!gx;
    1269 145         983 s! > !>!gx;
    1270              
    1271             # Now, escape characters that are magic in Markdown:
    1272 145         316 s! \* !$g_escape_table{'*'}!ogx;
    1273 145         220 s! _ !$g_escape_table{'_'}!ogx;
    1274 145         280 s! { !$g_escape_table{'{'}!ogx;
    1275 145         190 s! } !$g_escape_table{'}'}!ogx;
    1276 145         384 s! \[ !$g_escape_table{'['}!ogx;
    1277 145         273 s! \] !$g_escape_table{']'}!ogx;
    1278 145         317 s! \\ !$g_escape_table{'\\'}!ogx;
    1279              
    1280 145         640 return $_;
    1281             }
    1282              
    1283             sub _DoItalicsAndBold {
    1284 261     261   365 my ($self, $text) = @_;
    1285              
    1286             # Handle at beginning of lines:
    1287 261         671 $text =~ s{ ^(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
    1288             {$2}gsx;
    1289              
    1290 261         585 $text =~ s{ ^(\*|_) (?=\S) (.+?) (?<=\S) \1 }
    1291             {$2}gsx;
    1292              
    1293             # must go first:
    1294 261         3622 $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
    1295             {$2}gsx;
    1296              
    1297 261         4263 $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 }
    1298             {$2}gsx;
    1299              
    1300             # And now, a second pass to catch nested strong and emphasis special cases
    1301 261         3733 $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
    1302             {$2}gsx;
    1303              
    1304 261         3734 $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 }
    1305             {$2}gsx;
    1306              
    1307 261         639 return $text;
    1308             }
    1309              
    1310             sub _DoBlockQuotes {
    1311 67     67   114 my ($self, $text) = @_;
    1312              
    1313 67         305 $text =~ s{
    1314             ( # Wrap whole match in $1
    1315             (
    1316             ^[ \t]*>[ \t]? # '>' at the start of a line
    1317             .+\n # rest of the first line
    1318             (.+\n)* # subsequent consecutive lines
    1319             \n* # blanks
    1320             )+
    1321             )
    1322             }{
    1323 3         8 my $bq = $1;
    1324 3         12 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
    1325 3         7 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
    1326 3         16 $bq = $self->_RunBlockGamut($bq, {wrap_in_p_tags => 1}); # recurse
    1327              
    1328 3         12 $bq =~ s/^/ /mg;
    1329             # These leading spaces screw with
     content, so we need to fix that: 
    1330 3         5 $bq =~ s{
    1331             (\s*
    .+?
    )
    1332             }{
    1333 0         0 my $pre = $1;
    1334 0         0 $pre =~ s/^ //mg;
    1335 0         0 $pre;
    1336             }egsx;
    1337              
    1338 3         15 "
    \n$bq\n
    \n\n";
    1339             }egmx;
    1340              
    1341              
    1342 67         119 return $text;
    1343             }
    1344              
    1345             sub _FormParagraphs {
    1346             #
    1347             # Params:
    1348             # $text - string to process with html

    tags

    1349             #
    1350 67     67   113 my ($self, $text, $options) = @_;
    1351              
    1352             # Strip leading and trailing lines:
    1353 67         182 $text =~ s/\A\n+//;
    1354 67         413 $text =~ s/\n+\z//;
    1355              
    1356 67         692 my @grafs = split(/\n{2,}/, $text);
    1357              
    1358             #
    1359             # Wrap

    tags.

    1360             #
    1361 67         150 foreach (@grafs) {
    1362 378 100       2012 unless (defined( $self->{_html_blocks}{$_} )) {
    1363 212         787 $_ = $self->_RunSpanGamut($_);
    1364 212 100       786 if ($options->{wrap_in_p_tags}) {
    1365 211         1217 s/^([ \t]*)/

    /;

    1366 211         623 $_ .= "

    ";
    1367             }
    1368             }
    1369             }
    1370              
    1371             #
    1372             # Unhashify HTML blocks
    1373             #
    1374 67         128 foreach (@grafs) {
    1375 378 100       1120 if (defined( $self->{_html_blocks}{$_} )) {
    1376 166         389 $_ = $self->{_html_blocks}{$_};
    1377             }
    1378             }
    1379              
    1380 67         349 return join "\n\n", @grafs;
    1381             }
    1382              
    1383             sub _EncodeAmpsAndAngles {
    1384             # Smart processing for ampersands and angle brackets that need to be encoded.
    1385              
    1386 284     284   536 my ($self, $text) = @_;
    1387 284 50 33     1176 return '' if (!defined $text or !length $text);
    1388              
    1389             # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
    1390             # http://bumppo.net/projects/amputator/
    1391 284         718 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g;
    1392              
    1393             # Encode naked <'s
    1394 284         538 $text =~ s{<(?![a-z/?\$!])}{<}gi;
    1395              
    1396             # And >'s - added by Fletcher Penney
    1397             # $text =~ s{>(?![a-z/?\$!])}{>}gi;
    1398             # Causes problems...
    1399              
    1400             # Remove encoding inside comments
    1401 284         4029 $text =~ s{
    1402             (?<=) # End comments
    1405             }{
    1406 1         3 my $t = $1;
    1407 1         4 $t =~ s/&/&/g;
    1408 1         3 $t =~ s/</
    1409 1         4 $t;
    1410             }egsx;
    1411              
    1412 284         776 return $text;
    1413             }
    1414              
    1415             sub _EncodeBackslashEscapes {
    1416             #
    1417             # Parameter: String.
    1418             # Returns: The string, with after processing the following backslash
    1419             # escape sequences.
    1420             #
    1421 413     413   536 my $self = shift;
    1422 413         606 local $_ = shift;
    1423              
    1424 413         826 s! \\\\ !$g_escape_table{'\\'}!ogx; # Must process escaped backslashes first.
    1425 413         1008 s! \\` !$g_escape_table{'`'}!ogx;
    1426 413         602 s! \\\* !$g_escape_table{'*'}!ogx;
    1427 413         644 s! \\_ !$g_escape_table{'_'}!ogx;
    1428 413         863 s! \\\{ !$g_escape_table{'{'}!ogx;
    1429 413         598 s! \\\} !$g_escape_table{'}'}!ogx;
    1430 413         559 s! \\\[ !$g_escape_table{'['}!ogx;
    1431 413         482 s! \\\] !$g_escape_table{']'}!ogx;
    1432 413         603 s! \\\( !$g_escape_table{'('}!ogx;
    1433 413         713 s! \\\) !$g_escape_table{')'}!ogx;
    1434 413         579 s! \\> !$g_escape_table{'>'}!ogx;
    1435 413         456 s! \\\# !$g_escape_table{'#'}!ogx;
    1436 413         565 s! \\\+ !$g_escape_table{'+'}!ogx;
    1437 413         486 s! \\\- !$g_escape_table{'-'}!ogx;
    1438 413         757 s! \\\. !$g_escape_table{'.'}!ogx;
    1439 413         496 s{ \\! }{$g_escape_table{'!'}}ogx;
    1440              
    1441 413         1095 return $_;
    1442             }
    1443              
    1444             sub _DoAutoLinks {
    1445 261     261   386 my ($self, $text) = @_;
    1446              
    1447 261         450 $text =~ s{<((https?|ftp):[^'">\s]+)>}{$1}gi;
    1448              
    1449             # Email addresses:
    1450 261         362 $text =~ s{
    1451             <
    1452             (?:mailto:)?
    1453             (
    1454             [-.\w\+]+
    1455             \@
    1456             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
    1457             )
    1458             >
    1459             }{
    1460 1         4 $self->_EncodeEmailAddress( $self->_UnescapeSpecialChars($1) );
    1461             }egix;
    1462              
    1463 261         471 return $text;
    1464             }
    1465              
    1466             sub _EncodeEmailAddress {
    1467             #
    1468             # Input: an email address, e.g. "foo@example.com"
    1469             #
    1470             # Output: the email address as a mailto link, with each character
    1471             # of the address encoded as either a decimal or hex entity, in
    1472             # the hopes of foiling most address harvesting spam bots. E.g.:
    1473             #
    1474             # 1475             # xample.com">foo
    1476             # @example.com
    1477             #
    1478             # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
    1479             # mailing list:
    1480             #
    1481              
    1482 1     1   2 my ($self, $addr) = @_;
    1483              
    1484             my @encode = (
    1485 7     7   16 sub { '&#' . ord(shift) . ';' },
    1486 5     5   17 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
    1487 3     3   6 sub { shift },
    1488 1         17 );
    1489              
    1490 1         3 $addr = "mailto:" . $addr;
    1491              
    1492 1         4 $addr =~ s{(.)}{
    1493 16         35 my $char = $1;
    1494 16 100       41 if ( $char eq '@' ) {
        100          
    1495             # this *must* be encoded. I insist.
    1496 1         3 $char = $encode[int rand 1]->($char);
    1497             }
    1498             elsif ( $char ne ':' ) {
    1499             # leave ':' alone (to spot mailto: later)
    1500 14         79 my $r = rand;
    1501             # roughly 10% raw, 45% hex, 45% dec
    1502 14 100       50 $char = (
        100          
    1503             $r > .9 ? $encode[2]->($char) :
    1504             $r < .45 ? $encode[1]->($char) :
    1505             $encode[0]->($char)
    1506             );
    1507             }
    1508 16         43 $char;
    1509             }gex;
    1510              
    1511 1         6 $addr = qq{$addr};
    1512 1         7 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
    1513              
    1514 1         8 return $addr;
    1515             }
    1516              
    1517             sub _UnescapeSpecialChars {
    1518             #
    1519             # Swap back in all the special characters we've hidden.
    1520             #
    1521 49     49   80 my ($self, $text) = @_;
    1522              
    1523 49         226 while( my($char, $hash) = each(%g_escape_table) ) {
    1524 784         9145 $text =~ s/$hash/$char/g;
    1525             }
    1526 49         121 return $text;
    1527             }
    1528              
    1529             sub _TokenizeHTML {
    1530             #
    1531             # Parameter: String containing HTML markup.
    1532             # Returns: Reference to an array of the tokens comprising the input
    1533             # string. Each token is either a tag (possibly with nested,
    1534             # tags contained therein, such as , or a
    1535             # run of text between tags. Each element of the array is a
    1536             # two-element array; the first is either 'tag' or 'text';
    1537             # the second is the actual value.
    1538             #
    1539             #
    1540             # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
    1541             #
    1542             #
    1543              
    1544 522     522   707 my ($self, $str) = @_;
    1545 522         601 my $pos = 0;
    1546 522         886 my $len = length $str;
    1547 522         702 my @tokens;
    1548              
    1549 522         617 my $depth = 6;
    1550 522         2425 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
    1551 522         4246 my $match = qr/(?s: ) | # comment
    1552             (?s: <\? .*? \?> ) | # processing instruction
    1553             $nested_tags/iox; # nested tags
    1554              
    1555 522         5082 while ($str =~ m/($match)/og) {
    1556 400         717 my $whole_tag = $1;
    1557 400         459 my $sec_start = pos $str;
    1558 400         496 my $tag_start = $sec_start - length $whole_tag;
    1559 400 100       741 if ($pos < $tag_start) {
    1560 332         1716 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
    1561             }
    1562 400         956 push @tokens, ['tag', $whole_tag];
    1563 400         2596 $pos = pos $str;
    1564             }
    1565 522 100       2534 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
    1566 522         3059 \@tokens;
    1567             }
    1568              
    1569             sub _Outdent {
    1570             #
    1571             # Remove one level of line-leading tabs or spaces
    1572             #
    1573 137     137   227 my ($self, $text) = @_;
    1574              
    1575 137         1146 $text =~ s/^(\t|[ ]{1,$self->{tab_width}})//gm;
    1576 137         519 return $text;
    1577             }
    1578              
    1579             sub _Detab {
    1580             #
    1581             # Cribbed from a post by Bart Lateur:
    1582             #
    1583             #
    1584 138     138   275 my ($self, $text) = @_;
    1585              
    1586             # FIXME - Better anchor/regex would be quicker.
    1587              
    1588             # Original:
    1589             #$text =~ s{(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}ge;
    1590              
    1591             # Much swifter, but pretty hateful:
    1592 138         4135 do {} while ($text =~ s{^(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}mge);
      27         43115  
    1593 138         406 return $text;
    1594             }
    1595              
    1596             sub _ConvertCopyright {
    1597 48     48   85 my ($self, $text) = @_;
    1598             # Convert to an XML compatible form of copyright symbol
    1599              
    1600 48         270 $text =~ s/©/©/gi;
    1601              
    1602 48         107 return $text;
    1603             }
    1604              
    1605             1;
    1606              
    1607             __END__