File Coverage

blib/lib/Markdown/To/POD.pm
Criterion Covered Total %
statement 279 496 56.2
branch 23 98 23.4
condition 9 61 14.7
subroutine 42 55 76.3
pod 3 3 100.0
total 356 713 49.9


line stmt bran cond sub pod time code
1             package Markdown::To::POD;
2              
3             our $DATE = '2014-07-28'; # DATE
4             our $VERSION = '0.04'; # VERSION
5             # ABSTRACT: Convert Markdown syntax to POD
6              
7 1     1   1466 use 5.010001;
  1         3  
  1         44  
8 1     1   5 use strict;
  1         2  
  1         30  
9 1     1   6 use warnings;
  1         1  
  1         28  
10 1     1   5 use re 'eval';
  1         2  
  1         52  
11              
12 1     1   6 use Digest::MD5 qw(md5_hex);
  1         9  
  1         74  
13 1     1   2303 use Encode qw();
  1         19989  
  1         28  
14 1     1   9 use Carp qw(croak);
  1         3  
  1         75  
15 1     1   5 use base 'Exporter';
  1         2  
  1         1346  
16              
17             our @EXPORT_OK = qw(markdown_to_pod);
18              
19              
20             # Regex to match balanced [brackets]. See Friedl's
21             # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
22             our ($g_nested_brackets, $g_nested_parens);
23             $g_nested_brackets = qr{
24             (?> # Atomic matching
25             [^\[\]]+ # Anything other than brackets
26             |
27             \[
28             (??{ $g_nested_brackets }) # Recursive set of nested brackets
29             \]
30             )*
31             }x;
32             # Doesn't allow for whitespace, because we're using it to match URLs:
33             $g_nested_parens = qr{
34             (?> # Atomic matching
35             [^()\s]+ # Anything other than parens or whitespace
36             |
37             \(
38             (??{ $g_nested_parens }) # Recursive set of nested brackets
39             \)
40             )*
41             }x;
42              
43             # Table of hash values for escaped characters:
44             our %g_escape_table;
45             foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
46             $g_escape_table{$char} = md5_hex($char);
47             }
48              
49              
50             sub new {
51 3     3 1 6 my ($class, %p) = @_;
52              
53 3   50     20 $p{base_url} ||= ''; # This is the base URL to be used for WikiLinks
54              
55 3 50 33     20 $p{tab_width} = 4 unless (defined $p{tab_width} and $p{tab_width} =~ m/^\d+$/);
56              
57 3   50     16 $p{empty_element_suffix} ||= ' />'; # Change to ">" for HTML output
58              
59 3 50       8 $p{trust_list_start_value} = $p{trust_list_start_value} ? 1 : 0;
60              
61 3         11 my $self = { params => \%p };
62 3   33     15 bless $self, ref($class) || $class;
63 3         7 return $self;
64             }
65              
66              
67             sub markdown_to_pod {
68 6     6 1 18 my ( $self, $text, $options ) = @_;
69              
70             # Detect functional mode, and create an instance for this run
71 6 100       18 unless (ref $self) {
72 3 50       10 if ( $self ne __PACKAGE__ ) {
73 3         13 my $ob = __PACKAGE__->new();
74             # $self is text, $text is options
75 3         12 return $ob->markdown_to_pod($self, $text);
76             }
77             else {
78 0         0 croak('Calling ' . $self . '->markdown (as a class method) is not supported.');
79             }
80             }
81              
82 3   50     14 $options ||= {};
83              
84 3         5 %$self = (%{ $self->{params} }, %$options, params => $self->{params});
  3         33  
85              
86 3         12 $self->_CleanUpRunData($options);
87              
88 3         13 return $self->_Markdown($text);
89             }
90              
91             sub _CleanUpRunData {
92 3     3   7 my ($self, $options) = @_;
93             # Clear the global hashes. If we don't clear these, you get conflicts
94             # from other articles when generating a page which contains more than
95             # one article (e.g. an index page that shows the N most recent
96             # articles).
97 3 50       42 $self->{_urls} = $options->{urls} ? $options->{urls} : {}; # FIXME - document passing this option (tested in 05options.t).
98 3         9 $self->{_titles} = {};
99 3         10 $self->{_html_blocks} = {};
100             # Used to track when we're inside an ordered or unordered list
101             # (see _ProcessListItems() for details)
102 3         8 $self->{_list_level} = 0;
103              
104             }
105              
106             sub _Markdown {
107             #
108             # Main function. The order in which other subs are called here is
109             # essential. Link and image substitutions need to happen before
110             # _EscapeSpecialChars(), so that any *'s or _'s in the
111             # and tags get encoded.
112             #
113 3     3   6 my ($self, $text, $options) = @_;
114              
115 3         10 $text = $self->_CleanUpDoc($text);
116              
117             # Turn block-level HTML elements into hash entries, and interpret markdown in them if they have a 'markdown="1"' attribute
118 3         18 $text = $self->_HashHTMLBlocks($text, {interpret_markdown_on_attribute => 1});
119              
120 3         28 $text = $self->_StripLinkDefinitions($text);
121              
122 3         19 $text = $self->_RunBlockGamut($text, {wrap_in_p_tags => 1});
123              
124 3         12 $text = $self->_UnescapeSpecialChars($text);
125              
126 3         9 $text = $self->_ConvertCopyright($text);
127              
128 3         46 return $text . "\n";
129             }
130              
131              
132             sub urls {
133 0     0 1 0 my ( $self ) = @_;
134              
135 0         0 return $self->{_urls};
136             }
137              
138             sub _CleanUpDoc {
139 3     3   8 my ($self, $text) = @_;
140              
141             # Standardize line endings:
142 3         8 $text =~ s{\r\n}{\n}g; # DOS to Unix
143 3         6 $text =~ s{\r}{\n}g; # Mac to Unix
144              
145             # Make sure $text ends with a couple of newlines:
146 3         5 $text .= "\n\n";
147              
148             # Convert all tabs to spaces.
149 3         11 $text = $self->_Detab($text);
150              
151             # Strip any lines consisting only of spaces and tabs.
152             # This makes subsequent regexen easier to write, because we can
153             # match consecutive blank lines with /\n+/ instead of something
154             # contorted like /[ \t]*\n+/ .
155 3         9 $text =~ s/^[ \t]+$//mg;
156              
157 3         7 return $text;
158             }
159              
160             sub _StripLinkDefinitions {
161             #
162             # Strips link definitions from text, stores the URLs and titles in
163             # hash references.
164             #
165 3     3   7 my ($self, $text) = @_;
166 3         8 my $less_than_tab = $self->{tab_width} - 1;
167              
168             # Link defs are in the form: ^[id]: url "optional title"
169 3         64 while ($text =~ s{
170             ^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
171             [ \t]*
172             \n? # maybe *one* newline
173             [ \t]*
174             ? # url = \$2
175             [ \t]*
176             \n? # maybe one newline
177             [ \t]*
178             (?:
179             (?<=\s) # lookbehind for whitespace
180             ["(]
181             (.+?) # title = \$3
182             [")]
183             [ \t]*
184             )? # title is optional
185             (?:\n+|\Z)
186             }{}omx) {
187 0         0 $self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
188 0 0       0 if ($3) {
189 0         0 $self->{_titles}{lc $1} = $3;
190 0         0 $self->{_titles}{lc $1} =~ s/"/"/g;
191             }
192              
193             }
194              
195 3         8 return $text;
196             }
197              
198             sub _md5_utf8 {
199             # Internal function used to safely MD5sum chunks of the input, which might be Unicode in Perl's internal representation.
200 0     0   0 my $input = shift;
201 0 0       0 return unless defined $input;
202 0 0       0 if (Encode::is_utf8 $input) {
203 0         0 return md5_hex(Encode::encode('utf8', $input));
204             }
205             else {
206 0         0 return md5_hex($input);
207             }
208             }
209              
210             sub _HashHTMLBlocks {
211 6     6   12 my ($self, $text, $options) = @_;
212 6         11 my $less_than_tab = $self->{tab_width} - 1;
213              
214             # Hashify HTML blocks (protect from further interpretation by encoding to an md5):
215             # We only want to do this for block-level HTML tags, such as headers,
216             # lists, and tables. That's because we still want to wrap

s around

217             # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
218             # phrase emphasis, and spans. The list of tags we're looking for is
219             # hard-coded:
220 6         21 my $block_tags = qr{
221             (?:
222             p | div | h[1-6] | blockquote | pre | table |
223             dl | ol | ul | script | noscript | form |
224             fieldset | iframe | math | ins | del
225             )
226             }x;
227              
228 6         18 my $tag_attrs = qr{
229             (?: # Match one attr name/value pair
230             \s+ # There needs to be at least some whitespace
231             # before each attribute name.
232             [\w.:_-]+ # Attribute name
233             \s*=\s*
234             (?:
235             ".+?" # "Attribute value"
236             |
237             '.+?' # 'Attribute value'
238             |
239             [^\s]+? # AttributeValue (HTML5)
240             )
241             )* # Zero or more
242             }x;
243              
244 6         141 my $empty_tag = qr{< \w+ $tag_attrs \s* />}oxms;
245 6         113 my $open_tag = qr{< $block_tags $tag_attrs \s* >}oxms;
246 6         10 my $close_tag = undef; # let Text::Balanced handle this
247 6         7 my $prefix_pattern = undef; # Text::Balanced
248 6         17 my $markdown_attr = qr{ \s* markdown \s* = \s* (['"]) (.*?) \1 }xs;
249              
250 1     1   1233 use Text::Balanced qw(gen_extract_tagged);
  1         36049  
  1         4229  
251 6         45 my $extract_block = gen_extract_tagged($open_tag, $close_tag, $prefix_pattern, { ignore => [$empty_tag] });
252              
253 6         728 my @chunks;
254             # parse each line, looking for block-level HTML tags
255 6         87 while ($text =~ s{^(([ ]{0,$less_than_tab}<)?.*\n)}{}m) {
256 12         31 my $cur_line = $1;
257 12 50       26 if (defined $2) {
258             # current line could be start of code block
259              
260 0         0 my ($tag, $remainder, $prefix, $opening_tag, $text_in_tag, $closing_tag) = $extract_block->($cur_line . $text);
261 0 0       0 if ($tag) {
262 0 0 0     0 if ($options->{interpret_markdown_on_attribute} and $opening_tag =~ s/$markdown_attr//i) {
263 0         0 my $markdown = $2;
264 0 0       0 if ($markdown =~ /^(1|on|yes)$/) {
265             # interpret markdown and reconstruct $tag to include the interpreted $text_in_tag
266 0         0 my $wrap_in_p_tags = $opening_tag =~ /^<(div|iframe)/;
267 0         0 $tag = $prefix . $opening_tag . "\n"
268             . $self->_RunBlockGamut($text_in_tag, {wrap_in_p_tags => $wrap_in_p_tags})
269             . "\n" . $closing_tag
270             ;
271             } else {
272             # just remove the markdown="0" attribute
273 0         0 $tag = $prefix . $opening_tag . $text_in_tag . $closing_tag;
274             }
275             }
276 0         0 my $key = _md5_utf8($tag);
277 0         0 $self->{_html_blocks}{$key} = $tag;
278 0         0 push @chunks, "\n\n" . $key . "\n\n";
279 0         0 $text = $remainder;
280             }
281             else {
282             # No tag match, so toss $cur_line into @chunks
283 0         0 push @chunks, $cur_line;
284             }
285             }
286             else {
287             # current line could NOT be start of code block
288 12         78 push @chunks, $cur_line;
289             }
290              
291             }
292 6         12 push @chunks, $text; # whatever is left
293              
294 6         13 $text = join '', @chunks;
295              
296 6         81 return $text;
297             }
298              
299             sub _HashHR {
300 3     3   4 my ($self, $text) = @_;
301 3         9 my $less_than_tab = $self->{tab_width} - 1;
302              
303 3         63 $text =~ s{
304             (?:
305             (?<=\n\n) # Starting after a blank line
306             | # or
307             \A\n? # the beginning of the doc
308             )
309             ( # save in $1
310             [ ]{0,$less_than_tab}
311             <(hr) # start tag = $2
312             \b # word break
313             ([^<>])*? #
314             /?> # the matching end tag
315             [ \t]*
316             (?=\n{2,}|\Z) # followed by a blank line or end of document
317             )
318             }{
319 0         0 my $key = _md5_utf8($1);
320 0         0 $self->{_html_blocks}{$key} = $1;
321 0         0 "\n\n" . $key . "\n\n";
322             }egx;
323              
324 3         9 return $text;
325             }
326              
327             sub _HashHTMLComments {
328 3     3   6 my ($self, $text) = @_;
329 3         7 my $less_than_tab = $self->{tab_width} - 1;
330              
331             # Special case for standalone HTML comments:
332 3         52 $text =~ s{
333             (?:
334             (?<=\n\n) # Starting after a blank line
335             | # or
336             \A\n? # the beginning of the doc
337             )
338             ( # save in $1
339             [ ]{0,$less_than_tab}
340             (?s:
341            
342             (--.*?--\s*)+
343             >
344             )
345             [ \t]*
346             (?=\n{2,}|\Z) # followed by a blank line or end of document
347             )
348             }{
349 0         0 my $key = _md5_utf8($1);
350 0         0 $self->{_html_blocks}{$key} = $1;
351 0         0 "\n\n" . $key . "\n\n";
352             }egx;
353              
354 3         10 return $text;
355             }
356              
357             sub _HashPHPASPBlocks {
358 3     3   5 my ($self, $text) = @_;
359 3         5 my $less_than_tab = $self->{tab_width} - 1;
360              
361             # PHP and ASP-style processor instructions ( and <%…%>)
362 3         62 $text =~ s{
363             (?:
364             (?<=\n\n) # Starting after a blank line
365             | # or
366             \A\n? # the beginning of the doc
367             )
368             ( # save in $1
369             [ ]{0,$less_than_tab}
370             (?s:
371             <([?%]) # $2
372             .*?
373             \2>
374             )
375             [ \t]*
376             (?=\n{2,}|\Z) # followed by a blank line or end of document
377             )
378             }{
379 0         0 my $key = _md5_utf8($1);
380 0         0 $self->{_html_blocks}{$key} = $1;
381 0         0 "\n\n" . $key . "\n\n";
382             }egx;
383 3         10 return $text;
384             }
385              
386             sub _RunBlockGamut {
387             #
388             # These are all the transformations that form block-level
389             # tags like paragraphs, headers, and list items.
390             #
391 3     3   7 my ($self, $text, $options) = @_;
392              
393             # Do headers first, as these populate cross-refs
394 3         10 $text = $self->_DoHeaders($text);
395              
396             # Do Horizontal Rules:
397 3         8 my $less_than_tab = $self->{tab_width} - 1;
398             #$text =~ s{^[ ]{0,$less_than_tab}(\*[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
399             #$text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
400             #$text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{\n{empty_element_suffix}\n}gmx;
401 3         32 $text =~ s{^[ ]{0,$less_than_tab}(\*[ ]?){3,}[ \t]*$}{"\n" . ("=" x 72) . "\n\n"}egmx;
  0         0  
402 3         28 $text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{"\n" . ("=" x 72) . "\n\n"}egmx;
  0         0  
403 3         28 $text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{"\n" . ("=" x 72) . "\n\n"}egmx;
  0         0  
404              
405 3         12 $text = $self->_DoLists($text);
406              
407 3         12 $text = $self->_DoCodeBlocks($text);
408              
409 3         11 $text = $self->_DoBlockQuotes($text);
410              
411             # We already ran _HashHTMLBlocks() before, in Markdown(), but that
412             # was to escape raw HTML in the original Markdown source. This time,
413             # we're escaping the markup we've just created, so that we don't wrap
414             #

tags around block-level tags.

415 3         9 $text = $self->_HashHTMLBlocks($text);
416              
417             # Special case just for
. It was easier to make a special case than
418             # to make the other regex more complicated.
419 3         14 $text = $self->_HashHR($text);
420              
421 3         10 $text = $self->_HashHTMLComments($text);
422              
423 3         12 $text = $self->_HashPHPASPBlocks($text);
424              
425 3         18 $text = $self->_FormParagraphs($text, {wrap_in_p_tags => $options->{wrap_in_p_tags}});
426              
427 3         11 return $text;
428             }
429              
430             sub _RunSpanGamut {
431             #
432             # These are all the transformations that occur *within* block-level
433             # tags like paragraphs, headers, and list items.
434             #
435 3     3   6 my ($self, $text) = @_;
436              
437 3         10 $text = $self->_DoCodeSpans($text);
438 3         9 $text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
439 3         10 $text = $self->_EscapeSpecialChars($text);
440              
441             # Process anchor and image tags. Images must come first,
442             # because ![foo][f] looks like an anchor.
443 3         11 $text = $self->_DoImages($text);
444 3         10 $text = $self->_DoAnchors($text);
445              
446             # Make links out of things like ``
447             # Must come after _DoAnchors(), because you can use < and >
448             # delimiters in inline links like [this]().
449 3         11 $text = $self->_DoAutoLinks($text);
450              
451 3         8 $text = $self->_EncodeAmpsAndAngles($text);
452              
453 3         10 $text = $self->_DoItalicsAndBold($text);
454              
455             # FIXME - Is hard coding space here sane, or does this want to be related to tab width?
456             # Do hard breaks:
457 3         9 $text =~ s/ {2,}\n/ {empty_element_suffix}\n/g;
458              
459 3         11 return $text;
460             }
461              
462             sub _EscapeSpecialChars {
463 3     3   5 my ($self, $text) = @_;
464 3   33     30 my $tokens ||= $self->_TokenizeHTML($text);
465              
466 3         6 $text = ''; # rebuild $text from the tokens
467             # my $in_pre = 0; # Keep track of when we're inside
 or  tags. 
468             # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
469              
470 3         7 foreach my $cur_token (@$tokens) {
471 7 100       17 if ($cur_token->[0] eq "tag") {
472             # Within tags, encode * and _ so they don't conflict
473             # with their use in Markdown for italics and strong.
474             # We're replacing each such character with its
475             # corresponding MD5 checksum value; this is likely
476             # overkill, but it should prevent us from colliding
477             # with the escape values by accident.
478 2         5 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!ogx;
479 2         3 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!ogx;
480 2         4 $text .= $cur_token->[1];
481             } else {
482 5         7 my $t = $cur_token->[1];
483 5         13 $t = $self->_EncodeBackslashEscapes($t);
484 5         14 $text .= $t;
485             }
486             }
487 3         11 return $text;
488             }
489              
490             sub _EscapeSpecialCharsWithinTagAttributes {
491             #
492             # Within tags -- meaning between < and > -- encode [\ ` * _] so they
493             # don't conflict with their use in Markdown for code, italics and strong.
494             # We're replacing each such character with its corresponding MD5 checksum
495             # value; this is likely overkill, but it should prevent us from colliding
496             # with the escape values by accident.
497             #
498 3     3   6 my ($self, $text) = @_;
499 3   33     16 my $tokens ||= $self->_TokenizeHTML($text);
500 3         6 $text = ''; # rebuild $text from the tokens
501              
502 3         8 foreach my $cur_token (@$tokens) {
503 7 100       21 if ($cur_token->[0] eq "tag") {
504 2         5 $cur_token->[1] =~ s! \\ !$g_escape_table{'\\'}!gox;
505 2         4 $cur_token->[1] =~ s{ (?<=.)(?=.) }{$g_escape_table{'`'}}gox;
506 2         4 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gox;
507 2         3 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gox;
508             }
509 7         17 $text .= $cur_token->[1];
510             }
511 3         11 return $text;
512             }
513              
514             sub _DoAnchors {
515             #
516             # Turn Markdown link shortcuts into XHTML tags.
517             #
518 3     3   4 my ($self, $text) = @_;
519              
520             #
521             # First, handle reference-style links: [link text] [id]
522             #
523 3         127 $text =~ s{
524             ( # wrap whole match in $1
525             \[
526             ($g_nested_brackets) # link text = $2
527             \]
528              
529             [ ]? # one optional space
530             (?:\n[ ]*)? # one optional newline followed by spaces
531              
532             \[
533             (.*?) # id = $3
534             \]
535             )
536             }{
537 0         0 my $whole_match = $1;
538 0         0 my $link_text = $2;
539 0         0 my $link_id = lc $3;
540              
541 0 0       0 if ($link_id eq "") {
542 0         0 $link_id = lc $link_text; # for shortcut links like [this][].
543             }
544              
545 0         0 $link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
546              
547 0         0 $self->_GenerateAnchor($whole_match, $link_text, $link_id);
548             }xsge;
549              
550             #
551             # Next, inline-style links: [link text](url "optional title")
552             #
553 3         208 $text =~ s{
554             ( # wrap whole match in $1
555             \[
556             ($g_nested_brackets) # link text = $2
557             \]
558             \( # literal paren
559             [ \t]*
560             ($g_nested_parens) # href = $3
561             [ \t]*
562             ( # $4
563             (['"]) # quote char = $5
564             (.*?) # Title = $6
565             \5 # matching quote
566             [ \t]* # ignore any spaces/tabs between closing quote and )
567             )? # title is optional
568             \)
569             )
570             }{
571 0         0 my $result;
572 0         0 my $whole_match = $1;
573 0         0 my $link_text = $2;
574 0         0 my $url = $3;
575 0         0 my $title = $6;
576              
577 0         0 $self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
578             }xsge;
579              
580             #
581             # Last, handle reference-style shortcuts: [link text]
582             # These must come last in case you've also got [link test][1]
583             # or [link test](/foo)
584             #
585 3         8 $text =~ s{
586             ( # wrap whole match in $1
587             \[
588             ([^\[\]]+) # link text = $2; can't contain '[' or ']'
589             \]
590             )
591             }{
592 0         0 my $result;
593 0         0 my $whole_match = $1;
594 0         0 my $link_text = $2;
595 0         0 (my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
596              
597 0         0 $self->_GenerateAnchor($whole_match, $link_text, $link_id);
598             }xsge;
599              
600 3         9 return $text;
601             }
602              
603             sub _GenerateAnchor {
604             # FIXME - Fugly, change to named params?
605 0     0   0 my ($self, $whole_match, $link_text, $link_id, $url, $title, $attributes) = @_;
606              
607 0         0 my $result;
608              
609 0 0       0 $attributes = '' unless defined $attributes;
610              
611 0 0 0     0 if ( !defined $url && defined $self->{_urls}{$link_id}) {
612 0         0 $url = $self->{_urls}{$link_id};
613             }
614              
615 0 0       0 if (!defined $url) {
616 0         0 return $whole_match;
617             }
618              
619 0         0 $url =~ s! \* !$g_escape_table{'*'}!gox; # We've got to encode these to avoid
620 0         0 $url =~ s! _ !$g_escape_table{'_'}!gox; # conflicting with italics/bold.
621 0         0 $url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
622              
623 0         0 $result = qq{
624              
625 0 0 0     0 if ( !defined $title && defined $link_id && defined $self->{_titles}{$link_id} ) {
      0        
626 0         0 $title = $self->{_titles}{$link_id};
627             }
628              
629 0 0       0 if ( defined $title ) {
630 0         0 $title =~ s/"/"/g;
631 0         0 $title =~ s! \* !$g_escape_table{'*'}!gox;
632 0         0 $title =~ s! _ !$g_escape_table{'_'}!gox;
633 0         0 $result .= qq{ title="$title"};
634             }
635              
636             #$result .= "$attributes>$link_text";
637 0 0       0 $result = __podfmt(L => ($url . ($title ? "|$title" : "")));
638              
639 0         0 return $result;
640             }
641              
642             sub _DoImages {
643             #
644             # Turn Markdown image shortcuts into tags.
645             #
646 3     3   5 my ($self, $text) = @_;
647              
648             #
649             # First, handle reference-style labeled images: ![alt text][id]
650             #
651 3         6 $text =~ s{
652             ( # wrap whole match in $1
653             !\[
654             (.*?) # alt text = $2
655             \]
656              
657             [ ]? # one optional space
658             (?:\n[ ]*)? # one optional newline followed by spaces
659              
660             \[
661             (.*?) # id = $3
662             \]
663              
664             )
665             }{
666 0         0 my $result;
667 0         0 my $whole_match = $1;
668 0         0 my $alt_text = $2;
669 0         0 my $link_id = lc $3;
670              
671 0 0       0 if ($link_id eq '') {
672 0         0 $link_id = lc $alt_text; # for shortcut links like ![this][].
673             }
674              
675 0         0 $self->_GenerateImage($whole_match, $alt_text, $link_id);
676             }xsge;
677              
678             #
679             # Next, handle inline images: ![alt text](url "optional title")
680             # Don't forget: encode * and _
681              
682 3         184 $text =~ s{
683             ( # wrap whole match in $1
684             !\[
685             (.*?) # alt text = $2
686             \]
687             \( # literal paren
688             [ \t]*
689             ($g_nested_parens) # src url - href = $3
690             [ \t]*
691             ( # $4
692             (['"]) # quote char = $5
693             (.*?) # title = $6
694             \5 # matching quote
695             [ \t]*
696             )? # title is optional
697             \)
698             )
699             }{
700 0         0 my $result;
701 0         0 my $whole_match = $1;
702 0         0 my $alt_text = $2;
703 0         0 my $url = $3;
704 0         0 my $title = '';
705 0 0       0 if (defined($6)) {
706 0         0 $title = $6;
707             }
708              
709 0         0 $self->_GenerateImage($whole_match, $alt_text, undef, $url, $title);
710             }xsge;
711              
712 3         10 return $text;
713             }
714              
715             sub _GenerateImage {
716             # FIXME - Fugly, change to named params?
717 0     0   0 my ($self, $whole_match, $alt_text, $link_id, $url, $title, $attributes) = @_;
718              
719 0         0 my $result;
720              
721 0 0       0 $attributes = '' unless defined $attributes;
722              
723 0   0     0 $alt_text ||= '';
724 0         0 $alt_text =~ s/"/"/g;
725             # FIXME - how about >
726              
727 0 0 0     0 if ( !defined $url && defined $self->{_urls}{$link_id}) {
728 0         0 $url = $self->{_urls}{$link_id};
729             }
730              
731             # If there's no such link ID, leave intact:
732 0 0       0 return $whole_match unless defined $url;
733              
734 0         0 $url =~ s! \* !$g_escape_table{'*'}!ogx; # We've got to encode these to avoid
735 0         0 $url =~ s! _ !$g_escape_table{'_'}!ogx; # conflicting with italics/bold.
736 0         0 $url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
737              
738 0 0 0     0 if (!defined $title && length $link_id && defined $self->{_titles}{$link_id} && length $self->{_titles}{$link_id}) {
      0        
      0        
739 0         0 $title = $self->{_titles}{$link_id};
740             }
741              
742 0         0 $result = qq{$alt_text
743 0 0 0     0 if (defined $title && length $title) {
744 0         0 $title =~ s! \* !$g_escape_table{'*'}!ogx;
745 0         0 $title =~ s! _ !$g_escape_table{'_'}!ogx;
746 0         0 $title =~ s/"/"/g;
747 0         0 $result .= qq{ title="$title"};
748             }
749 0         0 $result .= $attributes . $self->{empty_element_suffix};
750              
751 0         0 $result = "\n\n=begin HTML\n\n$result\n\n=end HTML\n\n";
752              
753 0         0 return $result;
754             }
755              
756             sub _DoHeaders {
757 3     3   4 my ($self, $text) = @_;
758              
759             # Setext-style headers:
760             # Header 1
761             # ========
762             #
763             # Header 2
764             # --------
765             #
766 3         7 $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
767 0         0 $self->_GenerateHeader('1', $1);
768             }egmx;
769              
770 3         5 $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
771 0         0 $self->_GenerateHeader('2', $1);
772             }egmx;
773              
774              
775             # atx-style headers:
776             # # Header 1
777             # ## Header 2
778             # ## Header 2 with closing hashes ##
779             # ...
780             # ###### Header 6
781             #
782 3         4 my $l;
783 3         6 $text =~ s{
784             ^(\#{1,6}) # $1 = string of #'s
785             [ \t]*
786             (.+?) # $2 = Header text
787             [ \t]*
788             \#* # optional closing #'s (not counted)
789             \n+
790             }{
791 0         0 my $h_level = length($1);
792 0         0 $self->_GenerateHeader($h_level, $2);
793             }egmx;
794              
795 3         7 return $text;
796             }
797              
798             sub _GenerateHeader {
799 0     0   0 my ($self, $level, $id) = @_;
800              
801             #return "" . $self->_RunSpanGamut($id) . "\n\n";
802 0         0 return "=head$level " . $self->_RunSpanGamut($id) . "\n\n";
803             }
804              
805             sub _DoLists {
806             #
807             # Form HTML ordered (numbered) and unordered (bulleted) lists.
808             #
809 3     3   6 my ($self, $text) = @_;
810 3         7 my $less_than_tab = $self->{tab_width} - 1;
811              
812             # Re-usable patterns to match list item bullets and number markers:
813 3         10 my $marker_ul = qr/[*+-]/;
814 3         9 my $marker_ol = qr/\d+[.]/;
815 3         34 my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
816              
817             # Re-usable pattern to match any entirel ul or ol list:
818 3         76 my $whole_list = qr{
819             ( # $1 = whole list
820             ( # $2
821             [ ]{0,$less_than_tab}
822             (${marker_any}) # $3 = first list item marker
823             [ \t]+
824             )
825             (?s:.+?)
826             ( # $4
827             \z
828             |
829             \n{2,}
830             (?=\S)
831             (?! # Negative lookahead for another list item marker
832             [ \t]*
833             ${marker_any}[ \t]+
834             )
835             )
836             )
837             }mx;
838              
839             # We use a different prefix before nested lists than top-level lists.
840             # See extended comment in _ProcessListItems().
841             #
842             # Note: There's a bit of duplication here. My original implementation
843             # created a scalar regex pattern as the conditional result of the test on
844             # $self->{_list_level}, and then only ran the $text =~ s{...}{...}egmx
845             # substitution once, using the scalar as the pattern. This worked,
846             # everywhere except when running under MT on my hosting account at Pair
847             # Networks. There, this caused all rebuilds to be killed by the reaper (or
848             # perhaps they crashed, but that seems incredibly unlikely given that the
849             # same script on the same server ran fine *except* under MT. I've spent
850             # more time trying to figure out why this is happening than I'd like to
851             # admit. My only guess, backed up by the fact that this workaround works,
852             # is that Perl optimizes the substition when it can figure out that the
853             # pattern will never change, and when this optimization isn't on, we run
854             # afoul of the reaper. Thus, the slightly redundant code to that uses two
855             # static s/// patterns rather than one conditional pattern.
856              
857 3 50       13 if ($self->{_list_level}) {
858 0         0 $text =~ s{
859             ^
860             $whole_list
861             }{
862 0         0 my $list = $1;
863 0         0 my $marker = $3;
864 0 0       0 my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
865             # Turn double returns into triple returns, so that we can make a
866             # paragraph for the last item in a list, if necessary:
867 0         0 $list =~ s/\n{2,}/\n\n\n/g;
868 0 0       0 my $result = ( $list_type eq 'ul' ) ?
869             $self->_ProcessListItemsUL($list, $marker_ul)
870             : $self->_ProcessListItemsOL($list, $marker_ol);
871              
872 0         0 $result = $self->_MakeList($list_type, $result, $marker);
873 0         0 $result;
874             }egmx;
875             }
876             else {
877 3         96 $text =~ s{
878             (?:(?<=\n\n)|\A\n?)
879             $whole_list
880             }{
881 0         0 my $list = $1;
882 0         0 my $marker = $3;
883 0 0       0 my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
884             # Turn double returns into triple returns, so that we can make a
885             # paragraph for the last item in a list, if necessary:
886 0         0 $list =~ s/\n{2,}/\n\n\n/g;
887 0 0       0 my $result = ( $list_type eq 'ul' ) ?
888             $self->_ProcessListItemsUL($list, $marker_ul)
889             : $self->_ProcessListItemsOL($list, $marker_ol);
890 0         0 $result = $self->_MakeList($list_type, $result, $marker);
891 0         0 $result;
892             }egmx;
893             }
894              
895              
896 3         16 return $text;
897             }
898              
899             sub _MakeList {
900 0     0   0 my ($self, $list_type, $content, $marker) = @_;
901              
902 0 0 0     0 if ($list_type eq 'ol' and $self->{trust_list_start_value}) {
903 0         0 my ($num) = $marker =~ /^(\d+)[.]/;
904             #return "
    \n" . $content . "
\n";
905 0         0 return "=over\n\n" . $content . "=back\n\n";
906             }
907              
908             #return "<$list_type>\n" . $content . "\n";
909 0         0 return "=over\n\n" . $content . "=back\n\n";
910             }
911              
912             sub _ProcessListItemsOL {
913             #
914             # Process the contents of a single ordered list, splitting it
915             # into individual list items.
916             #
917              
918 0     0   0 my ($self, $list_str, $marker_any) = @_;
919              
920              
921             # The $self->{_list_level} global keeps track of when we're inside a list.
922             # Each time we enter a list, we increment it; when we leave a list,
923             # we decrement. If it's zero, we're not in a list anymore.
924             #
925             # We do this because when we're not inside a list, we want to treat
926             # something like this:
927             #
928             # I recommend upgrading to version
929             # 8. Oops, now this line is treated
930             # as a sub-list.
931             #
932             # As a single paragraph, despite the fact that the second line starts
933             # with a digit-period-space sequence.
934             #
935             # Whereas when we're inside a list (or sub-list), that line will be
936             # treated as the start of a sub-list. What a kludge, huh? This is
937             # an aspect of Markdown's syntax that's hard to parse perfectly
938             # without resorting to mind-reading. Perhaps the solution is to
939             # change the syntax rules such that sub-lists must start with a
940             # starting cardinal number; e.g. "1." or "a.".
941              
942 0         0 $self->{_list_level}++;
943              
944             # trim trailing blank lines:
945 0         0 $list_str =~ s/\n{2,}\z/\n/;
946              
947              
948 0         0 my $i = 0;
949              
950 0         0 $list_str =~ s{
951             (\n)? # leading line = $1
952             (^[ \t]*) # leading whitespace = $2
953             ($marker_any) [ \t]+ # list marker = $3
954             ((?s:.+?) # list item text = $4
955             (\n{1,2}))
956             (?= \n* (\z | \2 ($marker_any) [ \t]+))
957             }{
958 0         0 my $item = $4;
959 0         0 my $leading_line = $1;
960 0         0 my $leading_space = $2;
961              
962 0 0 0     0 if ($leading_line or ($item =~ m/\n{2,}/)) {
963 0         0 $item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
964             }
965             else {
966             # Recursion for sub-lists:
967 0         0 $item = $self->_DoLists($self->_Outdent($item));
968 0         0 chomp $item;
969 0         0 $item = $self->_RunSpanGamut($item);
970             }
971              
972             #"
  • " . $item . "
  • \n";
    973 0         0 $i++; "=item $i. " . $item . "\n\n";
      0         0  
    974             }egmxo;
    975              
    976 0         0 $self->{_list_level}--;
    977 0         0 return $list_str;
    978             }
    979              
    980             sub _ProcessListItemsUL {
    981             #
    982             # Process the contents of a single unordered list, splitting it
    983             # into individual list items.
    984             #
    985              
    986 0     0   0 my ($self, $list_str, $marker_any) = @_;
    987              
    988              
    989             # The $self->{_list_level} global keeps track of when we're inside a list.
    990             # Each time we enter a list, we increment it; when we leave a list,
    991             # we decrement. If it's zero, we're not in a list anymore.
    992             #
    993             # We do this because when we're not inside a list, we want to treat
    994             # something like this:
    995             #
    996             # I recommend upgrading to version
    997             # 8. Oops, now this line is treated
    998             # as a sub-list.
    999             #
    1000             # As a single paragraph, despite the fact that the second line starts
    1001             # with a digit-period-space sequence.
    1002             #
    1003             # Whereas when we're inside a list (or sub-list), that line will be
    1004             # treated as the start of a sub-list. What a kludge, huh? This is
    1005             # an aspect of Markdown's syntax that's hard to parse perfectly
    1006             # without resorting to mind-reading. Perhaps the solution is to
    1007             # change the syntax rules such that sub-lists must start with a
    1008             # starting cardinal number; e.g. "1." or "a.".
    1009              
    1010 0         0 $self->{_list_level}++;
    1011              
    1012             # trim trailing blank lines:
    1013 0         0 $list_str =~ s/\n{2,}\z/\n/;
    1014              
    1015              
    1016 0         0 $list_str =~ s{
    1017             (\n)? # leading line = $1
    1018             (^[ \t]*) # leading whitespace = $2
    1019             ($marker_any) [ \t]+ # list marker = $3
    1020             ((?s:.+?) # list item text = $4
    1021             (\n{1,2}))
    1022             (?= \n* (\z | \2 ($marker_any) [ \t]+))
    1023             }{
    1024 0         0 my $item = $4;
    1025 0         0 my $leading_line = $1;
    1026 0         0 my $leading_space = $2;
    1027              
    1028 0 0 0     0 if ($leading_line or ($item =~ m/\n{2,}/)) {
    1029 0         0 $item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
    1030             }
    1031             else {
    1032             # Recursion for sub-lists:
    1033 0         0 $item = $self->_DoLists($self->_Outdent($item));
    1034 0         0 chomp $item;
    1035 0         0 $item = $self->_RunSpanGamut($item);
    1036             }
    1037              
    1038             #"
  • " . $item . "
  • \n";
    1039 0         0 "=item * " . $item . "\n\n";
    1040             }egmxo;
    1041              
    1042 0         0 $self->{_list_level}--;
    1043 0         0 return $list_str;
    1044             }
    1045              
    1046             sub _DoCodeBlocks {
    1047             #
    1048             # Process Markdown code blocks (indented with 4 spaces or 1 tab):
    1049             # * outdent the spaces/tab
    1050             # * encode <, >, & into HTML entities
    1051             # * escape Markdown special characters into MD5 hashes
    1052             # * trim leading and trailing newlines
    1053             #
    1054              
    1055 3     3   7 my ($self, $text) = @_;
    1056              
    1057 3         66 $text =~ s{
    1058             (?:\n\n|\A)
    1059             ( # $1 = the code block -- one or more lines, starting with a space/tab
    1060             (?:
    1061             (?:[ ]{$self->{tab_width}} | \t) # Lines must start with a tab or a tab-width of spaces
    1062             .*\n+
    1063             )+
    1064             )
    1065             ((?=^[ ]{0,$self->{tab_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
    1066             }{
    1067 0         0 my $codeblock = $1;
    1068 0         0 my $result; # return value
    1069              
    1070 0         0 $codeblock = $self->_EncodeCode($self->_Outdent($codeblock), 0);
    1071 0         0 $codeblock = $self->_Detab($codeblock);
    1072 0         0 $codeblock =~ s/\A\n+//; # trim leading newlines
    1073 0         0 $codeblock =~ s/\n+\z//; # trim trailing newlines
    1074              
    1075             #$result = "\n\n
    " . $codeblock . "\n
    \n\n";
    1076 0         0 $codeblock =~ s/^/ /mg;
    1077              
    1078 0         0 $result = "\n\n" . $codeblock . "\n\n";
    1079              
    1080 0         0 $result;
    1081             }egmx;
    1082              
    1083 3         9 return $text;
    1084             }
    1085              
    1086             sub _DoCodeSpans {
    1087             #
    1088             # * Backtick quotes are used for spans.
    1089             #
    1090             # * You can use multiple backticks as the delimiters if you want to
    1091             # include literal backticks in the code span. So, this input:
    1092             #
    1093             # Just type ``foo `bar` baz`` at the prompt.
    1094             #
    1095             # Will translate to:
    1096             #
    1097             #

    Just type foo `bar` baz at the prompt.

    1098             #
    1099             # There's no arbitrary limit to the number of backticks you
    1100             # can use as delimters. If you need three consecutive backticks
    1101             # in your code, use four for delimiters, etc.
    1102             #
    1103             # * You can use spaces to get literal backticks at the edges:
    1104             #
    1105             # ... type `` `bar` `` ...
    1106             #
    1107             # Turns to:
    1108             #
    1109             # ... type `bar` ...
    1110             #
    1111              
    1112 3     3   5 my ($self, $text) = @_;
    1113              
    1114 3         11 $text =~ s@
    1115             (?
    1116             (`+) # $1 = Opening run of `
    1117             (.+?) # $2 = The code block
    1118             (?
    1119             \1 # Matching closer
    1120             (?!`)
    1121             @
    1122 1         3 my $c = "$2";
    1123 1         12 $c =~ s/^[ \t]*//g; # leading whitespace
    1124 1         6 $c =~ s/[ \t]*$//g; # trailing whitespace
    1125 1         5 $c = $self->_EncodeCode($c);
    1126             #"$c";
    1127 1         3 __podfmt(C => $c);
    1128             @egsx;
    1129              
    1130 3         6 return $text;
    1131             }
    1132              
    1133             sub _EncodeCode {
    1134             #
    1135             # Encode/escape certain characters inside Markdown code runs.
    1136             # The point is that in code, these characters are literals,
    1137             # and lose their special Markdown meanings.
    1138             #
    1139 1     1   3 my $self = shift;
    1140 1         3 local $_ = shift;
    1141 1   50     10 my $do_angle_bracket = shift // 1;
    1142              
    1143             # Encode all ampersands; HTML entities are not
    1144             # entities within a Markdown code span.
    1145             #s/&/&/g;
    1146              
    1147             # Encode $'s, but only if we're running under Blosxom.
    1148             # (Blosxom interpolates Perl variables in article bodies.)
    1149             {
    1150 1     1   29 no warnings 'once';
      1         2  
      1         2297  
      1         2  
    1151 1 50       4 if (defined($blosxom::version)) {
    1152             #s/\$/$/g;
    1153             }
    1154             }
    1155              
    1156              
    1157             # Do the angle bracket song and dance:
    1158             #s! < !<!gx;
    1159             #s! > !>!gx;
    1160 1 100       7 s! ([<>]) !$1 eq '<' ? 'E' : 'E'!egx if $do_angle_bracket;
      2 50       10  
    1161              
    1162             # Now, escape characters that are magic in Markdown:
    1163 1         3 s! \* !$g_escape_table{'*'}!ogx;
    1164 1         2 s! _ !$g_escape_table{'_'}!ogx;
    1165 1         2 s! { !$g_escape_table{'{'}!ogx;
    1166 1         2 s! } !$g_escape_table{'}'}!ogx;
    1167 1         2 s! \[ !$g_escape_table{'['}!ogx;
    1168 1         2 s! \] !$g_escape_table{']'}!ogx;
    1169 1         2 s! \\ !$g_escape_table{'\\'}!ogx;
    1170              
    1171 1         3 return $_;
    1172             }
    1173              
    1174             sub __podfmt {
    1175 2     2   6 my ($fmt, $content) = @_;
    1176 2 100       10 if ($content =~ /[<>]/) {
    1177 1         5 "$fmt<< $content >>";
    1178             } else {
    1179 1         8 "$fmt<$content>";
    1180             }
    1181             }
    1182              
    1183             sub _DoItalicsAndBold {
    1184 3     3   5 my ($self, $text) = @_;
    1185              
    1186             # Handle at beginning of lines:
    1187 3         8 $text =~ s{ ^(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
    1188             #{$2}gsx;
    1189 0         0 {__podfmt(B => $2)}gsex;
    1190              
    1191 3         7 $text =~ s{ ^(\*|_) (?=\S) (.+?) (?<=\S) \1 }
    1192             #{$2}gsx;
    1193 0         0 {__podfmt(I => $2)}gsex;
    1194              
    1195             # must go first:
    1196 3         13 $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
    1197             #{$2}gsx;
    1198 0         0 {__podfmt(B => $2)}gsex;
    1199              
    1200 3         18 $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 }
    1201             #{$2}gsx;
    1202 1         5 {__podfmt(I => $2)}gsex;
    1203              
    1204             # And now, a second pass to catch nested strong and emphasis special cases
    1205 3         12 $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
    1206             #{$2}gsx;
    1207 0         0 {__podfmt(B => $2)}gsex;
    1208              
    1209 3         12 $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 }
    1210             #{$2}gsx;
    1211 0         0 {__podfmt(I => $2)}gsex;
    1212              
    1213 3         8 return $text;
    1214             }
    1215              
    1216             sub _DoBlockQuotes {
    1217 3     3   7 my ($self, $text) = @_;
    1218              
    1219 3         4 $text =~ s{
    1220             ( # Wrap whole match in $1
    1221             (
    1222             ^[ \t]*>[ \t]? # '>' at the start of a line
    1223             .+\n # rest of the first line
    1224             (.+\n)* # subsequent consecutive lines
    1225             \n* # blanks
    1226             )+
    1227             )
    1228             }{
    1229 0         0 my $bq = $1;
    1230 0         0 $bq =~ s/^([ \t]*>)/ $1/gm;
    1231 0         0 $bq;
    1232             #$bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
    1233             #$bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
    1234             #$bq = $self->_RunBlockGamut($bq, {wrap_in_p_tags => 1}); # recurse
    1235              
    1236             #$bq =~ s/^/ /mg;
    1237             ## These leading spaces screw with
     content, so we need to fix that: 
    1238             #$bq =~ s{
    1239             # (\s*
    .+?
    )
    1240             # }{
    1241             # my $pre = $1;
    1242             # #$pre =~ s/^ //mg;
    1243             # $pre;
    1244             # }egsx;
    1245             #
    1246             #"
    \n$bq\n
    \n\n";
    1247             }egmx;
    1248              
    1249              
    1250 3         7 return $text;
    1251             }
    1252              
    1253             sub _FormParagraphs {
    1254             #
    1255             # Params:
    1256             # $text - string to process with html

    tags

    1257             #
    1258 3     3   6 my ($self, $text, $options) = @_;
    1259              
    1260             # Strip leading and trailing lines:
    1261 3         5 $text =~ s/\A\n+//;
    1262 3         11 $text =~ s/\n+\z//;
    1263              
    1264 3         14 my @grafs = split(/\n{2,}/, $text);
    1265              
    1266             #
    1267             # Wrap

    tags.

    1268             #
    1269 3         8 foreach (@grafs) {
    1270 3 50       11 unless (defined( $self->{_html_blocks}{$_} )) {
    1271 3         10 $_ = $self->_RunSpanGamut($_);
    1272             #if ($options->{wrap_in_p_tags}) {
    1273             # s/^([ \t]*)

    //;

    1274             # $_ .= "

    ";
    1275             #}
    1276             }
    1277             }
    1278              
    1279             #
    1280             # Unhashify HTML blocks
    1281             #
    1282 3         9 foreach (@grafs) {
    1283 3 50       16 if (defined( $self->{_html_blocks}{$_} )) {
    1284 0         0 $_ = $self->{_html_blocks}{$_};
    1285             }
    1286             }
    1287              
    1288 3         14 return join "\n\n", @grafs;
    1289             }
    1290              
    1291             sub _EncodeAmpsAndAngles {
    1292             # Smart processing for ampersands and angle brackets that need to be encoded.
    1293              
    1294 3     3   7 my ($self, $text) = @_;
    1295 3 50 33     14 return '' if (!defined $text or !length $text);
    1296              
    1297 3         9 return $text;
    1298              
    1299             # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
    1300             # http://bumppo.net/projects/amputator/
    1301 0         0 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g;
    1302              
    1303             # Encode naked <'s
    1304 0         0 $text =~ s{<(?![a-z/?\$!])}{<}gi;
    1305              
    1306             # And >'s - added by Fletcher Penney
    1307             # $text =~ s{>(?![a-z/?\$!])}{>}gi;
    1308             # Causes problems...
    1309              
    1310             # Remove encoding inside comments
    1311 0         0 $text =~ s{
    1312             (?<=) # End comments
    1315             }{
    1316 0         0 my $t = $1;
    1317 0         0 $t =~ s/&/&/g;
    1318 0         0 $t =~ s/</
    1319 0         0 $t;
    1320             }egsx;
    1321              
    1322 0         0 return $text;
    1323             }
    1324              
    1325             sub _EncodeBackslashEscapes {
    1326             #
    1327             # Parameter: String.
    1328             # Returns: The string, with after processing the following backslash
    1329             # escape sequences.
    1330             #
    1331 5     5   6 my $self = shift;
    1332 5         40 local $_ = shift;
    1333              
    1334 5         9 s! \\\\ !$g_escape_table{'\\'}!ogx; # Must process escaped backslashes first.
    1335 5         7 s! \\` !$g_escape_table{'`'}!ogx;
    1336 5         7 s! \\\* !$g_escape_table{'*'}!ogx;
    1337 5         5 s! \\_ !$g_escape_table{'_'}!ogx;
    1338 5         8 s! \\\{ !$g_escape_table{'{'}!ogx;
    1339 5         7 s! \\\} !$g_escape_table{'}'}!ogx;
    1340 5         6 s! \\\[ !$g_escape_table{'['}!ogx;
    1341 5         6 s! \\\] !$g_escape_table{']'}!ogx;
    1342 5         13 s! \\\( !$g_escape_table{'('}!ogx;
    1343 5         6 s! \\\) !$g_escape_table{')'}!ogx;
    1344 5         6 s! \\> !$g_escape_table{'>'}!ogx;
    1345 5         8 s! \\\# !$g_escape_table{'#'}!ogx;
    1346 5         7 s! \\\+ !$g_escape_table{'+'}!ogx;
    1347 5         6 s! \\\- !$g_escape_table{'-'}!ogx;
    1348 5         6 s! \\\. !$g_escape_table{'.'}!ogx;
    1349 5         6 s{ \\! }{$g_escape_table{'!'}}ogx;
    1350              
    1351 5         13 return $_;
    1352             }
    1353              
    1354             sub _DoAutoLinks {
    1355 3     3   5 my ($self, $text) = @_;
    1356              
    1357             #$text =~ s{<((https?|ftp):[^'">\s]+)>}{$1}gi;
    1358             #$text =~ s{<((https?|ftp):[^'">\s]+)>}{__podfmt(L => $1)}egi;
    1359              
    1360             # Email addresses:
    1361 3         4 $text =~ s{
    1362             <
    1363             (?:mailto:)?
    1364             (
    1365             [-.\w\+]+
    1366             \@
    1367             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
    1368             )
    1369             >
    1370             }{
    1371 0         0 $self->_EncodeEmailAddress( $self->_UnescapeSpecialChars($1) );
    1372             }egix;
    1373              
    1374 3         7 return $text;
    1375             }
    1376              
    1377             sub _EncodeEmailAddress {
    1378             #
    1379             # Input: an email address, e.g. "foo@example.com"
    1380             #
    1381             # Output: the email address as a mailto link, with each character
    1382             # of the address encoded as either a decimal or hex entity, in
    1383             # the hopes of foiling most address harvesting spam bots. E.g.:
    1384             #
    1385             # 1386             # xample.com">foo
    1387             # @example.com
    1388             #
    1389             # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
    1390             # mailing list:
    1391             #
    1392              
    1393 0     0   0 my ($self, $addr) = @_;
    1394              
    1395             my @encode = (
    1396 0     0   0 sub { '&#' . ord(shift) . ';' },
    1397 0     0   0 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
    1398 0     0   0 sub { shift },
    1399 0         0 );
    1400              
    1401 0         0 $addr = "mailto:" . $addr;
    1402              
    1403 0         0 $addr =~ s{(.)}{
    1404 0         0 my $char = $1;
    1405 0 0       0 if ( $char eq '@' ) {
        0          
    1406             # this *must* be encoded. I insist.
    1407 0         0 $char = $encode[int rand 1]->($char);
    1408             }
    1409             elsif ( $char ne ':' ) {
    1410             # leave ':' alone (to spot mailto: later)
    1411 0         0 my $r = rand;
    1412             # roughly 10% raw, 45% hex, 45% dec
    1413 0 0       0 $char = (
        0          
    1414             $r > .9 ? $encode[2]->($char) :
    1415             $r < .45 ? $encode[1]->($char) :
    1416             $encode[0]->($char)
    1417             );
    1418             }
    1419 0         0 $char;
    1420             }gex;
    1421              
    1422 0         0 $addr = qq{$addr};
    1423 0         0 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
    1424              
    1425 0         0 return $addr;
    1426             }
    1427              
    1428             sub _UnescapeSpecialChars {
    1429             #
    1430             # Swap back in all the special characters we've hidden.
    1431             #
    1432 3     3   7 my ($self, $text) = @_;
    1433              
    1434 3         15 while( my($char, $hash) = each(%g_escape_table) ) {
    1435 48         526 $text =~ s/$hash/$char/g;
    1436             }
    1437 3         7 return $text;
    1438             }
    1439              
    1440             sub _TokenizeHTML {
    1441             #
    1442             # Parameter: String containing HTML markup.
    1443             # Returns: Reference to an array of the tokens comprising the input
    1444             # string. Each token is either a tag (possibly with nested,
    1445             # tags contained therein, such as , or a
    1446             # run of text between tags. Each element of the array is a
    1447             # two-element array; the first is either 'tag' or 'text';
    1448             # the second is the actual value.
    1449             #
    1450             #
    1451             # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
    1452             #
    1453             #
    1454              
    1455 6     6   8 my ($self, $str) = @_;
    1456 6         7 my $pos = 0;
    1457 6         8 my $len = length $str;
    1458 6         6 my @tokens;
    1459              
    1460 6         7 my $depth = 6;
    1461 6         30 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
    1462 6         196 my $match = qr/(?s: ) | # comment
    1463             (?s: <\? .*? \?> ) | # processing instruction
    1464             $nested_tags/iox; # nested tags
    1465              
    1466 6         224 while ($str =~ m/($match)/og) {
    1467 4         7 my $whole_tag = $1;
    1468 4         7 my $sec_start = pos $str;
    1469 4         5 my $tag_start = $sec_start - length $whole_tag;
    1470 4 50       56 if ($pos < $tag_start) {
    1471 4         15 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
    1472             }
    1473 4         10 push @tokens, ['tag', $whole_tag];
    1474 4         21 $pos = pos $str;
    1475             }
    1476 6 50       29 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
    1477 6         38 \@tokens;
    1478             }
    1479              
    1480             sub _Outdent {
    1481             #
    1482             # Remove one level of line-leading tabs or spaces
    1483             #
    1484 0     0   0 my ($self, $text) = @_;
    1485              
    1486 0         0 $text =~ s/^(\t|[ ]{1,$self->{tab_width}})//gm;
    1487 0         0 return $text;
    1488             }
    1489              
    1490             sub _Detab {
    1491             #
    1492             # Cribbed from a post by Bart Lateur:
    1493             #
    1494             #
    1495 3     3   6 my ($self, $text) = @_;
    1496              
    1497             # FIXME - Better anchor/regex would be quicker.
    1498              
    1499             # Original:
    1500             #$text =~ s{(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}ge;
    1501              
    1502             # Much swifter, but pretty hateful:
    1503 3         14 do {} while ($text =~ s{^(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}mge);
      0         0  
    1504 3         8 return $text;
    1505             }
    1506              
    1507             sub _ConvertCopyright {
    1508 3     3   5 my ($self, $text) = @_;
    1509             # Convert to an XML compatible form of copyright symbol
    1510              
    1511 3         5 $text =~ s/©/©/gi;
    1512              
    1513 3         6 return $text;
    1514             }
    1515              
    1516             1;
    1517              
    1518             __END__