File Coverage

blib/lib/HTML/WikiConverter/MediaWiki.pm
Criterion Covered Total %
statement 18 141 12.7
branch 0 82 0.0
condition 0 18 0.0
subroutine 6 24 25.0
pod 0 4 0.0
total 24 269 8.9


line stmt bran cond sub pod time code
1             package HTML::WikiConverter::MediaWiki;
2 1     1   25723 use base 'HTML::WikiConverter';
  1         2  
  1         695  
3              
4 1     1   8 use warnings;
  1         2  
  1         59  
5 1     1   6 use strict;
  1         7  
  1         37  
6              
7 1     1   116832 use URI;
  1         9371  
  1         48  
8 1     1   14 use File::Basename;
  1         2  
  1         116  
9 1     1   1145 use HTML::Tagset;
  1         1699  
  1         2883  
10             our $VERSION = '0.59';
11              
12             =head1 NAME
13              
14             HTML::WikiConverter::MediaWiki - Convert HTML to MediaWiki markup
15              
16             =head1 SYNOPSIS
17              
18             use HTML::WikiConverter;
19             my $wc = new HTML::WikiConverter( dialect => 'MediaWiki' );
20             print $wc->html2wiki( $html );
21              
22             =head1 DESCRIPTION
23              
24             This module contains rules for converting HTML into MediaWiki
25             markup. See L for additional usage details.
26              
27             =head1 ATTRIBUTES
28              
29             In addition to the regular set of attributes recognized by the
30             L constructor, this dialect also accepts the
31             following attributes:
32              
33             =head2 preserve_bold
34              
35             Boolean indicating whether bold HTML elements should be preserved as
36             HTML in the wiki output rather than being converted into MediaWiki
37             markup.
38              
39             By default, EbE and EstrongE elements are converted to
40             wiki markup identically. But sometimes you may wish EbE tags
41             in the HTML to be preserved in the resulting MediaWiki markup. This
42             attribute allows this.
43              
44             For example, if C is enabled, HTML like
45              
46            
47            
  • Bold
  • 48            
  • Strong
  • 49            
    50              
    51             will be converted to
    52              
    53             * Bold
    54             * '''Strong'''
    55              
    56             When disabled (the default), the preceding HTML markup would be
    57             converted into
    58              
    59             * '''Bold'''
    60             * '''Strong'''
    61              
    62             =head2 preserve_italic
    63              
    64             Boolean indicating whether italic HTML elements should be preserved as
    65             HTML in the wiki output rather than being converted into MediaWiki
    66             markup.
    67              
    68             For example, if C is enabled, HTML like
    69              
    70            
    71            
  • Italic
  • 72            
  • Emphasized
  • 73            
    74              
    75             will be converted to
    76              
    77             * Italic
    78             * ''Emphasized''
    79              
    80             When disabled (the default), the preceding HTML markup would be
    81             converted into
    82              
    83             * ''Italic''
    84             * ''Emphasized''
    85              
    86             =head2 preserve_templates
    87              
    88             Boolean indicating whether C<{{template}}> calls found in HTML should
    89             be preserved in the wiki markup. If disabled (the default), templates
    90             calls will be wrapped in CnowikiE> tags.
    91              
    92             =head2 preserve_nowiki
    93              
    94             Boolean indicating whether CnowikiE> tags found in HTML
    95             should be preserved in the wiki markup. If disabled (the default),
    96             nowiki tags will be replaced with their content.
    97              
    98             =head2 pad_headings
    99              
    100             Boolean indicating whether section headings should be padded with
    101             spaces (eg, "== Section ==" instead of "==Section=="). Default is
    102             false (ie, not to pad).
    103              
    104             =cut
    105              
    106             my @common_attrs = qw/ id class lang dir title style /;
    107             my @block_attrs = ( @common_attrs, 'align' );
    108             my @tablealign_attrs = qw/ align char charoff valign /;
    109             my @tablecell_attrs = qw(
    110             abbr axis headers scope rowspan
    111             colspan nowrap width height bgcolor
    112             );
    113              
    114             # Fix for bug 14527
    115             my $pre_prefix = '[jsmckaoqkjgbhazkfpwijhkixh]';
    116              
    117             sub rules {
    118 0     0 0   my $self = shift;
    119              
    120 0           my %rules = (
    121             hr => { replace => "\n----\n" },
    122             br => { preserve => 1, empty => 1, attributes => [ qw/id class title style clear/ ] },
    123             p => { block => 1, trim => 'both', line_format => 'single' },
    124             em => { start => "''", end => "''", line_format => 'single' },
    125             strong => { start => "'''", end => "'''", line_format => 'single' },
    126              
    127             i => { alias => 'em' },
    128             b => { alias => 'strong' },
    129              
    130             pre => { line_prefix => $pre_prefix, block => 1 },
    131              
    132             table => { start => \&_table_start, end => "|}", block => 1, line_format => 'blocks' },
    133             tr => { start => \&_tr_start },
    134             td => { start => \&_td_start, end => "\n", trim => 'both', line_format => 'blocks' },
    135             th => { start => \&_td_start, end => "\n", trim => 'both', line_format => 'single' },
    136             caption => { start => \&_caption_start, end => "\n", line_format => 'single' },
    137              
    138             img => { replace => \&_image },
    139             a => { replace => \&_link },
    140              
    141             ul => { line_format => 'multi', block => 1 },
    142             ol => { alias => 'ul' },
    143             dl => { alias => 'ul' },
    144              
    145             li => { start => \&_li_start, trim => 'leading' },
    146             dt => { alias => 'li' },
    147             dd => { alias => 'li' },
    148              
    149             # Preserved elements, from MediaWiki's Sanitizer.php (http://tinyurl.com/dzj6o)
    150             div => { preserve => 1, attributes => \@block_attrs },
    151             span => { preserve => 1, attributes => \@block_attrs },
    152             blockquote => { preserve => 1, attributes => [ @common_attrs, qw/ cite / ] },
    153             del => { preserve => 1, attributes => [ @common_attrs, qw/ cite datetime / ] },
    154             ins => { preserve => 1, attributes => [ @common_attrs, qw/ cite datetime / ] },
    155             font => { preserve => 1, attributes => [ @common_attrs, qw/ size color face / ] },
    156              
    157             # Headings (h1-h6)
    158             h1 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    159             h2 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    160             h3 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    161             h4 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    162             h5 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    163             h6 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    164             );
    165              
    166 0           my @preserved = qw/ center cite code var sup sub tt big small strike s u ruby rb rt rp /;
    167 0 0         push @preserved, 'i' if $self->preserve_italic;
    168 0 0         push @preserved, 'b' if $self->preserve_bold;
    169 0 0         push @preserved, 'nowiki' if $self->preserve_nowiki;
    170 0           $rules{$_} = { preserve => 1, attributes => \@common_attrs } foreach @preserved;
    171              
    172 0           return \%rules;
    173             }
    174              
    175             sub attributes { {
    176 0     0 0   preserve_italic => { default => 0 },
    177             preserve_bold => { default => 0 },
    178             strip_tags => { default => [ qw/ head style script ~comment title meta link object / ] },
    179             pad_headings => { default => 0 },
    180             preserve_templates => { default => 0 },
    181             preserve_nowiki => { default => 0 },
    182              
    183             # see bug #28402
    184             # xxx passthrough_naked_tags => { default => [ qw/ tbody thead font / ] },
    185             passthrough_naked_tags => { default => [ qw/ tbody thead font span / ] },
    186             } }
    187              
    188             sub _hr_start {
    189 0     0     my( $wc, $node, $subrules ) = @_;
    190 0           ( my $level = $node->tag ) =~ s/\D//g;
    191 0           my $affix = ('=') x $level;
    192 0 0         return $wc->pad_headings ? "$affix " : $affix;
    193             }
    194              
    195             sub _hr_end {
    196 0     0     my( $wc, $node, $subrules ) = @_;
    197 0           ( my $level = $node->tag ) =~ s/\D//g;
    198 0           my $affix = ('=') x $level;
    199 0 0         return $wc->pad_headings ? " $affix" : $affix;
    200             }
    201              
    202             sub postprocess_output {
    203 0     0 0   my( $self, $outref ) = @_;
    204 0           $$outref =~ s/\Q$pre_prefix\E/ /g;
    205             }
    206              
    207             # Calculates the prefix that will be placed before each list item.
    208             # Handles ordered, unordered, and definition list items.
    209             sub _li_start {
    210 0     0     my( $self, $node, $rules ) = @_;
    211 0           my @parent_lists = $node->look_up( _tag => qr/ul|ol|dl/ );
    212              
    213 0           my $prefix = '';
    214 0           foreach my $parent ( @parent_lists ) {
    215 0           my $bullet = '';
    216 0 0         $bullet = '*' if $parent->tag eq 'ul';
    217 0 0         $bullet = '#' if $parent->tag eq 'ol';
    218 0 0         $bullet = ':' if $parent->tag eq 'dl';
    219 0 0 0       $bullet = ';' if $parent->tag eq 'dl' and $node->tag eq 'dt';
    220 0           $prefix = $bullet.$prefix;
    221             }
    222              
    223 0           return "\n$prefix ";
    224             }
    225              
    226             sub _link {
    227 0     0     my( $self, $node, $rules ) = @_;
    228 0 0         my $url = defined $node->attr('href') ? $node->attr('href') : '';
    229 0           my $text = $self->get_elem_contents($node);
    230              
    231             # Handle internal links
    232 0 0         if( my $title = $self->get_wiki_page( $url ) ) {
    233 0           $title =~ s/_/ /g;
    234 0 0         return "[[$title]]" if $text eq $title; # no difference between link text and page title
    235 0 0         return "[[$text]]" if $text eq lcfirst $title; # differ by 1st char. capitalization
    236 0           return "[[$title|$text]]"; # completely different
    237             }
    238              
    239             # Treat them as external links
    240 0 0         return $url if $url eq $text;
    241 0           return "[$url $text]";
    242             }
    243              
    244             sub _image {
    245 0     0     my( $self, $node, $rules ) = @_;
    246 0 0         return '' unless $node->attr('src');
    247              
    248 0   0       my $alt = $node->attr('alt') || '';
    249 0           my $img = basename( URI->new($node->attr('src'))->path );
    250 0   0       my $width = $node->attr('width') || '';
    251              
    252 0 0 0       return sprintf '[[Image:%s|%spx|%s]]', $img, $width, $alt if $alt and $width;
    253 0 0         return sprintf '[[Image:%s|%s]]', $img, $alt if $alt;
    254 0           return sprintf '[[Image:%s]]', $img;
    255             }
    256              
    257             sub _table_start {
    258 0     0     my( $self, $node, $rules ) = @_;
    259 0           my $prefix = '{|';
    260              
    261 0           my @table_attrs = (
    262             @common_attrs,
    263             qw/ summary width border frame rules cellspacing
    264             cellpadding align bgcolor frame rules /
    265             );
    266              
    267 0           my $attrs = $self->get_attr_str( $node, @table_attrs );
    268 0 0         $prefix .= ' '.$attrs if $attrs;
    269              
    270 0           return $prefix."\n";
    271             }
    272              
    273             sub _tr_start {
    274 0     0     my( $self, $node, $rules ) = @_;
    275 0           my $prefix = '|-';
    276            
    277 0           my @tr_attrs = ( @common_attrs, 'bgcolor', @tablealign_attrs );
    278 0           my $attrs = $self->get_attr_str( $node, @tr_attrs );
    279 0 0         $prefix .= ' '.$attrs if $attrs;
    280              
    281 0 0 0       return '' unless $node->left or $attrs;
    282 0           return $prefix."\n";
    283             }
    284              
    285             # List of tags (and pseudo-tags, in the case of '~text') that are
    286             # considered phrasal elements. Any table cells that contain only these
    287             # elements will be placed on a single line.
    288             my @td_phrasals = qw/ i em b strong u tt code span font sup sub br ~text s strike del ins /;
    289             my %td_phrasals = map { $_ => 1 } @td_phrasals;
    290              
    291             sub _td_start {
    292 0     0     my( $self, $node, $rules ) = @_;
    293 0 0         my $prefix = $node->tag eq 'th' ? '!' : '|';
    294              
    295 0           my @td_attrs = ( @common_attrs, @tablecell_attrs, @tablealign_attrs );
    296 0           my $attrs = $self->get_attr_str( $node, @td_attrs );
    297 0 0         $prefix .= ' '.$attrs.' |' if $attrs;
    298              
    299             # If there are any non-text elements inside the cell, then the
    300             # cell's content should start on its own line
    301 0           my @non_text = grep !$td_phrasals{$_->tag}, $node->content_list;
    302 0 0         my $space = @non_text ? "\n" : ' ';
    303              
    304 0           return $prefix.$space;
    305             }
    306              
    307             sub _caption_start {
    308 0     0     my( $self, $node, $rules ) = @_;
    309 0           my $prefix = '|+ ';
    310              
    311 0           my @caption_attrs = ( @common_attrs, 'align' );
    312 0           my $attrs = $self->get_attr_str( $node, @caption_attrs );
    313 0 0         $prefix .= $attrs.' |' if $attrs;
    314              
    315 0           return $prefix;
    316             }
    317              
    318             sub preprocess_node {
    319 0     0 0   my( $self, $node ) = @_;
    320 0 0         my $tag = defined $node->tag ? $node->tag : '';
    321 0 0         $self->strip_aname($node) if $tag eq 'a';
    322 0           $self->_strip_extra($node);
    323 0 0         $self->_nowiki_text($node) if $tag eq '~text';
    324            
    325             # # XXX font-to-span convers
    326             # $node->tag('span') if $tag eq 'font';
    327             }
    328              
    329             my $URL_PROTOCOLS = 'http|https|ftp|irc|gopher|news|mailto';
    330             my $EXT_LINK_URL_CLASS = '[^]<>"\\x00-\\x20\\x7F]';
    331             my $EXT_LINK_TEXT_CLASS = '[^\]\\x00-\\x1F\\x7F]';
    332              
    333             # Text nodes matching one or more of these patterns will be enveloped
    334             # in and
    335              
    336             sub _wikitext_patterns {
    337 0     0     my $self = shift;
    338              
    339             # the caret in "qr/^/" seems redundant with "start_of_line" but both
    340             # are necessary
    341 0           my %wikitext_patterns = (
    342             misc => { pattern => qr/^(?:\*|\#|\;|\:|\=|\!|\|)/m, location => 'start_of_line' },
    343             italic => { pattern => qr/''/, location => 'anywhere' },
    344             rule => { pattern => qr/^----/m, location => 'start_of_line' },
    345             table => { pattern => qr/^\{\|/m, location => 'start_of_line' },
    346             link => { pattern => qr/\[\[/m, location => 'anywhere' },
    347             template => { pattern => qr/{{/m, location => 'anywhere' },
    348             );
    349              
    350 0 0         delete $wikitext_patterns{template} if $self->preserve_templates;
    351 0           return \%wikitext_patterns;
    352             }
    353              
    354             sub _nowiki_text {
    355 0     0     my( $self, $node ) = @_;
    356              
    357 0 0         my $text = defined $node->attr('text') ? $node->attr('text') : '';
    358 0 0         return unless $text;
    359              
    360 0           my $wikitext_patterns = $self->_wikitext_patterns;
    361 0           my $found_nowiki_text = 0;
    362              
    363 0           ANYWHERE: {
    364 0           my @anywhere_patterns =
    365 0           map { $_->{pattern} } grep { $_->{location} eq 'anywhere' } values %$wikitext_patterns;
      0            
    366              
    367 0 0         $found_nowiki_text++ if $self->_match( $text, \@anywhere_patterns );
    368             };
    369              
    370 0 0         START_OF_LINE: {
    371 0           last if $found_nowiki_text;
    372              
    373 0           my @sol_patterns =
    374 0           map { $_->{pattern} } grep { $_->{location} eq 'start_of_line' } values %$wikitext_patterns;
      0            
    375              
    376             # find closest parent that is a block-level node
    377 0           my $nearest_parent_block = $self->elem_search_lineage( $node, { block => 1 } );
    378              
    379 0 0         if( $nearest_parent_block ) {
    380 0           my $leftmostish_text_node = $self->_get_leftmostish_text_node( $nearest_parent_block );
    381 0 0 0       if( $leftmostish_text_node and $node == $leftmostish_text_node ) {
    382             # I'm the first child in this block element, so let's apply start_of_line nowiki fixes
    383 0 0         $found_nowiki_text++ if $self->_match( $text, \@sol_patterns );
    384             }
    385             }
    386             };
    387              
    388 0 0         if( $found_nowiki_text ) {
    389 0           $text = "$text";
    390             } else {
    391 0           $text =~ s~(\[\b(?:$URL_PROTOCOLS):$EXT_LINK_URL_CLASS+ *$EXT_LINK_TEXT_CLASS*?\])~$1~go;
    392             }
    393              
    394 0           $node->attr( text => $text );
    395             }
    396              
    397             sub _get_leftmostish_text_node {
    398 0     0     my( $self, $node ) = @_;
    399 0 0         return unless $node;
    400 0 0         return $node if $node->tag eq '~text';
    401 0           return $self->_get_leftmostish_text_node( ($node->content_list)[0] )
    402             }
    403              
    404             sub _match {
    405 0     0     my( $self, $text, $patterns ) = @_;
    406 0   0       $text =~ $_ && return 1 for @$patterns;
    407 0           return 0;
    408             }
    409              
    410             my %extra = (
    411             id => qr/catlinks/,
    412             class => qr/urlexpansion|printfooter|editsection/
    413             );
    414              
    415             # Delete ... et al
    416             sub _strip_extra {
    417 0     0     my( $self, $node ) = @_;
    418 0 0         my $tag = defined $node->tag ? $node->tag : '';
    419              
    420 0           foreach my $att_name ( keys %extra ) {
    421 0 0         my $att_value = defined $node->attr($att_name) ? $node->attr($att_name) : '';
    422 0 0         if( $att_value =~ $extra{$att_name} ) {
    423 0           $node->detach();
    424 0           $node->delete();
    425 0           return;
    426             }
    427             }
    428             }
    429              
    430             =head1 AUTHOR
    431              
    432             David J. Iberri, C<< >>
    433              
    434             =head1 BUGS
    435              
    436             Please report any bugs or feature requests to
    437             C, or through the web
    438             interface at
    439             L.
    440             I will be notified, and then you'll automatically be notified of
    441             progress on your bug as I make changes.
    442              
    443             =head1 SUPPORT
    444              
    445             You can find documentation for this module with the perldoc command.
    446              
    447             perldoc HTML::WikiConverter::MediaWiki
    448              
    449             You can also look for information at:
    450              
    451             =over 4
    452              
    453             =item * AnnoCPAN: Annotated CPAN documentation
    454              
    455             L
    456              
    457             =item * CPAN Ratings
    458              
    459             L
    460              
    461             =item * RT: CPAN's request tracker
    462              
    463             L
    464              
    465             =item * Search CPAN
    466              
    467             L
    468              
    469             =back
    470              
    471             =head1 COPYRIGHT & LICENSE
    472              
    473             Copyright 2006 David J. Iberri, all rights reserved.
    474              
    475             This program is free software; you can redistribute it and/or modify
    476             it under the same terms as Perl itself.
    477              
    478             =cut
    479              
    480             1;