| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package HTML::WikiConverter; |
|
2
|
2
|
|
|
2
|
|
47486
|
use warnings; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
82
|
|
|
3
|
2
|
|
|
2
|
|
13
|
use strict; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
72
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
2677
|
use Params::Validate ':all'; |
|
|
2
|
|
|
|
|
26997
|
|
|
|
2
|
|
|
|
|
465
|
|
|
6
|
2
|
|
|
2
|
|
2135
|
use HTML::WikiConverter::Normalizer; |
|
|
2
|
|
|
|
|
8
|
|
|
|
2
|
|
|
|
|
86
|
|
|
7
|
2
|
|
|
2
|
|
6554
|
use HTML::TreeBuilder; |
|
|
2
|
|
|
|
|
28557
|
|
|
|
2
|
|
|
|
|
33
|
|
|
8
|
2
|
|
|
2
|
|
100
|
use HTML::Entities; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
172
|
|
|
9
|
2
|
|
|
2
|
|
13
|
use HTML::Tagset; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
53
|
|
|
10
|
2
|
|
|
2
|
|
13
|
use File::Spec; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
56
|
|
|
11
|
2
|
|
|
2
|
|
2348
|
use DirHandle; |
|
|
2
|
|
|
|
|
5404
|
|
|
|
2
|
|
|
|
|
62
|
|
|
12
|
2
|
|
|
2
|
|
5626
|
use Encode; |
|
|
2
|
|
|
|
|
41938
|
|
|
|
2
|
|
|
|
|
254
|
|
|
13
|
2
|
|
|
2
|
|
25
|
use Carp; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
136
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
2
|
|
|
2
|
|
1828
|
use URI::Escape; |
|
|
2
|
|
|
|
|
3038
|
|
|
|
2
|
|
|
|
|
159
|
|
|
16
|
2
|
|
|
2
|
|
2106
|
use URI; |
|
|
2
|
|
|
|
|
6260
|
|
|
|
2
|
|
|
|
|
14475
|
|
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.68'; |
|
19
|
|
|
|
|
|
|
our $AUTOLOAD; |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 NAME |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
HTML::WikiConverter - Convert HTML to wiki markup |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
use HTML::WikiConverter; |
|
28
|
|
|
|
|
|
|
my $wc = new HTML::WikiConverter( dialect => 'MediaWiki' ); |
|
29
|
|
|
|
|
|
|
print $wc->html2wiki( html => '<b>text</b>' ), "\n\n"; |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# A more complete example |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $html = qq( |
|
34
|
|
|
|
|
|
|
<p><i>Italic</i>, <b>bold</b>, <span style="font-weight:bold">also bold</span>, etc.</p> |
|
35
|
|
|
|
|
|
|
); |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my @dialects = HTML::WikiConverter->available_dialects; |
|
38
|
|
|
|
|
|
|
foreach my $dialect ( @dialects ) { |
|
39
|
|
|
|
|
|
|
my $wc = new HTML::WikiConverter( dialect => $dialect ); |
|
40
|
|
|
|
|
|
|
my $wiki = $wc->html2wiki( html => $html ); |
|
41
|
|
|
|
|
|
|
printf "The %s dialect gives:\n\n%s\n\n", $dialect, $wiki; |
|
42
|
|
|
|
|
|
|
} |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
C<HTML::WikiConverter> is an HTML to wiki converter. It can convert |
|
47
|
|
|
|
|
|
|
HTML source into a variety of wiki markups, called wiki |
|
48
|
|
|
|
|
|
|
"dialects". The following dialects are supported: |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
DokuWiki |
|
51
|
|
|
|
|
|
|
Kwiki |
|
52
|
|
|
|
|
|
|
MediaWiki |
|
53
|
|
|
|
|
|
|
MoinMoin |
|
54
|
|
|
|
|
|
|
Oddmuse |
|
55
|
|
|
|
|
|
|
PbWiki |
|
56
|
|
|
|
|
|
|
PhpWiki |
|
57
|
|
|
|
|
|
|
PmWiki |
|
58
|
|
|
|
|
|
|
SlipSlap |
|
59
|
|
|
|
|
|
|
TikiWiki |
|
60
|
|
|
|
|
|
|
UseMod |
|
61
|
|
|
|
|
|
|
WakkaWiki |
|
62
|
|
|
|
|
|
|
WikkaWiki |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Note that while dialects usually produce satisfactory wiki markup, not |
|
65
|
|
|
|
|
|
|
all features of all dialects are supported. Consult individual |
|
66
|
|
|
|
|
|
|
dialects' documentation for details of supported features. Suggestions |
|
67
|
|
|
|
|
|
|
for improvements, especially in the form of patches, are very much |
|
68
|
|
|
|
|
|
|
appreciated. |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head1 METHODS |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head2 new |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my $wc = new HTML::WikiConverter( dialect => $dialect, %attrs ); |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
Returns a converter for the specified wiki dialect. Croaks if |
|
77
|
|
|
|
|
|
|
C<$dialect> is not provided or its dialect module is not installed on |
|
78
|
|
|
|
|
|
|
your system. Additional attributes may be specified in C<%attrs>; see |
|
79
|
|
|
|
|
|
|
L</"ATTRIBUTES"> for a complete list. |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=cut |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub new { |
|
84
|
10
|
|
|
10
|
1
|
486937
|
my $pkg = shift; |
|
85
|
10
|
100
|
|
|
|
53
|
return $pkg->__new_dialect(@_) if $pkg eq __PACKAGE__; |
|
86
|
|
|
|
|
|
|
|
|
87
|
5
|
|
|
|
|
16
|
my $self = bless { }, $pkg; |
|
88
|
5
|
|
|
|
|
27
|
$self->__load_attribute_specs(); |
|
89
|
5
|
|
|
|
|
32
|
$self->__setup(@_); |
|
90
|
4
|
|
|
|
|
28
|
return $self; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub __new_dialect { |
|
94
|
5
|
|
|
5
|
|
26
|
my( $pkg, %opts ) = @_; |
|
95
|
5
|
50
|
|
|
|
18
|
croak "Required 'dialect' parameter is missing" unless $opts{dialect}; |
|
96
|
5
|
|
|
|
|
24
|
my @dialect_classes = ( __PACKAGE__.'::'.$opts{dialect}, $opts{dialect} ); |
|
97
|
5
|
|
|
|
|
9
|
foreach my $dialect_class ( @dialect_classes ) { |
|
98
|
10
|
100
|
66
|
1
|
|
701
|
return $dialect_class->new( %opts ) if eval "use $dialect_class; 1" or $dialect_class->isa($pkg); |
|
|
1
|
|
|
1
|
|
484
|
|
|
|
0
|
|
|
1
|
|
0
|
|
|
|
0
|
|
|
1
|
|
0
|
|
|
|
1
|
|
|
1
|
|
662
|
|
|
|
0
|
|
|
1
|
|
0
|
|
|
|
0
|
|
|
1
|
|
0
|
|
|
|
1
|
|
|
1
|
|
732
|
|
|
|
0
|
|
|
1
|
|
0
|
|
|
|
0
|
|
|
1
|
|
0
|
|
|
|
1
|
|
|
|
|
313
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
1
|
|
|
|
|
374
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
1
|
|
|
|
|
337
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
1
|
|
|
|
|
465
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
1
|
|
|
|
|
549
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
1
|
|
|
|
|
460
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
1
|
|
|
|
|
352
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
99
|
|
|
|
|
|
|
} |
|
100
|
0
|
|
|
|
|
0
|
my $dc_list = join ', ', @dialect_classes; |
|
101
|
0
|
|
|
|
|
0
|
croak "Dialect '$opts{dialect}' could not be loaded (tried $dc_list). Error: $@"; |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub __setup { |
|
105
|
5
|
|
|
5
|
|
8
|
my $self = shift; |
|
106
|
5
|
|
|
|
|
25
|
$self->__setup_attributes(@_); |
|
107
|
4
|
|
|
|
|
23
|
$self->__setup_rules(); |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub __setup_attributes { |
|
111
|
5
|
|
|
5
|
|
7
|
my $self = shift; |
|
112
|
5
|
|
|
|
|
23
|
$self->__attrs( {} ); |
|
113
|
5
|
|
|
|
|
28
|
$self->__load_and_validate_attributes(@_); |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
sub __setup_rules { |
|
117
|
23
|
|
|
23
|
|
35
|
my $self = shift; |
|
118
|
23
|
|
|
|
|
115
|
$self->__load_rules(); |
|
119
|
23
|
|
|
|
|
88
|
$self->__validate_rules(); |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
|
|
122
|
33
|
|
|
33
|
|
126
|
sub __original_attrs { shift->_attr( { internal => 1 }, __original_attrs => @_ ) } |
|
123
|
583
|
|
|
583
|
|
1732
|
sub __attrs { shift->_attr( { internal => 1 }, __attrs => @_ ) } |
|
124
|
131
|
|
|
131
|
|
413
|
sub __attrs_changed { shift->_attr( { internal => 1 }, __attrs_changed => @_ ) } |
|
125
|
203
|
|
|
203
|
|
649
|
sub __root { shift->_attr( { internal => 1 }, __root => @_ ) } |
|
126
|
515
|
|
|
515
|
|
1787
|
sub __rules { shift->_attr( { internal => 1 }, __rules => @_ ) } |
|
127
|
631
|
|
|
631
|
|
2306
|
sub __attribute_specs { shift->_attr( { internal => 1 }, __attribute_specs => @_ ) } |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# Unsupported attributes |
|
130
|
0
|
|
|
0
|
0
|
0
|
sub base_url { shift->__no_such( attribute => qw/ base_url base_uri / ) } |
|
131
|
0
|
|
|
0
|
0
|
0
|
sub wiki_url { shift->__no_such( attribute => qw/ wiki_url wiki_uri / ) } |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub __no_such { |
|
134
|
0
|
|
|
0
|
|
0
|
my( $self, $thing, $that, $this ) = @_; |
|
135
|
0
|
|
|
|
|
0
|
croak "'$that' is not a valid $thing. Perhaps you meant '$this'?"; |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# Pass '{internal=>1}' as first arg for params that aren't attributes |
|
139
|
|
|
|
|
|
|
sub _attr { |
|
140
|
2688
|
100
|
|
2688
|
|
8193
|
my( $self, $opts, $param, @value ) = ref $_[1] eq 'HASH' ? @_ : ( +shift, {}, @_ ); |
|
141
|
2688
|
100
|
|
|
|
5720
|
my $store = $opts->{internal} ? $self : $self->__attrs; |
|
142
|
|
|
|
|
|
|
|
|
143
|
2688
|
100
|
|
|
|
6069
|
if( @value ) { |
|
144
|
296
|
100
|
|
|
|
376
|
eval { validate_pos( @value, $self->__attribute_specs->{$param} ) unless $opts->{internal} }; |
|
|
296
|
|
|
|
|
826
|
|
|
145
|
296
|
50
|
|
|
|
647
|
$self->__attribute_error($@) if $@; |
|
146
|
296
|
|
|
|
|
558
|
$store->{$param} = $value[0]; |
|
147
|
296
|
100
|
|
|
|
1585
|
$self->__attrs_changed(1) if !$opts->{internal}; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
2688
|
100
|
|
|
|
16339
|
return defined $store->{$param} ? $store->{$param} : ''; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# Attribute accessors and mutators |
|
154
|
|
|
|
|
|
|
sub AUTOLOAD { |
|
155
|
546
|
|
|
546
|
|
1712
|
my $self = shift; |
|
156
|
546
|
|
|
|
|
2433
|
( my $attr = $AUTOLOAD ) =~ s/.*://; |
|
157
|
546
|
100
|
|
|
|
1291
|
return $self->_attr( $attr => @_ ) if exists $self->__attribute_specs->{$attr}; |
|
158
|
1
|
|
|
|
|
195
|
croak "Can't locate method '$attr' in package ".ref($self); |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# So AUTOLOAD doesn't intercept calls to destruction method |
|
162
|
0
|
|
|
0
|
|
0
|
sub DESTROY { } |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub __slurp { |
|
165
|
0
|
|
|
0
|
|
0
|
my( $self, $file ) = @_; |
|
166
|
0
|
|
|
|
|
0
|
eval "use File::Slurp;"; |
|
167
|
0
|
0
|
|
|
|
0
|
return $self->__simple_slurp($file) if $@; |
|
168
|
0
|
|
|
|
|
0
|
return scalar File::Slurp::read_file($file); |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub __simple_slurp { |
|
172
|
0
|
|
|
0
|
|
0
|
my( $self, $file ) = @_; |
|
173
|
0
|
0
|
|
|
|
0
|
open my $fh, $file or croak "can't open file $file for reading: $!"; |
|
174
|
0
|
|
|
|
|
0
|
my $text = do { local $/; <$fh> }; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
175
|
0
|
|
|
|
|
0
|
close $fh; |
|
176
|
0
|
|
|
|
|
0
|
return $text; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head2 html2wiki |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
$wiki = $wc->html2wiki( $html, %attrs ); |
|
182
|
|
|
|
|
|
|
$wiki = $wc->html2wiki( html => $html, %attrs ); |
|
183
|
|
|
|
|
|
|
$wiki = $wc->html2wiki( file => $file, %attrs ); |
|
184
|
|
|
|
|
|
|
$wiki = $wc->html2wiki( uri => $uri, %attrs ); |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Converts HTML source to wiki markup for the current dialect. Accepts |
|
187
|
|
|
|
|
|
|
either an HTML string C<$html>, an file C<$file>, or a URI <$uri> to |
|
188
|
|
|
|
|
|
|
read from. |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Attributes assigned in C<%attrs> (see L</"ATTRIBUTES">) will augment |
|
191
|
|
|
|
|
|
|
or override previously assigned attributes for the duration of the |
|
192
|
|
|
|
|
|
|
C<html2wiki()> call. |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub html2wiki { |
|
197
|
24
|
|
|
24
|
1
|
278295
|
my $self = shift; |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# Assumes that if @_ is odd-numbered, its first element is html |
|
200
|
24
|
100
|
|
|
|
152
|
my %args = @_ % 2 ? ( html => +shift, @_ ) : @_; |
|
201
|
|
|
|
|
|
|
|
|
202
|
24
|
|
|
|
|
101
|
my %common_arg_errors = ( url => 'uri', base_url => 'base_uri', wiki_url => 'wiki_uri' ); |
|
203
|
24
|
|
|
|
|
190
|
while( my( $bad, $good ) = each %common_arg_errors ) { |
|
204
|
72
|
50
|
|
|
|
276
|
$self->__no_such( 'argument to html2wiki()', $bad, $good ) if exists $args{$bad}; |
|
205
|
|
|
|
|
|
|
} |
|
206
|
|
|
|
|
|
|
|
|
207
|
24
|
|
|
|
|
59
|
my @input_sources = grep { exists $args{$_} } qw/ html file uri /; |
|
|
72
|
|
|
|
|
169
|
|
|
208
|
24
|
50
|
|
|
|
69
|
croak "missing 'html', 'file', or 'uri' argument to html2wiki" unless @input_sources; |
|
209
|
24
|
50
|
|
|
|
68
|
croak "more than one of 'html', 'file', or 'uri' provided, but only one input source allowed" if @input_sources > 1; |
|
210
|
|
|
|
|
|
|
|
|
211
|
24
|
|
100
|
|
|
95
|
my $html = delete $args{html} || ''; |
|
212
|
24
|
|
50
|
|
|
132
|
my $file = delete $args{file} || ''; |
|
213
|
24
|
|
100
|
|
|
92
|
my $uri = delete $args{uri} || ''; |
|
214
|
|
|
|
|
|
|
|
|
215
|
24
|
50
|
33
|
|
|
66
|
$html = $self->__slurp($file) if $file && $self->slurp; |
|
216
|
24
|
100
|
|
|
|
64
|
$html = $self->__fetch_html_from_uri($uri) if $uri; # may set 'user_agent' attrib, so call before storing attribs |
|
217
|
24
|
50
|
33
|
|
|
172
|
$html = "<html>$html</html>" if $html and $self->wrap_in_html; |
|
218
|
|
|
|
|
|
|
|
|
219
|
24
|
|
|
|
|
62
|
$self->__original_attrs( { %{ $self->__attrs } } ); |
|
|
24
|
|
|
|
|
55
|
|
|
220
|
24
|
|
|
|
|
165
|
$self->$_( $args{$_} ) foreach keys %args; |
|
221
|
24
|
100
|
|
|
|
66
|
$self->__setup_rules() if $self->__attrs_changed; |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# Decode into Perl's internal form |
|
224
|
24
|
|
|
|
|
138
|
$html = decode( $self->encoding, $html ); |
|
225
|
|
|
|
|
|
|
|
|
226
|
24
|
|
|
|
|
1640
|
my $tree = new HTML::TreeBuilder(); |
|
227
|
24
|
|
|
|
|
6047
|
$tree->store_comments(1); |
|
228
|
24
|
|
|
|
|
324
|
$tree->p_strict( $self->p_strict ); |
|
229
|
24
|
|
|
|
|
273
|
$tree->implicit_body_p_tag(1); |
|
230
|
24
|
|
|
|
|
1038
|
$tree->ignore_unknown(0); # <ruby> et al |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
# Parse the HTML string or file |
|
233
|
24
|
50
|
|
|
|
207
|
if( $html ) { |
|
234
|
24
|
|
|
|
|
80
|
$self->given_html( $html ); |
|
235
|
24
|
|
|
|
|
324
|
$tree->parse($html); |
|
236
|
24
|
|
|
|
|
41060
|
$tree->eof(); |
|
237
|
|
|
|
|
|
|
} else { |
|
238
|
0
|
|
|
|
|
0
|
$self->given_html( $self->__slurp($file) ); |
|
239
|
0
|
|
|
|
|
0
|
$tree->parse_file($file); |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# Preprocess, save tree and parsed HTML |
|
243
|
24
|
|
|
|
|
3017
|
$self->__root( $tree ); |
|
244
|
24
|
|
|
|
|
98
|
$self->__preprocess_tree(); |
|
245
|
|
|
|
|
|
|
|
|
246
|
23
|
|
|
|
|
607
|
$self->__root->deobjectify_text(); |
|
247
|
23
|
|
|
|
|
991
|
$self->parsed_html( $tree->as_HTML(undef, ' ', {}) ); |
|
248
|
23
|
|
|
|
|
83
|
$self->__root->objectify_text(); |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Convert and preprocess |
|
251
|
23
|
|
|
|
|
1943
|
my $output = $self->__wikify($tree); |
|
252
|
23
|
|
|
|
|
97
|
$self->__postprocess_output(\$output); |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# Avoid leaks |
|
255
|
23
|
|
|
|
|
104
|
$tree->delete(); |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# Return to original encoding |
|
258
|
23
|
|
|
|
|
1743
|
$output = encode( $self->encoding, $output ); |
|
259
|
|
|
|
|
|
|
|
|
260
|
23
|
100
|
|
|
|
1039
|
if( $self->__attrs_changed ) { |
|
261
|
9
|
|
|
|
|
11
|
$self->__attrs( { %{ $self->__original_attrs } } ); |
|
|
9
|
|
|
|
|
25
|
|
|
262
|
9
|
|
|
|
|
37
|
$self->__setup_rules(); |
|
263
|
9
|
|
|
|
|
44
|
$self->__attrs_changed(0); |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
|
|
266
|
23
|
|
|
|
|
285
|
return $output; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub __wikify { |
|
270
|
140
|
|
|
140
|
|
212
|
my( $self, $node ) = @_; |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# Concatenate adjacent text nodes |
|
273
|
140
|
|
|
|
|
358
|
$node->normalize_content(); |
|
274
|
|
|
|
|
|
|
|
|
275
|
140
|
100
|
|
|
|
2054
|
if( $node->tag eq '~text' ) { |
|
|
|
100
|
|
|
|
|
|
|
276
|
36
|
|
|
|
|
279
|
return $node->attr('text'); |
|
277
|
|
|
|
|
|
|
} elsif( $node->tag eq '~comment' ) { |
|
278
|
1
|
|
|
|
|
12
|
return '<!--' . $node->attr('text') . '-->'; |
|
279
|
|
|
|
|
|
|
} else { |
|
280
|
103
|
|
|
|
|
1510
|
my $rules = $self->rules_for_tag( $node->tag ); |
|
281
|
|
|
|
|
|
|
|
|
282
|
103
|
100
|
|
|
|
290
|
return $self->__subst($rules->{replace}, $node, $rules) if exists $rules->{replace}; |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# Set private preserve rules |
|
285
|
98
|
100
|
|
|
|
284
|
if( $rules->{preserve} ) { |
|
|
|
50
|
|
|
|
|
|
|
286
|
4
|
50
|
|
|
|
24
|
$rules->{__start} = \&__preserve_start, |
|
287
|
|
|
|
|
|
|
$rules->{__end} = $rules->{empty} ? undef : '</'.$node->tag.'>'; |
|
288
|
|
|
|
|
|
|
} elsif( $rules->{passthrough} ) { |
|
289
|
0
|
|
|
|
|
0
|
$rules->{__start} = ''; |
|
290
|
0
|
|
|
|
|
0
|
$rules->{__end} = ''; |
|
291
|
|
|
|
|
|
|
} |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# Recurse |
|
294
|
98
|
|
|
|
|
307
|
my $output = $self->get_elem_contents($node); |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# Unspecified tags have their whitespace preserved (this allows |
|
297
|
|
|
|
|
|
|
# 'html' and 'body' tags [among others] to keep formatting when |
|
298
|
|
|
|
|
|
|
# inner tags like 'pre' need to preserve whitespace). |
|
299
|
98
|
|
100
|
|
|
389
|
my $trim = $rules->{trim} || 'none'; |
|
300
|
98
|
100
|
66
|
|
|
425
|
$output =~ s/^\s+// if $trim eq 'both' or $trim eq 'leading'; |
|
301
|
98
|
100
|
66
|
|
|
463
|
$output =~ s/\s+$// if $trim eq 'both' or $trim eq 'trailing'; |
|
302
|
|
|
|
|
|
|
|
|
303
|
98
|
|
100
|
|
|
331
|
my $lf = $rules->{line_format} || 'none'; |
|
304
|
98
|
100
|
|
|
|
213
|
$output =~ s/^\s*\n/\n/gm if $lf ne 'none'; |
|
305
|
98
|
50
|
|
|
|
389
|
if( $lf eq 'blocks' ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
306
|
0
|
|
|
|
|
0
|
$output =~ s/\n{3,}/\n\n/g; |
|
307
|
|
|
|
|
|
|
} elsif( $lf eq 'multi' ) { |
|
308
|
3
|
|
|
|
|
9
|
$output =~ s/\n{2,}/\n/g; |
|
309
|
|
|
|
|
|
|
} elsif( $lf eq 'single' ) { |
|
310
|
0
|
|
|
|
|
0
|
$output =~ s/\n+/ /g; |
|
311
|
|
|
|
|
|
|
} elsif( $lf eq 'none' ) { |
|
312
|
|
|
|
|
|
|
# Do nothing |
|
313
|
|
|
|
|
|
|
} |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
# Substitutions |
|
316
|
98
|
100
|
|
|
|
219
|
$output =~ s/^/$self->__subst($rules->{line_prefix}, $node, $rules)/gem if $rules->{line_prefix}; |
|
|
4
|
|
|
|
|
12
|
|
|
317
|
98
|
100
|
|
|
|
198
|
$output = $self->__subst($rules->{__start}, $node, $rules).$output if $rules->{__start}; |
|
318
|
98
|
100
|
|
|
|
196
|
$output = $output.$self->__subst($rules->{__end}, $node, $rules) if $rules->{__end}; |
|
319
|
98
|
100
|
|
|
|
238
|
$output = $self->__subst($rules->{start}, $node, $rules).$output if $rules->{start}; |
|
320
|
98
|
100
|
|
|
|
240
|
$output = $output.$self->__subst($rules->{end}, $node, $rules) if $rules->{end}; |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# If the current element is a block and is contained within |
|
323
|
|
|
|
|
|
|
# another block element, then we will not block the current |
|
324
|
|
|
|
|
|
|
# element by default. However, if the current element is a block |
|
325
|
|
|
|
|
|
|
# and is contained within another block element that specifies a |
|
326
|
|
|
|
|
|
|
# line_format of 'blocks', then we will block the current element. |
|
327
|
98
|
100
|
66
|
|
|
265
|
$output = "\n\n$output\n\n" if $rules->{block} && |
|
|
|
|
66
|
|
|
|
|
|
328
|
|
|
|
|
|
|
( ! $self->elem_search_lineage( $node, { block => 1 } ) or |
|
329
|
|
|
|
|
|
|
$self->elem_search_lineage( $node, { line_format => 'blocks' } ) ); |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# ...but they are put on their own line |
|
332
|
98
|
100
|
100
|
|
|
328
|
$output = "\n$output" if $rules->{block} and $node->parent->look_up( _tag => $node->tag ) and $trim ne 'none'; |
|
|
|
|
66
|
|
|
|
|
|
333
|
|
|
|
|
|
|
|
|
334
|
98
|
|
|
|
|
1411
|
return $output; |
|
335
|
|
|
|
|
|
|
} |
|
336
|
|
|
|
|
|
|
} |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# Deprecated. Instead use elem_search_lineage( $node, { block => 1 } ). |
|
339
|
|
|
|
|
|
|
sub elem_within_block { |
|
340
|
0
|
|
|
0
|
0
|
0
|
my( $self, $node ) = @_; |
|
341
|
0
|
|
|
|
|
0
|
foreach my $n ( $node->lineage ) { |
|
342
|
0
|
0
|
0
|
|
|
0
|
return $n if $self->rules_for_tag($n->tag || '')->{block}; |
|
343
|
|
|
|
|
|
|
} |
|
344
|
0
|
|
|
|
|
0
|
return 0; |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=head2 elem_search_lineage |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
my $ancestor = $wc->elem_search_lineage( $node, \%rules ); |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
Searches the lineage of C<$node> and returns the first ancestor node |
|
352
|
|
|
|
|
|
|
that has rules matching those specified in C<%rules>, or C<undef> if |
|
353
|
|
|
|
|
|
|
no matching node is found. |
|
354
|
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
For example, to find out whether C<$node> has an ancestor with rules |
|
356
|
|
|
|
|
|
|
matching C<{ block =E<gt>1 }>, one could use: |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
if( $wc->elem_search_lineage( $node, { block => 1 } ) ) { |
|
359
|
|
|
|
|
|
|
# do something |
|
360
|
|
|
|
|
|
|
} |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=cut |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
sub elem_search_lineage { |
|
365
|
15
|
|
|
15
|
1
|
27
|
my( $self, $node, $search_rules ) = @_; |
|
366
|
|
|
|
|
|
|
|
|
367
|
15
|
|
|
|
|
60
|
foreach my $n ( $node->lineage ) { |
|
368
|
30
|
|
|
|
|
323
|
my $rules = $self->rules_for_tag( $n->tag ); |
|
369
|
|
|
|
|
|
|
|
|
370
|
30
|
|
|
|
|
50
|
my $matched = 1; |
|
371
|
30
|
|
|
|
|
106
|
while( my($k,$v) = each %$search_rules ) { |
|
372
|
30
|
|
100
|
|
|
116
|
my $rule_value = $rules->{$k} || ''; |
|
373
|
30
|
100
|
|
|
|
146
|
$matched = 0 unless $v eq $rule_value; |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
|
|
376
|
30
|
100
|
|
|
|
131
|
return $n if $matched; |
|
377
|
|
|
|
|
|
|
} |
|
378
|
|
|
|
|
|
|
|
|
379
|
14
|
|
|
|
|
76
|
return undef; |
|
380
|
|
|
|
|
|
|
} |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
sub __subst { |
|
383
|
51
|
|
|
51
|
|
93
|
my( $self, $subst, $node, $rules ) = @_; |
|
384
|
51
|
100
|
|
|
|
185
|
return ref $subst eq 'CODE' ? $subst->( $self, $node, $rules ) : $subst; |
|
385
|
|
|
|
|
|
|
} |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
sub __preserve_start { |
|
388
|
4
|
|
|
4
|
|
9
|
my( $self, $node, $rules ) = @_; |
|
389
|
|
|
|
|
|
|
|
|
390
|
4
|
|
|
|
|
10
|
my $tag = $node->tag; |
|
391
|
4
|
50
|
|
|
|
35
|
my @attrs = exists $rules->{attributes} ? @{$rules->{attributes}} : ( ); |
|
|
0
|
|
|
|
|
0
|
|
|
392
|
4
|
|
|
|
|
18
|
my $attr_str = $self->get_attr_str( $node, @attrs ); |
|
393
|
4
|
50
|
|
|
|
14
|
my $slash = $rules->{empty} ? ' /' : ''; |
|
394
|
|
|
|
|
|
|
|
|
395
|
4
|
50
|
|
|
|
7
|
return '<'.$tag.' '.$attr_str.$slash.'>' if $attr_str; |
|
396
|
4
|
|
|
|
|
19
|
return '<'.$tag.$slash.'>'; |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# Maps a tag name to its URI attribute |
|
400
|
|
|
|
|
|
|
my %rel2abs = ( a => 'href', img => 'src' ); |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
my %allowedEmptyTag = ( %HTML::Tagset::emptyElement, '~comment' => 1, '~text' => 1 ); |
|
403
|
|
|
|
|
|
|
my %isKnownTag = %HTML::Tagset::isKnown; |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
sub __preprocess_tree { |
|
406
|
24
|
|
|
24
|
|
37
|
my $self = shift; |
|
407
|
|
|
|
|
|
|
|
|
408
|
24
|
|
|
|
|
59
|
$self->__root->objectify_text(); |
|
409
|
|
|
|
|
|
|
|
|
410
|
24
|
|
|
|
|
9209
|
$self->preprocess_tree($self->__root); |
|
411
|
|
|
|
|
|
|
|
|
412
|
24
|
50
|
|
|
|
129
|
HTML::WikiConverter::Normalizer->new->normalize($self->__root) if $self->normalize; |
|
413
|
|
|
|
|
|
|
|
|
414
|
23
|
50
|
|
|
|
213
|
my %strip_tag = map { $_ => 1 } @{ $self->strip_tags || [] }; |
|
|
71
|
|
|
|
|
184
|
|
|
|
23
|
|
|
|
|
126
|
|
|
415
|
23
|
|
|
|
|
276
|
my %passthrough_naked_tags = map { $_ => 1 } $self->__passthrough_naked_tags; |
|
|
0
|
|
|
|
|
0
|
|
|
416
|
|
|
|
|
|
|
|
|
417
|
23
|
|
|
|
|
59
|
foreach my $node ( $self->__root->descendents ) { |
|
418
|
110
|
100
|
|
|
|
5498
|
$node->tag('') unless $node->tag; |
|
419
|
110
|
100
|
|
|
|
915
|
$node->delete, next if $strip_tag{$node->tag}; |
|
420
|
94
|
50
|
33
|
|
|
776
|
$node->replace_with_content->delete, next if $passthrough_naked_tags{$node->tag} and !$node->all_external_attr_names; |
|
421
|
94
|
|
|
|
|
721
|
$self->__rm_invalid_text($node); |
|
422
|
94
|
100
|
100
|
|
|
421
|
$node->delete, next if $self->strip_empty_tags and !$allowedEmptyTag{$node->tag} and $self->__elem_is_empty($node); |
|
|
|
|
100
|
|
|
|
|
|
423
|
84
|
100
|
100
|
|
|
345
|
$self->__encode_entities($node) if $node->tag eq '~text' and $self->escape_entities; |
|
424
|
84
|
100
|
100
|
|
|
1095
|
$self->__rel2abs($node) if $self->base_uri and exists $rel2abs{$node->tag}; |
|
425
|
84
|
|
|
|
|
11651
|
$self->preprocess_node($node); |
|
426
|
|
|
|
|
|
|
} |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# Reobjectify in case preprocessing added new text |
|
429
|
23
|
|
|
|
|
169
|
$self->__root->objectify_text(); |
|
430
|
|
|
|
|
|
|
|
|
431
|
23
|
100
|
|
|
|
764
|
$self->preprocess->( $self->__root ) if ref $self->preprocess; |
|
432
|
|
|
|
|
|
|
} |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
sub __passthrough_naked_tags { |
|
435
|
23
|
|
|
23
|
|
42
|
my $self = shift; |
|
436
|
|
|
|
|
|
|
|
|
437
|
23
|
|
|
|
|
28
|
my @tags; |
|
438
|
23
|
50
|
|
|
|
105
|
if( ref $self->passthrough_naked_tags eq 'ARRAY' ) { |
|
|
|
50
|
|
|
|
|
|
|
439
|
0
|
|
|
|
|
0
|
@tags = @{ $self->passthrough_naked_tags }; |
|
|
0
|
|
|
|
|
0
|
|
|
440
|
|
|
|
|
|
|
} elsif( $self->passthrough_naked_tags ) { |
|
441
|
0
|
|
|
|
|
0
|
@tags = $self->__default_passthrough_naked_tags; |
|
442
|
|
|
|
|
|
|
} else { |
|
443
|
23
|
|
|
|
|
36
|
@tags = ( ); |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
|
|
446
|
23
|
|
|
|
|
106
|
return @tags; |
|
447
|
|
|
|
|
|
|
} |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# (bug #28402) |
|
450
|
0
|
|
|
0
|
|
0
|
sub __default_passthrough_naked_tags { qw/ tbody thead span div font / } |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
sub __elem_is_empty { |
|
453
|
26
|
|
|
26
|
|
293
|
my( $self, $node ) = @_; |
|
454
|
26
|
|
|
|
|
64
|
my $content = $self->get_elem_contents($node); |
|
455
|
26
|
100
|
66
|
|
|
145
|
my $has_nonwhitespace = $content && length $content ? $content =~ /\S/ : 0; |
|
456
|
26
|
|
|
|
|
148
|
return !$has_nonwhitespace; |
|
457
|
|
|
|
|
|
|
} |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub __fetch_html_from_uri { |
|
460
|
1
|
|
|
1
|
|
3
|
my( $self, $uri ) = @_; |
|
461
|
1
|
|
|
|
|
9
|
my $ua = $self->__user_agent; |
|
462
|
1
|
|
|
|
|
7
|
my $res = $ua->get($uri); |
|
463
|
1
|
50
|
|
|
|
628118
|
croak "request for <$uri> failed" unless $res->is_success; |
|
464
|
1
|
|
50
|
|
|
30
|
my $encoding = $self->encoding || $self->__guess_encoding($res) || 'utf-8'; |
|
465
|
1
|
|
|
|
|
8
|
my $html = encode( $self->encoding, decode( $encoding, $res->content ) ); |
|
466
|
1
|
|
|
|
|
2403
|
return $html; |
|
467
|
|
|
|
|
|
|
} |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
sub __guess_encoding { |
|
470
|
0
|
|
|
0
|
|
0
|
my( $self, $res ) = @_; |
|
471
|
0
|
0
|
|
|
|
0
|
carp "LWP::Charset is not installed but is required for determining the charset claimed by the content at the requested URI", return |
|
472
|
|
|
|
|
|
|
unless eval "use LWP::Charset; 1"; |
|
473
|
0
|
|
|
|
|
0
|
return LWP::Charset::getCharset($res); |
|
474
|
|
|
|
|
|
|
} |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub __user_agent { |
|
477
|
1
|
|
|
1
|
|
4
|
my $self = shift; |
|
478
|
1
|
50
|
|
|
|
12
|
$self->user_agent( $self->__default_user_agent ) unless $self->user_agent; |
|
479
|
1
|
|
|
|
|
9
|
return $self->user_agent; |
|
480
|
|
|
|
|
|
|
} |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
sub __default_user_agent { |
|
483
|
1
|
50
|
|
1
|
|
289
|
croak "LWP is not installed but is required for fetching URIs" unless eval "use LWP::UserAgent; 1"; |
|
|
1
|
|
|
1
|
|
13
|
|
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
19
|
|
|
484
|
1
|
|
|
|
|
10
|
return LWP::UserAgent->new( agent => shift->__default_ua_string ); |
|
485
|
|
|
|
|
|
|
} |
|
486
|
|
|
|
|
|
|
|
|
487
|
1
|
|
|
1
|
|
11
|
sub __default_ua_string { "html2wiki/$VERSION" } |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
# Encodes high-bit and control chars in node's text to HTML entities. |
|
490
|
|
|
|
|
|
|
sub __encode_entities { |
|
491
|
19
|
|
|
19
|
|
76
|
my( $self, $node ) = @_; |
|
492
|
19
|
50
|
|
|
|
66
|
my $text = defined $node->attr('text') ? $node->attr('text') : ''; |
|
493
|
19
|
|
|
|
|
441
|
encode_entities( $text, '<>&' ); |
|
494
|
19
|
|
|
|
|
1254
|
$node->attr( text => $text ); |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# Convert relative to absolute URIs |
|
498
|
|
|
|
|
|
|
sub __rel2abs { |
|
499
|
5
|
|
|
5
|
|
67
|
my( $self, $node ) = @_; |
|
500
|
5
|
|
|
|
|
16
|
my $attr = $rel2abs{$node->tag}; |
|
501
|
5
|
50
|
|
|
|
43
|
return unless $node->attr($attr); # don't add attribute if it's not already there |
|
502
|
5
|
|
|
|
|
77
|
$node->attr( $attr => uri_unescape( URI->new_abs( $node->attr($attr), $self->base_uri )->as_string ) ); |
|
503
|
|
|
|
|
|
|
} |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# Removes text nodes directly inside container elements. |
|
506
|
|
|
|
|
|
|
my %containers = map { $_ => 1 } qw/ table tbody tr ul ol dl menu /; |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
sub __rm_invalid_text { |
|
509
|
94
|
|
|
94
|
|
140
|
my( $self, $node ) = @_; |
|
510
|
94
|
50
|
|
|
|
221
|
my $tag = defined $node->tag ? $node->tag : ''; |
|
511
|
94
|
50
|
|
|
|
1467
|
if( $containers{$tag} ) { |
|
512
|
0
|
|
|
|
|
0
|
$_->delete for grep { $_->tag eq '~text' } $node->content_list; |
|
|
0
|
|
|
|
|
0
|
|
|
513
|
|
|
|
|
|
|
} |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub strip_aname { |
|
517
|
0
|
|
|
0
|
0
|
0
|
my( $self, $node ) = @_; |
|
518
|
0
|
0
|
|
|
|
0
|
return if $node->attr('href'); |
|
519
|
0
|
|
|
|
|
0
|
$node->replace_with_content->delete(); |
|
520
|
|
|
|
|
|
|
} |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
sub caption2para { |
|
523
|
0
|
|
|
0
|
0
|
0
|
my( $self, $node ) = @_; |
|
524
|
0
|
|
|
|
|
0
|
my $table = $node->parent; |
|
525
|
0
|
|
|
|
|
0
|
$node->detach(); |
|
526
|
0
|
|
|
|
|
0
|
$table->preinsert($node); |
|
527
|
0
|
|
|
|
|
0
|
$node->tag('p'); |
|
528
|
|
|
|
|
|
|
} |
|
529
|
|
|
|
|
|
|
|
|
530
|
24
|
|
|
24
|
0
|
46
|
sub preprocess_tree { } |
|
531
|
84
|
|
|
84
|
0
|
178
|
sub preprocess_node { } |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
sub __postprocess_output { |
|
534
|
23
|
|
|
23
|
|
38
|
my( $self, $outref ) = @_; |
|
535
|
23
|
|
|
|
|
143
|
$$outref =~ s/\n[\s^\n]+\n/\n\n/g; # XXX this is causing bug 14527 |
|
536
|
23
|
|
|
|
|
122
|
$$outref =~ s/\n{2,}/\n\n/g; |
|
537
|
23
|
|
|
|
|
75
|
$$outref =~ s/^\n+//; |
|
538
|
23
|
|
|
|
|
96
|
$$outref =~ s/\s+$//; |
|
539
|
23
|
|
|
|
|
56
|
$$outref =~ s/[ \t]+$//gm; |
|
540
|
23
|
|
|
|
|
69
|
$self->postprocess_output($outref); |
|
541
|
|
|
|
|
|
|
} |
|
542
|
|
|
|
|
|
|
|
|
543
|
23
|
|
|
23
|
0
|
42
|
sub postprocess_output { } |
|
544
|
|
|
|
|
|
|
|
|
545
|
0
|
|
|
0
|
0
|
0
|
sub attributes { {} } |
|
546
|
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
sub __load_attribute_specs { |
|
548
|
5
|
|
|
5
|
|
19
|
my $self = shift; |
|
549
|
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
# Get default attribute specs |
|
551
|
5
|
|
|
|
|
29
|
my $default_specs = $self->__default_attribute_specs; |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# Get dialect attribute specs |
|
554
|
5
|
|
|
|
|
24
|
my @dialect_specs = $self->attributes; |
|
555
|
5
|
50
|
33
|
|
|
64
|
my $dialect_specs = @dialect_specs == 1 && ref $dialect_specs[0] eq 'HASH' ? $dialect_specs[0] : {@dialect_specs}; |
|
556
|
|
|
|
|
|
|
|
|
557
|
5
|
|
|
|
|
53
|
my %attr_specs = %$default_specs; |
|
558
|
5
|
|
|
|
|
29
|
while( my( $attr, $spec ) = each %$dialect_specs ) { |
|
559
|
13
|
|
|
|
|
47
|
$attr_specs{$attr} = $spec; |
|
560
|
|
|
|
|
|
|
} |
|
561
|
|
|
|
|
|
|
|
|
562
|
5
|
|
|
|
|
31
|
$self->__attribute_specs( \%attr_specs ); |
|
563
|
|
|
|
|
|
|
} |
|
564
|
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
sub __load_and_validate_attributes { |
|
566
|
5
|
|
|
5
|
|
8
|
my $self = shift; |
|
567
|
|
|
|
|
|
|
|
|
568
|
5
|
|
|
|
|
8
|
my %attrs = eval { validate( @_, $self->__attribute_specs ) }; |
|
|
5
|
|
|
|
|
13
|
|
|
569
|
5
|
100
|
|
|
|
317
|
$self->__attribute_error($@) if $@; |
|
570
|
|
|
|
|
|
|
|
|
571
|
4
|
|
|
|
|
17
|
while( my( $attr, $value ) = each %attrs ) { |
|
572
|
64
|
|
|
|
|
434
|
$self->$attr($value); |
|
573
|
|
|
|
|
|
|
} |
|
574
|
|
|
|
|
|
|
} |
|
575
|
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
sub __attribute_error { |
|
577
|
1
|
|
|
1
|
|
2
|
my( $self, $error ) = @_; |
|
578
|
|
|
|
|
|
|
# Validating attributes failed, so we don't have access to the |
|
579
|
|
|
|
|
|
|
# 'dialect' attribute; obtain it from the package name instead |
|
580
|
1
|
|
|
|
|
3
|
( my $dialect = ref $self ) =~ s/.*://; |
|
581
|
|
|
|
|
|
|
|
|
582
|
1
|
50
|
|
|
|
10
|
$error = sprintf "The attribute '%s' does not exist in the dialect '%s'.", $1, $dialect |
|
583
|
|
|
|
|
|
|
if $error =~ /not listed in the validation options\: (\w+)/; |
|
584
|
|
|
|
|
|
|
|
|
585
|
1
|
|
|
|
|
172
|
croak $error; |
|
586
|
|
|
|
|
|
|
} |
|
587
|
|
|
|
|
|
|
|
|
588
|
0
|
|
|
0
|
0
|
0
|
sub rules { {} } |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
sub __load_rules { |
|
591
|
23
|
|
|
23
|
|
38
|
my $self = shift; |
|
592
|
23
|
|
|
|
|
80
|
$self->__rules( $self->rules ); |
|
593
|
|
|
|
|
|
|
} |
|
594
|
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
# Rules for validating rules |
|
596
|
|
|
|
|
|
|
my %meta_rules = ( |
|
597
|
|
|
|
|
|
|
trim => { range => [ qw/ none both leading trailing / ] }, |
|
598
|
|
|
|
|
|
|
line_format => { range => [ qw/ none single multi blocks / ] }, |
|
599
|
|
|
|
|
|
|
replace => { singleton => 1 }, |
|
600
|
|
|
|
|
|
|
alias => { singleton => 1 }, |
|
601
|
|
|
|
|
|
|
attributes => { depends => [ qw/ preserve / ] }, |
|
602
|
|
|
|
|
|
|
empty => { depends => [ qw/ preserve / ] }, |
|
603
|
|
|
|
|
|
|
passthrough => { singleton => 1 }, |
|
604
|
|
|
|
|
|
|
); |
|
605
|
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
sub __validate_rules { |
|
607
|
23
|
|
|
23
|
|
33
|
my $self = shift; |
|
608
|
|
|
|
|
|
|
|
|
609
|
23
|
|
|
|
|
27
|
foreach my $tag ( keys %{ $self->__rules } ) { |
|
|
23
|
|
|
|
|
40
|
|
|
610
|
185
|
|
|
|
|
351
|
my $rules = $self->__rules->{$tag}; |
|
611
|
|
|
|
|
|
|
|
|
612
|
185
|
|
|
|
|
501
|
foreach my $opt ( keys %$rules ) { |
|
613
|
280
|
100
|
|
|
|
793
|
my $spec = $meta_rules{$opt} or next; |
|
614
|
|
|
|
|
|
|
|
|
615
|
96
|
|
100
|
|
|
242
|
my $singleton = $spec->{singleton} || 0; |
|
616
|
96
|
50
|
|
|
|
246
|
my @disallows = ref $spec->{disallows} eq 'ARRAY' ? @{ $spec->{disallows} } : ( ); |
|
|
0
|
|
|
|
|
0
|
|
|
617
|
96
|
50
|
|
|
|
187
|
my @depends = ref $spec->{depends} eq 'ARRAY' ? @{ $spec->{depends} } : ( ); |
|
|
0
|
|
|
|
|
0
|
|
|
618
|
96
|
100
|
|
|
|
193
|
my @range = ref $spec->{range} eq 'ARRAY' ? @{ $spec->{range} } : ( ); |
|
|
30
|
|
|
|
|
76
|
|
|
619
|
96
|
|
|
|
|
144
|
my %range = map { $_ => 1 } @range; |
|
|
120
|
|
|
|
|
247
|
|
|
620
|
|
|
|
|
|
|
|
|
621
|
96
|
50
|
66
|
|
|
423
|
$self->__rule_error( $tag, "'$opt' cannot be combined with any other option" ) |
|
622
|
|
|
|
|
|
|
if $singleton and keys %$rules != 1; |
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
$rules->{$_} && $self->__rule_error( $tag, "'$opt' cannot be combined with '$_'" ) |
|
625
|
96
|
|
0
|
|
|
177
|
foreach @disallows; |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
! $rules->{$_} && $self->__rule_error( $tag, "'$opt' must be combined with '$_'" ) |
|
628
|
96
|
|
0
|
|
|
137
|
foreach @depends; |
|
629
|
|
|
|
|
|
|
|
|
630
|
96
|
50
|
66
|
|
|
520
|
$self->__rule_error( $tag, "Unknown '$opt' value '$rules->{$opt}'. '$opt' must be one of ", join(', ', map { "'$_'" } @range) ) |
|
|
0
|
|
|
|
|
0
|
|
|
631
|
|
|
|
|
|
|
if @range and ! exists $range{$rules->{$opt}}; |
|
632
|
|
|
|
|
|
|
} |
|
633
|
|
|
|
|
|
|
} |
|
634
|
|
|
|
|
|
|
} |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
sub __rule_error { |
|
637
|
0
|
|
|
0
|
|
0
|
my( $self, $tag, @msg ) = @_; |
|
638
|
0
|
|
|
|
|
0
|
my $dialect = ref $self; |
|
639
|
0
|
|
|
|
|
0
|
croak @msg, " in tag '$tag', dialect '$dialect'.\n"; |
|
640
|
|
|
|
|
|
|
} |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub get_elem_contents { |
|
643
|
128
|
|
|
128
|
0
|
187
|
my( $self, $node ) = @_; |
|
644
|
128
|
|
|
|
|
344
|
my $str = join '', map { $self->__wikify($_) } $node->content_list; |
|
|
117
|
|
|
|
|
817
|
|
|
645
|
128
|
50
|
|
|
|
959
|
return defined $str ? $str : ''; |
|
646
|
|
|
|
|
|
|
} |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
sub get_wiki_page { |
|
649
|
5
|
|
|
5
|
0
|
111
|
my( $self, $uri ) = @_; |
|
650
|
5
|
50
|
|
|
|
30
|
my @wiki_uris = ref $self->wiki_uri eq 'ARRAY' ? @{$self->wiki_uri} : $self->wiki_uri; |
|
|
5
|
|
|
|
|
23
|
|
|
651
|
5
|
|
|
|
|
20
|
foreach my $wiki_uri ( @wiki_uris ) { |
|
652
|
12
|
|
|
|
|
34
|
my $page = $self->__extract_wiki_page( $uri, $wiki_uri ); |
|
653
|
12
|
100
|
|
|
|
451
|
return $page if $page; |
|
654
|
|
|
|
|
|
|
} |
|
655
|
|
|
|
|
|
|
|
|
656
|
2
|
|
|
|
|
7
|
return undef; |
|
657
|
|
|
|
|
|
|
} |
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
sub __extract_wiki_page { |
|
660
|
12
|
|
|
12
|
|
20
|
my( $self, $uri, $wiki_uri ) = @_; |
|
661
|
12
|
50
|
|
|
|
24
|
return undef unless $wiki_uri; |
|
662
|
|
|
|
|
|
|
|
|
663
|
12
|
50
|
|
|
|
39
|
if( ref $wiki_uri eq 'Regexp' ) { |
|
|
|
100
|
|
|
|
|
|
|
664
|
0
|
0
|
|
|
|
0
|
return $uri =~ $wiki_uri ? $1 : undef; |
|
665
|
|
|
|
|
|
|
} elsif( ref $wiki_uri eq 'CODE' ) { |
|
666
|
3
|
|
|
|
|
11
|
return $wiki_uri->( $self, URI->new($uri) ); |
|
667
|
|
|
|
|
|
|
} else { |
|
668
|
|
|
|
|
|
|
# Ensure $wiki_uri is absolute |
|
669
|
9
|
|
|
|
|
42
|
$wiki_uri = URI->new_abs( $wiki_uri, $self->base_uri )->as_string; |
|
670
|
|
|
|
|
|
|
|
|
671
|
9
|
100
|
|
|
|
871
|
return undef unless index( $uri, $wiki_uri ) == 0; |
|
672
|
2
|
50
|
|
|
|
10
|
return undef unless length $uri > length $wiki_uri; |
|
673
|
2
|
|
|
|
|
10
|
return substr( $uri, length $wiki_uri ); |
|
674
|
|
|
|
|
|
|
} |
|
675
|
|
|
|
|
|
|
} |
|
676
|
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
# Adapted from Kwiki source |
|
678
|
|
|
|
|
|
|
my $UPPER = '\p{UppercaseLetter}'; |
|
679
|
|
|
|
|
|
|
my $LOWER = '\p{LowercaseLetter}'; |
|
680
|
|
|
|
|
|
|
my $WIKIWORD = "$UPPER$LOWER\\p{Number}\\p{ConnectorPunctuation}"; |
|
681
|
|
|
|
|
|
|
|
|
682
|
0
|
|
|
0
|
0
|
0
|
sub is_camel_case { return $_[1] =~ /(?:[$UPPER](?=[$WIKIWORD]*[$UPPER])(?=[$WIKIWORD]*[$LOWER])[$WIKIWORD]+)/ } |
|
683
|
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
sub get_attr_str { |
|
685
|
4
|
|
|
4
|
0
|
8
|
my( $self, $node, @attrs ) = @_; |
|
686
|
4
|
|
|
|
|
9
|
my %attrs = map { $_ => $node->attr($_) } @attrs; |
|
|
0
|
|
|
|
|
0
|
|
|
687
|
4
|
|
|
|
|
9
|
my $str = join ' ', map { $_.'="'.encode_entities($attrs{$_}).'"' } grep { $attrs{$_} } @attrs; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
# (bug #19046) partial fix: attributes must be contained on a single line |
|
690
|
4
|
50
|
|
|
|
9
|
$str =~ s/[\n\r]/ /g if $str; |
|
691
|
|
|
|
|
|
|
|
|
692
|
4
|
50
|
|
|
|
16
|
return defined $str ? $str : ''; |
|
693
|
|
|
|
|
|
|
} |
|
694
|
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
=head2 given_html |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
my $html = $wc->given_html; |
|
698
|
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
Returns the HTML passed to or fetched (ie, from a file or URI) by the |
|
700
|
|
|
|
|
|
|
last C<html2wiki()> method call. Useful for debugging. |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
=cut |
|
703
|
|
|
|
|
|
|
|
|
704
|
24
|
|
|
24
|
1
|
93
|
sub given_html { shift->_attr( { internal => 1 }, __given_html => @_ ) } |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
=head2 parsed_html |
|
707
|
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
my $parsed_html = $wc->parsed_html; |
|
709
|
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
Returns a string containing the post-processed HTML from the last |
|
711
|
|
|
|
|
|
|
C<html2wiki> call. Post-processing includes parsing by |
|
712
|
|
|
|
|
|
|
L<HTML::TreeBuilder>, CSS normalization by |
|
713
|
|
|
|
|
|
|
L<HTML::WikiConverter::Normalizer>, and calls to the C<preprocess> and |
|
714
|
|
|
|
|
|
|
C<preprocess_tree> dialect methods. |
|
715
|
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
=cut |
|
717
|
|
|
|
|
|
|
|
|
718
|
23
|
|
|
23
|
1
|
18412
|
sub parsed_html { shift->_attr( { internal => 1 }, __parsed_html => @_ ) } |
|
719
|
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
=head2 available_dialects |
|
721
|
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
my @dialects = HTML::WikiConverter->available_dialects; |
|
723
|
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
Returns a list of all available dialects by searching the directories |
|
725
|
|
|
|
|
|
|
in C<@INC> for C<HTML::WikiConverter::> modules. |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
=cut |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
sub available_dialects { |
|
730
|
0
|
|
|
0
|
1
|
0
|
my @dialects; |
|
731
|
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
my %seen; |
|
733
|
0
|
|
|
|
|
0
|
for my $inc ( @INC ) { |
|
734
|
0
|
|
|
|
|
0
|
my $dir = File::Spec->catfile( $inc, 'HTML', 'WikiConverter' ); |
|
735
|
0
|
0
|
|
|
|
0
|
my $dh = DirHandle->new( $dir ) or next; |
|
736
|
0
|
|
|
|
|
0
|
while ( my $f = $dh->read ) { |
|
737
|
0
|
0
|
|
|
|
0
|
next unless $f =~ /^(\w+)\.pm$/; |
|
738
|
0
|
|
|
|
|
0
|
my $dialect = $1; |
|
739
|
|
|
|
|
|
|
|
|
740
|
0
|
0
|
|
|
|
0
|
next if $seen{$dialect}++; |
|
741
|
0
|
0
|
0
|
|
|
0
|
next if $dialect eq 'Normalizer' or $dialect eq 'WebApp'; |
|
742
|
|
|
|
|
|
|
|
|
743
|
0
|
|
|
|
|
0
|
push @dialects, $dialect; |
|
744
|
|
|
|
|
|
|
} |
|
745
|
|
|
|
|
|
|
} |
|
746
|
|
|
|
|
|
|
|
|
747
|
0
|
0
|
|
|
|
0
|
return wantarray ? sort @dialects : @dialects; |
|
748
|
|
|
|
|
|
|
} |
|
749
|
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
=head2 rules_for_tag |
|
751
|
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
my $rules = $wc->rules_for_tag( $tag ); |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
Returns the rules that will be used for converting elements of the |
|
755
|
|
|
|
|
|
|
given tag. Follows C<alias> references. Note that the rules used for a |
|
756
|
|
|
|
|
|
|
particular tag may depend on the current set of attributes being used. |
|
757
|
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
=cut |
|
759
|
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
sub rules_for_tag { |
|
761
|
133
|
|
|
133
|
1
|
895
|
my( $self, $tag ) = @_; |
|
762
|
133
|
|
|
|
|
294
|
my $rules = $self->__rules_for_tag($tag); |
|
763
|
133
|
100
|
|
|
|
422
|
return $rules->{alias} ? $self->__rules_for_tag( $rules->{alias} ) : $rules; |
|
764
|
|
|
|
|
|
|
} |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
sub __rules_for_tag { |
|
767
|
142
|
|
|
142
|
|
195
|
my( $self, $tag ) = @_; |
|
768
|
142
|
100
|
|
|
|
276
|
return $self->__rules->{$tag} if $self->__rules->{$tag}; |
|
769
|
91
|
50
|
66
|
|
|
255
|
return $self->__rules->{UNKNOWN} if $self->__rules->{UNKNOWN} and !$isKnownTag{$tag}; |
|
770
|
91
|
|
|
|
|
337
|
return { }; |
|
771
|
|
|
|
|
|
|
} |
|
772
|
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
|
774
|
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
You may configure C<HTML::WikiConverter> using a number of |
|
776
|
|
|
|
|
|
|
attributes. These may be passed as arguments to the C<new> |
|
777
|
|
|
|
|
|
|
constructor, or can be called as object methods on an H::WC object. |
|
778
|
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
Some dialects allow other attributes in addition to those below, and |
|
780
|
|
|
|
|
|
|
may override the attributes' default values. Consult the dialect's |
|
781
|
|
|
|
|
|
|
documentation for details. |
|
782
|
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=head2 base_uri |
|
784
|
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
URI to use for converting relative URIs to absolute ones. This |
|
786
|
|
|
|
|
|
|
effectively ensures that the C<src> and C<href> attributes of image |
|
787
|
|
|
|
|
|
|
and anchor tags, respectively, are absolute before converting the HTML |
|
788
|
|
|
|
|
|
|
to wiki markup, which is necessary for wiki dialects that handle |
|
789
|
|
|
|
|
|
|
internal and external links separately. Relative URIs are only |
|
790
|
|
|
|
|
|
|
converted to absolute ones if the C<base_uri> argument is |
|
791
|
|
|
|
|
|
|
present. Defaults to C<undef>. |
|
792
|
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
=head2 dialect |
|
794
|
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
(Required) Dialect to use for converting HTML into wiki markup. See |
|
796
|
|
|
|
|
|
|
the L</"DESCRIPTION"> section above for a list of dialects. C<new()> |
|
797
|
|
|
|
|
|
|
will fail if the dialect given is not installed on your system. Use |
|
798
|
|
|
|
|
|
|
C<available_dialects()> to list installed dialects. |
|
799
|
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
=head2 encoding |
|
801
|
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
Specifies the encoding used by the HTML to be converted. Also |
|
803
|
|
|
|
|
|
|
determines the encoding of the wiki markup returned by the |
|
804
|
|
|
|
|
|
|
C<html2wiki> method. Defaults to C<"utf8">. |
|
805
|
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
=head2 escape_entities |
|
807
|
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
Passing C<escape_entities> a true value uses L<HTML::Entities> to |
|
809
|
|
|
|
|
|
|
encode potentially unsafe 'E<lt>', 'E<gt>', and 'E<amp>' characters. |
|
810
|
|
|
|
|
|
|
Defaults to true. |
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=head2 p_strict |
|
813
|
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
Boolean indicating whether L<HTML::TreeBuilder> will use strict |
|
815
|
|
|
|
|
|
|
handling of paragraph tags when parsing HTML input. (This corresponds |
|
816
|
|
|
|
|
|
|
to the C<p_strict> method in the L<HTML::TreeBuilder> module.) Enabled |
|
817
|
|
|
|
|
|
|
by default. |
|
818
|
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head2 passthrough_naked_tags |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
Boolean indicating whether tags with no attributes ("naked" tags) |
|
822
|
|
|
|
|
|
|
should be removed and replaced with their content. By default, this |
|
823
|
|
|
|
|
|
|
only applies to non-semantic tags such as E<lt>spanE<gt>, |
|
824
|
|
|
|
|
|
|
E<lt>divE<gt>, etc., but does not apply to semantic tags such as |
|
825
|
|
|
|
|
|
|
E<lt>strongE<gt>, E<lt>addressE<gt>, etc. To override this behavior |
|
826
|
|
|
|
|
|
|
and specify the tags that should be considered for passthrough, |
|
827
|
|
|
|
|
|
|
provide this attribute with a reference to an array of tag names. |
|
828
|
|
|
|
|
|
|
Defaults to false, but you'll probably want to enable it. |
|
829
|
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
=head2 preprocess |
|
831
|
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
Code reference that gets invoked after HTML is parsed but before it is |
|
833
|
|
|
|
|
|
|
converted into wiki markup. The callback is passed two arguments: the |
|
834
|
|
|
|
|
|
|
C<HTML::WikiConverter> object and a L<HTML::Element> pointing to the |
|
835
|
|
|
|
|
|
|
root node of the HTML tree created by L<HTML::TreeBuilder>. |
|
836
|
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
=head2 slurp |
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
Boolean that, if enabled, bypasses C<HTML::Parser>'s incremental |
|
840
|
|
|
|
|
|
|
parsing (thus I<slurping> the file in all at once) of files when |
|
841
|
|
|
|
|
|
|
reading HTML files. If L<File::Slurp> is installed, its C<read_file()> |
|
842
|
|
|
|
|
|
|
function will be used to perform slurping; otherwise, a common Perl |
|
843
|
|
|
|
|
|
|
idiom will be used for slurping instead. This option is only used if |
|
844
|
|
|
|
|
|
|
you call C<html2wiki()> with the C<file> argument. |
|
845
|
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
=head2 strip_empty_tags |
|
847
|
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
Strips elements containing no content (unless those elements |
|
849
|
|
|
|
|
|
|
legitimately contain no content, such as is the case for C<br> and |
|
850
|
|
|
|
|
|
|
C<img> tags, for example). Defaults to false. |
|
851
|
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
=head2 strip_tags |
|
853
|
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
A reference to an array of tags to be removed from the HTML input |
|
855
|
|
|
|
|
|
|
prior to conversion to wiki markup. Tag names are the same as those |
|
856
|
|
|
|
|
|
|
used in L<HTML::Element>. Defaults to C<[ '~comment', 'head', |
|
857
|
|
|
|
|
|
|
'script', 'style' ]>. |
|
858
|
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
=head2 user_agent |
|
860
|
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
Specifies the L<LWP::UserAgent> object to be used when fetching the |
|
862
|
|
|
|
|
|
|
URI passed to C<html2wiki()>. If unspecified and C<html2wiki()> is |
|
863
|
|
|
|
|
|
|
passed a URI, a default user agent will be created. |
|
864
|
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
=head2 wiki_uri |
|
866
|
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
Takes a URI, regular expression, or coderef (or a reference to an |
|
868
|
|
|
|
|
|
|
array of elements of these types) used to determine which links are to |
|
869
|
|
|
|
|
|
|
wiki pages: a link whose C<href> parameter matches C<wiki_uri> will be |
|
870
|
|
|
|
|
|
|
treated as a link to a wiki page. In addition, C<wiki_uri> will be |
|
871
|
|
|
|
|
|
|
used to extract the title of the wiki page. The way this is done |
|
872
|
|
|
|
|
|
|
depends on whether the C<wiki_uri> has been set to a string, regexp, |
|
873
|
|
|
|
|
|
|
or coderef. The default is C<undef>, meaning that all links will be |
|
874
|
|
|
|
|
|
|
treated as external links by default. |
|
875
|
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
If C<wiki_uri> is a string, it is interpreted as a URI template, and |
|
877
|
|
|
|
|
|
|
it will be assumed that URIs to wiki pages are created by joining |
|
878
|
|
|
|
|
|
|
C<wiki_uri> with the wiki page title. For example, the English |
|
879
|
|
|
|
|
|
|
Wikipedia might use C<"http://en.wikipedia.org/wiki/"> as the value of |
|
880
|
|
|
|
|
|
|
C<wiki_uri>. Ward's wiki might use C<"http://c2.com/cgi/wiki?">. These |
|
881
|
|
|
|
|
|
|
examples use an absolute C<wiki_uri>, but a relative URI can be used |
|
882
|
|
|
|
|
|
|
as well; an absolute URI will be created based on the value of |
|
883
|
|
|
|
|
|
|
C<base_uri>. For example, the Wikipedia example above can be rewritten |
|
884
|
|
|
|
|
|
|
using C<base_uri> of C<"http://en.wikipedia.org"> and a C<wiki_uri> of |
|
885
|
|
|
|
|
|
|
C<"/wiki/">. |
|
886
|
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
C<wiki_uri> can also be a regexp that matches URIs to wiki pages and |
|
888
|
|
|
|
|
|
|
also extracts the page title from them. For example, the English |
|
889
|
|
|
|
|
|
|
Wikipedia might use |
|
890
|
|
|
|
|
|
|
C<qr~http://en\.wikipedia\.org/w/index\.php\?title\=([^&]+)~>. |
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
C<wiki_uri> can also be a coderef that takes the current |
|
893
|
|
|
|
|
|
|
C<HTML::WikiConverter> object and a L<URI> object. It should return |
|
894
|
|
|
|
|
|
|
the title of the wiki page extracted from the URI, or C<undef> if the |
|
895
|
|
|
|
|
|
|
URI doesn't represent a link to a wiki page. |
|
896
|
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
As mentioned above, the C<wiki_uri> attribute can either take a single |
|
898
|
|
|
|
|
|
|
URI/regexp/coderef element or it may be assigned a reference to an |
|
899
|
|
|
|
|
|
|
array of any number of these elements. This is useful for wikis that |
|
900
|
|
|
|
|
|
|
have different ways of creating links to wiki pages. For example, the |
|
901
|
|
|
|
|
|
|
English Wikipedia might use: |
|
902
|
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
my $wc = new HTML::WikiConverter( |
|
904
|
|
|
|
|
|
|
dialect => 'MediaWiki', |
|
905
|
|
|
|
|
|
|
wiki_uri => [ |
|
906
|
|
|
|
|
|
|
'http://en.wikipiedia.org/wiki/', |
|
907
|
|
|
|
|
|
|
sub { pop->query_param('title') } # requires URI::QueryParam |
|
908
|
|
|
|
|
|
|
] |
|
909
|
|
|
|
|
|
|
); |
|
910
|
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
=head2 wrap_in_html |
|
912
|
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
Helps L<HTML::TreeBuilder> parse HTML fragments by wrapping HTML in |
|
914
|
|
|
|
|
|
|
C<E<lt>htmlE<gt>> and C<E<lt>/htmlE<gt>> before passing it through |
|
915
|
|
|
|
|
|
|
C<html2wiki>. Boolean, enabled by default. |
|
916
|
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
=cut |
|
918
|
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
sub __default_attribute_specs { { |
|
920
|
5
|
|
|
5
|
|
130
|
base_uri => { type => SCALAR, default => '' }, |
|
921
|
|
|
|
|
|
|
dialect => { type => SCALAR, optional => 0 }, |
|
922
|
|
|
|
|
|
|
encoding => { type => SCALAR, default => 'utf-8' }, |
|
923
|
|
|
|
|
|
|
escape_entities => { type => BOOLEAN, default => 1 }, |
|
924
|
|
|
|
|
|
|
normalize => { type => BOOLEAN, default => 1 }, |
|
925
|
|
|
|
|
|
|
p_strict => { type => BOOLEAN, default => 1 }, |
|
926
|
|
|
|
|
|
|
preprocess => { type => CODEREF | UNDEF, default => undef }, |
|
927
|
|
|
|
|
|
|
strip_empty_tags => { type => BOOLEAN, default => 0 }, |
|
928
|
|
|
|
|
|
|
slurp => { type => BOOLEAN, default => 0 }, |
|
929
|
|
|
|
|
|
|
strip_tags => { type => ARRAYREF, default => [ qw/ ~comment head script style / ] }, |
|
930
|
|
|
|
|
|
|
passthrough_naked_tags => { type => ARRAYREF | BOOLEAN, default => 0 }, |
|
931
|
|
|
|
|
|
|
user_agent => { type => OBJECT | UNDEF, default => undef }, |
|
932
|
|
|
|
|
|
|
wiki_uri => { type => SCALAR | ARRAYREF, default => '' }, |
|
933
|
|
|
|
|
|
|
wrap_in_html => { type => BOOLEAN, default => 1 }, |
|
934
|
|
|
|
|
|
|
} } |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
=head1 ADDING A DIALECT |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
Consult L<HTML::WikiConverter::Dialects> for documentation on how to |
|
939
|
|
|
|
|
|
|
write your own dialect module for C<HTML::WikiConverter>. Or if you're |
|
940
|
|
|
|
|
|
|
not up to the task, drop me an email and I'll have a go at it when I |
|
941
|
|
|
|
|
|
|
get a spare moment. |
|
942
|
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
944
|
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
L<HTML::Tree>, L<Convert::Wiki> |
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=head1 AUTHOR |
|
948
|
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
David J. Iberri, C<< <diberri@cpan.org> >> |
|
950
|
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
=head1 BUGS |
|
952
|
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
|
954
|
|
|
|
|
|
|
C<bug-html-wikiconverter at rt.cpan.org>, or through the web interface at |
|
955
|
|
|
|
|
|
|
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-WikiConverter>. |
|
956
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on |
|
957
|
|
|
|
|
|
|
your bug as I make changes. |
|
958
|
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
=head1 SUPPORT |
|
960
|
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
|
962
|
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
perldoc HTML::WikiConverter |
|
964
|
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
You can also look for information at: |
|
966
|
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
=over 4 |
|
968
|
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
|
970
|
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
L<http://annocpan.org/dist/HTML-WikiConverter> |
|
972
|
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
=item * CPAN Ratings |
|
974
|
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
L<http://cpanratings.perl.org/d/HTML-WikiConverter> |
|
976
|
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
|
978
|
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-WikiConverter> |
|
980
|
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
=item * Search CPAN |
|
982
|
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
L<http://search.cpan.org/dist/HTML-WikiConverter> |
|
984
|
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=back |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
|
988
|
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
Thanks to Tatsuhiko Miyagawa for suggesting |
|
990
|
|
|
|
|
|
|
L<Bundle::HTMLWikiConverter> as well as providing code for the |
|
991
|
|
|
|
|
|
|
C<available_dialects()> class method. |
|
992
|
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
My thanks also goes to Martin Kudlvasr for catching (and fixing!) a |
|
994
|
|
|
|
|
|
|
bug in the logic of how HTML files were processed. |
|
995
|
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
Big thanks to Dave Schaefer for the PbWiki dialect and for the idea |
|
997
|
|
|
|
|
|
|
behind the new C<attributes()> implementation. |
|
998
|
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
|
1000
|
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
Copyright (c) David J. Iberri, all rights reserved. |
|
1002
|
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
|
1004
|
|
|
|
|
|
|
under the same terms as Perl itself. |
|
1005
|
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
=cut |
|
1007
|
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
1; |