File Coverage

blib/lib/HTML/WikiConverter.pm

Criterion	Covered	Total	%
statement	312	403	77.4
branch	134	198	67.6
condition	53	84	63.1
subroutine	70	86	81.4
pod	7	21	33.3
total	576	792	72.7

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::WikiConverter;
2	2			2		47486	use warnings;
	2					4
	2					82
3	2			2		13	use strict;
	2					3
	2					72
4
5	2			2		2677	use Params::Validate ':all';
	2					26997
	2					465
6	2			2		2135	use HTML::WikiConverter::Normalizer;
	2					8
	2					86
7	2			2		6554	use HTML::TreeBuilder;
	2					28557
	2					33
8	2			2		100	use HTML::Entities;
	2					4
	2					172
9	2			2		13	use HTML::Tagset;
	2					3
	2					53
10	2			2		13	use File::Spec;
	2					3
	2					56
11	2			2		2348	use DirHandle;
	2					5404
	2					62
12	2			2		5626	use Encode;
	2					41938
	2					254
13	2			2		25	use Carp;
	2					4
	2					136
14
15	2			2		1828	use URI::Escape;
	2					3038
	2					159
16	2			2		2106	use URI;
	2					6260
	2					14475
17
18							our $VERSION = '0.68';
19							our $AUTOLOAD;
20
21							=head1 NAME
22
23							HTML::WikiConverter - Convert HTML to wiki markup
24
25							=head1 SYNOPSIS
26
27							use HTML::WikiConverter;
28							my $wc = new HTML::WikiConverter( dialect => 'MediaWiki' );
29							print $wc->html2wiki( html => '<b>text</b>' ), "\n\n";
30
31							# A more complete example
32
33							my $html = qq(
34							<p><i>Italic</i>, <b>bold</b>, <span style="font-weight:bold">also bold</span>, etc.</p>
35							);
36
37							my @dialects = HTML::WikiConverter->available_dialects;
38							foreach my $dialect ( @dialects ) {
39							my $wc = new HTML::WikiConverter( dialect => $dialect );
40							my $wiki = $wc->html2wiki( html => $html );
41							printf "The %s dialect gives:\n\n%s\n\n", $dialect, $wiki;
42							}
43
44							=head1 DESCRIPTION
45
46							C<HTML::WikiConverter> is an HTML to wiki converter. It can convert
47							HTML source into a variety of wiki markups, called wiki
48							"dialects". The following dialects are supported:
49
50							DokuWiki
51							Kwiki
52							MediaWiki
53							MoinMoin
54							Oddmuse
55							PbWiki
56							PhpWiki
57							PmWiki
58							SlipSlap
59							TikiWiki
60							UseMod
61							WakkaWiki
62							WikkaWiki
63
64							Note that while dialects usually produce satisfactory wiki markup, not
65							all features of all dialects are supported. Consult individual
66							dialects' documentation for details of supported features. Suggestions
67							for improvements, especially in the form of patches, are very much
68							appreciated.
69
70							=head1 METHODS
71
72							=head2 new
73
74							my $wc = new HTML::WikiConverter( dialect => $dialect, %attrs );
75
76							Returns a converter for the specified wiki dialect. Croaks if
77							C<$dialect> is not provided or its dialect module is not installed on
78							your system. Additional attributes may be specified in C<%attrs>; see
79							L</"ATTRIBUTES"> for a complete list.
80
81							=cut
82
83							sub new {
84	10			10	1	486937	my $pkg = shift;
85	10	100				53	return $pkg->__new_dialect(@_) if $pkg eq __PACKAGE__;
86
87	5					16	my $self = bless { }, $pkg;
88	5					27	$self->__load_attribute_specs();
89	5					32	$self->__setup(@_);
90	4					28	return $self;
91							}
92
93							sub __new_dialect {
94	5			5		26	my( $pkg, %opts ) = @_;
95	5	50				18	croak "Required 'dialect' parameter is missing" unless $opts{dialect};
96	5					24	my @dialect_classes = ( __PACKAGE__.'::'.$opts{dialect}, $opts{dialect} );
97	5					9	foreach my $dialect_class ( @dialect_classes ) {
98	10	100	66	1		701	return $dialect_class->new( %opts ) if eval "use $dialect_class; 1" or $dialect_class->isa($pkg);
	1			1		484
	0			1		0
	0			1		0
	1			1		662
	0			1		0
	0			1		0
	1			1		732
	0			1		0
	0			1		0
	1					313
	0					0
	0					0
	1					374
	0					0
	0					0
	1					337
	0					0
	0					0
	1					465
	0					0
	0					0
	1					549
	0					0
	0					0
	1					460
	0					0
	0					0
	1					352
	0					0
	0					0
99							}
100	0					0	my $dc_list = join ', ', @dialect_classes;
101	0					0	croak "Dialect '$opts{dialect}' could not be loaded (tried $dc_list). Error: $@";
102							}
103
104							sub __setup {
105	5			5		8	my $self = shift;
106	5					25	$self->__setup_attributes(@_);
107	4					23	$self->__setup_rules();
108							}
109
110							sub __setup_attributes {
111	5			5		7	my $self = shift;
112	5					23	$self->__attrs( {} );
113	5					28	$self->__load_and_validate_attributes(@_);
114							}
115
116							sub __setup_rules {
117	23			23		35	my $self = shift;
118	23					115	$self->__load_rules();
119	23					88	$self->__validate_rules();
120							}
121
122	33			33		126	sub __original_attrs { shift->_attr( { internal => 1 }, __original_attrs => @_ ) }
123	583			583		1732	sub __attrs { shift->_attr( { internal => 1 }, __attrs => @_ ) }
124	131			131		413	sub __attrs_changed { shift->_attr( { internal => 1 }, __attrs_changed => @_ ) }
125	203			203		649	sub __root { shift->_attr( { internal => 1 }, __root => @_ ) }
126	515			515		1787	sub __rules { shift->_attr( { internal => 1 }, __rules => @_ ) }
127	631			631		2306	sub __attribute_specs { shift->_attr( { internal => 1 }, __attribute_specs => @_ ) }
128
129							# Unsupported attributes
130	0			0	0	0	sub base_url { shift->__no_such( attribute => qw/ base_url base_uri / ) }
131	0			0	0	0	sub wiki_url { shift->__no_such( attribute => qw/ wiki_url wiki_uri / ) }
132
133							sub __no_such {
134	0			0		0	my( $self, $thing, $that, $this ) = @_;
135	0					0	croak "'$that' is not a valid $thing. Perhaps you meant '$this'?";
136							}
137
138							# Pass '{internal=>1}' as first arg for params that aren't attributes
139							sub _attr {
140	2688	100		2688		8193	my( $self, $opts, $param, @value ) = ref $_[1] eq 'HASH' ? @_ : ( +shift, {}, @_ );
141	2688	100				5720	my $store = $opts->{internal} ? $self : $self->__attrs;
142
143	2688	100				6069	if( @value ) {
144	296	100				376	eval { validate_pos( @value, $self->__attribute_specs->{$param} ) unless $opts->{internal} };
	296					826
145	296	50				647	$self->__attribute_error($@) if $@;
146	296					558	$store->{$param} = $value[0];
147	296	100				1585	$self->__attrs_changed(1) if !$opts->{internal};
148							}
149
150	2688	100				16339	return defined $store->{$param} ? $store->{$param} : '';
151							}
152
153							# Attribute accessors and mutators
154							sub AUTOLOAD {
155	546			546		1712	my $self = shift;
156	546					2433	( my $attr = $AUTOLOAD ) =~ s/.*://;
157	546	100				1291	return $self->_attr( $attr => @_ ) if exists $self->__attribute_specs->{$attr};
158	1					195	croak "Can't locate method '$attr' in package ".ref($self);
159							}
160
161							# So AUTOLOAD doesn't intercept calls to destruction method
162	0			0		0	sub DESTROY { }
163
164							sub __slurp {
165	0			0		0	my( $self, $file ) = @_;
166	0					0	eval "use File::Slurp;";
167	0	0				0	return $self->__simple_slurp($file) if $@;
168	0					0	return scalar File::Slurp::read_file($file);
169							}
170
171							sub __simple_slurp {
172	0			0		0	my( $self, $file ) = @_;
173	0	0				0	open my $fh, $file or croak "can't open file $file for reading: $!";
174	0					0	my $text = do { local $/; <$fh> };
	0					0
	0					0
175	0					0	close $fh;
176	0					0	return $text;
177							}
178
179							=head2 html2wiki
180
181							$wiki = $wc->html2wiki( $html, %attrs );
182							$wiki = $wc->html2wiki( html => $html, %attrs );
183							$wiki = $wc->html2wiki( file => $file, %attrs );
184							$wiki = $wc->html2wiki( uri => $uri, %attrs );
185
186							Converts HTML source to wiki markup for the current dialect. Accepts
187							either an HTML string C<$html>, an file C<$file>, or a URI <$uri> to
188							read from.
189
190							Attributes assigned in C<%attrs> (see L</"ATTRIBUTES">) will augment
191							or override previously assigned attributes for the duration of the
192							C<html2wiki()> call.
193
194							=cut
195
196							sub html2wiki {
197	24			24	1	278295	my $self = shift;
198
199							# Assumes that if @_ is odd-numbered, its first element is html
200	24	100				152	my %args = @_ % 2 ? ( html => +shift, @_ ) : @_;
201
202	24					101	my %common_arg_errors = ( url => 'uri', base_url => 'base_uri', wiki_url => 'wiki_uri' );
203	24					190	while( my( $bad, $good ) = each %common_arg_errors ) {
204	72	50				276	$self->__no_such( 'argument to html2wiki()', $bad, $good ) if exists $args{$bad};
205							}
206
207	24					59	my @input_sources = grep { exists $args{$_} } qw/ html file uri /;
	72					169
208	24	50				69	croak "missing 'html', 'file', or 'uri' argument to html2wiki" unless @input_sources;
209	24	50				68	croak "more than one of 'html', 'file', or 'uri' provided, but only one input source allowed" if @input_sources > 1;
210
211	24		100			95	my $html = delete $args{html} \|\| '';
212	24		50			132	my $file = delete $args{file} \|\| '';
213	24		100			92	my $uri = delete $args{uri} \|\| '';
214
215	24	50	33			66	$html = $self->__slurp($file) if $file && $self->slurp;
216	24	100				64	$html = $self->__fetch_html_from_uri($uri) if $uri; # may set 'user_agent' attrib, so call before storing attribs
217	24	50	33			172	$html = "<html>$html</html>" if $html and $self->wrap_in_html;
218
219	24					62	$self->__original_attrs( { %{ $self->__attrs } } );
	24					55
220	24					165	$self->$_( $args{$_} ) foreach keys %args;
221	24	100				66	$self->__setup_rules() if $self->__attrs_changed;
222
223							# Decode into Perl's internal form
224	24					138	$html = decode( $self->encoding, $html );
225
226	24					1640	my $tree = new HTML::TreeBuilder();
227	24					6047	$tree->store_comments(1);
228	24					324	$tree->p_strict( $self->p_strict );
229	24					273	$tree->implicit_body_p_tag(1);
230	24					1038	$tree->ignore_unknown(0); # <ruby> et al
231
232							# Parse the HTML string or file
233	24	50				207	if( $html ) {
234	24					80	$self->given_html( $html );
235	24					324	$tree->parse($html);
236	24					41060	$tree->eof();
237							} else {
238	0					0	$self->given_html( $self->__slurp($file) );
239	0					0	$tree->parse_file($file);
240							}
241
242							# Preprocess, save tree and parsed HTML
243	24					3017	$self->__root( $tree );
244	24					98	$self->__preprocess_tree();
245
246	23					607	$self->__root->deobjectify_text();
247	23					991	$self->parsed_html( $tree->as_HTML(undef, ' ', {}) );
248	23					83	$self->__root->objectify_text();
249
250							# Convert and preprocess
251	23					1943	my $output = $self->__wikify($tree);
252	23					97	$self->__postprocess_output(\$output);
253
254							# Avoid leaks
255	23					104	$tree->delete();
256
257							# Return to original encoding
258	23					1743	$output = encode( $self->encoding, $output );
259
260	23	100				1039	if( $self->__attrs_changed ) {
261	9					11	$self->__attrs( { %{ $self->__original_attrs } } );
	9					25
262	9					37	$self->__setup_rules();
263	9					44	$self->__attrs_changed(0);
264							}
265
266	23					285	return $output;
267							}
268
269							sub __wikify {
270	140			140		212	my( $self, $node ) = @_;
271
272							# Concatenate adjacent text nodes
273	140					358	$node->normalize_content();
274
275	140	100				2054	if( $node->tag eq '~text' ) {
		100
276	36					279	return $node->attr('text');
277							} elsif( $node->tag eq '~comment' ) {
278	1					12	return '<!--' . $node->attr('text') . '-->';
279							} else {
280	103					1510	my $rules = $self->rules_for_tag( $node->tag );
281
282	103	100				290	return $self->__subst($rules->{replace}, $node, $rules) if exists $rules->{replace};
283
284							# Set private preserve rules
285	98	100				284	if( $rules->{preserve} ) {
		50
286	4	50				24	$rules->{__start} = \&__preserve_start,
287							$rules->{__end} = $rules->{empty} ? undef : '</'.$node->tag.'>';
288							} elsif( $rules->{passthrough} ) {
289	0					0	$rules->{__start} = '';
290	0					0	$rules->{__end} = '';
291							}
292
293							# Recurse
294	98					307	my $output = $self->get_elem_contents($node);
295
296							# Unspecified tags have their whitespace preserved (this allows
297							# 'html' and 'body' tags [among others] to keep formatting when
298							# inner tags like 'pre' need to preserve whitespace).
299	98		100			389	my $trim = $rules->{trim} \|\| 'none';
300	98	100	66			425	$output =~ s/^\s+// if $trim eq 'both' or $trim eq 'leading';
301	98	100	66			463	$output =~ s/\s+$// if $trim eq 'both' or $trim eq 'trailing';
302
303	98		100			331	my $lf = $rules->{line_format} \|\| 'none';
304	98	100				213	$output =~ s/^\s*\n/\n/gm if $lf ne 'none';
305	98	50				389	if( $lf eq 'blocks' ) {
		100
		50
		50
306	0					0	$output =~ s/\n{3,}/\n\n/g;
307							} elsif( $lf eq 'multi' ) {
308	3					9	$output =~ s/\n{2,}/\n/g;
309							} elsif( $lf eq 'single' ) {
310	0					0	$output =~ s/\n+/ /g;
311							} elsif( $lf eq 'none' ) {
312							# Do nothing
313							}
314
315							# Substitutions
316	98	100				219	$output =~ s/^/$self->__subst($rules->{line_prefix}, $node, $rules)/gem if $rules->{line_prefix};
	4					12
317	98	100				198	$output = $self->__subst($rules->{__start}, $node, $rules).$output if $rules->{__start};
318	98	100				196	$output = $output.$self->__subst($rules->{__end}, $node, $rules) if $rules->{__end};
319	98	100				238	$output = $self->__subst($rules->{start}, $node, $rules).$output if $rules->{start};
320	98	100				240	$output = $output.$self->__subst($rules->{end}, $node, $rules) if $rules->{end};
321
322							# If the current element is a block and is contained within
323							# another block element, then we will not block the current
324							# element by default. However, if the current element is a block
325							# and is contained within another block element that specifies a
326							# line_format of 'blocks', then we will block the current element.
327	98	100	66			265	$output = "\n\n$output\n\n" if $rules->{block} &&
			66
328							( ! $self->elem_search_lineage( $node, { block => 1 } ) or
329							$self->elem_search_lineage( $node, { line_format => 'blocks' } ) );
330
331							# ...but they are put on their own line
332	98	100	100			328	$output = "\n$output" if $rules->{block} and $node->parent->look_up( _tag => $node->tag ) and $trim ne 'none';
			66
333
334	98					1411	return $output;
335							}
336							}
337
338							# Deprecated. Instead use elem_search_lineage( $node, { block => 1 } ).
339							sub elem_within_block {
340	0			0	0	0	my( $self, $node ) = @_;
341	0					0	foreach my $n ( $node->lineage ) {
342	0	0	0			0	return $n if $self->rules_for_tag($n->tag \|\| '')->{block};
343							}
344	0					0	return 0;
345							}
346
347							=head2 elem_search_lineage
348
349							my $ancestor = $wc->elem_search_lineage( $node, \%rules );
350
351							Searches the lineage of C<$node> and returns the first ancestor node
352							that has rules matching those specified in C<%rules>, or C<undef> if
353							no matching node is found.
354
355							For example, to find out whether C<$node> has an ancestor with rules
356							matching C<{ block =E<gt>1 }>, one could use:
357
358							if( $wc->elem_search_lineage( $node, { block => 1 } ) ) {
359							# do something
360							}
361
362							=cut
363
364							sub elem_search_lineage {
365	15			15	1	27	my( $self, $node, $search_rules ) = @_;
366
367	15					60	foreach my $n ( $node->lineage ) {
368	30					323	my $rules = $self->rules_for_tag( $n->tag );
369
370	30					50	my $matched = 1;
371	30					106	while( my($k,$v) = each %$search_rules ) {
372	30		100			116	my $rule_value = $rules->{$k} \|\| '';
373	30	100				146	$matched = 0 unless $v eq $rule_value;
374							}
375
376	30	100				131	return $n if $matched;
377							}
378
379	14					76	return undef;
380							}
381
382							sub __subst {
383	51			51		93	my( $self, $subst, $node, $rules ) = @_;
384	51	100				185	return ref $subst eq 'CODE' ? $subst->( $self, $node, $rules ) : $subst;
385							}
386
387							sub __preserve_start {
388	4			4		9	my( $self, $node, $rules ) = @_;
389
390	4					10	my $tag = $node->tag;
391	4	50				35	my @attrs = exists $rules->{attributes} ? @{$rules->{attributes}} : ( );
	0					0
392	4					18	my $attr_str = $self->get_attr_str( $node, @attrs );
393	4	50				14	my $slash = $rules->{empty} ? ' /' : '';
394
395	4	50				7	return '<'.$tag.' '.$attr_str.$slash.'>' if $attr_str;
396	4					19	return '<'.$tag.$slash.'>';
397							}
398
399							# Maps a tag name to its URI attribute
400							my %rel2abs = ( a => 'href', img => 'src' );
401
402							my %allowedEmptyTag = ( %HTML::Tagset::emptyElement, '~comment' => 1, '~text' => 1 );
403							my %isKnownTag = %HTML::Tagset::isKnown;
404
405							sub __preprocess_tree {
406	24			24		37	my $self = shift;
407
408	24					59	$self->__root->objectify_text();
409
410	24					9209	$self->preprocess_tree($self->__root);
411
412	24	50				129	HTML::WikiConverter::Normalizer->new->normalize($self->__root) if $self->normalize;
413
414	23	50				213	my %strip_tag = map { $_ => 1 } @{ $self->strip_tags \|\| [] };
	71					184
	23					126
415	23					276	my %passthrough_naked_tags = map { $_ => 1 } $self->__passthrough_naked_tags;
	0					0
416
417	23					59	foreach my $node ( $self->__root->descendents ) {
418	110	100				5498	$node->tag('') unless $node->tag;
419	110	100				915	$node->delete, next if $strip_tag{$node->tag};
420	94	50	33			776	$node->replace_with_content->delete, next if $passthrough_naked_tags{$node->tag} and !$node->all_external_attr_names;
421	94					721	$self->__rm_invalid_text($node);
422	94	100	100			421	$node->delete, next if $self->strip_empty_tags and !$allowedEmptyTag{$node->tag} and $self->__elem_is_empty($node);
			100
423	84	100	100			345	$self->__encode_entities($node) if $node->tag eq '~text' and $self->escape_entities;
424	84	100	100			1095	$self->__rel2abs($node) if $self->base_uri and exists $rel2abs{$node->tag};
425	84					11651	$self->preprocess_node($node);
426							}
427
428							# Reobjectify in case preprocessing added new text
429	23					169	$self->__root->objectify_text();
430
431	23	100				764	$self->preprocess->( $self->__root ) if ref $self->preprocess;
432							}
433
434							sub __passthrough_naked_tags {
435	23			23		42	my $self = shift;
436
437	23					28	my @tags;
438	23	50				105	if( ref $self->passthrough_naked_tags eq 'ARRAY' ) {
		50
439	0					0	@tags = @{ $self->passthrough_naked_tags };
	0					0
440							} elsif( $self->passthrough_naked_tags ) {
441	0					0	@tags = $self->__default_passthrough_naked_tags;
442							} else {
443	23					36	@tags = ( );
444							}
445
446	23					106	return @tags;
447							}
448
449							# (bug #28402)
450	0			0		0	sub __default_passthrough_naked_tags { qw/ tbody thead span div font / }
451
452							sub __elem_is_empty {
453	26			26		293	my( $self, $node ) = @_;
454	26					64	my $content = $self->get_elem_contents($node);
455	26	100	66			145	my $has_nonwhitespace = $content && length $content ? $content =~ /\S/ : 0;
456	26					148	return !$has_nonwhitespace;
457							}
458
459							sub __fetch_html_from_uri {
460	1			1		3	my( $self, $uri ) = @_;
461	1					9	my $ua = $self->__user_agent;
462	1					7	my $res = $ua->get($uri);
463	1	50				628118	croak "request for <$uri> failed" unless $res->is_success;
464	1		50			30	my $encoding = $self->encoding \|\| $self->__guess_encoding($res) \|\| 'utf-8';
465	1					8	my $html = encode( $self->encoding, decode( $encoding, $res->content ) );
466	1					2403	return $html;
467							}
468
469							sub __guess_encoding {
470	0			0		0	my( $self, $res ) = @_;
471	0	0				0	carp "LWP::Charset is not installed but is required for determining the charset claimed by the content at the requested URI", return
472							unless eval "use LWP::Charset; 1";
473	0					0	return LWP::Charset::getCharset($res);
474							}
475
476							sub __user_agent {
477	1			1		4	my $self = shift;
478	1	50				12	$self->user_agent( $self->__default_user_agent ) unless $self->user_agent;
479	1					9	return $self->user_agent;
480							}
481
482							sub __default_user_agent {
483	1	50		1		289	croak "LWP is not installed but is required for fetching URIs" unless eval "use LWP::UserAgent; 1";
	1			1		13
	1					5
	1					19
484	1					10	return LWP::UserAgent->new( agent => shift->__default_ua_string );
485							}
486
487	1			1		11	sub __default_ua_string { "html2wiki/$VERSION" }
488
489							# Encodes high-bit and control chars in node's text to HTML entities.
490							sub __encode_entities {
491	19			19		76	my( $self, $node ) = @_;
492	19	50				66	my $text = defined $node->attr('text') ? $node->attr('text') : '';
493	19					441	encode_entities( $text, '<>&' );
494	19					1254	$node->attr( text => $text );
495							}
496
497							# Convert relative to absolute URIs
498							sub __rel2abs {
499	5			5		67	my( $self, $node ) = @_;
500	5					16	my $attr = $rel2abs{$node->tag};
501	5	50				43	return unless $node->attr($attr); # don't add attribute if it's not already there
502	5					77	$node->attr( $attr => uri_unescape( URI->new_abs( $node->attr($attr), $self->base_uri )->as_string ) );
503							}
504
505							# Removes text nodes directly inside container elements.
506							my %containers = map { $_ => 1 } qw/ table tbody tr ul ol dl menu /;
507
508							sub __rm_invalid_text {
509	94			94		140	my( $self, $node ) = @_;
510	94	50				221	my $tag = defined $node->tag ? $node->tag : '';
511	94	50				1467	if( $containers{$tag} ) {
512	0					0	$_->delete for grep { $_->tag eq '~text' } $node->content_list;
	0					0
513							}
514							}
515
516							sub strip_aname {
517	0			0	0	0	my( $self, $node ) = @_;
518	0	0				0	return if $node->attr('href');
519	0					0	$node->replace_with_content->delete();
520							}
521
522							sub caption2para {
523	0			0	0	0	my( $self, $node ) = @_;
524	0					0	my $table = $node->parent;
525	0					0	$node->detach();
526	0					0	$table->preinsert($node);
527	0					0	$node->tag('p');
528							}
529
530	24			24	0	46	sub preprocess_tree { }
531	84			84	0	178	sub preprocess_node { }
532
533							sub __postprocess_output {
534	23			23		38	my( $self, $outref ) = @_;
535	23					143	$$outref =~ s/\n[\s^\n]+\n/\n\n/g; # XXX this is causing bug 14527
536	23					122	$$outref =~ s/\n{2,}/\n\n/g;
537	23					75	$$outref =~ s/^\n+//;
538	23					96	$$outref =~ s/\s+$//;
539	23					56	$$outref =~ s/[ \t]+$//gm;
540	23					69	$self->postprocess_output($outref);
541							}
542
543	23			23	0	42	sub postprocess_output { }
544
545	0			0	0	0	sub attributes { {} }
546
547							sub __load_attribute_specs {
548	5			5		19	my $self = shift;
549
550							# Get default attribute specs
551	5					29	my $default_specs = $self->__default_attribute_specs;
552
553							# Get dialect attribute specs
554	5					24	my @dialect_specs = $self->attributes;
555	5	50	33			64	my $dialect_specs = @dialect_specs == 1 && ref $dialect_specs[0] eq 'HASH' ? $dialect_specs[0] : {@dialect_specs};
556
557	5					53	my %attr_specs = %$default_specs;
558	5					29	while( my( $attr, $spec ) = each %$dialect_specs ) {
559	13					47	$attr_specs{$attr} = $spec;
560							}
561
562	5					31	$self->__attribute_specs( \%attr_specs );
563							}
564
565							sub __load_and_validate_attributes {
566	5			5		8	my $self = shift;
567
568	5					8	my %attrs = eval { validate( @_, $self->__attribute_specs ) };
	5					13
569	5	100				317	$self->__attribute_error($@) if $@;
570
571	4					17	while( my( $attr, $value ) = each %attrs ) {
572	64					434	$self->$attr($value);
573							}
574							}
575
576							sub __attribute_error {
577	1			1		2	my( $self, $error ) = @_;
578							# Validating attributes failed, so we don't have access to the
579							# 'dialect' attribute; obtain it from the package name instead
580	1					3	( my $dialect = ref $self ) =~ s/.*://;
581
582	1	50				10	$error = sprintf "The attribute '%s' does not exist in the dialect '%s'.", $1, $dialect
583							if $error =~ /not listed in the validation options\: (\w+)/;
584
585	1					172	croak $error;
586							}
587
588	0			0	0	0	sub rules { {} }
589
590							sub __load_rules {
591	23			23		38	my $self = shift;
592	23					80	$self->__rules( $self->rules );
593							}
594
595							# Rules for validating rules
596							my %meta_rules = (
597							trim => { range => [ qw/ none both leading trailing / ] },
598							line_format => { range => [ qw/ none single multi blocks / ] },
599							replace => { singleton => 1 },
600							alias => { singleton => 1 },
601							attributes => { depends => [ qw/ preserve / ] },
602							empty => { depends => [ qw/ preserve / ] },
603							passthrough => { singleton => 1 },
604							);
605
606							sub __validate_rules {
607	23			23		33	my $self = shift;
608
609	23					27	foreach my $tag ( keys %{ $self->__rules } ) {
	23					40
610	185					351	my $rules = $self->__rules->{$tag};
611
612	185					501	foreach my $opt ( keys %$rules ) {
613	280	100				793	my $spec = $meta_rules{$opt} or next;
614
615	96		100			242	my $singleton = $spec->{singleton} \|\| 0;
616	96	50				246	my @disallows = ref $spec->{disallows} eq 'ARRAY' ? @{ $spec->{disallows} } : ( );
	0					0
617	96	50				187	my @depends = ref $spec->{depends} eq 'ARRAY' ? @{ $spec->{depends} } : ( );
	0					0
618	96	100				193	my @range = ref $spec->{range} eq 'ARRAY' ? @{ $spec->{range} } : ( );
	30					76
619	96					144	my %range = map { $_ => 1 } @range;
	120					247
620
621	96	50	66			423	$self->__rule_error( $tag, "'$opt' cannot be combined with any other option" )
622							if $singleton and keys %$rules != 1;
623
624							$rules->{$_} && $self->__rule_error( $tag, "'$opt' cannot be combined with '$_'" )
625	96		0			177	foreach @disallows;
626
627							! $rules->{$_} && $self->__rule_error( $tag, "'$opt' must be combined with '$_'" )
628	96		0			137	foreach @depends;
629
630	96	50	66			520	$self->__rule_error( $tag, "Unknown '$opt' value '$rules->{$opt}'. '$opt' must be one of ", join(', ', map { "'$_'" } @range) )
	0					0
631							if @range and ! exists $range{$rules->{$opt}};
632							}
633							}
634							}
635
636							sub __rule_error {
637	0			0		0	my( $self, $tag, @msg ) = @_;
638	0					0	my $dialect = ref $self;
639	0					0	croak @msg, " in tag '$tag', dialect '$dialect'.\n";
640							}
641
642							sub get_elem_contents {
643	128			128	0	187	my( $self, $node ) = @_;
644	128					344	my $str = join '', map { $self->__wikify($_) } $node->content_list;
	117					817
645	128	50				959	return defined $str ? $str : '';
646							}
647
648							sub get_wiki_page {
649	5			5	0	111	my( $self, $uri ) = @_;
650	5	50				30	my @wiki_uris = ref $self->wiki_uri eq 'ARRAY' ? @{$self->wiki_uri} : $self->wiki_uri;
	5					23
651	5					20	foreach my $wiki_uri ( @wiki_uris ) {
652	12					34	my $page = $self->__extract_wiki_page( $uri, $wiki_uri );
653	12	100				451	return $page if $page;
654							}
655
656	2					7	return undef;
657							}
658
659							sub __extract_wiki_page {
660	12			12		20	my( $self, $uri, $wiki_uri ) = @_;
661	12	50				24	return undef unless $wiki_uri;
662
663	12	50				39	if( ref $wiki_uri eq 'Regexp' ) {
		100
664	0	0				0	return $uri =~ $wiki_uri ? $1 : undef;
665							} elsif( ref $wiki_uri eq 'CODE' ) {
666	3					11	return $wiki_uri->( $self, URI->new($uri) );
667							} else {
668							# Ensure $wiki_uri is absolute
669	9					42	$wiki_uri = URI->new_abs( $wiki_uri, $self->base_uri )->as_string;
670
671	9	100				871	return undef unless index( $uri, $wiki_uri ) == 0;
672	2	50				10	return undef unless length $uri > length $wiki_uri;
673	2					10	return substr( $uri, length $wiki_uri );
674							}
675							}
676
677							# Adapted from Kwiki source
678							my $UPPER = '\p{UppercaseLetter}';
679							my $LOWER = '\p{LowercaseLetter}';
680							my $WIKIWORD = "$UPPER$LOWER\\p{Number}\\p{ConnectorPunctuation}";
681
682	0			0	0	0	sub is_camel_case { return $_[1] =~ /(?:[$UPPER](?=[$WIKIWORD][$UPPER])(?=[$WIKIWORD][$LOWER])[$WIKIWORD]+)/ }
683
684							sub get_attr_str {
685	4			4	0	8	my( $self, $node, @attrs ) = @_;
686	4					9	my %attrs = map { $_ => $node->attr($_) } @attrs;
	0					0
687	4					9	my $str = join ' ', map { $_.'="'.encode_entities($attrs{$_}).'"' } grep { $attrs{$_} } @attrs;
	0					0
	0					0
688
689							# (bug #19046) partial fix: attributes must be contained on a single line
690	4	50				9	$str =~ s/[\n\r]/ /g if $str;
691
692	4	50				16	return defined $str ? $str : '';
693							}
694
695							=head2 given_html
696
697							my $html = $wc->given_html;
698
699							Returns the HTML passed to or fetched (ie, from a file or URI) by the
700							last C<html2wiki()> method call. Useful for debugging.
701
702							=cut
703
704	24			24	1	93	sub given_html { shift->_attr( { internal => 1 }, __given_html => @_ ) }
705
706							=head2 parsed_html
707
708							my $parsed_html = $wc->parsed_html;
709
710							Returns a string containing the post-processed HTML from the last
711							C<html2wiki> call. Post-processing includes parsing by
712							L<HTML::TreeBuilder>, CSS normalization by
713							L<HTML::WikiConverter::Normalizer>, and calls to the C<preprocess> and
714							C<preprocess_tree> dialect methods.
715
716							=cut
717
718	23			23	1	18412	sub parsed_html { shift->_attr( { internal => 1 }, __parsed_html => @_ ) }
719
720							=head2 available_dialects
721
722							my @dialects = HTML::WikiConverter->available_dialects;
723
724							Returns a list of all available dialects by searching the directories
725							in C<@INC> for C<HTML::WikiConverter::> modules.
726
727							=cut
728
729							sub available_dialects {
730	0			0	1	0	my @dialects;
731
732							my %seen;
733	0					0	for my $inc ( @INC ) {
734	0					0	my $dir = File::Spec->catfile( $inc, 'HTML', 'WikiConverter' );
735	0	0				0	my $dh = DirHandle->new( $dir ) or next;
736	0					0	while ( my $f = $dh->read ) {
737	0	0				0	next unless $f =~ /^(\w+)\.pm$/;
738	0					0	my $dialect = $1;
739
740	0	0				0	next if $seen{$dialect}++;
741	0	0	0			0	next if $dialect eq 'Normalizer' or $dialect eq 'WebApp';
742
743	0					0	push @dialects, $dialect;
744							}
745							}
746
747	0	0				0	return wantarray ? sort @dialects : @dialects;
748							}
749
750							=head2 rules_for_tag
751
752							my $rules = $wc->rules_for_tag( $tag );
753
754							Returns the rules that will be used for converting elements of the
755							given tag. Follows C<alias> references. Note that the rules used for a
756							particular tag may depend on the current set of attributes being used.
757
758							=cut
759
760							sub rules_for_tag {
761	133			133	1	895	my( $self, $tag ) = @_;
762	133					294	my $rules = $self->__rules_for_tag($tag);
763	133	100				422	return $rules->{alias} ? $self->__rules_for_tag( $rules->{alias} ) : $rules;
764							}
765
766							sub __rules_for_tag {
767	142			142		195	my( $self, $tag ) = @_;
768	142	100				276	return $self->__rules->{$tag} if $self->__rules->{$tag};
769	91	50	66			255	return $self->__rules->{UNKNOWN} if $self->__rules->{UNKNOWN} and !$isKnownTag{$tag};
770	91					337	return { };
771							}
772
773							=head1 ATTRIBUTES
774
775							You may configure C<HTML::WikiConverter> using a number of
776							attributes. These may be passed as arguments to the C<new>
777							constructor, or can be called as object methods on an H::WC object.
778
779							Some dialects allow other attributes in addition to those below, and
780							may override the attributes' default values. Consult the dialect's
781							documentation for details.
782
783							=head2 base_uri
784
785							URI to use for converting relative URIs to absolute ones. This
786							effectively ensures that the C<src> and C<href> attributes of image
787							and anchor tags, respectively, are absolute before converting the HTML
788							to wiki markup, which is necessary for wiki dialects that handle
789							internal and external links separately. Relative URIs are only
790							converted to absolute ones if the C<base_uri> argument is
791							present. Defaults to C<undef>.
792
793							=head2 dialect
794
795							(Required) Dialect to use for converting HTML into wiki markup. See
796							the L</"DESCRIPTION"> section above for a list of dialects. C<new()>
797							will fail if the dialect given is not installed on your system. Use
798							C<available_dialects()> to list installed dialects.
799
800							=head2 encoding
801
802							Specifies the encoding used by the HTML to be converted. Also
803							determines the encoding of the wiki markup returned by the
804							C<html2wiki> method. Defaults to C<"utf8">.
805
806							=head2 escape_entities
807
808							Passing C<escape_entities> a true value uses L<HTML::Entities> to
809							encode potentially unsafe 'E<lt>', 'E<gt>', and 'E<amp>' characters.
810							Defaults to true.
811
812							=head2 p_strict
813
814							Boolean indicating whether L<HTML::TreeBuilder> will use strict
815							handling of paragraph tags when parsing HTML input. (This corresponds
816							to the C<p_strict> method in the L<HTML::TreeBuilder> module.) Enabled
817							by default.
818
819							=head2 passthrough_naked_tags
820
821							Boolean indicating whether tags with no attributes ("naked" tags)
822							should be removed and replaced with their content. By default, this
823							only applies to non-semantic tags such as E<lt>spanE<gt>,
824							E<lt>divE<gt>, etc., but does not apply to semantic tags such as
825							E<lt>strongE<gt>, E<lt>addressE<gt>, etc. To override this behavior
826							and specify the tags that should be considered for passthrough,
827							provide this attribute with a reference to an array of tag names.
828							Defaults to false, but you'll probably want to enable it.
829
830							=head2 preprocess
831
832							Code reference that gets invoked after HTML is parsed but before it is
833							converted into wiki markup. The callback is passed two arguments: the
834							C<HTML::WikiConverter> object and a L<HTML::Element> pointing to the
835							root node of the HTML tree created by L<HTML::TreeBuilder>.
836
837							=head2 slurp
838
839							Boolean that, if enabled, bypasses C<HTML::Parser>'s incremental
840							parsing (thus I<slurping> the file in all at once) of files when
841							reading HTML files. If L<File::Slurp> is installed, its C<read_file()>
842							function will be used to perform slurping; otherwise, a common Perl
843							idiom will be used for slurping instead. This option is only used if
844							you call C<html2wiki()> with the C<file> argument.
845
846							=head2 strip_empty_tags
847
848							Strips elements containing no content (unless those elements
849							legitimately contain no content, such as is the case for C<br> and
850							C<img> tags, for example). Defaults to false.
851
852							=head2 strip_tags
853
854							A reference to an array of tags to be removed from the HTML input
855							prior to conversion to wiki markup. Tag names are the same as those
856							used in L<HTML::Element>. Defaults to C<[ '~comment', 'head',
857							'script', 'style' ]>.
858
859							=head2 user_agent
860
861							Specifies the L<LWP::UserAgent> object to be used when fetching the
862							URI passed to C<html2wiki()>. If unspecified and C<html2wiki()> is
863							passed a URI, a default user agent will be created.
864
865							=head2 wiki_uri
866
867							Takes a URI, regular expression, or coderef (or a reference to an
868							array of elements of these types) used to determine which links are to
869							wiki pages: a link whose C<href> parameter matches C<wiki_uri> will be
870							treated as a link to a wiki page. In addition, C<wiki_uri> will be
871							used to extract the title of the wiki page. The way this is done
872							depends on whether the C<wiki_uri> has been set to a string, regexp,
873							or coderef. The default is C<undef>, meaning that all links will be
874							treated as external links by default.
875
876							If C<wiki_uri> is a string, it is interpreted as a URI template, and
877							it will be assumed that URIs to wiki pages are created by joining
878							C<wiki_uri> with the wiki page title. For example, the English
879							Wikipedia might use C<"http://en.wikipedia.org/wiki/"> as the value of
880							C<wiki_uri>. Ward's wiki might use C<"http://c2.com/cgi/wiki?">. These
881							examples use an absolute C<wiki_uri>, but a relative URI can be used
882							as well; an absolute URI will be created based on the value of
883							C<base_uri>. For example, the Wikipedia example above can be rewritten
884							using C<base_uri> of C<"http://en.wikipedia.org"> and a C<wiki_uri> of
885							C<"/wiki/">.
886
887							C<wiki_uri> can also be a regexp that matches URIs to wiki pages and
888							also extracts the page title from them. For example, the English
889							Wikipedia might use
890							C<qr~http://en\.wikipedia\.org/w/index\.php\?title\=([^&]+)~>.
891
892							C<wiki_uri> can also be a coderef that takes the current
893							C<HTML::WikiConverter> object and a L<URI> object. It should return
894							the title of the wiki page extracted from the URI, or C<undef> if the
895							URI doesn't represent a link to a wiki page.
896
897							As mentioned above, the C<wiki_uri> attribute can either take a single
898							URI/regexp/coderef element or it may be assigned a reference to an
899							array of any number of these elements. This is useful for wikis that
900							have different ways of creating links to wiki pages. For example, the
901							English Wikipedia might use:
902
903							my $wc = new HTML::WikiConverter(
904							dialect => 'MediaWiki',
905							wiki_uri => [
906							'http://en.wikipiedia.org/wiki/',
907							sub { pop->query_param('title') } # requires URI::QueryParam
908							]
909							);
910
911							=head2 wrap_in_html
912
913							Helps L<HTML::TreeBuilder> parse HTML fragments by wrapping HTML in
914							C<E<lt>htmlE<gt>> and C<E<lt>/htmlE<gt>> before passing it through
915							C<html2wiki>. Boolean, enabled by default.
916
917							=cut
918
919							sub __default_attribute_specs { {
920	5			5		130	base_uri => { type => SCALAR, default => '' },
921							dialect => { type => SCALAR, optional => 0 },
922							encoding => { type => SCALAR, default => 'utf-8' },
923							escape_entities => { type => BOOLEAN, default => 1 },
924							normalize => { type => BOOLEAN, default => 1 },
925							p_strict => { type => BOOLEAN, default => 1 },
926							preprocess => { type => CODEREF \| UNDEF, default => undef },
927							strip_empty_tags => { type => BOOLEAN, default => 0 },
928							slurp => { type => BOOLEAN, default => 0 },
929							strip_tags => { type => ARRAYREF, default => [ qw/ ~comment head script style / ] },
930							passthrough_naked_tags => { type => ARRAYREF \| BOOLEAN, default => 0 },
931							user_agent => { type => OBJECT \| UNDEF, default => undef },
932							wiki_uri => { type => SCALAR \| ARRAYREF, default => '' },
933							wrap_in_html => { type => BOOLEAN, default => 1 },
934							} }
935
936							=head1 ADDING A DIALECT
937
938							Consult L<HTML::WikiConverter::Dialects> for documentation on how to
939							write your own dialect module for C<HTML::WikiConverter>. Or if you're
940							not up to the task, drop me an email and I'll have a go at it when I
941							get a spare moment.
942
943							=head1 SEE ALSO
944
945							L<HTML::Tree>, L<Convert::Wiki>
946
947							=head1 AUTHOR
948
949							David J. Iberri, C<< <diberri@cpan.org> >>
950
951							=head1 BUGS
952
953							Please report any bugs or feature requests to
954							C<bug-html-wikiconverter at rt.cpan.org>, or through the web interface at
955							L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-WikiConverter>.
956							I will be notified, and then you'll automatically be notified of progress on
957							your bug as I make changes.
958
959							=head1 SUPPORT
960
961							You can find documentation for this module with the perldoc command.
962
963							perldoc HTML::WikiConverter
964
965							You can also look for information at:
966
967							=over 4
968
969							=item * AnnoCPAN: Annotated CPAN documentation
970
971							L<http://annocpan.org/dist/HTML-WikiConverter>
972
973							=item * CPAN Ratings
974
975							L<http://cpanratings.perl.org/d/HTML-WikiConverter>
976
977							=item * RT: CPAN's request tracker
978
979							L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-WikiConverter>
980
981							=item * Search CPAN
982
983							L<http://search.cpan.org/dist/HTML-WikiConverter>
984
985							=back
986
987							=head1 ACKNOWLEDGEMENTS
988
989							Thanks to Tatsuhiko Miyagawa for suggesting
990							L<Bundle::HTMLWikiConverter> as well as providing code for the
991							C<available_dialects()> class method.
992
993							My thanks also goes to Martin Kudlvasr for catching (and fixing!) a
994							bug in the logic of how HTML files were processed.
995
996							Big thanks to Dave Schaefer for the PbWiki dialect and for the idea
997							behind the new C<attributes()> implementation.
998
999							=head1 COPYRIGHT & LICENSE
1000
1001							Copyright (c) David J. Iberri, all rights reserved.
1002
1003							This program is free software; you can redistribute it and/or modify it
1004							under the same terms as Perl itself.
1005
1006							=cut
1007
1008							1;