File Coverage

blib/lib/HTML/WikiConverter/Normalizer.pm
Criterion Covered Total %
statement 93 101 92.0
branch 24 36 66.6
condition 5 9 55.5
subroutine 18 20 90.0
pod 3 3 100.0
total 143 169 84.6


line stmt bran cond sub pod time code
1             package HTML::WikiConverter::Normalizer;
2              
3 3     3   25806 use warnings;
  3         6  
  3         106  
4 3     3   18 use strict;
  3         6  
  3         98  
5              
6 3     3   18 use Carp;
  3         6  
  3         215  
7              
8 3     3   3475 use CSS;
  3         73478  
  3         108  
9 3     3   21442 use HTML::Element;
  3         90053  
  3         25  
10              
11             =head1 NAME
12              
13             HTML::WikiConverter::Normalizer - Convert CSS styles to (roughly) corresponding HTML
14              
15             =head1 SYNOPSIS
16              
17             use HTML::TreeBuilder;
18             use HTML::WikiConverter::Normalizer;
19              
20             my $tree = new HTML::TreeBuilder();
21             $tree->parse( '<p><font style="font-style:italic; font-weight:bold">text</font></p>' );
22              
23             my $norm = new HTML::WikiConverter::Normalizer();
24             $norm->normalize($tree);
25              
26             # Roughly gives "<p><font><b><i>text</i></b></font></p>"
27             print $tree->as_HTML();
28              
29             =head1 DESCRIPTION
30              
31             L<HTML::WikiConverter> dialects convert HTML into wiki markup. Most
32             (if not all) know nothing about CSS, nor do they take it into
33             consideration when performing html-to-wiki conversion. But there is no
34             good reason for, say, C<E<lt>font
35             style="font-weight:bold"E<gt>textE<lt>/fontE<gt>> not to be converted
36             into C<'''text'''> in the MediaWiki dialect. The same is true of other
37             dialects, all of which should be able to use CSS information to
38             produce wiki markup.
39              
40             The issue becomes especially problematic when considering that several
41             WYSIWYG HTML editors (e.g. Mozilla's) produce this sort of CSS-heavy
42             HTML. Prior to C<HTML::WikiConverter::Normalizer>, this HTML would
43             have been essentially converted to text, the CSS information having
44             been ignored by C<HTML::WikiConverter>.
45              
46             C<HTML::WikiConverter::Normalizer> avoids this with a few simple
47             transformations that convert CSS styles into HTML tags.
48              
49             =head1 METHODS
50              
51             =head2 new
52              
53             my $norm = new HTML::WikiConverter::Normalizer();
54              
55             Constructs a new normalizer
56              
57             =cut
58              
59             sub new {
60 30     30 1 13113 my( $pkg, %attrs ) = @_;
61 30         167 my $self = bless \%attrs, $pkg;
62 30         250 $self->{_css} = new CSS( { parser => 'CSS::Parse::Lite' } );
63 30         828 $self->{_handlers} = $self->handlers;
64 30         134 return $self;
65             }
66              
67             =head2 normalize
68              
69             $norm->normalize($elem);
70              
71             Normalizes C<$elem> and all its descendents, where C<$elem> is an
72             L<HTML::Element> object.
73              
74             =cut
75              
76             sub normalize {
77 30     30 1 78 my( $self, $root ) = @_;
78 30         84 $self->_normalize($root);
79 29         697 $self->_postprocess($root);
80             }
81              
82             =head1 SUBCLASSING
83              
84             The following methods may be useful to subclasses.
85              
86             =cut
87              
88 31     31   168 sub _css { shift->{_css} }
89 60     60   174 sub _handlers { shift->{_handlers} }
90              
91             =head2 handlers
92              
93             my $handlers = $self->handlers;
94              
95             Class method returning reference to an array of handlers used to
96             convert CSS to HTML. Each handler is a hashref that specifies the CSS
97             properties and values to match, and the HTML tags and attributes the
98             matched properties will be converted to.
99              
100             The C<type>, C<name>, C<value>, and C<tag> keys may be used to match
101             an element's property or attribute. C<type> may be either C<"css"> if
102             matching a CSS property (in which case C<name> must contain the name
103             of the property, and C<value> must contain the property value to
104             match) or C<"attr"> if matching an HTML tag attribute (in which case
105             C<name> must contain the name of the attribute, and C<value> must
106             contain the attribute value to match).
107              
108             C<value> may be a string (for an exact match), regex (which will be
109             used to match against the element's property or attribute value),
110             coderef (which will be passed the property or attribute value and is
111             expected to return true on match, false otherwise), or C<"*"> (which
112             matches any property or attribute value). A tag or list of tags can
113             also be matched with the C<tag> key, which takes either a string or an
114             arrayref.
115              
116             To specify what actions the handler will take, the C<new_tag>,
117             C<new_attr>, and C<normalizer> keys are used. C<new_tag> is required
118             and indicates the name of the tag that will be created. C<attribute>
119             is optional and indicates the name of the attribute in the new tag
120             that will take the value of the original CSS property. If a coderef is
121             given as the C<normalizer>, it will be passed the value of the
122             property/attribute and should return one suitable to be assigned to
123             the new tag attribute.
124              
125             =cut
126              
127             sub handlers { [
128 30     30 1 825 { type => 'css', name => 'font-family', value => '*', new_tag => 'font', new_attr => 'face' },
129             { type => 'css', name => 'font-size', value => '*', new_tag => 'font', new_attr => 'size', normalizer => \&_normalize_fontsize },
130             { type => 'css', name => 'color', value => '*', new_tag => 'font', new_attr => 'color', normalizer => \&_normalize_color },
131             { type => 'css', name => 'font-weight', value => 'bold', new_tag => 'b' },
132             { type => 'css', name => 'text-decoration', value => 'underline', new_tag => 'u' },
133             { type => 'css', name => 'font-style', value => 'italic', new_tag => 'i' },
134             { type => 'attr', name => 'align', value => 'center', tag => [ qw/ div p / ], new_tag => 'center' },
135              
136             # { type => 'attr', tag => 'font', attr => 'size', value => '*', new_tag => 'span', style => 'font-size' },
137             ] }
138              
139             sub _new_handlers {
140 0     0   0 span_to_font => {
141             xpath => '//[@style[contains(., "font-size")]]',
142             },
143             }
144              
145             sub _normalize_color {
146 3     3   6 my( $self, $color ) = @_;
147 3         9 return $color;
148             }
149              
150             sub _normalize_fontsize {
151 0     0   0 my( $self, $size ) = @_;
152 0         0 return $size;
153             }
154              
155             sub _normalize {
156 30     30   44 my( $self, $node ) = @_;
157 30         209 $self->_normalize_css( $node );
158 29         372 $self->_normalize_attrs( $node );
159             }
160              
161             sub _normalize_css {
162 233     233   2136 my( $self, $node ) = @_;
163              
164 233         630 $node->objectify_text;
165              
166             # Recurse
167 233         13207 $self->_normalize_css($_) for $node->content_list;
168              
169 224 100       2392 my $style_text = $node->attr('style') or return;
170 11         296 my $full_css = "this { $style_text }";
171              
172 11         35 $self->_css->read_string( $full_css );
173 10         25567 my $style = $self->_css->get_style_by_selector('this');
174              
175 10 50       147 my @original_props = @{ $style->{properties} || [] };
  10         63  
176 10         27 my @new_props = ( );
177              
178 10         26 foreach my $prop ( @original_props ) {
179 21         144 my $handler = $self->_find_handler( type => 'css', name => $prop->{property}, value => $prop->{simple_value}, tag => $node->tag );
180 21 100       145 if( $handler ) {
181 9         37 $self->_handle( $handler, $node, $prop->{simple_value} );
182             } else {
183 12         31 push @new_props, $prop;
184             }
185             }
186              
187 10         118 $style->{properties} = \@new_props;
188 10         47 chomp( my $style_string = $style->to_string );
189 10         17135 $style_string =~ s/^this \{\s*(.*?)\s*\}$/$1/;
190 10         35 $style_string =~ s/\s+$//;
191 10   100     55 $style_string ||= undef;
192              
193 10         46 $node->attr( style => $style_string );
194 10         163 $self->_css->purge();
195             }
196              
197             sub _normalize_attrs {
198 174     174   1340 my( $self, $node ) = @_;
199              
200 174         442 $node->objectify_text();
201              
202             # Recurse
203 174         3373 $self->_normalize_attrs($_) for $node->content_list;
204            
205 174         1955 foreach my $attr ( $node->all_external_attr_names ) {
206 39         432 my $attr_value = $node->attr($attr);
207 39         529 my $handler = $self->_find_handler( type => 'attr', name => $attr, value => $attr_value, tag => $node->tag );
208 39 100       196 if( $handler ) {
209 1         6 $self->_handle( $handler, $node, $attr_value );
210 1         18 $node->attr( $attr => undef );
211             }
212             }
213             }
214              
215 29     29   79 sub _postprocess { }
216              
217             sub _handle {
218 10     10   25 my( $self, $handler, $node, $value ) = @_;
219 10 100       42 $value = $handler->{normalizer} ? $handler->{normalizer}->( $self, $value ) : $value;
220              
221 10         40 my %elem_attrs = ( );
222 10 100       126 $elem_attrs{$handler->{new_attr}} = $value if $handler->{new_attr};
223              
224 10         61 my $new_elem = new HTML::Element( $handler->{new_tag}, %elem_attrs );
225              
226 10         314 foreach my $c ( $node->content_list ) {
227 10         95 $c->detach;
228 10         173 $new_elem->push_content($c);
229             }
230              
231 10         237 $node->push_content($new_elem);
232             }
233              
234             sub _find_handler {
235 60     60   611 my( $self, %args ) = @_;
236              
237 60         151 my @arg_keys = qw/ type name value /;
238 60         102 for my $arg ( @arg_keys ) {
239 180 50       539 if( not exists $args{$arg} ) {
240 0         0 my( $t, $n, $v ) = @args{ @arg_keys };
241 0   0     0 $_ ||= '' for ( $t, $n, $v );
242              
243 0         0 croak sprintf "missing required '$arg' key (type: %s, name: %s, value: %s)", $t, $n, $v;
244             }
245             }
246              
247 60         141 my $type = $args{type};
248 60         181 my $name = $args{name};
249 60         91 my $value = $args{value};
250 60   50     163 my $tag = $args{tag} || '';
251              
252 60         88 foreach my $handler ( @{ $self->_handlers } ) {
  60         157  
253 393 100       860 next unless $handler->{type} eq $type;
254 147 100       717 next unless $handler->{name} eq $name;
255 10 50 66     41 next if $handler->{tag} and ! $self->_match_handler_tag( $handler->{tag}, $tag );
256              
257 10 50       68 if( ref $handler->{value} eq 'Regexp' ) {
    50          
    100          
258 0 0       0 return $handler if $value =~ $handler->{value}
259             } elsif( ref $handler->{value} eq 'CODE' ) {
260 0 0       0 return $handler if $handler->{value}->( $value );
261             } elsif( $handler->{value} eq '*') {
262 4         22 return $handler;
263             } else {
264 6 50       38 return $handler if $handler->{value} eq $value;
265             }
266             }
267              
268 50         193 return;
269             }
270              
271             sub _match_handler_tag {
272 1     1   4 my( $self, $handler_tag, $tag ) = @_;
273 1 50       7 my %handler_tags = map { $_ => 1 } ( ref $handler_tag eq 'ARRAY' ? @$handler_tag : $handler_tag );
  2         9  
274 1 50       278 return $handler_tags{$tag} ? 1 : 0;
275             }
276              
277             =head1 SEE ALSO
278              
279             L<CSS>
280              
281             =head1 AUTHOR
282              
283             David J. Iberri, C<< <diberri@cpan.org> >>
284              
285             =head1 BUGS
286              
287             Please report any bugs or feature requests to C<bug-html-wikiconverter
288             at rt.cpan.org>, or through the web interface at
289             L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-WikiConverter>.
290             I will be notified, and then you'll automatically be notified of
291             progress on your bug as I make changes.
292              
293             =head1 COPYRIGHT & LICENSE
294              
295             Copyright 2006 David J. Iberri, all rights reserved.
296              
297             This program is free software; you can redistribute it and/or modify it
298             under the same terms as Perl itself.
299              
300             =cut
301              
302             1;