File Coverage

blib/lib/WWW/Webrobot/Html2XHtml.pm
Criterion Covered Total %
statement 38 46 82.6
branch 5 16 31.2
condition 2 18 11.1
subroutine 8 8 100.0
pod 2 3 66.6
total 55 91 60.4


line stmt bran cond sub pod time code
1             package WWW::Webrobot::Html2XHtml;
2 1     1   585 use strict;
  1         2  
  1         23  
3 1     1   4 use warnings;
  1         2  
  1         21  
4              
5             # Author: Stefan Trcek
6             # Copyright(c) 2004 ABAS Software AG
7              
8             =head1 NAME
9              
10             WWW::Webrobot::Html2XHtml - convert HTML to XML
11              
12             =head1 SYNOPSIS
13              
14             use WWW::Webrobot::Html2XHtml;
15             my $converter = WWW::Webrobot::Html2XHtml -> new();
16             $converter->to_xhtml($dirty_html, $encoding);
17              
18              
19             =head1 DESCRIPTION
20              
21             =head1 METHODS
22              
23             =over
24              
25             =cut
26              
27              
28 1     1   1001 use HTML::TreeBuilder;
  1         32369  
  1         12  
29 1     1   41 use HTML::Entities;
  1         3  
  1         7586  
30 1     1   680 use WWW::Webrobot::MyEncode qw/has_Encode octet_to_internal_utf8/;
  1         3  
  1         622  
31              
32              
33             my $XML_HEADER = qq(\n);
34              
35              
36             my %e2c =
37             map {$_ => pack("U", ord $HTML::Entities::entity2char{$_})}
38             grep {my $value = ord($HTML::Entities::entity2char{$_}); 128 <= $value && $value < 256}
39             keys %HTML::Entities::entity2char;
40              
41              
42             =item new
43              
44             Constructor
45              
46             =cut
47              
48             sub new {
49 4     4 1 5274 my $class = shift;
50 4   33     34 my $self = bless({}, ref($class) || $class);
51 4         12 return $self;
52             }
53              
54             sub html_decode_entities_utf8 {
55 4     4 0 8 my ($value) = @_;
56 4         26 foreach ($value) {
57 4 50 33     22 s/(&\#(\d+);?)/ 128<=$2 && $2<256 ? pack("U", $2) : $1 /eg;
  3         26  
58 4 0 0     15 s/(&\#[xX]([0-9a-fA-F]+);?)/ my $c = hex($2); 128<=$c && $c<256 ? pack("U", $c) : $1 /eg;
  0         0  
  0         0  
59 4 100       25 s/(&(\w+);?)/ $e2c{$2} || $1 /eg;
  5         36  
60             }
61 4         9 return $value;
62             }
63              
64             =item to_xhtml($dirty_html, $encoding)
65              
66             Convert C<$dirty_html> to XML.
67             C<$dirty_html> is a sequence of octets and is assumend to be
68             coded in C<$encoding>.
69              
70             =cut
71              
72             sub to_xhtml {
73 4     4 1 17 my ($self, $dirty_html, $encoding) = @_;
74             #return "NO VALID ENCODING='$encoding'\n" if ! $encoding;
75              
76 4         26 my $parser = new HTML::TreeBuilder();
77 4         831 $parser->no_space_compacting(1);
78 4         41 $parser->ignore_ignorable_whitespace(0);
79              
80             # Encode $dirty_html to Perls internal encoding UTF-8.
81 4         34 $dirty_html = octet_to_internal_utf8($encoding, $dirty_html);
82              
83             # Decode HTML entities, because HTML::TreeBuilder doesn't handle it right.
84             # Can't use HTML::Entities::decode_entities because it uses 'chr($x)'
85             # instead of 'pack("U",$x)'
86 4         10 $dirty_html = html_decode_entities_utf8($dirty_html);
87              
88             # Parse $dirty_html and encode all remaining bytes as html entities.
89             # That works because all non-ASCII UTF-8 character bytes are 1xxxxxxx
90 4         57 my $tree = $parser->parse($dirty_html);
91 4         3690 my $xml = $XML_HEADER . $tree->as_XML();
92             # $xml has all byte encoded as &#xx;
93 4         3684 $tree = $tree -> delete;
94              
95 4 50       321 if (! has_Encode()) {
    50          
96             # Decode UTF-8 characters and control characters, $xml is ASCII
97 0 0 0     0 $xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg;
  0         0  
98             }
99             elsif (Encode::is_utf8($xml)) { # SunOS 5.7 / perl 5.8.5
100             # Decode UTF-8 characters and control characters, $xml is UTF-8
101 4 0 0     14 $xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("U", $2) /eg;
  0         0  
102             }
103             else { # Linux perl 5.8.0/5.8.5, Win32 perl 5.8.0
104             # Decode UTF-8 characters and control characters, $xml is ASCII
105 0 0 0     0 $xml =~ s/(&\#(\d+);)/ 32 <= $2 && $2 < 128 ? $1 : pack("C", $2) /eg;
  0         0  
106             # Now we have an UTF-8 string and must Perl believe so too.
107 0         0 Encode::_utf8_on($xml);
108             }
109              
110 4         65 return $xml;
111             }
112              
113             =back
114              
115             =cut
116              
117             1;