File Coverage

blib/lib/IO/HTML.pm
Criterion Covered Total %
statement 99 103 96.1
branch 75 94 79.7
condition 29 36 80.5
subroutine 14 14 100.0
pod 5 5 100.0
total 222 252 88.1


line stmt bran cond sub pod time code
1             #---------------------------------------------------------------------
2             package IO::HTML;
3             #
4             # Copyright 2015 Christopher J. Madsen
5             #
6             # Author: Christopher J. Madsen
7             # Created: 14 Jan 2012
8             #
9             # This program is free software; you can redistribute it and/or modify
10             # it under the same terms as Perl itself.
11             #
12             # This program is distributed in the hope that it will be useful,
13             # but WITHOUT ANY WARRANTY; without even the implied warranty of
14             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the
15             # GNU General Public License or the Artistic License for more details.
16             #
17             # ABSTRACT: Open an HTML file with automatic charset detection
18             #---------------------------------------------------------------------
19              
20 4     4   77457 use 5.008;
  4         14  
21 4     4   21 use strict;
  4         8  
  4         81  
22 4     4   16 use warnings;
  4         7  
  4         126  
23              
24 4     4   15 use Carp 'croak';
  4         8  
  4         289  
25 4     4   3078 use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding
  4         41569  
  4         330  
26 4     4   24 use Exporter 5.57 'import';
  4         48  
  4         6702  
27              
28             our $VERSION = '1.003'; # TRIAL VERSION
29             # This file is part of IO-HTML 1.003 (September 26, 2015)
30              
31              
32             our $bytes_to_check ||= 1024;
33             our $default_encoding ||= 'cp1252';
34              
35             our @EXPORT = qw(html_file);
36             our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile
37             sniff_encoding);
38              
39             our %EXPORT_TAGS = (
40             rw => [qw( html_file html_file_and_encoding html_outfile )],
41             all => [ @EXPORT, @EXPORT_OK ],
42             );
43              
44             #=====================================================================
45              
46              
47             sub html_file
48             {
49 18     18 1 15325 (&html_file_and_encoding)[0]; # return just the filehandle
50             } # end html_file
51              
52              
53             # Note: I made html_file and html_file_and_encoding separate functions
54             # (instead of making html_file context-sensitive) because I wanted to
55             # use html_file in function calls (i.e. list context) without having
56             # to write "scalar html_file" all the time.
57              
58             sub html_file_and_encoding
59             {
60 36     36 1 69001 my ($filename, $options) = @_;
61              
62 36   100     160 $options ||= {};
63              
64 36 50       1199 open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!";
65              
66              
67 36         80 my ($encoding, $bom) = sniff_encoding($in, $filename, $options);
68              
69 36 100       89 if (not defined $encoding) {
70 8 50       23 croak "No default encoding specified"
71             unless defined($encoding = $default_encoding);
72 8 50       23 $encoding = find_encoding($encoding) if $options->{encoding};
73             } # end if we didn't find an encoding
74              
75             binmode $in, sprintf(":encoding(%s):crlf",
76 36 100   2   417 $options->{encoding} ? $encoding->name : $encoding);
  2         21  
  2         5  
  2         20  
77              
78 36         5958 return ($in, $encoding, $bom);
79             } # end html_file_and_encoding
80             #---------------------------------------------------------------------
81              
82              
83             sub html_outfile
84             {
85 6     6 1 9989 my ($filename, $encoding, $bom) = @_;
86              
87 6 50       40 if (not defined $encoding) {
    100          
88 0 0       0 croak "No default encoding specified"
89             unless defined($encoding = $default_encoding);
90             } # end if we didn't find an encoding
91             elsif (ref $encoding) {
92 1         4 $encoding = $encoding->name;
93             }
94              
95 6 50       384 open(my $out, ">:encoding($encoding)", $filename)
96             or croak "Failed to open $filename: $!";
97              
98 6 100       36530 print $out "\x{FeFF}" if $bom;
99              
100 6         34 return $out;
101             } # end html_outfile
102             #---------------------------------------------------------------------
103              
104              
105             sub sniff_encoding
106             {
107 72     72 1 13666 my ($in, $filename, $options) = @_;
108              
109 72 100       168 $filename = 'file' unless defined $filename;
110 72   100     173 $options ||= {};
111              
112 72         122 my $pos = tell $in;
113 72 50       139 croak "Could not seek $filename: $!" if $pos < 0;
114              
115 72 50       556 croak "Could not read $filename: $!"
116             unless defined read $in, my($buf), $bytes_to_check;
117              
118 72 50       201 seek $in, $pos, 0 or croak "Could not seek $filename: $!";
119              
120              
121             # Check for BOM:
122 72         77 my $bom;
123 72         132 my $encoding = do {
124 72 100       290 if ($buf =~ /^\xFe\xFF/) {
    100          
    100          
125 4         12 $bom = 2;
126 4         12 'UTF-16BE';
127             } elsif ($buf =~ /^\xFF\xFe/) {
128 8         9 $bom = 2;
129 8         13 'UTF-16LE';
130             } elsif ($buf =~ /^\xEF\xBB\xBF/) {
131 4         5 $bom = 3;
132 4         8 'utf-8-strict';
133             } else {
134 56         106 find_charset_in($buf, $options); # check for
135             }
136             }; # end $encoding
137              
138 72 100       219 if ($bom) {
    100          
139 16 50       46 seek $in, $bom, 1 or croak "Could not seek $filename: $!";
140 16         21 $bom = 1;
141             }
142             elsif (not defined $encoding) { # try decoding as UTF-8
143 28         105 my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET);
144 28 100 100     1505 if ($buf =~ /^(?: # nothing left over
145             | [\xC2-\xDF] # incomplete 2-byte char
146             | [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char
147             | [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char
148             )\z/x and $test =~ /[^\x00-\x7F]/) {
149 12         23 $encoding = 'utf-8-strict';
150             } # end if valid UTF-8 with at least one multi-byte character:
151             } # end if testing for UTF-8
152              
153 72 100 66     273 if (defined $encoding and $options->{encoding} and not ref $encoding) {
      100        
154 9         24 $encoding = find_encoding($encoding);
155             } # end if $encoding is a string and we want an object
156              
157 72 50       329 return wantarray ? ($encoding, $bom) : $encoding;
158             } # end sniff_encoding
159              
160             #=====================================================================
161             # Based on HTML5 8.2.2.2 Determining the character encoding:
162              
163             # Get attribute from current position of $_
164             sub _get_attribute
165             {
166 286     286   509 m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or /
167              
168 286 100 100     1447 return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc;
169              
170 94         276 my ($name, $value) = (lc $1, '');
171              
172 94 100       341 if (/\G[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/gc) {
173 84 100       279 if (/\G"/gc) {
    50          
174             # Double-quoted attribute value
175 72         188 /\G([^"]*)("?)/gc;
176 72 100       215 return unless $2; # Incomplete attribute (missing closing quote)
177 63         153 $value = lc $1;
178             } elsif (/\G'/gc) {
179             # Single-quoted attribute value
180 0         0 /\G([^']*)('?)/gc;
181 0 0       0 return unless $2; # Incomplete attribute (missing closing quote)
182 0         0 $value = lc $1;
183             } else {
184             # Unquoted attribute value
185 12         31 /\G([^\x09\x0A\x0C\x0D >]*)/gc;
186 12         29 $value = lc $1;
187             }
188             } # end if attribute has value
189              
190 85 50       388 return wantarray ? ($name, $value) : 1;
191             } # end _get_attribute
192              
193             # Examine a meta value for a charset:
194             sub _get_charset_from_meta
195             {
196 19     19   36 for (shift) {
197 19         81 while (/charset[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/ig) {
198 16 50 33     197 return $1 if (/\G"([^"]*)"/gc or
      33        
199             /\G'([^']*)'/gc or
200             /\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc);
201             }
202             } # end for value
203              
204 3         14 return undef;
205             } # end _get_charset_from_meta
206             #---------------------------------------------------------------------
207              
208              
209             sub find_charset_in
210             {
211 81     81 1 17215 for (shift) {
212 81   100     272 my $options = shift || {};
213             # search only the first $bytes_to_check bytes (default 1024)
214 81 100       196 my $stop = length > $bytes_to_check ? $bytes_to_check : length;
215              
216             my $expect_pragma = (defined $options->{need_pragma}
217 81 100       188 ? $options->{need_pragma} : 1);
218              
219 81         217 pos() = 0;
220 81         244 while (pos() < $stop) {
221 211 100       975 if (/\G