File Coverage

blib/lib/IO/HTML.pm
Criterion Covered Total %
statement 99 103 96.1
branch 75 94 79.7
condition 31 36 86.1
subroutine 14 14 100.0
pod 5 5 100.0
total 224 252 88.8


line stmt bran cond sub pod time code
1             #---------------------------------------------------------------------
2             package IO::HTML;
3             #
4             # Copyright 2020 Christopher J. Madsen
5             #
6             # Author: Christopher J. Madsen
7             # Created: 14 Jan 2012
8             #
9             # This program is free software; you can redistribute it and/or modify
10             # it under the same terms as Perl itself.
11             #
12             # This program is distributed in the hope that it will be useful,
13             # but WITHOUT ANY WARRANTY; without even the implied warranty of
14             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the
15             # GNU General Public License or the Artistic License for more details.
16             #
17             # ABSTRACT: Open an HTML file with automatic charset detection
18             #---------------------------------------------------------------------
19              
20 4     4   273882 use 5.008;
  4         47  
21 4     4   22 use strict;
  4         7  
  4         113  
22 4     4   37 use warnings;
  4         6  
  4         148  
23              
24 4     4   25 use Carp 'croak';
  4         5  
  4         284  
25 4     4   2292 use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding
  4         41273  
  4         323  
26 4     4   32 use Exporter 5.57 'import';
  4         52  
  4         7321  
27              
28             our $VERSION = '1.004';
29             # This file is part of IO-HTML 1.004 (September 26, 2020)
30              
31              
32             our $bytes_to_check ||= 1024;
33             our $default_encoding ||= 'cp1252';
34              
35             our @EXPORT = qw(html_file);
36             our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile
37             sniff_encoding);
38              
39             our %EXPORT_TAGS = (
40             rw => [qw( html_file html_file_and_encoding html_outfile )],
41             all => [ @EXPORT, @EXPORT_OK ],
42             );
43              
44             #=====================================================================
45              
46              
47             sub html_file
48             {
49 18     18 1 22584 (&html_file_and_encoding)[0]; # return just the filehandle
50             } # end html_file
51              
52              
53             # Note: I made html_file and html_file_and_encoding separate functions
54             # (instead of making html_file context-sensitive) because I wanted to
55             # use html_file in function calls (i.e. list context) without having
56             # to write "scalar html_file" all the time.
57              
58             sub html_file_and_encoding
59             {
60 36     36 1 28818 my ($filename, $options) = @_;
61              
62 36   100     172 $options ||= {};
63              
64 36 50       1232 open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!";
65              
66              
67 36         131 my ($encoding, $bom) = sniff_encoding($in, $filename, $options);
68              
69 36 100       76 if (not defined $encoding) {
70 8 50       27 croak "No default encoding specified"
71             unless defined($encoding = $default_encoding);
72 8 50       20 $encoding = find_encoding($encoding) if $options->{encoding};
73             } # end if we didn't find an encoding
74              
75             binmode $in, sprintf(":encoding(%s):crlf",
76 36 100   2   559 $options->{encoding} ? $encoding->name : $encoding);
  2         13  
  2         4  
  2         15  
77              
78 36         2348 return ($in, $encoding, $bom);
79             } # end html_file_and_encoding
80             #---------------------------------------------------------------------
81              
82              
83             sub html_outfile
84             {
85 6     6 1 23188 my ($filename, $encoding, $bom) = @_;
86              
87 6 50       28 if (not defined $encoding) {
    100          
88 0 0       0 croak "No default encoding specified"
89             unless defined($encoding = $default_encoding);
90             } # end if we didn't find an encoding
91             elsif (ref $encoding) {
92 1         5 $encoding = $encoding->name;
93             }
94              
95 6 50       277 open(my $out, ">:encoding($encoding)", $filename)
96             or croak "Failed to open $filename: $!";
97              
98 6 100       4342 print $out "\x{FeFF}" if $bom;
99              
100 6         25 return $out;
101             } # end html_outfile
102             #---------------------------------------------------------------------
103              
104              
105             sub sniff_encoding
106             {
107 72     72 1 19964 my ($in, $filename, $options) = @_;
108              
109 72 100       179 $filename = 'file' unless defined $filename;
110 72   100     186 $options ||= {};
111              
112 72         167 my $pos = tell $in;
113 72 50       161 croak "Could not seek $filename: $!" if $pos < 0;
114              
115 72 50       702 croak "Could not read $filename: $!"
116             unless defined read $in, my($buf), $bytes_to_check;
117              
118 72 50       378 seek $in, $pos, 0 or croak "Could not seek $filename: $!";
119              
120              
121             # Check for BOM:
122 72         114 my $bom;
123 72         86 my $encoding = do {
124 72 100       289 if ($buf =~ /^\xFe\xFF/) {
    100          
    100          
125 4         7 $bom = 2;
126 4         9 'UTF-16BE';
127             } elsif ($buf =~ /^\xFF\xFe/) {
128 8         11 $bom = 2;
129 8         16 'UTF-16LE';
130             } elsif ($buf =~ /^\xEF\xBB\xBF/) {
131 4         8 $bom = 3;
132 4         9 'utf-8-strict';
133             } else {
134 56         131 find_charset_in($buf, $options); # check for
135             }
136             }; # end $encoding
137              
138 72 100       214 if ($bom) {
    100          
139 16 50       71 seek $in, $bom, 1 or croak "Could not seek $filename: $!";
140 16         25 $bom = 1;
141             }
142             elsif (not defined $encoding) { # try decoding as UTF-8
143 28         82 my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET);
144 28 100 100     1467 if ($buf =~ /^(?: # nothing left over
145             | [\xC2-\xDF] # incomplete 2-byte char
146             | [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char
147             | [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char
148             )\z/x and $test =~ /[^\x00-\x7F]/) {
149 12         23 $encoding = 'utf-8-strict';
150             } # end if valid UTF-8 with at least one multi-byte character:
151             } # end if testing for UTF-8
152              
153 72 100 100     284 if (defined $encoding and $options->{encoding} and not ref $encoding) {
      100        
154 9         28 $encoding = find_encoding($encoding);
155             } # end if $encoding is a string and we want an object
156              
157 72 50       397 return wantarray ? ($encoding, $bom) : $encoding;
158             } # end sniff_encoding
159              
160             #=====================================================================
161             # Based on HTML5 8.2.2.2 Determining the character encoding:
162              
163             # Get attribute from current position of $_
164             sub _get_attribute
165             {
166 286     286   502 m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or /
167              
168 286 100 100     1070 return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc;
169              
170 94         321 my ($name, $value) = (lc $1, '');
171              
172 94 100       330 if (/\G[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/gc) {
173 84 100       186 if (/\G"/gc) {
    50          
174             # Double-quoted attribute value
175 72         176 /\G([^"]*)("?)/gc;
176 72 100       184 return unless $2; # Incomplete attribute (missing closing quote)
177 63         135 $value = lc $1;
178             } elsif (/\G'/gc) {
179             # Single-quoted attribute value
180 0         0 /\G([^']*)('?)/gc;
181 0 0       0 return unless $2; # Incomplete attribute (missing closing quote)
182 0         0 $value = lc $1;
183             } else {
184             # Unquoted attribute value
185 12         25 /\G([^\x09\x0A\x0C\x0D >]*)/gc;
186 12         24 $value = lc $1;
187             }
188             } # end if attribute has value
189              
190 85 50       301 return wantarray ? ($name, $value) : 1;
191             } # end _get_attribute
192              
193             # Examine a meta value for a charset:
194             sub _get_charset_from_meta
195             {
196 19     19   37 for (shift) {
197 19         84 while (/charset[\x09\x0A\x0C\x0D ]*=[\x09\x0A\x0C\x0D ]*/ig) {
198 16 50 33     171 return $1 if (/\G"([^"]*)"/gc or
      33        
199             /\G'([^']*)'/gc or
200             /\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc);
201             }
202             } # end for value
203              
204 3         12 return undef;
205             } # end _get_charset_from_meta
206             #---------------------------------------------------------------------
207              
208              
209             sub find_charset_in
210             {
211 81     81 1 14156 for (shift) {
212 81   100     228 my $options = shift || {};
213             # search only the first $bytes_to_check bytes (default 1024)
214 81 100       182 my $stop = length > $bytes_to_check ? $bytes_to_check : length;
215              
216             my $expect_pragma = (defined $options->{need_pragma}
217 81 100       186 ? $options->{need_pragma} : 1);
218              
219 81         230 pos() = 0;
220 81         234 while (pos() < $stop) {
221 211 100       909 if (/\G