File Coverage

blib/lib/IO/HTML.pm

Criterion	Covered	Total	%
statement	99	103	96.1
branch	75	94	79.7
condition	31	36	86.1
subroutine	14	14	100.0
pod	5	5	100.0
total	224	252	88.8

line	stmt	bran	cond	sub	pod	time	code
1							#---------------------------------------------------------------------
2							package IO::HTML;
3							#
4							# Copyright 2020 Christopher J. Madsen
5							#
6							# Author: Christopher J. Madsen
7							# Created: 14 Jan 2012
8							#
9							# This program is free software; you can redistribute it and/or modify
10							# it under the same terms as Perl itself.
11							#
12							# This program is distributed in the hope that it will be useful,
13							# but WITHOUT ANY WARRANTY; without even the implied warranty of
14							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the
15							# GNU General Public License or the Artistic License for more details.
16							#
17							# ABSTRACT: Open an HTML file with automatic charset detection
18							#---------------------------------------------------------------------
19
20	4			4		273882	use 5.008;
	4					47
21	4			4		22	use strict;
	4					7
	4					113
22	4			4		37	use warnings;
	4					6
	4					148
23
24	4			4		25	use Carp 'croak';
	4					5
	4					284
25	4			4		2292	use Encode 2.10 qw(decode find_encoding); # need utf-8-strict encoding
	4					41273
	4					323
26	4			4		32	use Exporter 5.57 'import';
	4					52
	4					7321
27
28							our $VERSION = '1.004';
29							# This file is part of IO-HTML 1.004 (September 26, 2020)
30
31
32							our $bytes_to_check \|\|= 1024;
33							our $default_encoding \|\|= 'cp1252';
34
35							our @EXPORT = qw(html_file);
36							our @EXPORT_OK = qw(find_charset_in html_file_and_encoding html_outfile
37							sniff_encoding);
38
39							our %EXPORT_TAGS = (
40							rw => [qw( html_file html_file_and_encoding html_outfile )],
41							all => [ @EXPORT, @EXPORT_OK ],
42							);
43
44							#=====================================================================
45
46
47							sub html_file
48							{
49	18			18	1	22584	(&html_file_and_encoding)[0]; # return just the filehandle
50							} # end html_file
51
52
53							# Note: I made html_file and html_file_and_encoding separate functions
54							# (instead of making html_file context-sensitive) because I wanted to
55							# use html_file in function calls (i.e. list context) without having
56							# to write "scalar html_file" all the time.
57
58							sub html_file_and_encoding
59							{
60	36			36	1	28818	my ($filename, $options) = @_;
61
62	36		100			172	$options \|\|= {};
63
64	36	50				1232	open(my $in, '<:raw', $filename) or croak "Failed to open $filename: $!";
65
66
67	36					131	my ($encoding, $bom) = sniff_encoding($in, $filename, $options);
68
69	36	100				76	if (not defined $encoding) {
70	8	50				27	croak "No default encoding specified"
71							unless defined($encoding = $default_encoding);
72	8	50				20	$encoding = find_encoding($encoding) if $options->{encoding};
73							} # end if we didn't find an encoding
74
75							binmode $in, sprintf(":encoding(%s):crlf",
76	36	100		2		559	$options->{encoding} ? $encoding->name : $encoding);
	2					13
	2					4
	2					15
77
78	36					2348	return ($in, $encoding, $bom);
79							} # end html_file_and_encoding
80							#---------------------------------------------------------------------
81
82
83							sub html_outfile
84							{
85	6			6	1	23188	my ($filename, $encoding, $bom) = @_;
86
87	6	50				28	if (not defined $encoding) {
		100
88	0	0				0	croak "No default encoding specified"
89							unless defined($encoding = $default_encoding);
90							} # end if we didn't find an encoding
91							elsif (ref $encoding) {
92	1					5	$encoding = $encoding->name;
93							}
94
95	6	50				277	open(my $out, ">:encoding($encoding)", $filename)
96							or croak "Failed to open $filename: $!";
97
98	6	100				4342	print $out "\x{FeFF}" if $bom;
99
100	6					25	return $out;
101							} # end html_outfile
102							#---------------------------------------------------------------------
103
104
105							sub sniff_encoding
106							{
107	72			72	1	19964	my ($in, $filename, $options) = @_;
108
109	72	100				179	$filename = 'file' unless defined $filename;
110	72		100			186	$options \|\|= {};
111
112	72					167	my $pos = tell $in;
113	72	50				161	croak "Could not seek $filename: $!" if $pos < 0;
114
115	72	50				702	croak "Could not read $filename: $!"
116							unless defined read $in, my($buf), $bytes_to_check;
117
118	72	50				378	seek $in, $pos, 0 or croak "Could not seek $filename: $!";
119
120
121							# Check for BOM:
122	72					114	my $bom;
123	72					86	my $encoding = do {
124	72	100				289	if ($buf =~ /^\xFe\xFF/) {
		100
		100
125	4					7	$bom = 2;
126	4					9	'UTF-16BE';
127							} elsif ($buf =~ /^\xFF\xFe/) {
128	8					11	$bom = 2;
129	8					16	'UTF-16LE';
130							} elsif ($buf =~ /^\xEF\xBB\xBF/) {
131	4					8	$bom = 3;
132	4					9	'utf-8-strict';
133							} else {
134	56					131	find_charset_in($buf, $options); # check for
135							}
136							}; # end $encoding
137
138	72	100				214	if ($bom) {
		100
139	16	50				71	seek $in, $bom, 1 or croak "Could not seek $filename: $!";
140	16					25	$bom = 1;
141							}
142							elsif (not defined $encoding) { # try decoding as UTF-8
143	28					82	my $test = decode('utf-8-strict', $buf, Encode::FB_QUIET);
144	28	100	100			1467	if ($buf =~ /^(?: # nothing left over
145							\| [\xC2-\xDF] # incomplete 2-byte char
146							\| [\xE0-\xEF] [\x80-\xBF]? # incomplete 3-byte char
147							\| [\xF0-\xF4] [\x80-\xBF]{0,2} # incomplete 4-byte char
148							)\z/x and $test =~ /[^\x00-\x7F]/) {
149	12					23	$encoding = 'utf-8-strict';
150							} # end if valid UTF-8 with at least one multi-byte character:
151							} # end if testing for UTF-8
152
153	72	100	100			284	if (defined $encoding and $options->{encoding} and not ref $encoding) {
			100
154	9					28	$encoding = find_encoding($encoding);
155							} # end if $encoding is a string and we want an object
156
157	72	50				397	return wantarray ? ($encoding, $bom) : $encoding;
158							} # end sniff_encoding
159
160							#=====================================================================
161							# Based on HTML5 8.2.2.2 Determining the character encoding:
162
163							# Get attribute from current position of $_
164							sub _get_attribute
165							{
166	286			286		502	m!\G[\x09\x0A\x0C\x0D /]+!gc; # skip whitespace or /
167
168	286	100	100			1070	return if /\G>/gc or not /\G(=?[^\x09\x0A\x0C\x0D =]*)/gc;
169
170	94					321	my ($name, $value) = (lc $1, '');
171
172	94	100				330	if (/\G[\x09\x0A\x0C\x0D ]=[\x09\x0A\x0C\x0D ]/gc) {
173	84	100				186	if (/\G"/gc) {
		50
174							# Double-quoted attribute value
175	72					176	/\G([^"]*)("?)/gc;
176	72	100				184	return unless $2; # Incomplete attribute (missing closing quote)
177	63					135	$value = lc $1;
178							} elsif (/\G'/gc) {
179							# Single-quoted attribute value
180	0					0	/\G([^']*)('?)/gc;
181	0	0				0	return unless $2; # Incomplete attribute (missing closing quote)
182	0					0	$value = lc $1;
183							} else {
184							# Unquoted attribute value
185	12					25	/\G([^\x09\x0A\x0C\x0D >]*)/gc;
186	12					24	$value = lc $1;
187							}
188							} # end if attribute has value
189
190	85	50				301	return wantarray ? ($name, $value) : 1;
191							} # end _get_attribute
192
193							# Examine a meta value for a charset:
194							sub _get_charset_from_meta
195							{
196	19			19		37	for (shift) {
197	19					84	while (/charset[\x09\x0A\x0C\x0D ]=[\x09\x0A\x0C\x0D ]/ig) {
198	16	50	33			171	return $1 if (/\G"([^"]*)"/gc or
			33
199							/\G'([^']*)'/gc or
200							/\G(?!['"])([^\x09\x0A\x0C\x0D ;]+)/gc);
201							}
202							} # end for value
203
204	3					12	return undef;
205							} # end _get_charset_from_meta
206							#---------------------------------------------------------------------
207
208
209							sub find_charset_in
210							{
211	81			81	1	14156	for (shift) {
212	81		100			228	my $options = shift \|\| {};
213							# search only the first $bytes_to_check bytes (default 1024)
214	81	100				182	my $stop = length > $bytes_to_check ? $bytes_to_check : length;
215
216							my $expect_pragma = (defined $options->{need_pragma}
217	81	100				186	? $options->{need_pragma} : 1);
218
219	81					230	pos() = 0;
220	81					234	while (pos() < $stop) {
221	211	100				909	if (/\G