File Coverage

blib/lib/encoding/warnings.pm
Criterion Covered Total %
statement 37 40 92.5
branch 11 16 68.7
condition 3 5 60.0
subroutine 7 8 87.5
pod 0 3 0.0
total 58 72 80.5


line stmt bran cond sub pod time code
1             package encoding::warnings;
2             $encoding::warnings::VERSION = '0.11';
3              
4 4     4   58894 use strict;
  4         8  
  4         118  
5 4     4   70 use 5.007;
  4         14  
  4         1497  
6              
7             =head1 NAME
8              
9             encoding::warnings - Warn on implicit encoding conversions
10              
11             =head1 VERSION
12              
13             This document describes version 0.11 of encoding::warnings, released
14             June 5, 2007.
15              
16             =head1 SYNOPSIS
17              
18             use encoding::warnings; # or 'FATAL' to raise fatal exceptions
19              
20             utf8::encode($a = chr(20000)); # a byte-string (raw bytes)
21             $b = chr(20000); # a unicode-string (wide characters)
22              
23             # "Bytes implicitly upgraded into wide characters as iso-8859-1"
24             $c = $a . $b;
25              
26             =head1 DESCRIPTION
27              
28             =head2 Overview of the problem
29              
30             By default, there is a fundamental asymmetry in Perl's unicode model:
31             implicit upgrading from byte-strings to unicode-strings assumes that
32             they were encoded in I, but unicode-strings are
33             downgraded with UTF-8 encoding. This happens because the first 256
34             codepoints in Unicode happens to agree with Latin-1.
35              
36             However, this silent upgrading can easily cause problems, if you happen
37             to mix unicode strings with non-Latin1 data -- i.e. byte-strings encoded
38             in UTF-8 or other encodings. The error will not manifest until the
39             combined string is written to output, at which time it would be impossible
40             to see where did the silent upgrading occur.
41              
42             =head2 Detecting the problem
43              
44             This module simplifies the process of diagnosing such problems. Just put
45             this line on top of your main program:
46              
47             use encoding::warnings;
48              
49             Afterwards, implicit upgrading of high-bit bytes will raise a warning.
50             Ex.: C
51             - line 7>.
52              
53             However, strings composed purely of ASCII code points (C<0x00>..C<0x7F>)
54             will I trigger this warning.
55              
56             You can also make the warnings fatal by importing this module as:
57              
58             use encoding::warnings 'FATAL';
59              
60             =head2 Solving the problem
61              
62             Most of the time, this warning occurs when a byte-string is concatenated
63             with a unicode-string. There are a number of ways to solve it:
64              
65             =over 4
66              
67             =item * Upgrade both sides to unicode-strings
68              
69             If your program does not need compatibility for Perl 5.6 and earlier,
70             the recommended approach is to apply appropriate IO disciplines, so all
71             data in your program become unicode-strings. See L, L and
72             L for how.
73              
74             =item * Downgrade both sides to byte-strings
75              
76             The other way works too, especially if you are sure that all your data
77             are under the same encoding, or if compatibility with older versions
78             of Perl is desired.
79              
80             You may downgrade strings with C and C.
81             See L and L for details.
82              
83             =item * Specify the encoding for implicit byte-string upgrading
84              
85             If you are confident that all byte-strings will be in a specific
86             encoding like UTF-8, I need not support older versions of Perl,
87             use the C pragma:
88              
89             use encoding 'utf8';
90              
91             Similarly, this will silence warnings from this module, and preserve the
92             default behaviour:
93              
94             use encoding 'iso-8859-1';
95              
96             However, note that C actually had three distinct effects:
97              
98             =over 4
99              
100             =item * PerlIO layers for B and B
101              
102             This is similar to what L pragma does.
103              
104             =item * Literal conversions
105              
106             This turns I literal string in your program into unicode-strings
107             (equivalent to a C), by decoding them using the specified
108             encoding.
109              
110             =item * Implicit upgrading for byte-strings
111              
112             This will silence warnings from this module, as shown above.
113              
114             =back
115              
116             Because literal conversions also work on empty strings, it may surprise
117             some people:
118              
119             use encoding 'big5';
120              
121             my $byte_string = pack("C*", 0xA4, 0x40);
122             print length $a; # 2 here.
123             $a .= ""; # concatenating with a unicode string...
124             print length $a; # 1 here!
125              
126             In other words, do not C unless you are certain that the
127             program will not deal with any raw, 8-bit binary data at all.
128              
129             However, the C 1> flavor of C will I
130             affect implicit upgrading for byte-strings, and is thus incapable of
131             silencing warnings from this module. See L for more details.
132              
133             =back
134              
135             =head1 CAVEATS
136              
137             For Perl 5.9.4 or later, this module's effect is lexical.
138              
139             For Perl versions prior to 5.9.4, this module affects the whole script,
140             instead of inside its lexical block.
141              
142             =cut
143              
144             # Constants.
145             sub ASCII () { 0 }
146             sub LATIN1 () { 1 }
147             sub FATAL () { 2 }
148              
149             # Install a ${^ENCODING} handler if no other one are already in place.
150             sub import {
151 4     4   27 my $class = shift;
152 4   100     25 my $fatal = shift || '';
153              
154 4         9 local $@;
155 4 50 33     24 return if ${^ENCODING} and ref(${^ENCODING}) ne $class;
156 4 50       8 return unless eval { require Encode; 1 };
  4         1726  
  4         24721  
157              
158 4 50       18 my $ascii = Encode::find_encoding('us-ascii') or return;
159 4 50       1715 my $latin1 = Encode::find_encoding('iso-8859-1') or return;
160              
161             # Have to undef explicitly here
162 4         47 undef ${^ENCODING};
163              
164             # Install a warning handler for decode()
165 4 100       25 my $decoder = bless(
166             [
167             $ascii,
168             $latin1,
169             (($fatal eq 'FATAL') ? 'Carp::croak' : 'Carp::carp'),
170             ], $class,
171             );
172              
173 4         9 ${^ENCODING} = $decoder;
174 4         1861 $^H{$class} = 1;
175             }
176              
177             sub unimport {
178 0     0   0 my $class = shift;
179 0         0 $^H{$class} = undef;
180 0         0 undef ${^ENCODING};
181             }
182              
183             # Don't worry about source code literals.
184             sub cat_decode {
185 6     6 0 10 my $self = shift;
186 6         126 return $self->[LATIN1]->cat_decode(@_);
187             }
188              
189             # Warn if the data is not purely US-ASCII.
190             sub decode {
191 1701     1701 0 34571 my $self = shift;
192              
193             DO_WARN: {
194 1701 50       1740 if ($] >= 5.009004) {
  1701         3535  
195 1701         5331 my $hints = (caller(0))[10];
196 1701 100       5772 $hints->{ref($self)} or last DO_WARN;
197             }
198              
199 6         8 local $@;
200 6         14 my $rv = eval { $self->[ASCII]->decode($_[0], Encode::FB_CROAK()) };
  6         74  
201 6 100       1996 return $rv unless $@;
202              
203 3         31 require Carp;
204 4     4   20 no strict 'refs';
  4         15  
  4         350  
205 3         663 $self->[FATAL]->(
206             "Bytes implicitly upgraded into wide characters as iso-8859-1"
207             );
208              
209             }
210              
211 1696         8389 return $self->[LATIN1]->decode(@_);
212             }
213              
214 2     2 0 56 sub name { 'iso-8859-1' }
215              
216             1;
217              
218             __END__