File Coverage

blib/lib/Test/utf8.pm
Criterion Covered Total %
statement 92 92 100.0
branch 12 12 100.0
condition 16 16 100.0
subroutine 21 21 100.0
pod 6 8 75.0
total 147 149 98.6


line stmt bran cond sub pod time code
1             package Test::utf8;
2              
3 7     7   198438 use 5.007003;
  7         25  
  7         283  
4              
5 7     7   41 use strict;
  7         16  
  7         242  
6 7     7   48 use warnings;
  7         14  
  7         259  
7              
8 7     7   38 use base qw(Exporter);
  7         9  
  7         877  
9              
10 7     7   7833 use Encode;
  7         115279  
  7         725  
11 7     7   11307 use charnames ':full';
  7         374943  
  7         54  
12              
13             our $VERSION = "1.01";
14              
15             our @EXPORT = qw(
16             is_valid_string is_dodgy_utf8 is_sane_utf8
17             is_within_ascii is_within_latin1 is_within_latin_1
18             is_flagged_utf8 isnt_flagged_utf8
19             );
20              
21             # A Regexp string to match valid UTF8 bytes
22             # this info comes from page 78 of "The Unicode Standard 4.0"
23             # published by the Unicode Consortium
24             our $valid_utf8_regexp = <<'REGEX' ;
25             [\x{00}-\x{7f}]
26             | [\x{c2}-\x{df}][\x{80}-\x{bf}]
27             | \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
28             | [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
29             | \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
30             | [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
31             | \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
32             | [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
33             | \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
34             REGEX
35              
36             =head1 NAME
37              
38             Test::utf8 - handy utf8 tests
39              
40             =head1 SYNOPSIS
41              
42             # check the string is good
43             is_valid_string($string); # check the string is valid
44             is_sane_utf8($string); # check not double encoded
45              
46             # check the string has certain attributes
47             is_flagged_utf8($string1); # has utf8 flag set
48             is_within_ascii($string2); # only has ascii chars in it
49             isnt_within_ascii($string3); # has chars outside the ascii range
50             is_within_latin_1($string4); # only has latin-1 chars in it
51             isnt_within_ascii($string5); # has chars outside the latin-1 range
52              
53             =head1 DESCRIPTION
54              
55             This module is a collection of tests useful for dealing with utf8 strings in
56             Perl.
57              
58             This module has two types of tests: The validity tests check if a string is
59             valid and not corrupt, whereas the characteristics tests will check that string
60             has a given set of characteristics.
61              
62             =head2 Validity Tests
63              
64             =over
65              
66             =item is_valid_string($string, $testname)
67              
68             Checks if the string is "valid", i.e. this passes and returns true unless
69             the internal utf8 flag hasn't been set on scalar that isn't made up of a valid
70             utf-8 byte sequence.
71              
72             This should I<never> happen and, in theory, this test should always pass. Unless
73             you (or a module you use) goes monkeying around inside a scalar using Encode's
74             private functions or XS code you shouldn't ever end up in a situation where
75             you've got a corrupt scalar. But if you do, and you do, then this function
76             should help you detect the problem.
77              
78             To be clear, here's an example of the error case this can detect:
79              
80             my $mark = "Mark";
81             my $leon = "L\x{e9}on";
82             is_valid_string($mark); # passes, not utf-8
83             is_valid_string($leon); # passes, not utf-8
84              
85             my $iloveny = "I \x{2665} NY";
86             is_valid_string($iloveny); # passes, proper utf-8
87              
88             my $acme = "L\x{c3}\x{a9}on";
89             Encode::_utf8_on($acme); # (please don't do things like this)
90             is_valid_string($acme); # passes, proper utf-8 byte sequence upgraded
91              
92             Encode::_utf8_on($leon); # (this is why you don't do things like this)
93             is_valid_string($leon); # fails! the byte \x{e9} isn't valid utf-8
94              
95             =cut
96              
97             sub is_valid_string($;$)
98             {
99 12     12 1 8049 my $string = shift;
100 12   100     74 my $name = shift || "valid string test";
101              
102             # check we're a utf8 string - if not, we pass.
103 12 100       70 unless (Encode::is_utf8($string))
104 5         17 { return _pass($name) }
105              
106             # work out at what byte (if any) we have an invalid byte sequence
107             # and return the correct test result
108 7         20 my $pos = _invalid_sequence_at_byte($string);
109 7 100       22 if (_ok(!defined($pos), $name)) { return 1 }
  4         1256  
110 3         1944 _diag("malformed byte sequence starting at byte $pos");
111 3         9 return;
112             }
113              
114             sub _invalid_sequence_at_byte($)
115             {
116 7     7   14 my $string = shift;
117              
118             # examine the bytes that make up the string (not the chars)
119             # by turning off the utf8 flag (no, use bytes doesn't
120             # work, we're dealing with a regexp)
121 7         19 Encode::_utf8_off($string); ## no critic (ProtectPrivateSubs)
122              
123             # work out the index of the first non matching byte
124 7         264 my $result = $string =~ m/^($valid_utf8_regexp)*/ogx;
125              
126             # if we matched all the string return the empty list
127 7   100     43 my $pos = pos $string || 0;
128 7 100       27 return if $pos == length($string);
129              
130             # otherwise return the position we found
131 3         7 return $pos
132             }
133              
134             =item is_sane_utf8($string, $name)
135              
136             This test fails if the string contains something that looks like it
137             might be dodgy utf8, i.e. containing something that looks like the
138             multi-byte sequence for a latin-1 character but perl hasn't been
139             instructed to treat as such. Strings that are not utf8 always
140             automatically pass.
141              
142             Some examples may help:
143              
144             # This will pass as it's a normal latin-1 string
145             is_sane_utf8("Hello L\x{e9}eon");
146              
147             # this will fail because the \x{c3}\x{a9} looks like the
148             # utf8 byte sequence for e-acute
149             my $string = "Hello L\x{c3}\x{a9}on";
150             is_sane_utf8($string);
151              
152             # this will pass because the utf8 is correctly interpreted as utf8
153             Encode::_utf8_on($string)
154             is_sane_utf8($string);
155              
156             Obviously this isn't a hundred percent reliable. The edge case where
157             this will fail is where you have C<\x{c2}> (which is "LATIN CAPITAL
158             LETTER WITH CIRCUMFLEX") or C<\x{c3}> (which is "LATIN CAPITAL LETTER
159             WITH TILDE") followed by one of the latin-1 punctuation symbols.
160              
161             # a capital letter A with tilde surrounded by smart quotes
162             # this will fail because it'll see the "\x{c2}\x{94}" and think
163             # it's actually the utf8 sequence for the end smart quote
164             is_sane_utf8("\x{93}\x{c2}\x{94}");
165              
166             However, since this hardly comes up this test is reasonably reliable
167             in most cases. Still, care should be applied in cases where dynamic
168             data is placed next to latin-1 punctuation to avoid false negatives.
169              
170             There exists two situations to cause this test to fail; The string
171             contains utf8 byte sequences and the string hasn't been flagged as
172             utf8 (this normally means that you got it from an external source like
173             a C library; When Perl needs to store a string internally as utf8 it
174             does it's own encoding and flagging transparently) or a utf8 flagged
175             string contains byte sequences that when translated to characters
176             themselves look like a utf8 byte sequence. The test diagnostics tells
177             you which is the case.
178              
179             =cut
180              
181             # build my regular expression out of the latin-1 bytes
182             # NOTE: This won't work if our locale is nonstandard will it?
183             my $re_bit = join "|", map { Encode::encode("utf8",chr($_)) } (127..255);
184              
185             sub is_sane_utf8($;$)
186             {
187 6     6 1 862 my $string = shift;
188 6   100     36 my $name = shift || "sane utf8";
189              
190             # regexp in scalar context with 'g', meaning this loop will run for
191             # each match. Should only have to run it once, but will redo if
192             # the failing case turns out to be allowed in %allowed.
193 6         612 while ($string =~ /($re_bit)/o)
194             {
195             # work out what the double encoded string was
196 2         5 my $bytes = $1;
197              
198 2         10 my $index = $+[0] - length($bytes);
199 2         11 my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, $bytes;
  4         25  
200              
201             # what character does that represent?
202 2         15 my $char = Encode::decode("utf8",$bytes);
203 2         132 my $ord = ord($char);
204 2         6 my $hex = sprintf '%00x', $ord;
205 2         12 $char = charnames::viacode($ord);
206              
207             # print out diagnostic messages
208 2         463901 _fail($name);
209 2         1227 _diag(qq{Found dodgy chars "$codes" at char $index\n});
210 2 100       10 if (Encode::is_utf8($string))
211 1         3 { _diag("Chars in utf8 string look like utf8 byte sequence.") }
212             else
213 1         4 { _diag("String not flagged as utf8...was it meant to be?\n") }
214 2         16 _diag("Probably originally a $char char - codepoint $ord (dec),"
215             ." $hex (hex)\n");
216              
217 2         7 return 0;
218             }
219              
220             # got this far, must have passed.
221 4         43 _ok(1,$name);
222 4         1205 return 1;
223             }
224              
225             # historic name of method; deprecated
226 5     5 0 4427 sub is_dodgy_utf8 { goto &is_sane_utf8 }
227              
228             =back
229              
230             =head2 String Characteristic Tests
231              
232             These routines allow you to check the range of characters in a string.
233             Note that these routines are blind to the actual encoding perl
234             internally uses to store the characters, they just check if the
235             string contains only characters that can be represented in the named
236             encoding:
237              
238             =over
239              
240             =item is_within_ascii
241              
242             Tests that a string only contains characters that are in the ASCII
243             character set.
244              
245             =cut
246              
247             sub is_within_ascii($;$)
248             {
249 3     3 1 1664 my $string = shift;
250 3   100     13 my $name = shift || "within ascii";
251              
252             # look for anything that isn't ascii or pass
253 3 100       17 $string =~ /([^\x{00}-\x{7f}])/ or return _pass($name);
254              
255             # explain why we failed
256 1         3 my $dec = ord($1);
257 1         9 my $hex = sprintf '%02x', $dec;
258              
259 1         6 _fail($name);
260 1         460 _diag("Char $+[0] not ASCII (it's $dec dec / $hex hex)");
261              
262 1         3 return 0;
263             }
264              
265             =item is_within_latin_1
266              
267             Tests that a string only contains characters that are in latin-1.
268              
269             =cut
270              
271             sub is_within_latin_1($;$)
272             {
273 4     4 1 1003 my $string = shift;
274 4   100     24 my $name = shift || "within latin-1";
275              
276             # look for anything that isn't ascii or pass
277 4 100       29 $string =~ /([^\x{00}-\x{ff}])/ or return _pass($name);
278              
279             # explain why we failed
280 1         4 my $dec = ord($1);
281 1         7 my $hex = sprintf '%x', $dec;
282              
283 1         5 _fail($name);
284 1         527 _diag("Char $+[0] not Latin-1 (it's $dec dec / $hex hex)");
285              
286 1         3 return 0;
287             }
288              
289 3     3 0 2582 sub is_within_latin1 { goto &is_within_latin_1 }
290              
291             =back
292              
293             Simply check if a scalar is or isn't flagged as utf8 by perl's
294             internals:
295              
296             =over
297              
298             =item is_flagged_utf8($string, $name)
299              
300             Passes if the string is flagged by perl's internals as utf8, fails if
301             it's not.
302              
303             =cut
304              
305             sub is_flagged_utf8
306             {
307 3     3 1 1708 my $string = shift;
308 3   100     14 my $name = shift || "flagged as utf8";
309 3         18 return _ok(Encode::is_utf8($string),$name);
310             }
311              
312             =item isnt_flagged_utf8($string,$name)
313              
314             The opposite of C<is_flagged_utf8>, passes if and only if the string
315             isn't flagged as utf8 by perl's internals.
316              
317             Note: you can refer to this function as C<isn't_flagged_utf8> if you
318             really want to.
319              
320             =cut
321              
322             sub isnt_flagged_utf8($;$)
323             {
324 3     3 1 2648 my $string = shift;
325 3   100     14 my $name = shift || "not flagged as utf8";
326 3         11 return _ok(!Encode::is_utf8($string), $name);
327             }
328              
329             sub isn::t_flagged_utf8($;$)
330             {
331 3     3   2608 my $string = shift;
332 3   100     18 my $name = shift || "not flagged as utf8";
333 3         12 return _ok(!Encode::is_utf8($string), $name);
334             }
335              
336             =back
337              
338             =head1 AUTHOR
339              
340             Written by Mark Fowler B<mark@twoshortplanks.com>
341              
342             =head1 COPYRIGHT
343              
344             Copyright Mark Fowler 2004,2012. All rights reserved.
345              
346             This program is free software; you can redistribute it
347             and/or modify it under the same terms as Perl itself.
348              
349             =head1 BUGS
350              
351             None known. Please report any to me via the CPAN RT system. See
352             http://rt.cpan.org/ for more details.
353              
354             =head1 SEE ALSO
355              
356             L<Test::DoubleEncodedEntities> for testing for double encoded HTML
357             entities.
358              
359             =cut
360              
361             ##########
362              
363             # shortcuts for Test::Builder.
364              
365 7     7   14721 use Test::Builder;
  7         20  
  7         1893  
366             my $tester = Test::Builder->new();
367              
368             sub _ok
369             {
370 34     34   64 local $Test::Builder::Level = $Test::Builder::Level + 1;
371 34         138 return $tester->ok(@_)
372             }
373             sub _diag
374             {
375 11     11   28 local $Test::Builder::Level = $Test::Builder::Level + 1;
376 11         36 $tester->diag(@_);
377 11         766 return;
378             }
379              
380             sub _fail
381             {
382 4     4   16 local $Test::Builder::Level = $Test::Builder::Level + 1;
383 4         15 return _ok(0,@_)
384             }
385              
386             sub _pass
387             {
388 10     10   25 local $Test::Builder::Level = $Test::Builder::Level + 1;
389 10         30 return _ok(1,@_)
390             }
391              
392              
393             1;
394              
395