File Coverage

blib/lib/Search/Tools/UTF8.pm
Criterion Covered Total %
statement 68 86 79.0
branch 21 36 58.3
condition 12 16 75.0
subroutine 15 15 100.0
pod 7 7 100.0
total 123 160 76.8


line stmt bran cond sub pod time code
1             package Search::Tools::UTF8;
2 39     39   66142 use strict;
  39         47  
  39         922  
3 39     39   112 use warnings;
  39         40  
  39         708  
4 39     39   110 use Carp;
  39         40  
  39         1765  
5 39     39   3497 use Search::Tools; # XS stuff
  39         40  
  39         801  
6 39     39   14208 use Encode;
  39         209268  
  39         2359  
7 39     39   16651 use charnames ':full';
  39         875096  
  39         233  
8 39     39   10162 use Data::Dump qw( dump );
  39         35106  
  39         2136  
9 39     39   163 use base qw( Exporter );
  39         43  
  39         44051  
10             our @EXPORT = qw(
11             to_utf8
12             is_valid_utf8
13             is_flagged_utf8
14             is_perl_utf8_string
15             is_ascii
16             is_latin1
17             is_sane_utf8
18             find_bad_utf8
19             find_bad_ascii
20             find_bad_latin1
21             find_bad_latin1_report
22             byte_length
23             looks_like_cp1252
24             fix_cp1252_codepoints_in_utf8
25             debug_bytes
26             );
27              
28             our $Debug = ( $ENV{PERL_DEBUG} && $ENV{PERL_DEBUG} > 2 ) ? 1 : 0;
29              
30             our $VERSION = '1.004';
31              
32             sub to_utf8 {
33 604     604 1 2312 my $str = shift;
34 604 50       1000 Carp::cluck("\$str is undefined") unless defined $str;
35              
36 604   100     1229 my $charset = shift || 'iso-8859-1';
37              
38             # checks first
39 604 100       1000 if ( is_flagged_utf8($str) ) {
40 393 50       559 $Debug and carp "string '$str' is flagged utf8 already";
41 393         1052 return $str;
42             }
43 211 100       994 if ( is_ascii($str) ) {
44 193         488 Encode::_utf8_on($str);
45 193 50       329 $Debug and carp "string '$str' is ascii; utf8 flag turned on";
46 193         1161 return $str;
47             }
48 18 100       53 if ( is_valid_utf8($str) ) {
49              
50             # we got here only because the flag was off and it wasn't ascii.
51             # however, is_valid_utf8() claims that it is valid internal UTF8,
52             # so just turn the flag on.
53 6         18 Encode::_utf8_on($str);
54 6 50       15 $Debug and carp "string '$str' is valid utf8; utf8 flag turned on";
55 6         17 return $str;
56             }
57              
58             $Debug
59 12 50       31 and carp "converting $str from $charset -> utf8";
60 12         55 my $c = Encode::decode( $charset, $str );
61 12 50       957 $Debug and carp "converted $c";
62              
63 12 50       38 unless ( is_sane_utf8( $c, 1 ) ) {
64 0         0 carp "not sane: $c";
65             }
66              
67 12         108 return $c;
68             }
69              
70             sub is_flagged_utf8 {
71 611     611 1 1944 return Encode::is_utf8( $_[0] );
72             }
73              
74             my $re_bit = join "|",
75             map { Encode::encode( "utf8", chr($_) ) } ( 127 .. 255 );
76              
77             #binmode STDERR, ":utf8";
78             #print STDERR $re_bit;
79              
80             sub is_sane_utf8 {
81 15     15 1 30 my $string = shift;
82 15   50     55 my $warnings = shift || $Debug || 0;
83              
84 15         21 my $is_insane = 0;
85 15         2268 while ( $string =~ /($re_bit)/go ) {
86              
87             # work out what the double encoded string was
88 0         0 my $bytes = $1;
89              
90 0         0 my $index = $+[0] - length($bytes);
91 0         0 my $codes = join '', map { sprintf '<%00x>', ord($_) } split //,
  0         0  
92             $bytes;
93              
94             # what character does that represent?
95 0         0 my $char = Encode::decode( "utf8", $bytes );
96 0         0 my $ord = ord($char);
97 0         0 my $hex = sprintf '%00x', $ord;
98 0         0 $char = charnames::viacode($ord);
99              
100             # print out diagnostic messages
101 0 0       0 if ($warnings) {
102              
103 0         0 warn(qq{Found dodgy chars "$codes" at char $index\n});
104 0 0       0 if ( Encode::is_utf8($string) ) {
105 0         0 warn("Chars in utf8 string look like utf8 byte sequence.");
106             }
107             else {
108 0         0 warn("String not flagged as utf8...was it meant to be?\n");
109             }
110 0         0 warn(
111             "Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n"
112             );
113              
114             }
115 0         0 $is_insane++;
116             }
117              
118 15 50       186 return $is_insane ? 0 : 1;
119             }
120              
121             sub is_valid_utf8 {
122 39 100 100 39 1 6046 if ( is_latin1( $_[0] )
      100        
123             && !is_ascii( $_[0] )
124             && !is_perl_utf8_string( $_[0] ) )
125             {
126 8         26 return 0;
127             }
128 31         124 return is_perl_utf8_string( $_[0] );
129             }
130              
131             sub find_bad_latin1_report {
132 2     2 1 164 my $bad = find_bad_latin1(@_);
133 2 50       5 if ($bad) {
134              
135             # explain why we failed
136 2         4 my $char = substr( $_[0], $bad, 1 );
137 2         2 my $dec = ord($char);
138 2         6 my $hex = sprintf '%x', $dec;
139 2         252 carp("byte $bad ($char) is not Latin1 (it's $dec dec / $hex hex)");
140             }
141 2         7 return $bad;
142             }
143              
144             sub looks_like_cp1252 {
145 6 100 33 6 1 66 if ( !is_latin1( $_[0] )
      66        
146             && !is_ascii( $_[0] )
147             && $_[0] =~ m/[\x80-\x9f]/ )
148             {
149 5         21 return 1;
150             }
151 1         3 return 0;
152             }
153              
154             my %win1252 = (
155             "\x80" => "\x{20AC}", #EURO SIGN
156             "\x81" => '', #UNDEFINED
157             "\x82" => "\x{201A}", #SINGLE LOW-9 QUOTATION MARK
158             "\x83" => "\x{0192}", #LATIN SMALL LETTER F WITH HOOK
159             "\x84" => "\x{201E}", #DOUBLE LOW-9 QUOTATION MARK
160             "\x85" => "\x{2026}", #HORIZONTAL ELLIPSIS
161             "\x86" => "\x{2020}", #DAGGER
162             "\x87" => "\x{2021}", #DOUBLE DAGGER
163             "\x88" => "\x{02C6}", #MODIFIER LETTER CIRCUMFLEX ACCENT
164             "\x89" => "\x{2030}", #PER MILLE SIGN
165             "\x8A" => "\x{0160}", #LATIN CAPITAL LETTER S WITH CARON
166             "\x8B" => "\x{2039}", #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
167             "\x8C" => "\x{0152}", #LATIN CAPITAL LIGATURE OE
168             "\x8D" => '', #UNDEFINED
169             "\x8E" => "\x{017D}", #LATIN CAPITAL LETTER Z WITH CARON
170             "\x8F" => '', #UNDEFINED
171             "\x90" => '', #UNDEFINED
172             "\x91" => "\x{2018}", #LEFT SINGLE QUOTATION MARK
173             "\x92" => "\x{2019}", #RIGHT SINGLE QUOTATION MARK
174             "\x93" => "\x{201C}", #LEFT DOUBLE QUOTATION MARK
175             "\x94" => "\x{201D}", #RIGHT DOUBLE QUOTATION MARK
176             "\x95" => "\x{2022}", #BULLET
177             "\x96" => "\x{2013}", #EN DASH
178             "\x97" => "\x{2014}", #EM DASH
179             "\x98" => "\x{02DC}", #SMALL TILDE
180             "\x99" => "\x{2122}", #TRADE MARK SIGN
181             "\x9A" => "\x{0161}", #LATIN SMALL LETTER S WITH CARON
182             "\x9B" => "\x{203A}", #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
183             "\x9C" => "\x{0153}", #LATIN SMALL LIGATURE OE
184             "\x9D" => '', #UNDEFINED
185             "\x9E" => "\x{017E}", #LATIN SMALL LETTER Z WITH CARON
186             "\x9F" => "\x{0178}", #LATIN CAPITAL LETTER Y WITH DIAERESIS
187              
188             );
189              
190             # fix_latin (used in Transliterate) lacks the check for the
191             # prefixed \xc2 byte, but the UTF-8 encoding for these
192             # Windows codepoints has the leading \xc2 byte.
193             sub fix_cp1252_codepoints_in_utf8 {
194 1     1 1 170 my $buf = shift;
195 1 50       2 unless ( is_valid_utf8($buf) ) {
196 0         0 my $badbyte = find_bad_utf8($buf);
197 0         0 croak "bad UTF-8 byte(s) at $badbyte [ " . dump($buf) . " ]";
198             }
199 1 50       2 $Debug and warn "converting $buf\n";
200 1         2 my $bytes = Encode::encode_utf8( to_utf8($buf) );
201 1         10 $bytes =~ s/\xc2([\x80-\x9f])/$win1252{$1}/g;
202 1         7 return to_utf8($bytes);
203             }
204              
205             1;
206              
207             __END__