File Coverage

blib/lib/Encode/Guess.pm
Criterion Covered Total %
statement 105 110 95.4
branch 44 56 78.5
condition 10 20 50.0
subroutine 14 16 87.5
pod 7 7 100.0
total 180 209 86.1


line stmt bran cond sub pod time code
1             package Encode::Guess;
2 1     1   26114 use strict;
  1         4  
  1         39  
3 1     1   8 use warnings;
  1         4  
  1         47  
4 1     1   10 use Encode qw(:fallbacks find_encoding);
  1         4  
  1         289  
5             our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
6              
7             my $Canon = 'Guess';
8 1     1   17 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
  1         3  
  1         175  
9             our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
10             my $obj = bless {
11             Name => $Canon,
12             Suspects => {%DEF_SUSPECTS},
13             } => __PACKAGE__;
14             Encode::define_encoding($obj, $Canon);
15              
16 1     1   9 use parent qw(Encode::Encoding);
  1         3  
  1         16  
17 0     0 1 0 sub needs_lines { 1 }
18 0     0 1 0 sub perlio_ok { 0 }
19              
20             our @EXPORT = qw(guess_encoding);
21             our $NoUTFAutoGuess = 0;
22             our $UTF8_BOM = pack( "C3", 0xef, 0xbb, 0xbf );
23              
24             sub import { # Exporter not used so we do it on our own
25 1     1   17 my $callpkg = caller;
26 1         5 for my $item (@EXPORT) {
27 1     1   218 no strict 'refs';
  1         3  
  1         742  
28 1         3 *{"$callpkg\::$item"} = \&{"$item"};
  1         10  
  1         6  
29             }
30 1         6 set_suspects(@_);
31             }
32              
33             sub set_suspects {
34 4     4 1 2594 my $class = shift;
35 4 50       26 my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
36 4         37 $self->{Suspects} = {%DEF_SUSPECTS};
37 4         36 $self->add_suspects(@_);
38             }
39              
40             sub add_suspects {
41 4     4 1 13 my $class = shift;
42 4 50       23 my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
43 4         34 for my $c (@_) {
44 7 50       35 my $e = find_encoding($c) or die "Unknown encoding: $c";
45 7         74 $self->{Suspects}{ $e->name } = $e;
46 7         35 DEBUG and warn "Added: ", $e->name;
47             }
48             }
49              
50             sub decode($$;$) {
51 2     2 1 9 my ( $obj, $octet, $chk ) = @_;
52 2         8 my $guessed = guess( $obj, $octet );
53 2 50       11 unless ( ref($guessed) ) {
54 0         0 require Carp;
55 0         0 Carp::croak($guessed);
56             }
57 2   50     2775 my $utf8 = $guessed->decode( $octet, $chk || 0 );
58 2 50       14 $_[1] = $octet if $chk;
59 2         34 return $utf8;
60             }
61              
62             sub guess_encoding {
63 27     27 1 1348 guess( $Encode::Encoding{$Canon}, @_ );
64             }
65              
66             sub guess {
67 29     29 1 62 my $class = shift;
68 29 50       100 my $obj = ref($class) ? $class : $Encode::Encoding{$Canon};
69 29         66 my $octet = shift;
70              
71             # sanity check
72 29 100 66     179 return "Empty string, empty guess" unless defined $octet and length $octet;
73              
74             # cheat 0: utf8 flag;
75 28 100       134 if ( Encode::is_utf8($octet) ) {
76 1 50       8 return find_encoding('utf8') unless $NoUTFAutoGuess;
77 0         0 Encode::_utf8_off($octet);
78             }
79              
80             # cheat 1: BOM
81 1     1   529 use Encode::Unicode;
  1         4  
  1         685  
82 27 50       86 unless ($NoUTFAutoGuess) {
83 27         177 my $BOM = pack( 'C3', unpack( "C3", $octet ) );
84 27 50 33     158 return find_encoding('utf8')
85             if ( defined $BOM and $BOM eq $UTF8_BOM );
86 27         83 $BOM = unpack( 'N', $octet );
87 27 100 66     187 return find_encoding('UTF-32')
      33        
88             if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe0000 ) );
89 26         73 $BOM = unpack( 'n', $octet );
90 26 100 66     191 return find_encoding('UTF-16')
      33        
91             if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe ) );
92 25 100       125 if ( $octet =~ /\x00/o )
93             { # if \x00 found, we assume UTF-(16|32)(BE|LE)
94 4         8 my $utf;
95 4         10 my ( $be, $le ) = ( 0, 0 );
96 4 100       14 if ( $octet =~ /\x00\x00/o ) { # UTF-32(BE|LE) assumed
97 2         3 $utf = "UTF-32";
98 2         9 for my $char ( unpack( 'N*', $octet ) ) {
99 100 100       171 $char & 0x0000ffff and $be++;
100 100 100       179 $char & 0xffff0000 and $le++;
101             }
102             }
103             else { # UTF-16(BE|LE) assumed
104 2         4 $utf = "UTF-16";
105 2         9 for my $char ( unpack( 'n*', $octet ) ) {
106 100 100       170 $char & 0x00ff and $be++;
107 100 100       194 $char & 0xff00 and $le++;
108             }
109             }
110 4         11 DEBUG and warn "$utf, be == $be, le == $le";
111 4 50       14 $be == $le
112             and return
113             "Encodings ambiguous between $utf BE and LE ($be, $le)";
114 4 100       13 $utf .= ( $be > $le ) ? 'BE' : 'LE';
115 4         11 return find_encoding($utf);
116             }
117             }
118 21         44 my %try = %{ $obj->{Suspects} };
  21         179  
119 21         75 for my $c (@_) {
120 7 50       28 my $e = find_encoding($c) or die "Unknown encoding: $c";
121 7         34 $try{ $e->name } = $e;
122 7         20 DEBUG and warn "Added: ", $e->name;
123             }
124 21         51 my $nline = 1;
125 21         2083 for my $line ( split /\r\n?|\n/, $octet ) {
126              
127             # cheat 2 -- \e in the string
128 69 100       209 if ( $line =~ /\e/o ) {
129 2         13 my @keys = keys %try;
130 2         11 delete @try{qw/utf8 ascii/};
131 2         6 for my $k (@keys) {
132 8 100       33 ref( $try{$k} ) eq 'Encode::XS' and delete $try{$k};
133             }
134             }
135 69         252 my %ok = %try;
136              
137             # warn join(",", keys %try);
138 69         175 for my $k ( keys %try ) {
139 211         362 my $scratch = $line;
140 211         1275 $try{$k}->decode( $scratch, FB_QUIET );
141 211 100       490 if ( $scratch eq '' ) {
142 154         253 DEBUG and warn sprintf( "%4d:%-24s ok\n", $nline, $k );
143             }
144             else {
145 1     1   13 use bytes ();
  1         3  
  1         262  
146 57         92 DEBUG
147             and warn sprintf( "%4d:%-24s not ok; %d bytes left\n",
148             $nline, $k, bytes::length($scratch) );
149 57         132 delete $ok{$k};
150             }
151             }
152 69 100       191 %ok or return "No appropriate encodings found!";
153 66 100       156 if ( scalar( keys(%ok) ) == 1 ) {
154 14         39 my ($retval) = values(%ok);
155 14         282 return $retval;
156             }
157 52         174 %try = %ok;
158 52         117 $nline++;
159             }
160             $try{ascii}
161 4 50       13 or return "Encodings too ambiguous: ", join( " or ", keys %try );
162 4         30 return $try{ascii};
163             }
164              
165             1;
166             __END__