File Coverage

blib/lib/Image/ExifTool/Charset.pm
Criterion Covered Total %
statement 97 174 55.7
branch 59 134 44.0
condition 12 45 26.6
subroutine 7 8 87.5
pod 0 4 0.0
total 175 365 47.9


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------------------
2             # File: Charset.pm
3             #
4             # Description: ExifTool character encoding routines
5             #
6             # Revisions: 2009/08/28 - P. Harvey created
7             # 2010/01/20 - P. Harvey complete re-write
8             # 2010/07/16 - P. Harvey added UTF-16 support
9             #
10             # Notes: Charset lookups are generated using my convertCharset script
11             #------------------------------------------------------------------------------
12              
13             package Image::ExifTool::Charset;
14              
15 37     37   294 use strict;
  37         107  
  37         1534  
16 37     37   223 use vars qw($VERSION %csType);
  37         86  
  37         2304  
17 37     37   253 use Image::ExifTool qw(:DataAccess :Utils);
  37         87  
  37         16195  
18              
19             $VERSION = '1.11';
20              
21             my %charsetTable; # character set tables we've loaded
22              
23             # lookup for converting Unicode to 1-byte character sets
24             my %unicode2byte = (
25             Latin => { # pre-load Latin (cp1252) for speed
26             0x20ac => 0x80, 0x0160 => 0x8a, 0x2013 => 0x96,
27             0x201a => 0x82, 0x2039 => 0x8b, 0x2014 => 0x97,
28             0x0192 => 0x83, 0x0152 => 0x8c, 0x02dc => 0x98,
29             0x201e => 0x84, 0x017d => 0x8e, 0x2122 => 0x99,
30             0x2026 => 0x85, 0x2018 => 0x91, 0x0161 => 0x9a,
31             0x2020 => 0x86, 0x2019 => 0x92, 0x203a => 0x9b,
32             0x2021 => 0x87, 0x201c => 0x93, 0x0153 => 0x9c,
33             0x02c6 => 0x88, 0x201d => 0x94, 0x017e => 0x9e,
34             0x2030 => 0x89, 0x2022 => 0x95, 0x0178 => 0x9f,
35             },
36             );
37              
38             # bit flags for all supported character sets
39             # (this number must be correct because it dictates the decoding algorithm!)
40             # 0x001 = character set requires a translation module
41             # 0x002 = inverse conversion not yet supported by Recompose()
42             # 0x080 = some characters with codepoints in the range 0x00-0x7f are remapped
43             # 0x100 = 1-byte fixed-width characters
44             # 0x200 = 2-byte fixed-width characters
45             # 0x400 = 4-byte fixed-width characters
46             # 0x800 = 1- and 2-byte variable-width characters, or 1-byte
47             # fixed-width characters that map into multiple codepoints
48             # Note: In its public interface, ExifTool can currently only support type 0x101
49             # and lower character sets because strings are only converted if they
50             # contain characters above 0x7f and there is no provision for specifying
51             # the byte order for input/output values
52             %csType = (
53             UTF8 => 0x100,
54             ASCII => 0x100, # (treated like UTF8)
55             Arabic => 0x101,
56             Baltic => 0x101,
57             Cyrillic => 0x101,
58             Greek => 0x101,
59             Hebrew => 0x101,
60             Latin => 0x101,
61             Latin2 => 0x101,
62             DOSLatinUS => 0x101,
63             DOSLatin1 => 0x101,
64             DOSCyrillic => 0x101,
65             MacCroatian => 0x101,
66             MacCyrillic => 0x101,
67             MacGreek => 0x101,
68             MacIceland => 0x101,
69             MacLatin2 => 0x101,
70             MacRoman => 0x101,
71             MacRomanian => 0x101,
72             MacTurkish => 0x101,
73             Thai => 0x101,
74             Turkish => 0x101,
75             Vietnam => 0x101,
76             MacArabic => 0x103, # (directional characters not supported)
77             PDFDoc => 0x181,
78             Unicode => 0x200, # (UCS2)
79             UCS2 => 0x200,
80             UTF16 => 0x200,
81             Symbol => 0x201,
82             JIS => 0x201,
83             UCS4 => 0x400,
84             MacChineseCN => 0x803,
85             MacChineseTW => 0x803,
86             MacHebrew => 0x803, # (directional characters not supported)
87             MacKorean => 0x803,
88             MacRSymbol => 0x803,
89             MacThai => 0x803,
90             MacJapanese => 0x883,
91             ShiftJIS => 0x883,
92             );
93              
94             #------------------------------------------------------------------------------
95             # Load character set module
96             # Inputs: 0) Module name
97             # Returns: Reference to lookup hash, or undef on error
98             sub LoadCharset($)
99             {
100 149     149 0 305 my $charset = shift;
101 149         348 my $conv = $charsetTable{$charset};
102 149 100       384 unless ($conv) {
103             # load translation module
104 22         74 my $module = "Image::ExifTool::Charset::$charset";
105 37     37   311 no strict 'refs';
  37         105  
  37         66708  
106 22 50 33     2732 if (%$module or eval "require $module") {
107 22         181 $conv = $charsetTable{$charset} = \%$module;
108             }
109             }
110 149         371 return $conv;
111             }
112              
113             #------------------------------------------------------------------------------
114             # Does an array contain valid UTF-16 characters?
115             # Inputs: 0) array reference to list of UCS-2 values
116             # Returns: 0=invalid UTF-16, 1=valid UTF-16 with no surrogates, 2=valid UTF-16 with surrogates
117             sub IsUTF16($)
118             {
119 0     0 0 0 local $_;
120 0         0 my $uni = shift;
121 0         0 my $surrogate;
122 0         0 foreach (@$uni) {
123 0         0 my $hiBits = ($_ & 0xfc00);
124 0 0       0 if ($hiBits == 0xfc00) {
    0          
125             # check for invalid values in UTF-16
126 0 0 0     0 return 0 if $_ == 0xffff or $_ == 0xfffe or ($_ >= 0xfdd0 and $_ <= 0xfdef);
      0        
      0        
127             } elsif ($surrogate) {
128 0 0       0 return 0 if $hiBits != 0xdc00;
129 0         0 $surrogate = 0;
130             } else {
131 0 0       0 return 0 if $hiBits == 0xdc00;
132 0 0       0 $surrogate = 1 if $hiBits == 0xd800;
133             }
134             }
135 0 0       0 return 1 if not defined $surrogate;
136 0 0       0 return 2 unless $surrogate;
137 0         0 return 0;
138             }
139              
140             #------------------------------------------------------------------------------
141             # Decompose string with specified encoding into an array of integer code points
142             # Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
143             # 3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering)
144             # Returns: Reference to array of Unicode values
145             # Notes: Accepts any type of character set
146             # - byte order only used for fixed-width 2-byte and 4-byte character sets
147             # - byte order mark observed and then removed with UCS2 and UCS4
148             # - no warnings are issued if ExifTool object is not provided
149             # - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong
150             sub Decompose($$$;$)
151             {
152 776     776 0 1311 local $_;
153 776         1718 my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required)
154 776         1406 my $type = $csType{$charset};
155 776         1250 my (@uni, $conv);
156              
157 776 100       2342 if ($type & 0x001) {
    100          
158 51         139 $conv = LoadCharset($charset);
159 51 50       152 unless ($conv) {
160             # (shouldn't happen)
161 0 0       0 $et->Warn("Invalid character set $charset") if $et;
162 0         0 return \@uni; # error!
163             }
164             } elsif ($type == 0x100) {
165             # convert ASCII and UTF8 (treat ASCII as UTF8)
166 60 50       185 if ($] < 5.006001) {
167             # do it ourself
168 0         0 @uni = Image::ExifTool::UnpackUTF8($val);
169             } else {
170             # handle warnings from malformed UTF-8
171 60         131 undef $Image::ExifTool::evalWarning;
172 60         347 local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
173             # (somehow the meaning of "U0" was reversed in Perl 5.10.0!)
174 60 50       530 @uni = unpack($] < 5.010000 ? 'U0U*' : 'C0U*', $val);
175             # issue warning if we had errors
176 60 0 33     327 if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) {
      33        
177 0         0 $et->Warn('Malformed UTF-8 character(s)');
178 0         0 $$et{WarnBadUTF8} = 1;
179             }
180             }
181 60         227 return \@uni; # all done!
182             }
183 716 100       1963 if ($type & 0x100) { # 1-byte fixed-width characters
    100          
184 45         199 @uni = unpack('C*', $val);
185 45         120 foreach (@uni) {
186 859 100       2022 $_ = $$conv{$_} if defined $$conv{$_};
187             }
188             } elsif ($type & 0x600) { # 2-byte or 4-byte fixed-width characters
189 665         959 my $unknown;
190 665         1100 my $byteOrder = $_[3];
191 665 100       1430 if (not $byteOrder) {
    50          
192 410         1112 $byteOrder = GetByteOrder();
193             } elsif ($byteOrder eq 'Unknown') {
194 0         0 $byteOrder = GetByteOrder();
195 0         0 $unknown = 1;
196             }
197 665 100       1664 my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
198 665 50       2807 if ($type & 0x400) { # 4-byte
    50          
199 0         0 $fmt = uc $fmt; # unpack as 'N*' or 'V*'
200             # honour BOM if it exists
201 0 0       0 $val =~ s/^(\0\0\xfe\xff|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N*' : 'V*';
    0          
202 0         0 undef $unknown; # (byte order logic applies to 2-byte only)
203             } elsif ($val =~ s/^(\xfe\xff|\xff\xfe)//) {
204 0 0       0 $fmt = $1 eq "\xfe\xff" ? 'n*' : 'v*';
205 0         0 undef $unknown;
206             }
207             # convert from UCS2 or UCS4
208 665         2757 @uni = unpack($fmt, $val);
209              
210 665 50       1604 if (not $conv) {
    0          
211             # no translation necessary
212 665 50       1415 if ($unknown) {
213             # check the byte order
214 0         0 my (%bh, %bl);
215 0         0 my ($zh, $zl) = (0, 0);
216 0         0 foreach (@uni) {
217 0         0 $bh{$_ >> 8} = 1;
218 0         0 $bl{$_ & 0xff} = 1;
219 0 0       0 ++$zh unless $_ & 0xff00;
220 0 0       0 ++$zl unless $_ & 0x00ff;
221             }
222             # count the number of unique values in the hi and lo bytes
223 0         0 my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl));
224             # the byte with the greater number of unique values should be
225             # the low-order byte, otherwise the byte which is zero more
226             # often is likely the high-order byte
227 0 0 0     0 if ($bh > $bl or ($bh == $bl and $zl > $zh)) {
      0        
228             # we guessed wrong, so decode using the other byte order
229 0         0 $fmt =~ tr/nvNV/vnVN/;
230 0         0 @uni = unpack($fmt, $val);
231 0         0 $$et{WrongByteOrder} = 1;
232             }
233             }
234             # handle surrogate pairs of UTF-16
235 665 50       1750 if ($charset eq 'UTF16') {
236 0         0 my $i;
237 0         0 for ($i=0; $i<$#uni; ++$i) {
238 0 0 0     0 next unless ($uni[$i] & 0xfc00) == 0xd800 and
239             ($uni[$i+1] & 0xfc00) == 0xdc00;
240 0         0 my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff);
241 0         0 splice(@uni, $i, 2, $cp);
242             }
243             }
244             } elsif ($unknown) {
245             # count encoding errors as we do the translation
246 0         0 my $e1 = 0;
247 0         0 foreach (@uni) {
248 0 0       0 defined $$conv{$_} and $_ = $$conv{$_}, next;
249 0         0 ++$e1;
250             }
251             # try the other byte order if we had any errors
252 0 0       0 if ($e1) {
253 0 0       0 $fmt = $byteOrder eq 'MM' ? 'v*' : 'n*'; #(reversed)
254 0         0 my @try = unpack($fmt, $val);
255 0         0 my $e2 = 0;
256 0         0 foreach (@try) {
257 0 0       0 defined $$conv{$_} and $_ = $$conv{$_}, next;
258 0         0 ++$e2;
259             }
260             # use this byte order if there are fewer errors
261 0 0       0 if ($e2 < $e1) {
262 0         0 $$et{WrongByteOrder} = 1;
263 0         0 return \@try;
264             }
265             }
266             } else {
267             # translate any characters found in the lookup
268 0         0 foreach (@uni) {
269 0 0       0 $_ = $$conv{$_} if defined $$conv{$_};
270             }
271             }
272             } else { # variable-width characters
273             # unpack into bytes
274 6         32 my @bytes = unpack('C*', $val);
275 6         21 while (@bytes) {
276 22         37 my $ch = shift @bytes;
277 22         42 my $cv = $$conv{$ch};
278             # pass straight through if no translation
279 22 50       40 $cv or push(@uni, $ch), next;
280             # byte translates into single Unicode character
281 22 100       50 ref $cv or push(@uni, $cv), next;
282             # byte maps into multiple Unicode characters
283 12 50       32 ref $cv eq 'ARRAY' and push(@uni, @$cv), next;
284             # handle 2-byte character codes
285 12         17 $ch = shift @bytes;
286 12 50       24 if (defined $ch) {
287 12 50       28 if ($$cv{$ch}) {
288 12         19 $cv = $$cv{$ch};
289 12 50       48 ref $cv or push(@uni, $cv), next;
290 0         0 push @uni, @$cv; # multiple Unicode characters
291             } else {
292 0         0 push @uni, ord('?'); # encoding error
293 0         0 unshift @bytes, $ch;
294             }
295             } else {
296 0         0 push @uni, ord('?'); # encoding error
297             }
298             }
299             }
300 716         2088 return \@uni;
301             }
302              
303             #------------------------------------------------------------------------------
304             # Convert array of code point integers into a string with specified encoding
305             # Inputs: 0) ExifTool ref (or undef), 1) unicode character array ref,
306             # 2) character set (note: not all types are supported)
307             # 3) byte order ('MM' or 'II', multi-byte sets only, defaults to current byte order)
308             # Returns: converted string (truncated at null character if it exists), empty on error
309             # Notes: converts elements of input character array to new code points
310             # - ExifTool ref may be undef provided $charset is defined
311             sub Recompose($$;$$)
312             {
313 799     799 0 1246 local $_;
314 799         1638 my ($et, $uni, $charset) = @_; # ($byteOrder assigned later if required)
315 799         1345 my ($outVal, $conv, $inv);
316 799 100       1734 $charset or $charset = $$et{OPTIONS}{Charset};
317 799         1486 my $csType = $csType{$charset};
318 799 100       1752 if ($csType == 0x100) { # UTF8 (also treat ASCII as UTF8)
319 667 50       1663 if ($] >= 5.006001) {
320             # let Perl do it
321 667         2649 $outVal = pack('C0U*', @$uni);
322             } else {
323             # do it ourself
324 0         0 $outVal = Image::ExifTool::PackUTF8(@$uni);
325             }
326 667         1871 $outVal =~ s/\0.*//s; # truncate at null terminator
327 667         2582 return $outVal;
328             }
329             # get references to forward and inverse lookup tables
330 132 100       738 if ($csType & 0x801) {
331 98         252 $conv = LoadCharset($charset);
332 98 50       326 unless ($conv) {
333 0 0       0 $et->Warn("Missing charset $charset") if $et;
334 0         0 return '';
335             }
336 98         218 $inv = $unicode2byte{$charset};
337             # generate inverse lookup if necessary
338 98 100       242 unless ($inv) {
339 2 50 33     23 if (not $csType or $csType & 0x802) {
340 0 0       0 $et->Warn("Invalid destination charset $charset") if $et;
341 0         0 return '';
342             }
343             # prepare table to convert from Unicode to 1-byte characters
344 2         8 my ($char, %inv);
345 2         36 foreach $char (keys %$conv) {
346 224         552 $inv{$$conv{$char}} = $char;
347             }
348 2         19 $inv = $unicode2byte{$charset} = \%inv;
349             }
350             }
351 132 100       372 if ($csType & 0x100) { # 1-byte fixed-width
352             # convert to specified character set
353 98         230 foreach (@$uni) {
354 2471 100       4306 next if $_ < 0x80;
355 450 100       1053 $$inv{$_} and $_ = $$inv{$_}, next;
356             # our tables omit 1-byte characters with the same values as Unicode,
357             # so pass them straight through after making sure there isn't a
358             # different character with this byte value
359 392 100 100     739 next if $_ < 0x100 and not $$conv{$_};
360 385         537 $_ = ord('?'); # set invalid characters to '?'
361 385 100 100     1062 if ($et and not $$et{EncodingError}) {
362 3         25 $et->Warn("Some character(s) could not be encoded in $charset");
363 3         18 $$et{EncodingError} = 1;
364             }
365             }
366             # repack as an 8-bit string and truncate at null
367 98         579 $outVal = pack('C*', @$uni);
368 98         351 $outVal =~ s/\0.*//s;
369             } else { # 2-byte and 4-byte fixed-width
370             # convert if required
371 34 50       100 if ($inv) {
372 0   0     0 $$inv{$_} and $_ = $$inv{$_} foreach @$uni;
373             }
374             # generate surrogate pairs of UTF-16
375 34 50       116 if ($charset eq 'UTF16') {
376 0         0 my $i;
377 0         0 for ($i=0; $i<@$uni; ++$i) {
378 0 0 0     0 next unless $$uni[$i] >= 0x10000 and $$uni[$i] < 0x10ffff;
379 0         0 my $t = $$uni[$i] - 0x10000;
380 0         0 my $w1 = 0xd800 + (($t >> 10) & 0x3ff);
381 0         0 my $w2 = 0xdc00 + ($t & 0x3ff);
382 0         0 splice(@$uni, $i, 1, $w1, $w2);
383 0         0 ++$i; # skip surrogate pair
384             }
385             }
386             # pack as 2- or 4-byte integer in specified byte order
387 34   66     116 my $byteOrder = $_[3] || GetByteOrder();
388 34 100       122 my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
389 34 50       106 $fmt = uc($fmt) if $csType & 0x400;
390 34         176 $outVal = pack($fmt, @$uni);
391             }
392 132         604 return $outVal;
393             }
394              
395             1; # end
396              
397             __END__