File Coverage

blib/lib/Image/ExifTool/Charset.pm

Criterion	Covered	Total	%
statement	97	174	55.7
branch	59	134	44.0
condition	12	45	26.6
subroutine	7	8	87.5
pod	0	4	0.0
total	175	365	47.9

line	stmt	bran	cond	sub	pod	time	code
1							#------------------------------------------------------------------------------
2							# File: Charset.pm
3							#
4							# Description: ExifTool character encoding routines
5							#
6							# Revisions: 2009/08/28 - P. Harvey created
7							# 2010/01/20 - P. Harvey complete re-write
8							# 2010/07/16 - P. Harvey added UTF-16 support
9							#
10							# Notes: Charset lookups are generated using my convertCharset script
11							#------------------------------------------------------------------------------
12
13							package Image::ExifTool::Charset;
14
15	37			37		322	use strict;
	37					83
	37					1576
16	37			37		230	use vars qw($VERSION %csType);
	37					77
	37					2255
17	37			37		263	use Image::ExifTool qw(:DataAccess :Utils);
	37					83
	37					16105
18
19							$VERSION = '1.11';
20
21							my %charsetTable; # character set tables we've loaded
22
23							# lookup for converting Unicode to 1-byte character sets
24							my %unicode2byte = (
25							Latin => { # pre-load Latin (cp1252) for speed
26							0x20ac => 0x80, 0x0160 => 0x8a, 0x2013 => 0x96,
27							0x201a => 0x82, 0x2039 => 0x8b, 0x2014 => 0x97,
28							0x0192 => 0x83, 0x0152 => 0x8c, 0x02dc => 0x98,
29							0x201e => 0x84, 0x017d => 0x8e, 0x2122 => 0x99,
30							0x2026 => 0x85, 0x2018 => 0x91, 0x0161 => 0x9a,
31							0x2020 => 0x86, 0x2019 => 0x92, 0x203a => 0x9b,
32							0x2021 => 0x87, 0x201c => 0x93, 0x0153 => 0x9c,
33							0x02c6 => 0x88, 0x201d => 0x94, 0x017e => 0x9e,
34							0x2030 => 0x89, 0x2022 => 0x95, 0x0178 => 0x9f,
35							},
36							);
37
38							# bit flags for all supported character sets
39							# (this number must be correct because it dictates the decoding algorithm!)
40							# 0x001 = character set requires a translation module
41							# 0x002 = inverse conversion not yet supported by Recompose()
42							# 0x080 = some characters with codepoints in the range 0x00-0x7f are remapped
43							# 0x100 = 1-byte fixed-width characters
44							# 0x200 = 2-byte fixed-width characters
45							# 0x400 = 4-byte fixed-width characters
46							# 0x800 = 1- and 2-byte variable-width characters, or 1-byte
47							# fixed-width characters that map into multiple codepoints
48							# Note: In its public interface, ExifTool can currently only support type 0x101
49							# and lower character sets because strings are only converted if they
50							# contain characters above 0x7f and there is no provision for specifying
51							# the byte order for input/output values
52							%csType = (
53							UTF8 => 0x100,
54							ASCII => 0x100, # (treated like UTF8)
55							Arabic => 0x101,
56							Baltic => 0x101,
57							Cyrillic => 0x101,
58							Greek => 0x101,
59							Hebrew => 0x101,
60							Latin => 0x101,
61							Latin2 => 0x101,
62							DOSLatinUS => 0x101,
63							DOSLatin1 => 0x101,
64							DOSCyrillic => 0x101,
65							MacCroatian => 0x101,
66							MacCyrillic => 0x101,
67							MacGreek => 0x101,
68							MacIceland => 0x101,
69							MacLatin2 => 0x101,
70							MacRoman => 0x101,
71							MacRomanian => 0x101,
72							MacTurkish => 0x101,
73							Thai => 0x101,
74							Turkish => 0x101,
75							Vietnam => 0x101,
76							MacArabic => 0x103, # (directional characters not supported)
77							PDFDoc => 0x181,
78							Unicode => 0x200, # (UCS2)
79							UCS2 => 0x200,
80							UTF16 => 0x200,
81							Symbol => 0x201,
82							JIS => 0x201,
83							UCS4 => 0x400,
84							MacChineseCN => 0x803,
85							MacChineseTW => 0x803,
86							MacHebrew => 0x803, # (directional characters not supported)
87							MacKorean => 0x803,
88							MacRSymbol => 0x803,
89							MacThai => 0x803,
90							MacJapanese => 0x883,
91							ShiftJIS => 0x883,
92							);
93
94							#------------------------------------------------------------------------------
95							# Load character set module
96							# Inputs: 0) Module name
97							# Returns: Reference to lookup hash, or undef on error
98							sub LoadCharset($)
99							{
100	149			149	0	329	my $charset = shift;
101	149					304	my $conv = $charsetTable{$charset};
102	149	100				374	unless ($conv) {
103							# load translation module
104	22					78	my $module = "Image::ExifTool::Charset::$charset";
105	37			37		294	no strict 'refs';
	37					121
	37					67441
106	22	50	33			1919	if (%$module or eval "require $module") {
107	22					197	$conv = $charsetTable{$charset} = \%$module;
108							}
109							}
110	149					396	return $conv;
111							}
112
113							#------------------------------------------------------------------------------
114							# Does an array contain valid UTF-16 characters?
115							# Inputs: 0) array reference to list of UCS-2 values
116							# Returns: 0=invalid UTF-16, 1=valid UTF-16 with no surrogates, 2=valid UTF-16 with surrogates
117							sub IsUTF16($)
118							{
119	0			0	0	0	local $_;
120	0					0	my $uni = shift;
121	0					0	my $surrogate;
122	0					0	foreach (@$uni) {
123	0					0	my $hiBits = ($_ & 0xfc00);
124	0	0				0	if ($hiBits == 0xfc00) {
		0
125							# check for invalid values in UTF-16
126	0	0	0			0	return 0 if $_ == 0xffff or $_ == 0xfffe or ($_ >= 0xfdd0 and $_ <= 0xfdef);
			0
			0
127							} elsif ($surrogate) {
128	0	0				0	return 0 if $hiBits != 0xdc00;
129	0					0	$surrogate = 0;
130							} else {
131	0	0				0	return 0 if $hiBits == 0xdc00;
132	0	0				0	$surrogate = 1 if $hiBits == 0xd800;
133							}
134							}
135	0	0				0	return 1 if not defined $surrogate;
136	0	0				0	return 2 unless $surrogate;
137	0					0	return 0;
138							}
139
140							#------------------------------------------------------------------------------
141							# Decompose string with specified encoding into an array of integer code points
142							# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
143							# 3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering)
144							# Returns: Reference to array of Unicode values
145							# Notes: Accepts any type of character set
146							# - byte order only used for fixed-width 2-byte and 4-byte character sets
147							# - byte order mark observed and then removed with UCS2 and UCS4
148							# - no warnings are issued if ExifTool object is not provided
149							# - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong
150							sub Decompose($$$;$)
151							{
152	776			776	0	1340	local $_;
153	776					1724	my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required)
154	776					1392	my $type = $csType{$charset};
155	776					1275	my (@uni, $conv);
156
157	776	100				3260	if ($type & 0x001) {
		100
158	51					122	$conv = LoadCharset($charset);
159	51	50				168	unless ($conv) {
160							# (shouldn't happen)
161	0	0				0	$et->Warn("Invalid character set $charset") if $et;
162	0					0	return \@uni; # error!
163							}
164							} elsif ($type == 0x100) {
165							# convert ASCII and UTF8 (treat ASCII as UTF8)
166	60	50				172	if ($] < 5.006001) {
167							# do it ourself
168	0					0	@uni = Image::ExifTool::UnpackUTF8($val);
169							} else {
170							# handle warnings from malformed UTF-8
171	60					120	undef $Image::ExifTool::evalWarning;
172	60					328	local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
173							# (somehow the meaning of "U0" was reversed in Perl 5.10.0!)
174	60	50				532	@uni = unpack($] < 5.010000 ? 'U0U' : 'C0U', $val);
175							# issue warning if we had errors
176	60	0	33			321	if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) {
			33
177	0					0	$et->Warn('Malformed UTF-8 character(s)');
178	0					0	$$et{WarnBadUTF8} = 1;
179							}
180							}
181	60					235	return \@uni; # all done!
182							}
183	716	100				1946	if ($type & 0x100) { # 1-byte fixed-width characters
		100
184	45					214	@uni = unpack('C*', $val);
185	45					115	foreach (@uni) {
186	859	100				2013	$_ = $$conv{$_} if defined $$conv{$_};
187							}
188							} elsif ($type & 0x600) { # 2-byte or 4-byte fixed-width characters
189	665					956	my $unknown;
190	665					1090	my $byteOrder = $_[3];
191	665	100				1483	if (not $byteOrder) {
		50
192	410					1078	$byteOrder = GetByteOrder();
193							} elsif ($byteOrder eq 'Unknown') {
194	0					0	$byteOrder = GetByteOrder();
195	0					0	$unknown = 1;
196							}
197	665	100				1972	my $fmt = $byteOrder eq 'MM' ? 'n' : 'v';
198	665	50				2821	if ($type & 0x400) { # 4-byte
		50
199	0					0	$fmt = uc $fmt; # unpack as 'N' or 'V'
200							# honour BOM if it exists
201	0	0				0	$val =~ s/^(\0\0\xfe\xff\|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N' : 'V';
		0
202	0					0	undef $unknown; # (byte order logic applies to 2-byte only)
203							} elsif ($val =~ s/^(\xfe\xff\|\xff\xfe)//) {
204	0	0				0	$fmt = $1 eq "\xfe\xff" ? 'n' : 'v';
205	0					0	undef $unknown;
206							}
207							# convert from UCS2 or UCS4
208	665					2804	@uni = unpack($fmt, $val);
209
210	665	50				1570	if (not $conv) {
		0
211							# no translation necessary
212	665	50				1431	if ($unknown) {
213							# check the byte order
214	0					0	my (%bh, %bl);
215	0					0	my ($zh, $zl) = (0, 0);
216	0					0	foreach (@uni) {
217	0					0	$bh{$_ >> 8} = 1;
218	0					0	$bl{$_ & 0xff} = 1;
219	0	0				0	++$zh unless $_ & 0xff00;
220	0	0				0	++$zl unless $_ & 0x00ff;
221							}
222							# count the number of unique values in the hi and lo bytes
223	0					0	my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl));
224							# the byte with the greater number of unique values should be
225							# the low-order byte, otherwise the byte which is zero more
226							# often is likely the high-order byte
227	0	0	0			0	if ($bh > $bl or ($bh == $bl and $zl > $zh)) {
			0
228							# we guessed wrong, so decode using the other byte order
229	0					0	$fmt =~ tr/nvNV/vnVN/;
230	0					0	@uni = unpack($fmt, $val);
231	0					0	$$et{WrongByteOrder} = 1;
232							}
233							}
234							# handle surrogate pairs of UTF-16
235	665	50				1686	if ($charset eq 'UTF16') {
236	0					0	my $i;
237	0					0	for ($i=0; $i<$#uni; ++$i) {
238	0	0	0			0	next unless ($uni[$i] & 0xfc00) == 0xd800 and
239							($uni[$i+1] & 0xfc00) == 0xdc00;
240	0					0	my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff);
241	0					0	splice(@uni, $i, 2, $cp);
242							}
243							}
244							} elsif ($unknown) {
245							# count encoding errors as we do the translation
246	0					0	my $e1 = 0;
247	0					0	foreach (@uni) {
248	0	0				0	defined $$conv{$_} and $_ = $$conv{$_}, next;
249	0					0	++$e1;
250							}
251							# try the other byte order if we had any errors
252	0	0				0	if ($e1) {
253	0	0				0	$fmt = $byteOrder eq 'MM' ? 'v' : 'n'; #(reversed)
254	0					0	my @try = unpack($fmt, $val);
255	0					0	my $e2 = 0;
256	0					0	foreach (@try) {
257	0	0				0	defined $$conv{$_} and $_ = $$conv{$_}, next;
258	0					0	++$e2;
259							}
260							# use this byte order if there are fewer errors
261	0	0				0	if ($e2 < $e1) {
262	0					0	$$et{WrongByteOrder} = 1;
263	0					0	return \@try;
264							}
265							}
266							} else {
267							# translate any characters found in the lookup
268	0					0	foreach (@uni) {
269	0	0				0	$_ = $$conv{$_} if defined $$conv{$_};
270							}
271							}
272							} else { # variable-width characters
273							# unpack into bytes
274	6					42	my @bytes = unpack('C*', $val);
275	6					22	while (@bytes) {
276	22					40	my $ch = shift @bytes;
277	22					40	my $cv = $$conv{$ch};
278							# pass straight through if no translation
279	22	50				44	$cv or push(@uni, $ch), next;
280							# byte translates into single Unicode character
281	22	100				59	ref $cv or push(@uni, $cv), next;
282							# byte maps into multiple Unicode characters
283	12	50				26	ref $cv eq 'ARRAY' and push(@uni, @$cv), next;
284							# handle 2-byte character codes
285	12					62	$ch = shift @bytes;
286	12	50				29	if (defined $ch) {
287	12	50				32	if ($$cv{$ch}) {
288	12					22	$cv = $$cv{$ch};
289	12	50				55	ref $cv or push(@uni, $cv), next;
290	0					0	push @uni, @$cv; # multiple Unicode characters
291							} else {
292	0					0	push @uni, ord('?'); # encoding error
293	0					0	unshift @bytes, $ch;
294							}
295							} else {
296	0					0	push @uni, ord('?'); # encoding error
297							}
298							}
299							}
300	716					2040	return \@uni;
301							}
302
303							#------------------------------------------------------------------------------
304							# Convert array of code point integers into a string with specified encoding
305							# Inputs: 0) ExifTool ref (or undef), 1) unicode character array ref,
306							# 2) character set (note: not all types are supported)
307							# 3) byte order ('MM' or 'II', multi-byte sets only, defaults to current byte order)
308							# Returns: converted string (truncated at null character if it exists), empty on error
309							# Notes: converts elements of input character array to new code points
310							# - ExifTool ref may be undef provided $charset is defined
311							sub Recompose($$;$$)
312							{
313	799			799	0	1232	local $_;
314	799					1673	my ($et, $uni, $charset) = @_; # ($byteOrder assigned later if required)
315	799					1309	my ($outVal, $conv, $inv);
316	799	100				1716	$charset or $charset = $$et{OPTIONS}{Charset};
317	799					1392	my $csType = $csType{$charset};
318	799	100				1781	if ($csType == 0x100) { # UTF8 (also treat ASCII as UTF8)
319	667	50				1448	if ($] >= 5.006001) {
320							# let Perl do it
321	667					2715	$outVal = pack('C0U*', @$uni);
322							} else {
323							# do it ourself
324	0					0	$outVal = Image::ExifTool::PackUTF8(@$uni);
325							}
326	667					2233	$outVal =~ s/\0.*//s; # truncate at null terminator
327	667					2681	return $outVal;
328							}
329							# get references to forward and inverse lookup tables
330	132	100				374	if ($csType & 0x801) {
331	98					247	$conv = LoadCharset($charset);
332	98	50				279	unless ($conv) {
333	0	0				0	$et->Warn("Missing charset $charset") if $et;
334	0					0	return '';
335							}
336	98					203	$inv = $unicode2byte{$charset};
337							# generate inverse lookup if necessary
338	98	100				239	unless ($inv) {
339	2	50	33			21	if (not $csType or $csType & 0x802) {
340	0	0				0	$et->Warn("Invalid destination charset $charset") if $et;
341	0					0	return '';
342							}
343							# prepare table to convert from Unicode to 1-byte characters
344	2					6	my ($char, %inv);
345	2					35	foreach $char (keys %$conv) {
346	224					537	$inv{$$conv{$char}} = $char;
347							}
348	2					19	$inv = $unicode2byte{$charset} = \%inv;
349							}
350							}
351	132	100				381	if ($csType & 0x100) { # 1-byte fixed-width
352							# convert to specified character set
353	98					240	foreach (@$uni) {
354	2471	100				4254	next if $_ < 0x80;
355	450	100				928	$$inv{$_} and $_ = $$inv{$_}, next;
356							# our tables omit 1-byte characters with the same values as Unicode,
357							# so pass them straight through after making sure there isn't a
358							# different character with this byte value
359	392	100	100			724	next if $_ < 0x100 and not $$conv{$_};
360	385					480	$_ = ord('?'); # set invalid characters to '?'
361	385	100	100			1089	if ($et and not $$et{EncodingError}) {
362	3					26	$et->Warn("Some character(s) could not be encoded in $charset");
363	3					12	$$et{EncodingError} = 1;
364							}
365							}
366							# repack as an 8-bit string and truncate at null
367	98					554	$outVal = pack('C*', @$uni);
368	98					319	$outVal =~ s/\0.*//s;
369							} else { # 2-byte and 4-byte fixed-width
370							# convert if required
371	34	50				92	if ($inv) {
372	0		0			0	$$inv{$_} and $_ = $$inv{$_} foreach @$uni;
373							}
374							# generate surrogate pairs of UTF-16
375	34	50				140	if ($charset eq 'UTF16') {
376	0					0	my $i;
377	0					0	for ($i=0; $i<@$uni; ++$i) {
378	0	0	0			0	next unless $$uni[$i] >= 0x10000 and $$uni[$i] < 0x10ffff;
379	0					0	my $t = $$uni[$i] - 0x10000;
380	0					0	my $w1 = 0xd800 + (($t >> 10) & 0x3ff);
381	0					0	my $w2 = 0xdc00 + ($t & 0x3ff);
382	0					0	splice(@$uni, $i, 1, $w1, $w2);
383	0					0	++$i; # skip surrogate pair
384							}
385							}
386							# pack as 2- or 4-byte integer in specified byte order
387	34		66			120	my $byteOrder = $_[3] \|\| GetByteOrder();
388	34	100				109	my $fmt = $byteOrder eq 'MM' ? 'n' : 'v';
389	34	50				112	$fmt = uc($fmt) if $csType & 0x400;
390	34					194	$outVal = pack($fmt, @$uni);
391							}
392	132					613	return $outVal;
393							}
394
395							1; # end
396
397							__END__