File Coverage

blib/lib/Locale/Recode/_Conversions.pm
Criterion Covered Total %
statement 51 61 83.6
branch 31 46 67.3
condition 6 12 50.0
subroutine 6 7 85.7
pod 0 4 0.0
total 94 130 72.3


line stmt bran cond sub pod time code
1             #! /bin/false
2             # vim: set autoindent shiftwidth=4 tabstop=4:
3              
4             # List of internally known conversions.
5             # Copyright (C) 2002-2017 Guido Flohr ,
6             # all rights reserved.
7              
8             # This program is free software: you can redistribute it and/or modify
9             # it under the terms of the GNU General Public License as published by
10             # the Free Software Foundation; either version 3 of the License, or
11             # (at your option) any later version.
12              
13             # This program is distributed in the hope that it will be useful,
14             # but WITHOUT ANY WARRANTY; without even the implied warranty of
15             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16             # GNU General Public License for more details.
17              
18             # You should have received a copy of the GNU General Public License
19             # along with this program. If not, see .
20              
21             package Locale::Recode::_Conversions;
22              
23 161     161   1146 use strict;
  161         331  
  161         4785  
24 161     161   78026 use integer;
  161         2354  
  161         887  
25              
26 161     161   5514 use vars qw ($conversions $optional_conversions);
  161         310  
  161         159278  
27              
28             # These are the canonical names of the encodings always available.
29             $conversions = {
30             'ASMO_449' => 'ASMO_449',
31             'ATARI-ST-EURO' => 'ATARI_ST_EURO',
32             'ATARI-ST' => 'ATARI_ST',
33             'CP10007' => 'CP10007',
34             'CSN_369103' => 'CSN_369103',
35             'CWI' => 'CWI',
36             'DEC-MCS' => 'DEC_MCS',
37             'EBCDIC-AT-DE-A' => 'EBCDIC_AT_DE_A',
38             'EBCDIC-AT-DE' => 'EBCDIC_AT_DE',
39             'EBCDIC-CA-FR' => 'EBCDIC_CA_FR',
40             'EBCDIC-DK-NO-A' => 'EBCDIC_DK_NO_A',
41             'EBCDIC-DK-NO' => 'EBCDIC_DK_NO',
42             'EBCDIC-ES-A' => 'EBCDIC_ES_A',
43             'EBCDIC-ES-S' => 'EBCDIC_ES_S',
44             'EBCDIC-ES' => 'EBCDIC_ES',
45             'EBCDIC-FI-SE-A' => 'EBCDIC_FI_SE_A',
46             'EBCDIC-FI-SE' => 'EBCDIC_FI_SE',
47             'EBCDIC-FR' => 'EBCDIC_FR',
48             'EBCDIC-IS-FRISS' => 'EBCDIC_IS_FRISS',
49             'EBCDIC-IT' => 'EBCDIC_IT',
50             'EBCDIC-PT' => 'EBCDIC_PT',
51             'EBCDIC-UK' => 'EBCDIC_UK',
52             'EBCDIC-US' => 'EBCDIC_US',
53             'ECMA-CYRILLIC' => 'ECMA_CYRILLIC',
54             'GEORGIAN-ACADEMY' => 'GEORGIAN_ACADEMY',
55             'GEORGIAN-PS' => 'GEORGIAN_PS',
56             'GOST_19768-74' => 'GOST_19768_74',
57             'GREEK-CCITT' => 'GREEK_CCITT',
58             'GREEK7-OLD' => 'GREEK7_OLD',
59             'GREEK7' => 'GREEK7',
60             'HP-ROMAN8' => 'HP_ROMAN8',
61             'IBM037' => 'IBM037',
62             'IBM038' => 'IBM038',
63             'IBM1004' => 'IBM1004',
64             'IBM1026' => 'IBM1026',
65             'IBM1047' => 'IBM1047',
66             'IBM256' => 'IBM256',
67             'IBM273' => 'IBM273',
68             'IBM274' => 'IBM274',
69             'IBM275' => 'IBM275',
70             'IBM277' => 'IBM277',
71             'IBM278' => 'IBM278',
72             'IBM280' => 'IBM280',
73             'IBM281' => 'IBM281',
74             'IBM284' => 'IBM284',
75             'IBM285' => 'IBM285',
76             'IBM290' => 'IBM290',
77             'IBM297' => 'IBM297',
78             'IBM420' => 'IBM420',
79             'IBM423' => 'IBM423',
80             'IBM424' => 'IBM424',
81             'IBM437' => 'IBM437',
82             'IBM500' => 'IBM500',
83             'IBM850' => 'IBM850',
84             'IBM851' => 'IBM851',
85             'IBM852' => 'IBM852',
86             'IBM855' => 'IBM855',
87             'IBM857' => 'IBM857',
88             'IBM860' => 'IBM860',
89             'IBM861' => 'IBM861',
90             'IBM862' => 'IBM862',
91             'IBM863' => 'IBM863',
92             'IBM864' => 'IBM864',
93             'IBM865' => 'IBM865',
94             'IBM866' => 'IBM866',
95             'IBM868' => 'IBM868',
96             'IBM869' => 'IBM869',
97             'IBM870' => 'IBM870',
98             'IBM871' => 'IBM871',
99             'IBM874' => 'IBM874',
100             'IBM875' => 'IBM875',
101             'IBM880' => 'IBM880',
102             'IBM891' => 'IBM891',
103             'IBM903' => 'IBM903',
104             'IBM904' => 'IBM904',
105             'IBM905' => 'IBM905',
106             'IBM918' => 'IBM918',
107             'IEC_P27-1' => 'IEC_P27_1',
108             'INIS-8' => 'INIS_8',
109             'INIS-CYRILLIC' => 'INIS_CYRILLIC',
110             'INIS' => 'INIS',
111             'ISO-8859-1' => 'ISO_8859_1',
112             'ISO-8859-10' => 'ISO_8859_10',
113             'ISO-8859-11' => 'ISO_8859_11',
114             'ISO-8859-13' => 'ISO_8859_13',
115             'ISO-8859-14' => 'ISO_8859_14',
116             'ISO-8859-15' => 'ISO_8859_15',
117             'ISO-8859-16' => 'ISO_8859_16',
118             'ISO-8859-2' => 'ISO_8859_2',
119             'ISO-8859-3' => 'ISO_8859_3',
120             'ISO-8859-4' => 'ISO_8859_4',
121             'ISO-8859-5' => 'ISO_8859_5',
122             'ISO-8859-6' => 'ISO_8859_6',
123             'ISO-8859-7' => 'ISO_8859_7',
124             'ISO-8859-8' => 'ISO_8859_8',
125             'ISO-8859-9' => 'ISO_8859_9',
126             'ISO_10367-BOX' => 'ISO_10367_BOX',
127             'ISO_2033-1983' => 'ISO_2033_1983',
128             'ISO_5427-EXT' => 'ISO_5427_EXT',
129             'ISO_5427' => 'ISO_5427',
130             'ISO_5428' => 'ISO_5428',
131             'KOI-8' => 'KOI_8',
132             'KOI8-R' => 'KOI8_R',
133             'KOI8-RU' => 'KOI8_RU',
134             'KOI8-T' => 'KOI8_T',
135             'KOI8-U' => 'KOI8_U',
136             'LATIN-GREEK-1' => 'LATIN_GREEK_1',
137             'LATIN-GREEK' => 'LATIN_GREEK',
138             'MACINTOSH' => 'MACINTOSH',
139             'MACARABIC' => 'MACARABIC',
140             'MACCYRILLIC' => 'MACCYRILLIC',
141             'MACCROATIAN' => 'MACCROATIAN',
142             'MACGREEK' => 'MACGREEK',
143             'MACHEBREW' => 'MACHEBREW',
144             'MACICELAND' => 'MACICELAND',
145             'MACROMANIA' => 'MACROMANIA',
146             'MACTHAI' => 'MACTHAI',
147             'MACTURKISH' => 'MACTURKISH',
148             'MACUKRAINE' => 'MACUKRAINE',
149             'MAC-IS' => 'MAC_IS',
150             'MAC-SAMI' => 'MAC_SAMI',
151             'MAC-UK' => 'MAC_UK',
152             'NATS-DANO' => 'NATS_DANO',
153             'NATS-SEFI' => 'NATS_SEFI',
154             'NEXTSTEP' => 'NEXTSTEP',
155             'TIS-620' => 'TIS_620',
156             'UTF-8' => 'UTF_8',
157             'VISCII' => 'VISCII',
158             'WIN-SAMI-2' => 'SAMI_WS2',
159             'WINDOWS-1250' => 'CP1250',
160             'WINDOWS-1251' => 'CP1251',
161             'WINDOWS-1252' => 'CP1252',
162             'WINDOWS-1253' => 'CP1253',
163             'WINDOWS-1254' => 'CP1254',
164             'WINDOWS-1256' => 'CP1256',
165             'WINDOWS-1257' => 'CP1257',
166             'US-ASCII' => 'US_ASCII',
167             };
168              
169             # These encodings are maybe available via Encode(3pm).
170             $optional_conversions = {
171             'BIG5' => undef,
172             'BIG5-HKSCS' => undef,
173             'CN-GB' => undef,
174             'CN-GB-ISOIR165' => undef,
175             'CP1006' => undef,
176             'CP1026' => undef,
177             'CP1047' => undef,
178             'CP1361' => undef,
179             'CP949' => undef,
180             'CP37' => undef,
181             'CP424' => undef,
182             'CP500' => undef,
183             'CP737' => undef,
184             'CP775' => undef,
185             'CP856' => undef,
186             'CP874' => undef,
187             'CP875' => undef,
188             'CP932' => undef,
189             'CP936' => undef,
190             'CP950' => undef,
191             'EUC-JP' => undef,
192             'EUC-KR' => undef,
193             'EUC-TW' => undef,
194             # mapping from 0xef to 0xff missing.
195             # 'HP-ROMAN8' => undef,
196             'GB18030' => undef,
197             'HZ' => undef,
198             'IBM437' => undef,
199             'IBM850' => undef,
200             'IBM852' => undef,
201             'IBM855' => undef,
202             'IBM857' => undef,
203             'IBM860' => undef,
204             'IBM861' => undef,
205             'IBM862' => undef,
206             'IBM863' => undef,
207             'IBM864' => undef,
208             'IBM865' => undef,
209             'IBM866' => undef,
210             'IBM869' => undef,
211             'ISO-10646-UCS-2' => undef,
212             'ISO-10646-UCS-4' => undef,
213             'ISO-2022-JP' => undef,
214             'ISO-2022-JP-1' => undef,
215             'ISO-2022-KR' => undef,
216             'ISO-8859-1' => undef,
217             'ISO-8859-10' => undef,
218             # This is broken in some versions of Encode.
219             # 'ISO-8859-11' => undef,
220             'ISO-8859-13' => undef,
221             'ISO-8859-14' => undef,
222             'ISO-8859-15' => undef,
223             # Errors at 0xa5 and 0xab.
224             # 'ISO-8859-16' => undef,
225             'ISO-8859-2' => undef,
226             'ISO-8859-3' => undef,
227             'ISO-8859-4' => undef,
228             'ISO-8859-5' => undef,
229             # Uses arabic digits in ascii range?!
230             # 'ISO-8859-6' => undef,
231             # 0xa1 and 0xa2 are incorrectly encoded.
232             # 'ISO-8859-7' => undef,
233             # 0xfd and 0xfe are missing.
234             # 'ISO-8859-8' => undef,
235             'ISO-8859-9' => undef,
236             'ISO-IR-149' => undef,
237             'KOI8-R' => undef,
238             # 0x95 is BULLET, not BULLET OPERATOR.
239             # 'KOI8-U' => undef,
240             # Seems to be messed up in certain Encode versions.
241             # 'MACINTOSH' => undef,
242             # TODO: Check other Mac encodings for correctness.
243             # Nextstep is completely broken in my version of Encode.
244             # 'NEXTSTEP' => undef,
245             'SHIFT_JIS' => undef,
246             'UCS-2BE' => undef,
247             'UCS-2LE' => undef,
248             'UCS-4BE' => undef,
249             'UCS-4LE' => undef,
250             'US-ASCII' => undef,
251             'UTF-16' => undef,
252             'UTF-16BE' => undef,
253             'UTF-16LE' => undef,
254             'UTF-32' => undef,
255             'UTF-32BE' => undef,
256             'UTF-32LE' => undef,
257             'UTF-8' => undef,
258             # 0x86 is missing, 0xa6 is incorrectly encoded.
259             # 'VISCII' => undef,
260             'WINDOWS-1250' => undef,
261             'WINDOWS-1251' => undef,
262             'WINDOWS-1252' => undef,
263             'WINDOWS-1253' => undef,
264             'WINDOWS-1254' => undef,
265             'WINDOWS-1255' => undef,
266             'WINDOWS-1256' => undef,
267             'WINDOWS-1257' => undef,
268             'WINDOWS-1258' => undef,
269             };
270              
271             my $has_encode;
272              
273             sub resolveAlias
274             {
275 1789     1789 0 3478 my (undef, $encoding) = @_;
276              
277 1789         3249 $encoding = uc $encoding;
278            
279 1789 100       5492 return $encoding if exists $conversions->{$encoding};
280 445 50       1330 return $encoding if exists $optional_conversions->{$encoding};
281              
282 445         271908 require Locale::Recode::_Aliases;
283              
284 445         1776 my $resolved = Locale::Recode::_Aliases::ALIASES()->{$encoding};
285            
286 445 100       1336 return $resolved if $resolved;
287              
288 411         867 return;
289             }
290              
291             sub isSupported
292             {
293 1098     1098 0 2375 my ($class, $encoding) = @_;
294              
295 1098 50 33     6586 return unless defined $encoding && length $encoding;
296              
297 1098         2160 $encoding = uc $encoding;
298 1098         2483 my $mimename = $class->resolveAlias ($encoding);
299              
300 1098 100       2765 return unless $mimename;
301            
302             # Determine the correct module.
303 687 100       1882 if (exists $optional_conversions->{$mimename}) {
304 174 100       599 unless (defined $has_encode) {
305 138         9024 eval "require Encode";
306 138         1384245 $has_encode = !$@;
307              
308 138 50       629 if ($has_encode) {
309 138         879 require Encode::Alias;
310              
311             # Add missing real names.
312 138         670 Encode::Alias::define_alias (MS_KANJI => 'ShiftJIS');
313 138         2149 Encode::Alias::define_alias ('CN-GB' => 'EUC-CN');
314             }
315             }
316              
317 174 50       1925 if ($has_encode) {
318             # Now check whether Encode really supports that encoding.
319 174         10039 eval "Encode::encode ('$mimename', 'x')";
320            
321 174 50       126675 unless ($@) {
322 174         612 $conversions->{$mimename} = '_Encode';
323             }
324 174         467 delete $optional_conversions->{$mimename};
325             }
326             }
327              
328 687 50       2423 return $conversions->{$mimename} if exists $conversions->{$mimename};
329              
330 0         0 return;
331             }
332              
333             sub listSupported
334             {
335 0     0 0 0 my ($class) = @_;
336              
337 0         0 foreach my $opt (keys %$optional_conversions) {
338 0         0 $class->isSupported ($opt);
339             }
340              
341 0         0 my @list = keys %$conversions;
342 0         0 return @list;
343             }
344              
345             # Find a conversion path.
346             sub findPath
347             {
348 549     549 0 1544 my ($class, $from, $to) = @_;
349              
350 549 100       2575 $from = 'INTERNAL' eq uc $from ? 'INTERNAL' : $class->resolveAlias ($from);
351 549 100       2190 $to = 'INTERNAL' eq uc $to ? 'INTERNAL' : $class->resolveAlias ($to);
352            
353 549 50 33     2458 return unless $from && $to;
354            
355 549 50       1793 return [] if $from eq $to;
356              
357 549         1623 my $from_module = $class->isSupported ($from);
358 549         1351 my $to_module = $class->isSupported ($to);
359              
360 549 100       1561 if (!$from_module) {
361 273 50       822 if ('INTERNAL' eq $from) {
362 273 50       861 $from_module = $to_module or return;
363             } else {
364 0         0 return;
365             }
366             }
367              
368 549 100       1426 if (!$to_module) {
369 138 50       564 if ('INTERNAL' eq $to) {
370 138 50       539 $to_module = $from_module or return;
371             } else {
372 0         0 return;
373             }
374             }
375              
376 549 50 66     2779 if ($from_module eq $to_module
    0 66        
377             || $to eq 'INTERNAL'
378             || $to eq 'UTF-8') {
379 549         2478 return [[ $from_module, $from, $to ]];
380             } elsif ($from eq 'INTERNAL') {
381 0           return [[ $to_module, $from, $to ]];
382             } else {
383 0           return [[ $from_module, $from, 'INTERNAL' ],
384             [ $to_module, 'INTERNAL', $to ]];
385             }
386             }
387              
388             # TODO: check for
389             # 7bit-jis
390             # AdobeStandardEncoding
391             # AdobeSymbol
392             # AdobeZdingbat
393             # ascii-ctrl
394             # big5ext
395             # big5plus
396             # cccii
397             # cns11643-1
398             # cns11643-2
399             # cns11643-3
400             # cns11643-4
401             # cns11643-5
402             # cns11643-6
403             # cns11643-7
404             # cns11643-f
405             # dingbats
406             # gb12345-raw
407             # gb2312-raw
408             # gsm0338
409             # jis0201-raw
410             # jis0208-raw
411             # jis0212-raw
412             # koi8-f
413             # MIME-B
414             # MIME-Header
415             # MIME-Q
416             # posix-bc
417             # symbol
418             # unisys
419              
420             1;
421              
422             __END__