File Coverage

blib/lib/HTML/HTML5/Parser/Charset/Info.pm
Criterion Covered Total %
statement 55 110 50.0
branch 25 68 36.7
condition 12 39 30.7
subroutine 8 11 72.7
pod 0 7 0.0
total 100 235 42.5


line stmt bran cond sub pod time code
1             package HTML::HTML5::Parser::Charset::Info;
2             ## skip Test::Tabs
3 9     9   66 use strict;
  9         18  
  9         313  
4 9     9   59 use warnings;
  9         32  
  9         66439  
5             our $VERSION='0.991';
6              
7             ## TODO: Certain encodings MUST NOT be implemented [HTML5].
8              
9             ## ISSUE: Should we convert unassigned code point with trivial Unicode
10             ## mapping into U+FFFD? Or, should we return that Unicode character
11             ## with an error? (For example, Windows-1252's 0x81 should be converted
12             ## to U+FFFD or U+0081?)
13              
14             sub UNREGISTERED_CHARSET_NAME () { 0b1 }
15             ## Names for non-standard encodings/implementations for Perl encodings
16             sub REGISTERED_CHARSET_NAME () { 0b10 }
17             ## Names for standard encodings for Perl encodings
18             sub PRIMARY_CHARSET_NAME () { 0b100 }
19             ## "Name:" field for IANA names
20             ## Canonical name for Perl encodings
21             sub PREFERRED_CHARSET_NAME () { 0b1000 }
22             ## "preferred MIME name" for IANA names
23              
24             sub FALLBACK_ENCODING_IMPL () { 0b10000 }
25             ## For Perl encodings: Not a name of the encoding, the encoding
26             ## for the name might be useful as a fallback when the correct
27             ## encoding is not supported.
28             sub NONCONFORMING_ENCODING_IMPL () { FALLBACK_ENCODING_IMPL }
29             ## For Perl encodings: Not a conforming implementation of the encoding,
30             ## though it seems that the intention was to implement that encoding.
31             sub SEMICONFORMING_ENCODING_IMPL () { 0b1000000 }
32             ## For Perl encodings: The implementation itself (returned by
33             ## |get_perl_encoding|) is non-conforming. The decode handle
34             ## implementation (returned by |get_decode_handle|) is conforming.
35             sub ERROR_REPORTING_ENCODING_IMPL () { 0b100000 }
36             ## For Perl encodings: Support error reporting via |manakai_onerror|
37             ## handler when the encoding is handled with decode handle.
38              
39             ## iana_status
40             sub STATUS_COMMON () { 0b1 }
41             sub STATUS_LIMITED_USE () { 0b10 }
42             sub STATUS_OBSOLETE () { 0b100 }
43              
44             ## category
45             sub CHARSET_CATEGORY_BLOCK_SAFE () { 0b1 }
46             ## NOTE: Stateless
47             sub CHARSET_CATEGORY_EUCJP () { 0b10 }
48             sub CHARSET_CATEGORY_SJIS () { 0b100 }
49             sub CHARSET_CATEGORY_UTF16 () { 0b1000 }
50             ## NOTE: "A UTF-16 encoding" in HTML5.
51             sub CHARSET_CATEGORY_ASCII_COMPAT () { 0b10000 }
52             ## NOTE: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
53             ## for bytes in the range 0x09-0x0A, 0x0C-0x0D, 0x20-0x22, 0x26, 0x27,
54             ## 0x2C-0x3F, 0x41-0x5A, and 0x61-0x7A" [HTML5]
55             sub CHARSET_CATEGORY_EBCDIC () { 0b100000 }
56             ## NOTE: "based on EBCDIC" in HTML5.
57             sub CHARSET_CATEGORY_MIME_TEXT () { 0b1000000 }
58             ## NOTE: Suitable as MIME text.
59              
60             ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is?
61             ## ISSUE: 0x5F (_) should be added to the range?
62              
63             my $Charset; ## TODO: this is obsolete.
64              
65             our $IANACharset;
66             ## NOTE: Charset names used where IANA charset names are allowed, either
67             ## registered or not.
68             our $HTMLCharset;
69             ## NOTE: Same as charset names in $IANACharset, except all ASCII
70             ## punctuations are dropped and letters/digits only names are not included.
71              
72             $Charset->{'us-ascii'}
73             = $IANACharset->{'ansi_x3.4-1968'}
74             = $IANACharset->{'iso-ir-6'}
75             = $IANACharset->{'ansi_x3.4-1986'}
76             = $IANACharset->{'iso_646.irv:1991'}
77             = $IANACharset->{'ascii'}
78             = $IANACharset->{'iso646-us'}
79             = $IANACharset->{'us-ascii'}
80             = $IANACharset->{'us'}
81             = $IANACharset->{'ibm367'}
82             = $IANACharset->{'cp367'}
83             = $IANACharset->{'csascii'}
84             = $HTMLCharset->{'ansix341968'}
85             = $HTMLCharset->{'isoir6'}
86             = $HTMLCharset->{'ansix341986'}
87             = $HTMLCharset->{'iso646irv1991'}
88             = $HTMLCharset->{'iso646us'}
89             = $HTMLCharset->{'usascii'}
90             = __PACKAGE__->new ({
91             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
92             iana_names => {
93             'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
94             'iso-ir-6' => REGISTERED_CHARSET_NAME,
95             'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME,
96             'iso_646.irv:1991' => REGISTERED_CHARSET_NAME,
97             'ascii' => REGISTERED_CHARSET_NAME,
98             'iso646-us' => REGISTERED_CHARSET_NAME,
99             'us-ascii' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
100             'us' => REGISTERED_CHARSET_NAME,
101             'ibm367' => REGISTERED_CHARSET_NAME,
102             'cp367' => REGISTERED_CHARSET_NAME,
103             'csascii' => REGISTERED_CHARSET_NAME,
104             },
105             perl_names => {
106             'web-latin1-us-ascii' => UNREGISTERED_CHARSET_NAME |
107             SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
108             'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
109             },
110             fallback => {
111             "\x80" => "\x{20AC}",
112             "\x81" => undef,
113             "\x82" => "\x{201A}",
114             "\x83" => "\x{0192}",
115             "\x84" => "\x{201E}",
116             "\x85" => "\x{2026}",
117             "\x86" => "\x{2020}",
118             "\x87" => "\x{2021}",
119             "\x88" => "\x{02C6}",
120             "\x89" => "\x{2030}",
121             "\x8A" => "\x{0160}",
122             "\x8B" => "\x{2039}",
123             "\x8C" => "\x{0152}",
124             "\x8D" => undef,
125             "\x8E" => "\x{017D}",
126             "\x8F" => undef,
127             "\x90" => undef,
128             "\x91" => "\x{2018}",
129             "\x92" => "\x{2019}",
130             "\x93" => "\x{201C}",
131             "\x94" => "\x{201D}",
132             "\x95" => "\x{2022}",
133             "\x96" => "\x{2013}",
134             "\x97" => "\x{2014}",
135             "\x98" => "\x{02DC}",
136             "\x99" => "\x{2122}",
137             "\x9A" => "\x{0161}",
138             "\x9B" => "\x{203A}",
139             "\x9C" => "\x{0153}",
140             "\x9D" => undef,
141             "\x9E" => "\x{017E}",
142             "\x9F" => "\x{0178}",
143             "\xA0" => "\xA0", "\xA1" => "\xA1", "\xA2" => "\xA2", "\xA3" => "\xA3",
144             "\xA4" => "\xA4", "\xA5" => "\xA5", "\xA6" => "\xA6", "\xA7" => "\xA7",
145             "\xA8" => "\xA8", "\xA9" => "\xA9", "\xAA" => "\xAA", "\xAB" => "\xAB",
146             "\xAC" => "\xAC", "\xAD" => "\xAD", "\xAE" => "\xAE", "\xAF" => "\xAF",
147             "\xB0" => "\xB0", "\xB1" => "\xB1", "\xB2" => "\xB2", "\xB3" => "\xB3",
148             "\xB4" => "\xB4", "\xB5" => "\xB5", "\xB6" => "\xB6", "\xB7" => "\xB7",
149             "\xB8" => "\xB8", "\xB9" => "\xB9", "\xBA" => "\xBA", "\xBB" => "\xBB",
150             "\xBC" => "\xBC", "\xBD" => "\xBD", "\xBE" => "\xBE", "\xBF" => "\xBF",
151             "\xC0" => "\xC0", "\xC1" => "\xC1", "\xC2" => "\xC2", "\xC3" => "\xC3",
152             "\xC4" => "\xC4", "\xC5" => "\xC5", "\xC6" => "\xC6", "\xC7" => "\xC7",
153             "\xC8" => "\xC8", "\xC9" => "\xC9", "\xCA" => "\xCA", "\xCB" => "\xCB",
154             "\xCC" => "\xCC", "\xCD" => "\xCD", "\xCE" => "\xCE", "\xCF" => "\xCF",
155             "\xD0" => "\xD0", "\xD1" => "\xD1", "\xD2" => "\xD2", "\xD3" => "\xD3",
156             "\xD4" => "\xD4", "\xD5" => "\xD5", "\xD6" => "\xD6", "\xD7" => "\xD7",
157             "\xD8" => "\xD8", "\xD9" => "\xD9", "\xDA" => "\xDA", "\xDB" => "\xDB",
158             "\xDC" => "\xDC", "\xDD" => "\xDD", "\xDE" => "\xDE", "\xDF" => "\xDF",
159             "\xE0" => "\xE0", "\xE1" => "\xE1", "\xE2" => "\xE2", "\xE3" => "\xE3",
160             "\xE4" => "\xE4", "\xE5" => "\xE5", "\xE6" => "\xE6", "\xE7" => "\xE7",
161             "\xE8" => "\xE8", "\xE9" => "\xE9", "\xEA" => "\xEA", "\xEB" => "\xEB",
162             "\xEC" => "\xEC", "\xED" => "\xED", "\xEE" => "\xEE", "\xEF" => "\xEF",
163             "\xF0" => "\xF0", "\xF1" => "\xF1", "\xF2" => "\xF2", "\xF3" => "\xF3",
164             "\xF4" => "\xF4", "\xF5" => "\xF5", "\xF6" => "\xF6", "\xF7" => "\xF7",
165             "\xF8" => "\xF8", "\xF9" => "\xF9", "\xFA" => "\xFA", "\xFB" => "\xFB",
166             "\xFC" => "\xFC", "\xFD" => "\xFD", "\xFE" => "\xFE", "\xFF" => "\xFF",
167             },
168             ## NOTE: Treated as |windows-1252|. Properties of this charset
169             ## should be consistent with those of that charset.
170             });
171              
172             $Charset->{'iso-8859-1'}
173             = $IANACharset->{'iso_8859-1:1987'}
174             = $IANACharset->{'iso-ir-100'}
175             = $IANACharset->{'iso_8859-1'}
176             = $IANACharset->{'iso-8859-1'}
177             = $IANACharset->{'latin1'}
178             = $IANACharset->{'l1'}
179             = $IANACharset->{'ibm819'}
180             = $IANACharset->{'cp819'}
181             = $IANACharset->{'csisolatin1'}
182             = $HTMLCharset->{'iso885911987'}
183             = $HTMLCharset->{'isoir100'}
184             = $HTMLCharset->{'iso88591'}
185             = __PACKAGE__->new ({
186             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
187             iana_names => {
188             'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
189             'iso-ir-100' => REGISTERED_CHARSET_NAME,
190             'iso_8859-1' => REGISTERED_CHARSET_NAME,
191             'iso-8859-1' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
192             'latin1' => REGISTERED_CHARSET_NAME,
193             'l1' => REGISTERED_CHARSET_NAME,
194             'ibm819' => REGISTERED_CHARSET_NAME,
195             'cp819' => REGISTERED_CHARSET_NAME,
196             'csisolatin1' => REGISTERED_CHARSET_NAME,
197             },
198             perl_names => {
199             'web-latin1' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
200             ERROR_REPORTING_ENCODING_IMPL,
201             'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
202             },
203             fallback => {
204             "\x80" => "\x{20AC}",
205             "\x81" => undef,
206             "\x82" => "\x{201A}",
207             "\x83" => "\x{0192}",
208             "\x84" => "\x{201E}",
209             "\x85" => "\x{2026}",
210             "\x86" => "\x{2020}",
211             "\x87" => "\x{2021}",
212             "\x88" => "\x{02C6}",
213             "\x89" => "\x{2030}",
214             "\x8A" => "\x{0160}",
215             "\x8B" => "\x{2039}",
216             "\x8C" => "\x{0152}",
217             "\x8D" => undef,
218             "\x8E" => "\x{017D}",
219             "\x8F" => undef,
220             "\x90" => undef,
221             "\x91" => "\x{2018}",
222             "\x92" => "\x{2019}",
223             "\x93" => "\x{201C}",
224             "\x94" => "\x{201D}",
225             "\x95" => "\x{2022}",
226             "\x96" => "\x{2013}",
227             "\x97" => "\x{2014}",
228             "\x98" => "\x{02DC}",
229             "\x99" => "\x{2122}",
230             "\x9A" => "\x{0161}",
231             "\x9B" => "\x{203A}",
232             "\x9C" => "\x{0153}",
233             "\x9D" => undef,
234             "\x9E" => "\x{017E}",
235             "\x9F" => "\x{0178}",
236             },
237             ## NOTE: Treated as |windows-1252|. Properties of this charset
238             ## should be consistent with those of that charset.
239             });
240              
241             $Charset->{'iso-8859-2'}
242             = $IANACharset->{'iso_8859-2:1987'}
243             = $IANACharset->{'iso-ir-101'}
244             = $IANACharset->{'iso_8859-2'}
245             = $IANACharset->{'iso-8859-2'}
246             = $IANACharset->{'latin2'}
247             = $IANACharset->{'l2'}
248             = $IANACharset->{'csisolatin2'}
249             = $HTMLCharset->{'iso885921987'}
250             = $HTMLCharset->{'isoir101'}
251             = $HTMLCharset->{'iso88592'}
252             = __PACKAGE__->new ({
253             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
254             iana_names => {
255             'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
256             'iso-ir-101' => REGISTERED_CHARSET_NAME,
257             'iso_8859-2' => REGISTERED_CHARSET_NAME,
258             'iso-8859-2' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
259             'latin2' => REGISTERED_CHARSET_NAME,
260             'l2' => REGISTERED_CHARSET_NAME,
261             'csisolatin2' => REGISTERED_CHARSET_NAME,
262             },
263             });
264              
265             $Charset->{'iso-8859-3'}
266             = $IANACharset->{'iso_8859-3:1988'}
267             = $IANACharset->{'iso-ir-109'}
268             = $IANACharset->{'iso_8859-3'}
269             = $IANACharset->{'iso-8859-3'}
270             = $IANACharset->{'latin3'}
271             = $IANACharset->{'l3'}
272             = $IANACharset->{'csisolatin3'}
273             = $HTMLCharset->{'iso885931988'}
274             = $HTMLCharset->{'isoir109'}
275             = $HTMLCharset->{'iso88593'}
276             = __PACKAGE__->new ({
277             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
278             iana_names => {
279             'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
280             'iso-ir-109' => REGISTERED_CHARSET_NAME,
281             'iso_8859-3' => REGISTERED_CHARSET_NAME,
282             'iso-8859-3' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
283             'latin3' => REGISTERED_CHARSET_NAME,
284             'l3' => REGISTERED_CHARSET_NAME,
285             'csisolatin3' => REGISTERED_CHARSET_NAME,
286             },
287             error_level => {
288             'unassigned-code-point-error' => 'iso_shall',
289             ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
290             ## unassigned code points, but ECMA-94:1986 (whose content considered
291             ## as equivalent to ISO 8859/1-4) disallows the use of them.
292             },
293             });
294              
295             $Charset->{'iso-8859-4'}
296             = $IANACharset->{'iso_8859-4:1988'}
297             = $IANACharset->{'iso-ir-110'}
298             = $IANACharset->{'iso_8859-4'}
299             = $IANACharset->{'iso-8859-4'}
300             = $IANACharset->{'latin4'}
301             = $IANACharset->{'l4'}
302             = $IANACharset->{'csisolatin4'}
303             = $HTMLCharset->{'iso885941988'}
304             = $HTMLCharset->{'isoir110'}
305             = $HTMLCharset->{'iso88594'}
306             = __PACKAGE__->new ({
307             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
308             iana_names => {
309             'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
310             'iso-ir-110' => REGISTERED_CHARSET_NAME,
311             'iso_8859-4' => REGISTERED_CHARSET_NAME,
312             'iso-8859-4' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
313             'latin4' => REGISTERED_CHARSET_NAME,
314             'l4' => REGISTERED_CHARSET_NAME,
315             'csisolatin4' => REGISTERED_CHARSET_NAME,
316             },
317             error_level => {
318             'unassigned-code-point-error' => 'iso_shall',
319             ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
320             ## unassigned code points, but ECMA-94:1986 (whose content considered
321             ## as equivalent to ISO 8859/1-4) disallows the use of them.
322             },
323             });
324              
325             $Charset->{'iso-8859-5'}
326             = $IANACharset->{'iso_8859-5:1988'}
327             = $IANACharset->{'iso-ir-144'}
328             = $IANACharset->{'iso_8859-5'}
329             = $IANACharset->{'iso-8859-5'}
330             = $IANACharset->{'cyrillic'}
331             = $IANACharset->{'csisolatincyrillic'}
332             = $HTMLCharset->{'iso885951988'}
333             = $HTMLCharset->{'isoir144'}
334             = $HTMLCharset->{'iso88595'}
335             = __PACKAGE__->new ({
336             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
337             iana_names => {
338             'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
339             'iso-ir-144' => REGISTERED_CHARSET_NAME,
340             'iso_8859-5' => REGISTERED_CHARSET_NAME,
341             'iso-8859-5' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
342             'cyrillic' => REGISTERED_CHARSET_NAME,
343             'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
344             },
345             });
346              
347             $Charset->{'iso-8859-6'}
348             = $IANACharset->{'iso_8859-6:1987'}
349             = $IANACharset->{'iso-ir-127'}
350             = $IANACharset->{'iso_8859-6'}
351             = $IANACharset->{'iso-8859-6'}
352             = $IANACharset->{'ecma-114'}
353             = $IANACharset->{'asmo-708'}
354             = $IANACharset->{'arabic'}
355             = $IANACharset->{'csisolatinarabic'}
356             = $HTMLCharset->{'iso885961987'}
357             = $HTMLCharset->{'isoir127'}
358             = $HTMLCharset->{'iso88596'}
359             = $HTMLCharset->{'ecma114'}
360             = $HTMLCharset->{'asmo708'}
361             = __PACKAGE__->new ({
362             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
363             ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
364             ## but have same character names (maybe).
365             ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"
366             ## is same as "us-ascii".
367             ## TODO: RFC 1345 def?
368             iana_names => {
369             'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
370             'iso-ir-127' => REGISTERED_CHARSET_NAME,
371             'iso_8859-6' => REGISTERED_CHARSET_NAME,
372             'iso-8859-6' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
373             'ecma-114' => REGISTERED_CHARSET_NAME,
374             'asmo-708' => REGISTERED_CHARSET_NAME,
375             'arabic' => REGISTERED_CHARSET_NAME,
376             'csisolatinarabic' => REGISTERED_CHARSET_NAME,
377             },
378             ## TODO: |error_level|
379             });
380              
381             $Charset->{'iso-8859-7'}
382             = $IANACharset->{'iso_8859-7:1987'}
383             = $IANACharset->{'iso-ir-126'}
384             = $IANACharset->{'iso_8859-7'}
385             = $IANACharset->{'iso-8859-7'}
386             = $IANACharset->{'elot_928'}
387             = $IANACharset->{'ecma-118'}
388             = $IANACharset->{'greek'}
389             = $IANACharset->{'greek8'}
390             = $IANACharset->{'csisolatingreek'}
391             = $HTMLCharset->{'iso885971987'}
392             = $HTMLCharset->{'isoir126'}
393             = $HTMLCharset->{'iso88597'}
394             = $HTMLCharset->{'elot928'}
395             = $HTMLCharset->{'ecma118'}
396             = __PACKAGE__->new ({
397             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
398             iana_names => {
399             'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
400             'iso-ir-126' => REGISTERED_CHARSET_NAME,
401             'iso_8859-7' => REGISTERED_CHARSET_NAME,
402             'iso-8859-7' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
403             'elot_928' => REGISTERED_CHARSET_NAME,
404             'ecma-118' => REGISTERED_CHARSET_NAME,
405             'greek' => REGISTERED_CHARSET_NAME,
406             'greek8' => REGISTERED_CHARSET_NAME,
407             'csisolatingreek' => REGISTERED_CHARSET_NAME,
408             },
409             ## TODO: |error_level|
410             });
411              
412             $Charset->{'iso-8859-8'}
413             = $IANACharset->{'iso_8859-8:1988'}
414             = $IANACharset->{'iso-ir-138'}
415             = $IANACharset->{'iso_8859-8'}
416             = $IANACharset->{'iso-8859-8'}
417             = $IANACharset->{'hebrew'}
418             = $IANACharset->{'csisolatinhebrew'}
419             = $HTMLCharset->{'iso885981988'}
420             = $HTMLCharset->{'isoir138'}
421             = $HTMLCharset->{'iso88598'}
422             = __PACKAGE__->new ({
423             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
424             iana_names => {
425             'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
426             'iso-ir-138' => REGISTERED_CHARSET_NAME,
427             'iso_8859-8' => REGISTERED_CHARSET_NAME,
428             'iso-8859-8' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
429             'hebrew' => REGISTERED_CHARSET_NAME,
430             'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
431             },
432             ## TODO: |error_level|
433             });
434              
435             $Charset->{'iso-8859-9'}
436             = $IANACharset->{'iso_8859-9:1989'}
437             = $IANACharset->{'iso-ir-148'}
438             = $IANACharset->{'iso_8859-9'}
439             = $IANACharset->{'iso-8859-9'}
440             = $IANACharset->{'latin5'}
441             = $IANACharset->{'l5'}
442             = $IANACharset->{'csisolatin5'}
443             = $HTMLCharset->{'iso885991989'}
444             = $HTMLCharset->{'isoir148'}
445             = $HTMLCharset->{'iso88599'}
446             = __PACKAGE__->new ({
447             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
448             iana_names => {
449             'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
450             'iso-ir-148' => REGISTERED_CHARSET_NAME,
451             'iso_8859-9' => REGISTERED_CHARSET_NAME,
452             'iso-8859-9' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
453             'latin5' => REGISTERED_CHARSET_NAME,
454             'l5' => REGISTERED_CHARSET_NAME,
455             'csisolatin5' => REGISTERED_CHARSET_NAME,
456             },
457             perl_names => {
458             'web-latin5' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
459             ERROR_REPORTING_ENCODING_IMPL,
460             'cp1254' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
461             },
462             fallback => {
463             "\x80" => "\x{20AC}",
464             "\x81" => undef,
465             "\x82" => "\x{201A}",
466             "\x83" => "\x{0192}",
467             "\x84" => "\x{201E}",
468             "\x85" => "\x{2026}",
469             "\x86" => "\x{2020}",
470             "\x87" => "\x{2021}",
471             "\x88" => "\x{02C6}",
472             "\x89" => "\x{2030}",
473             "\x8A" => "\x{0160}",
474             "\x8B" => "\x{2039}",
475             "\x8C" => "\x{0152}",
476             "\x8D" => undef,
477             "\x8E" => undef,
478             "\x8F" => undef,
479             "\x90" => undef,
480             "\x91" => "\x{2018}",
481             "\x92" => "\x{2019}",
482             "\x93" => "\x{201C}",
483             "\x94" => "\x{201D}",
484             "\x95" => "\x{2022}",
485             "\x96" => "\x{2013}",
486             "\x97" => "\x{2014}",
487             "\x98" => "\x{02DC}",
488             "\x99" => "\x{2122}",
489             "\x9A" => "\x{0161}",
490             "\x9B" => "\x{203A}",
491             "\x9C" => "\x{0153}",
492             "\x9D" => undef,
493             "\x9E" => undef,
494             "\x9F" => "\x{0178}",
495             },
496             ## NOTE: Treated as |windows-1254|. Properties of this charset
497             ## should be consistent with those of that charset.
498             });
499              
500             $Charset->{'iso-8859-10'}
501             = $IANACharset->{'iso-8859-10'}
502             = $IANACharset->{'iso-ir-157'}
503             = $IANACharset->{'l6'}
504             = $IANACharset->{'iso_8859-10:1992'}
505             = $IANACharset->{'csisolatin6'}
506             = $IANACharset->{'latin6'}
507             = $HTMLCharset->{'iso885910'}
508             = $HTMLCharset->{'isoir157'}
509             = $HTMLCharset->{'iso8859101992'}
510             = __PACKAGE__->new ({
511             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
512             iana_names => {
513             'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
514             'iso-ir-157' => REGISTERED_CHARSET_NAME,
515             'l6' => REGISTERED_CHARSET_NAME,
516             'iso_8859-10:1992' => REGISTERED_CHARSET_NAME,
517             'csisolatin6' => REGISTERED_CHARSET_NAME,
518             'latin6' => REGISTERED_CHARSET_NAME,
519             },
520             ## TODO: |error_level|
521             });
522              
523             $Charset->{'iso_6937-2-add'}
524             = $IANACharset->{'iso_6937-2-add'}
525             = $IANACharset->{'iso-ir-142'}
526             = $IANACharset->{'csisotextcomm'}
527             = $HTMLCharset->{'iso69372add'}
528             = $HTMLCharset->{'isoir142'}
529             = __PACKAGE__->new ({
530             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
531             iana_names => {
532             'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
533             'iso-ir-142' => REGISTERED_CHARSET_NAME,
534             'csisotextcomm' => REGISTERED_CHARSET_NAME,
535             },
536             ## TODO: |error_level|
537             });
538              
539             $Charset->{'jis_x0201'}
540             = $IANACharset->{'jis_x0201'}
541             = $IANACharset->{'x0201'}
542             = $IANACharset->{'cshalfwidthkatakana'}
543             = $HTMLCharset->{'jisx0201'}
544             = __PACKAGE__->new ({
545             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
546             iana_names => {
547             'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
548             'x0201' => REGISTERED_CHARSET_NAME,
549             'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
550             },
551             ## TODO: |error_level|
552             });
553              
554             $Charset->{'jis_encoding'}
555             = $IANACharset->{'jis_encoding'}
556             = $IANACharset->{'csjisencoding'}
557             = $HTMLCharset->{'jisencoding'}
558             = __PACKAGE__->new ({
559             category => 0,
560             iana_names => {
561             'jis_encoding' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
562             'csjisencoding' => REGISTERED_CHARSET_NAME,
563             },
564             ## NOTE: What is this?
565             });
566              
567             $Charset->{'shift_jis'}
568             = $IANACharset->{'shift_jis'}
569             = $IANACharset->{'ms_kanji'}
570             = $IANACharset->{'csshiftjis'}
571             = $HTMLCharset->{'shiftjis'}
572             = $HTMLCharset->{'mskanji'}
573             = __PACKAGE__->new ({
574             category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
575             CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
576             iana_names => {
577             'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
578             'ms_kanji' => REGISTERED_CHARSET_NAME,
579             'csshiftjis' => REGISTERED_CHARSET_NAME,
580             },
581             perl_names => {
582             'shift-jis-1997' => UNREGISTERED_CHARSET_NAME |
583             SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
584             shiftjis => PRIMARY_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
585             ERROR_REPORTING_ENCODING_IMPL,
586             ## NOTE: Unicode mapping is wrong.
587             },
588             ## TODO: |error_level|
589             });
590              
591             $Charset->{'x-sjis'}
592             = $IANACharset->{'x-sjis'}
593             = $HTMLCharset->{'xsjis'}
594             = __PACKAGE__->new ({
595             category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
596             CHARSET_CATEGORY_ASCII_COMPAT,
597             iana_names => {
598             'x-sjis' => UNREGISTERED_CHARSET_NAME,
599             },
600             perl_names => {
601             'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
602             },
603             ## TODO: |error_level|
604             });
605              
606             $Charset->{shift_jisx0213}
607             = $IANACharset->{shift_jisx0213}
608             = $HTMLCharset->{shiftjisx0213}
609             = __PACKAGE__->new ({
610             category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
611             CHARSET_CATEGORY_MIME_TEXT,
612             iana_names => {
613             shift_jisx0213 => UNREGISTERED_CHARSET_NAME,
614             },
615             perl_names => {
616             #shift_jisx0213 (non-standard - i don't know its conformance)
617             'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
618             'shiftjis' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
619             },
620             ## TODO: |error_level|
621             });
622              
623             $Charset->{'euc-jp'}
624             = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
625             = $IANACharset->{'cseucpkdfmtjapanese'}
626             = $IANACharset->{'euc-jp'}
627             = $HTMLCharset->{'extendedunixcodepackedformatforjapanese'}
628             = $HTMLCharset->{'cseucpkdfmtjapanese'}
629             = $HTMLCharset->{'eucjp'}
630             = __PACKAGE__->new ({
631             category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
632             CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
633             iana_names => {
634             'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
635             'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
636             'euc-jp' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
637             },
638             perl_names => {
639             'euc-jp-1997' => UNREGISTERED_CHARSET_NAME |
640             SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
641             ## NOTE: Though the IANA definition references the 1990 version
642             ## of EUC-JP, the 1997 version of JIS standard claims that the version
643             ## is same coded character set as the 1990 version, such that we
644             ## consider the EUC-JP 1990 version is same as the 1997 version.
645             'euc-jp' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
646             ERROR_REPORTING_ENCODING_IMPL,
647             ## NOTE: Unicode mapping is wrong.
648             },
649             ## TODO: |error_level|
650             });
651              
652             $Charset->{'x-euc-jp'}
653             = $IANACharset->{'x-euc-jp'}
654             = $HTMLCharset->{'xeucjp'}
655             = __PACKAGE__->new ({
656             category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
657             CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
658             iana_names => {
659             'x-euc-jp' => UNREGISTERED_CHARSET_NAME,
660             },
661             perl_names => {
662             'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
663             'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
664             },
665             });
666              
667             $Charset->{'extended_unix_code_fixed_width_for_japanese'}
668             = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
669             = $IANACharset->{'cseucfixwidjapanese'}
670             = $HTMLCharset->{'extendedunixcodefixedwidthforjapanese'}
671             = __PACKAGE__->new ({
672             category => CHARSET_CATEGORY_BLOCK_SAFE,
673             iana_names => {
674             'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
675             'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
676             },
677             ## TODO: |error_level|
678             });
679              
680             ## TODO: ...
681              
682             $Charset->{'euc-kr'}
683             = $IANACharset->{'euc-kr'}
684             = $IANACharset->{'cseuckr'}
685             = $HTMLCharset->{'euckr'}
686             = __PACKAGE__->new ({
687             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
688             iana_names => {
689             'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
690             'cseuckr' => REGISTERED_CHARSET_NAME,
691             },
692             perl_names => {
693             ## TODO: We need a parse error generating wrapper for the decoder.
694             'cp949' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
695             },
696             ## NOTE: |euc-kr| is handled as |windows-949|, such that properties
697             ## should be consistent with that encoding's properties.
698             });
699              
700             $Charset->{'iso-2022-jp'}
701             = $IANACharset->{'iso-2022-jp'}
702             = $IANACharset->{'csiso2022jp'}
703             = $IANACharset->{'iso2022jp'}
704             = $IANACharset->{'junet-code'}
705             = $HTMLCharset->{'iso2022jp'}
706             = $HTMLCharset->{'junetcode'}
707             = __PACKAGE__->new ({
708             category => CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
709             iana_names => {
710             'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
711             'csiso2022jp' => REGISTERED_CHARSET_NAME,
712             'iso2022jp' => UNREGISTERED_CHARSET_NAME,
713             'junet-code' => UNREGISTERED_CHARSET_NAME,
714             },
715             ## TODO: |error_level|
716             });
717              
718             $Charset->{'iso-2022-jp-2'}
719             = $IANACharset->{'iso-2022-jp-2'}
720             = $IANACharset->{'csiso2022jp2'}
721             = $HTMLCharset->{'iso2022jp2'}
722             = __PACKAGE__->new ({
723             category => CHARSET_CATEGORY_MIME_TEXT,
724             iana_names => {
725             'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
726             'csiso2022jp2' => REGISTERED_CHARSET_NAME,
727             },
728             ## TODO: |error_level|
729             });
730              
731             ## TODO: ...
732              
733             $IANACharset->{'gb_2312-80'}
734             = $IANACharset->{'iso-ir-58'}
735             = $IANACharset->{chinese}
736             = $HTMLCharset->{gb231280}
737             = $HTMLCharset->{isoir58}
738             = __PACKAGE__->new ({
739             ## NOTE: What is represented by this charset is unclear... I don't
740             ## understand what RFC 1945 describes...
741             category => 0,
742             iana_names => {
743             'gb_2312-80' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
744             'iso-ir-58' => REGISTERED_CHARSET_NAME,
745             'chinese' => REGISTERED_CHARSET_NAME,
746             'csiso58gb231280' => REGISTERED_CHARSET_NAME,
747             },
748             perl_names => {
749             ## TODO: GB2312->GBK Parse Error wrapper
750             'cp936' => FALLBACK_ENCODING_IMPL,
751             },
752             ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
753             ## consistent.
754             });
755              
756             ## TODO: ...
757              
758             $Charset->{'utf-8'}
759             = $IANACharset->{'utf-8'}
760             = $IANACharset->{'x-utf-8'}
761             = $HTMLCharset->{'utf8'}
762             = $HTMLCharset->{'xutf8'}
763             = __PACKAGE__->new ({
764             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
765             CHARSET_CATEGORY_MIME_TEXT,
766             iana_names => {
767             'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
768             ## NOTE: IANA name "utf-8" references RFC 3629. According to the RFC,
769             ## the definitive definition is one specified in the Unicode Standard.
770             'x-utf-8' => UNREGISTERED_CHARSET_NAME,
771             ## NOTE: We treat |x-utf-8| as an alias of |utf-8|, since unlike
772             ## other charset like |x-sjis| or |x-euc-jp|, there is no major
773             ## variant for the UTF-8 encoding.
774             ## TODO: We might ought to reconsider this policy, since
775             ## there are UTF-8 variant in fact, such as
776             ## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as
777             ## such.
778             },
779             perl_names => {
780             'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
781             ERROR_REPORTING_ENCODING_IMPL,
782             ## NOTE: It does not support non-Unicode UCS characters (conforming).
783             ## It does detect illegal sequences (conforming).
784             ## It does not support surrpgate pairs (conforming).
785             ## It does not support BOMs (non-conforming).
786             },
787             ## TODO: |error_level|
788             bom_pattern => qr/\xEF\xBB\xBF/,
789             });
790              
791             $Charset->{'utf-8n'}
792             = $IANACharset->{'utf-8n'}
793             = $HTMLCharset->{'utf-8'}
794             = __PACKAGE__->new ({
795             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
796             CHARSET_CATEGORY_ASCII_COMPAT,
797             iana_names => {
798             'utf-8n' => UNREGISTERED_CHARSET_NAME,
799             ## NOTE: Is there any normative definition for the charset?
800             ## What variant of UTF-8 should we use for the charset?
801             },
802             perl_names => {
803             'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
804             },
805             ## TODO: |error_level|
806             });
807              
808             ## TODO: ...
809              
810             $Charset->{'gbk'}
811             = $IANACharset->{'gbk'}
812             = $IANACharset->{'cp936'}
813             = $IANACharset->{'ms936'}
814             = $IANACharset->{'windows-936'}
815             = $HTMLCharset->{'windows936'}
816             = __PACKAGE__->new ({
817             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
818             iana_names => {
819             'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
820             'cp936' => REGISTERED_CHARSET_NAME,
821             'ms936' => REGISTERED_CHARSET_NAME,
822             'windows-936' => REGISTERED_CHARSET_NAME,
823             },
824             ## TODO: |error_level|
825             iana_status => STATUS_COMMON | STATUS_OBSOLETE,
826             });
827              
828             $Charset->{'gb18030'}
829             = $IANACharset->{'gb18030'}
830             = __PACKAGE__->new ({
831             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
832             iana_names => {
833             'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
834             },
835             iana_status => STATUS_COMMON,
836             mime_text_suitable => 1,
837             });
838              
839             ## TODO: ...
840              
841             $Charset->{'utf-16be'}
842             = $IANACharset->{'utf-16be'}
843             = $HTMLCharset->{'utf16be'}
844             = __PACKAGE__->new ({
845             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
846             iana_names => {
847             'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
848             },
849             ## TODO: |error_level|
850             });
851              
852             $Charset->{'utf-16le'}
853             = $IANACharset->{'utf-16le'}
854             = $HTMLCharset->{'utf16le'}
855             = __PACKAGE__->new ({
856             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
857             iana_names => {
858             'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
859             },
860             ## TODO: |error_level|
861             });
862              
863             $Charset->{'utf-16'}
864             = $IANACharset->{'utf-16'}
865             = $HTMLCharset->{'utf16'}
866             = __PACKAGE__->new ({
867             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
868             iana_names => {
869             'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
870             },
871             ## TODO: |error_level|
872             });
873              
874             ## TODO: ...
875              
876             $Charset->{'windows-31j'}
877             = $IANACharset->{'windows-31j'}
878             = $IANACharset->{'cswindows31j'}
879             = $HTMLCharset->{'windows31j'}
880             = __PACKAGE__->new ({
881             category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
882             CHARSET_CATEGORY_MIME_TEXT,
883             iana_names => {
884             'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
885             'cswindows31j' => REGISTERED_CHARSET_NAME,
886             },
887             iana_status => STATUS_LIMITED_USE, # maybe
888             ## TODO: |error_level|
889             });
890              
891             $Charset->{'gb2312'}
892             = $IANACharset->{'gb2312'}
893             = $IANACharset->{'csgb2312'}
894             = __PACKAGE__->new ({
895             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
896             CHARSET_CATEGORY_ASCII_COMPAT,
897             iana_names => {
898             'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
899             'csgb2312' => REGISTERED_CHARSET_NAME,
900             },
901             perl_names => {
902             ## TODO: GB2312->GBK Parse Error wrapper
903             'cp936' => FALLBACK_ENCODING_IMPL,
904             },
905             ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
906             ## consistent.
907             });
908              
909             $Charset->{'big5'}
910             = $IANACharset->{'big5'}
911             = $IANACharset->{'csbig5'}
912             = $IANACharset->{'x-x-big5'}
913             = $HTMLCharset->{xxbig5}
914             = __PACKAGE__->new ({
915             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
916             iana_names => {
917             'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
918             'csbig5' => REGISTERED_CHARSET_NAME,
919             'x-x-big5' => UNREGISTERED_CHARSET_NAME,
920             ## NOTE: In HTML5, |x-x-big5| is defined as an alias of |big5|.
921             ## According to that spec, if there is any difference between
922             ## input and replacement encodings, the result is parse error.
923             ## However, since there is no formal definition for |x-x-big5|
924             ## charset, we cannot raise such errors.
925             },
926             ## TODO: |error_level|
927             });
928              
929             ## TODO: ...
930              
931             $Charset->{'big5-hkscs'}
932             = $IANACharset->{'big5-hkscs'}
933             = $HTMLCharset->{'big5hkscs'}
934             = __PACKAGE__->new ({
935             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
936             iana_names => {
937             'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
938             },
939             ## TODO: |error_level|
940             });
941              
942             ## TODO: ...
943              
944             $Charset->{'windows-1252'}
945             = $IANACharset->{'windows-1252'}
946             = $HTMLCharset->{'windows1252'}
947             = __PACKAGE__->new ({
948             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
949             CHARSET_CATEGORY_MIME_TEXT,
950             iana_names => {
951             'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
952             },
953             ## TODO: Check whether use of 0x81 is conforming or not...
954             });
955              
956             $Charset->{'windows-1253'}
957             = $IANACharset->{'windows-1253'}
958             = $HTMLCharset->{'windows1253'}
959             = __PACKAGE__->new ({
960             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
961             CHARSET_CATEGORY_MIME_TEXT,
962             iana_names => {
963             'windows-1253' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
964             },
965             ## TODO: Check whether use of 0x81 is conforming or not...
966             });
967              
968             $Charset->{'windows-1254'}
969             = $IANACharset->{'windows-1254'}
970             = $HTMLCharset->{'windows1254'}
971             = __PACKAGE__->new ({
972             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
973             CHARSET_CATEGORY_MIME_TEXT,
974             iana_names => {
975             'windows-1254' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
976             },
977             ## TODO: Check whether use of 0x81 is conforming or not...
978             });
979              
980             ## TODO: ...
981              
982             $Charset->{'tis-620'}
983             = $IANACharset->{'tis-620'}
984             = $HTMLCharset->{'tis620'}
985             = __PACKAGE__->new ({
986             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
987             iana_names => {
988             'tis-620' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
989             },
990             perl_names => {
991             'web-tis-620' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
992             'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
993             },
994             fallback => {
995             "\x80" => "\x{20AC}",
996             "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
997             "\x85" => "\x{2026}",
998             "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
999             "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
1000             "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
1001             "\x91" => "\x{2018}",
1002             "\x92" => "\x{2019}",
1003             "\x93" => "\x{201C}",
1004             "\x94" => "\x{201D}",
1005             "\x95" => "\x{2022}",
1006             "\x96" => "\x{2013}",
1007             "\x97" => "\x{2014}",
1008             "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1009             "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1010             "\xA0" => "\xA0",
1011             },
1012             ## NOTE: |tis-620| is treated as |windows-874|, so ensure that
1013             ## they are consistent.
1014             });
1015              
1016             $Charset->{'iso-8859-11'}
1017             = $IANACharset->{'iso-8859-11'}
1018             = $HTMLCharset->{'iso885911'}
1019             = __PACKAGE__->new ({
1020             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1021             iana_names => {
1022             'iso-8859-11' => UNREGISTERED_CHARSET_NAME,
1023             ## NOTE: The Web Thai encoding, i.e. windows-874.
1024             },
1025             perl_names => {
1026             'web-thai' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1027             'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
1028             },
1029             fallback => {
1030             "\x80" => "\x{20AC}",
1031             "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
1032             "\x85" => "\x{2026}",
1033             "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
1034             "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
1035             "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
1036             "\x91" => "\x{2018}",
1037             "\x92" => "\x{2019}",
1038             "\x93" => "\x{201C}",
1039             "\x94" => "\x{201D}",
1040             "\x95" => "\x{2022}",
1041             "\x96" => "\x{2013}",
1042             "\x97" => "\x{2014}",
1043             "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1044             "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1045             },
1046             ## NOTE: |iso-8859-11| is treated as |windows-874|, so ensure that
1047             ## they are consistent.
1048             });
1049              
1050             $Charset->{'windows-874'}
1051             = $IANACharset->{'windows-874'}
1052             = $HTMLCharset->{'windows874'}
1053             = __PACKAGE__->new ({
1054             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1055             iana_names => {
1056             'windows-874' => UNREGISTERED_CHARSET_NAME,
1057             },
1058             perl_names => {
1059             'windows-874' => REGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1060             },
1061             ## TODO: |error_level|
1062             });
1063              
1064             $IANACharset->{'windows-949'}
1065             = $HTMLCharset->{windows949}
1066             = __PACKAGE__->new ({
1067             category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
1068             iana_names => {
1069             'windows-949' => UNREGISTERED_CHARSET_NAME,
1070             },
1071             perl_names => {
1072             'cp949' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
1073             ERROR_REPORTING_ENCODING_IMPL,
1074             ## TODO: Is this implementation conforming?
1075             },
1076             ## NOTE: |error_level| is same as default, since we can't find any formal
1077             ## definition for this charset.
1078             });
1079              
1080             sub new ($$) {
1081 378     378 0 1748 return bless $_[1], $_[0];
1082             } # new
1083              
1084             ## NOTE: A class method
1085             sub get_by_html_name ($$) {
1086 716     716 0 1530 my $name = $_[1];
1087 716         1416 $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
1088 716         1414 my $iana_name = $name;
1089 716         3771 $name =~ s/[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]//g;
1090             ## NOTE: U+000B is included.
1091 716 50 66     2265 unless ($HTMLCharset->{$name} || $IANACharset->{$name}) {
1092             $IANACharset->{$iana_name} =
1093 0         0 $HTMLCharset->{$name} = __PACKAGE__->new ({
1094             iana_names => {
1095             $iana_name => UNREGISTERED_CHARSET_NAME,
1096             },
1097             });
1098             }
1099 716   66     2430 return $HTMLCharset->{$name} || $IANACharset->{$name};
1100             } # get_by_html_name
1101              
1102             ## NOTE: A class method
1103             sub get_by_iana_name ($$) {
1104 0     0 0 0 my $name = $_[1];
1105 0         0 $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
1106 0 0       0 unless ($IANACharset->{$name}) {
1107 0         0 $IANACharset->{$name} = __PACKAGE__->new ({
1108             iana_names => {
1109             $name => UNREGISTERED_CHARSET_NAME,
1110             },
1111             });
1112             }
1113 0         0 return $IANACharset->{$name};
1114             } # get_by_iana_name
1115              
1116             sub get_decode_handle ($$;%) {
1117 713     713 0 1172 my $self = shift;
1118 713         1082 my $byte_stream = shift;
1119 713         2722 my %opt = @_;
1120              
1121             my $obj = {
1122             category => $self->{category},
1123             char_buffer => \(my $s = ''),
1124             char_buffer_pos => 0,
1125             character_queue => [],
1126             filehandle => $byte_stream,
1127             charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)
1128 713         14584 byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used
1129       0     onerror => $opt{onerror} || sub {},
1130             #onerror_set
1131             level => $opt{level} || {
1132             must => 'm',
1133             charset_variant => 'm',
1134             charset_fact => 'm',
1135             iso_shall => 'm',
1136             },
1137             error_level => $self->{error_level} || {
1138             ## HTML5 charset name aliases
1139             ## NOTE: Use of code points in the variant whose definition differs
1140             ## from the original charset is a parse error in HTML5. However,
1141             ## it does not affect the document conformance; the HTML5 spec
1142             ## does not define the conformance of the input stream against the
1143             ## charset in use.
1144 713 50 50     2887 'fallback-char-error' => 'charset_variant',
      50        
      50        
1145             #'fallback-illegal-error' => 'charset_variant',
1146             'fallback-unassigned-error' => 'charset_variant',
1147             ## NOTE: An appropriate error level should be set for each charset
1148             ## (many charset prohibits use of unassigned code points).
1149              
1150             'illegal-octets-error' => 'charset_fact',
1151             'unassigned-code-point-error' => 'charset_fact',
1152             'invalid-state-error' => 'charset_fact',
1153             },
1154             };
1155              
1156 713         3926 require HTML::HTML5::Parser::Charset::DecodeHandle;
1157 713 50       2679 if ($self->{iana_names}->{'iso-2022-jp'}) {
    50          
1158 0         0 $obj->{state_2440} = 'gl-jis-1978';
1159 0         0 $obj->{state_2442} = 'gl-jis-1983';
1160 0         0 $obj->{state} = 'state_2842';
1161 0         0 eval {
1162 0         0 require Encode::GLJIS1978;
1163 0         0 require Encode::GLJIS1983;
1164             };
1165 0 0 0     0 if (Encode::find_encoding ($obj->{state_2440}) and
1166             Encode::find_encoding ($obj->{state_2442})) {
1167 0         0 return ((bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP'),
1168             PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
1169             }
1170             } elsif ($self->{xml_names}->{'iso-2022-jp'}) {
1171 0         0 $obj->{state_2440} = 'gl-jis-1997-swapped';
1172 0         0 $obj->{state_2442} = 'gl-jis-1997';
1173 0         0 $obj->{state} = 'state_2842';
1174 0         0 eval {
1175 0         0 require Encode::GLJIS1997Swapped;
1176 0         0 require Encode::GLJIS1997;
1177             };
1178 0 0 0     0 if (Encode::find_encoding ($obj->{state_2440}) and
1179             Encode::find_encoding ($obj->{state_2442})) {
1180 0         0 return ((bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP'),
1181             PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
1182             }
1183             }
1184              
1185 713         2836 my ($e, $e_status) = $self->get_perl_encoding
1186             (%opt, allow_semiconforming => 1);
1187 713 50       2051 if ($e) {
1188 713         1629 $obj->{perl_encoding_name} = $e->name;
1189 713 50       3625 unless ($self->{category} & CHARSET_CATEGORY_BLOCK_SAFE) {
1190 0         0 $e_status |= FALLBACK_ENCODING_IMPL;
1191             }
1192 713         1335 $obj->{bom_pattern} = $self->{bom_pattern};
1193 713         1456 $obj->{fallback} = $self->{fallback};
1194 713         5002 return ((bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode'), $e_status);
1195             } else {
1196 0         0 return (undef, 0);
1197             }
1198             } # get_decode_handle
1199              
1200             sub get_perl_encoding ($;%) {
1201 713     713 0 2483 my ($self, %opt) = @_;
1202            
1203 713         3016 require Encode;
1204             my $load_encode = sub {
1205 713     713   1255 my $name = shift;
1206 713 50 33     5786 if ($name eq 'euc-jp-1997') {
    50          
    100          
    50          
1207 0         0 require Encode::EUCJP1997;
1208             } elsif ($name eq 'shift-jis-1997') {
1209 0         0 require Encode::ShiftJIS1997;
1210             } elsif ({'web-latin1' => 1,
1211             'web-latin1-us-ascii' => 1,
1212             'web-latin5' => 1}->{$name}) {
1213 2         771 require HTML::HTML5::Parser::Charset::WebLatin1;
1214             } elsif ($name eq 'web-thai' or $name eq 'web-tis-620') {
1215 0         0 require HTML::HTML5::Parser::Charset::WebThai;
1216             }
1217 713         2817 }; # $load_encode
1218              
1219 713 50       2030 if ($opt{allow_error_reporting}) {
1220 713 50       1113 for my $perl_name (keys %{$self->{perl_names} or {}}) {
  713         2701  
1221 715         1309 my $perl_status = $self->{perl_names}->{$perl_name};
1222 715 100       1731 next unless $perl_status & ERROR_REPORTING_ENCODING_IMPL;
1223 713 50       1565 next if $perl_status & FALLBACK_ENCODING_IMPL;
1224             next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
1225 713 50 33     2705 not $opt{allow_semiconforming};
1226            
1227 713         1890 $load_encode->($perl_name);
1228 713         3201 my $e = Encode::find_encoding ($perl_name);
1229 713 50 33     14650 if ($e and $e->name eq $perl_name) {
1230             ## NOTE: Don't return $e unless $e eq $perl_name, since
1231             ## |find_encoding| resolves e.g. |foobarlatin-1| to |iso-8859-1|,
1232             ## which might return wrong encoding object when a dedicated
1233             ## implementation not part of the standard Perl distribution is
1234             ## desired.
1235 713         7775 return ($e, $perl_status);
1236             }
1237             }
1238             }
1239            
1240 0 0       0 for my $perl_name (keys %{$self->{perl_names} or {}}) {
  0         0  
1241 0         0 my $perl_status = $self->{perl_names}->{$perl_name};
1242 0 0       0 next if $perl_status & ERROR_REPORTING_ENCODING_IMPL;
1243 0 0       0 next if $perl_status & FALLBACK_ENCODING_IMPL;
1244             next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
1245 0 0 0     0 not $opt{allow_semiconforming};
1246              
1247 0         0 $load_encode->($perl_name);
1248 0         0 my $e = Encode::find_encoding ($perl_name);
1249 0 0       0 if ($e) {
1250 0         0 return ($e, $perl_status);
1251             }
1252             }
1253            
1254 0 0       0 if ($opt{allow_fallback}) {
1255 0 0       0 for my $perl_name (keys %{$self->{perl_names} or {}}) {
  0         0  
1256 0         0 my $perl_status = $self->{perl_names}->{$perl_name};
1257 0 0 0     0 next unless $perl_status & FALLBACK_ENCODING_IMPL or
1258             $perl_status & SEMICONFORMING_ENCODING_IMPL;
1259             ## NOTE: We don't prefer semi-conforming implementations to
1260             ## non-conforming implementations, since semi-conforming implementations
1261             ## will never be conforming without assist of the callee, and in such
1262             ## cases the callee should set the |allow_semiconforming| option upon
1263             ## the invocation of the method anyway.
1264            
1265 0         0 $load_encode->($perl_name);
1266 0         0 my $e = Encode::find_encoding ($perl_name);
1267 0 0       0 if ($e) {
1268 0         0 return ($e, $perl_status);
1269             }
1270             }
1271              
1272 0 0       0 for my $iana_name (keys %{$self->{iana_names} or {}}) {
  0         0  
1273 0         0 $load_encode->($iana_name);
1274 0         0 my $e = Encode::find_encoding ($iana_name);
1275 0 0       0 if ($e) {
1276 0         0 return ($e, FALLBACK_ENCODING_IMPL);
1277             }
1278             }
1279             }
1280            
1281 0         0 return (undef, 0);
1282             } # get_perl_encoding
1283              
1284             sub get_iana_name ($) {
1285 714     714 0 1156 my $self = shift;
1286            
1287 714         1320 my $primary;
1288             my $other;
1289 714 50       1189 for my $iana_name (keys %{$self->{iana_names} or {}}) {
  714         2750  
1290 1428         2398 my $name_status = $self->{iana_names}->{$iana_name};
1291 1428 100       3529 if ($name_status & PREFERRED_CHARSET_NAME) {
    100          
    100          
1292 4         17 return $iana_name;
1293             } elsif ($name_status & PRIMARY_CHARSET_NAME) {
1294 710         1362 $primary = $iana_name;
1295             } elsif ($name_status & REGISTERED_CHARSET_NAME) {
1296 4         7 $other = $iana_name;
1297             } else {
1298 710   33     2500 $other ||= $iana_name;
1299             }
1300             }
1301              
1302 710   33     3772 return $primary || $other;
1303             } # get_iana_name
1304              
1305             ## NOTE: A non-method function
1306             sub is_syntactically_valid_iana_charset_name ($) {
1307 0     0 0   my $name = shift;
1308 0           return $name =~ /\A[\x20-\x7E]{1,40}\z/;
1309              
1310             ## NOTE: According to IANAREG, "The character set names may be up to 40
1311             ## characters taken from the printable characters of US-ASCII. However,
1312             ## no distinction is made between use of upper and lower case letters.".
1313             } # is_suntactically_valid_iana_charset_name
1314              
1315             1;
1316             ## $Date: 2008/09/15 07:19:33 $
1317