File Coverage

blib/lib/MIME/Charset.pm
Criterion Covered Total %
statement 214 291 73.5
branch 92 168 54.7
condition 43 99 43.4
subroutine 29 35 82.8
pod 20 20 100.0
total 398 613 64.9


line stmt bran cond sub pod time code
1             #-*- perl -*-
2              
3             package MIME::Charset;
4 9     9   200987 use 5.005;
  9         81  
5              
6             =head1 NAME
7              
8             MIME::Charset - Charset Information for MIME
9              
10             =head1 SYNOPSIS
11              
12             use MIME::Charset:
13              
14             $charset = MIME::Charset->new("euc-jp");
15              
16             Getting charset information:
17              
18             $benc = $charset->body_encoding; # e.g. "Q"
19             $cset = $charset->as_string; # e.g. "US-ASCII"
20             $henc = $charset->header_encoding; # e.g. "S"
21             $cset = $charset->output_charset; # e.g. "ISO-2022-JP"
22              
23             Translating text data:
24              
25             ($text, $charset, $encoding) =
26             $charset->header_encode(
27             "\xc9\xc2\xc5\xaa\xc0\xde\xc3\xef\xc5\xaa".
28             "\xc7\xd1\xca\xaa\xbd\xd0\xce\xcf\xb4\xef",
29             Charset => 'euc-jp');
30             # ...returns e.g. (, "ISO-2022-JP", "B").
31              
32             ($text, $charset, $encoding) =
33             $charset->body_encode(
34             "Collectioneur path\xe9tiquement ".
35             "\xe9clectique de d\xe9chets",
36             Charset => 'latin1');
37             # ...returns e.g. (, "ISO-8859-1", "QUOTED-PRINTABLE").
38              
39             $len = $charset->encoded_header_len(
40             "Perl\xe8\xa8\x80\xe8\xaa\x9e",
41             Charset => 'utf-8',
42             Encoding => "b");
43             # ...returns e.g. 28.
44              
45             Manipulating module defaults:
46              
47             MIME::Charset::alias("csEUCKR", "euc-kr");
48             MIME::Charset::default("iso-8859-1");
49             MIME::Charset::fallback("us-ascii");
50              
51             Non-OO functions (may be deprecated in near future):
52              
53             use MIME::Charset qw(:info);
54              
55             $benc = body_encoding("iso-8859-2"); # "Q"
56             $cset = canonical_charset("ANSI X3.4-1968"); # "US-ASCII"
57             $henc = header_encoding("utf-8"); # "S"
58             $cset = output_charset("shift_jis"); # "ISO-2022-JP"
59              
60             use MIME::Charset qw(:trans);
61              
62             ($text, $charset, $encoding) =
63             header_encode(
64             "\xc9\xc2\xc5\xaa\xc0\xde\xc3\xef\xc5\xaa".
65             "\xc7\xd1\xca\xaa\xbd\xd0\xce\xcf\xb4\xef",
66             "euc-jp");
67             # ...returns (, "ISO-2022-JP", "B");
68              
69             ($text, $charset, $encoding) =
70             body_encode(
71             "Collectioneur path\xe9tiquement ".
72             "\xe9clectique de d\xe9chets",
73             "latin1");
74             # ...returns (, "ISO-8859-1", "QUOTED-PRINTABLE");
75              
76             $len = encoded_header_len(
77             "Perl\xe8\xa8\x80\xe8\xaa\x9e", "b", "utf-8"); # 28
78              
79             =head1 DESCRIPTION
80              
81             MIME::Charset provides information about character sets used for
82             MIME messages on Internet.
83              
84             =head2 Definitions
85              
86             The B is ``character set'' used in MIME to refer to a
87             method of converting a sequence of octets into a sequence of characters.
88             It includes both concepts of ``coded character set'' (CCS) and
89             ``character encoding scheme'' (CES) of ISO/IEC.
90              
91             The B is that used in MIME to refer to a method of representing
92             a body part or a header body as sequence(s) of printable US-ASCII
93             characters.
94              
95             =cut
96              
97 9     9   42 use strict;
  9         12  
  9         243  
98 9     9   33 use vars qw(@ISA $VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS $Config);
  9         13  
  9         656  
99 9     9   53 use Exporter;
  9         17  
  9         819  
100             @ISA = qw(Exporter);
101             @EXPORT = qw(body_encoding canonical_charset header_encoding output_charset
102             body_encode encoded_header_len header_encode);
103             @EXPORT_OK = qw(alias default fallback recommended);
104             %EXPORT_TAGS = (
105             "info" => [qw(body_encoding header_encoding
106             canonical_charset output_charset)],
107             "trans" =>[ qw(body_encode encoded_header_len
108             header_encode)],
109             );
110 9     9   55 use Carp qw(croak);
  9         21  
  9         667  
111              
112 9 50   9   52 use constant USE_ENCODE => ($] >= 5.007003)? 'Encode': '';
  9         22  
  9         1375  
113              
114             my @ENCODE_SUBS = qw(FB_CROAK FB_PERLQQ FB_HTMLCREF FB_XMLCREF
115             is_utf8 resolve_alias);
116             if (USE_ENCODE) {
117 9     9   4662 eval "use ".USE_ENCODE." \@ENCODE_SUBS;";
  9         80726  
  9         3718  
118             if ($@) { # Perl 5.7.3 + Encode 0.40
119             eval "use ".USE_ENCODE." qw(is_utf8);";
120             require MIME::Charset::_Compat;
121             for my $sub (@ENCODE_SUBS) {
122 9     9   53 no strict "refs";
  9         13  
  9         742  
123             *{$sub} = \&{"MIME::Charset::_Compat::$sub"}
124             unless $sub eq 'is_utf8';
125             }
126             }
127             } else {
128             require MIME::Charset::_Compat;
129             for my $sub (@ENCODE_SUBS) {
130 9     9   61 no strict "refs";
  9         14  
  9         33756  
131             *{$sub} = \&{"MIME::Charset::_Compat::$sub"};
132             }
133             }
134              
135             $VERSION = '1.013.1';
136              
137             ######## Private Attributes ########
138              
139             my $DEFAULT_CHARSET = 'US-ASCII';
140             my $FALLBACK_CHARSET = 'UTF-8';
141              
142             # This table was initially borrowed from Python email package.
143              
144             my %CHARSETS = (# input header enc body enc output conv
145             'DIN_66003' => ['Q', undef, undef],
146             'ISO-8859-1' => ['Q', 'Q', undef],
147             'ISO-8859-2' => ['Q', 'Q', undef],
148             'ISO-8859-3' => ['Q', 'Q', undef],
149             'ISO-8859-4' => ['Q', 'Q', undef],
150             # ISO-8859-5 is Cyrillic, and not especially used
151             # ISO-8859-6 is Arabic, also not particularly used
152             # ISO-8859-7 is Greek, 'Q' will not make it readable
153             # ISO-8859-8 is Hebrew, 'Q' will not make it readable
154             'ISO-8859-9' => ['Q', 'Q', undef],
155             'ISO-8859-10' => ['Q', 'Q', undef],
156             # ISO-8859-11 is Thai, 'Q' will not make it readable
157             'ISO-8859-13' => ['Q', 'Q', undef],
158             'ISO-8859-14' => ['Q', 'Q', undef],
159             'ISO-8859-15' => ['Q', 'Q', undef],
160             'ISO-8859-16' => ['Q', 'Q', undef],
161             'WINDOWS-1252' => ['Q', 'Q', undef],
162             'VISCII' => ['Q', 'Q', undef],
163             'US-ASCII' => [undef, undef, undef],
164             'BIG5' => ['B', 'B', undef],
165             'GB2312' => ['B', 'B', undef],
166             'HZ-GB-2312' => ['B', undef, undef],
167             'EUC-JP' => ['B', undef, 'ISO-2022-JP'],
168             'SHIFT_JIS' => ['B', undef, 'ISO-2022-JP'],
169             'ISO-2022-JP' => ['B', undef, undef],
170             'ISO-2022-JP-1' => ['B', undef, undef],
171             'ISO-2022-JP-2' => ['B', undef, undef],
172             'EUC-JISX0213' => ['B', undef, 'ISO-2022-JP-3'],
173             'SHIFT_JISX0213' => ['B', undef, 'ISO-2022-JP-3'],
174             'ISO-2022-JP-3' => ['B', undef, undef],
175             'EUC-JIS-2004' => ['B', undef, 'ISO-2022-JP-2004'],
176             'SHIFT_JIS-2004' => ['B', undef, 'ISO-2022-JP-2004'],
177             'ISO-2022-JP-2004' => ['B', undef, undef],
178             'KOI8-R' => ['B', 'B', undef],
179             'TIS-620' => ['B', 'B', undef], # cf. Mew
180             'UTF-16' => ['B', 'B', undef],
181             'UTF-16BE' => ['B', 'B', undef],
182             'UTF-16LE' => ['B', 'B', undef],
183             'UTF-32' => ['B', 'B', undef],
184             'UTF-32BE' => ['B', 'B', undef],
185             'UTF-32LE' => ['B', 'B', undef],
186             'UTF-7' => ['Q', undef, undef],
187             'UTF-8' => ['S', 'S', undef],
188             'GSM03.38' => [undef, undef, undef], # not for MIME
189             # We're making this one up to represent raw unencoded 8bit
190             '8BIT' => [undef, 'B', 'ISO-8859-1'],
191             );
192              
193             # Fix some unexpected or unpreferred names returned by
194             # Encode::resolve_alias() or used by somebodies else.
195             my %CHARSET_ALIASES = (# unpreferred preferred
196             "ASCII" => "US-ASCII",
197             "BIG5-ETEN" => "BIG5",
198             "CP1250" => "WINDOWS-1250",
199             "CP1251" => "WINDOWS-1251",
200             "CP1252" => "WINDOWS-1252",
201             "CP1253" => "WINDOWS-1253",
202             "CP1254" => "WINDOWS-1254",
203             "CP1255" => "WINDOWS-1255",
204             "CP1256" => "WINDOWS-1256",
205             "CP1257" => "WINDOWS-1257",
206             "CP1258" => "WINDOWS-1258",
207             "CP874" => "WINDOWS-874",
208             "CP936" => "GBK",
209             "CP949" => "KS_C_5601-1987",
210             "DIN66003" => "DIN_66003",
211             "EUC-CN" => "GB2312",
212             "HZ" => "HZ-GB-2312", # RFC 1842
213             "KS_C_5601" => "KS_C_5601-1987",
214             "SHIFTJIS" => "SHIFT_JIS",
215             "SHIFTJISX0213" => "SHIFT_JISX0213",
216             "TIS620" => "TIS-620", # IANA MIBenum 2259
217             "UNICODE-1-1-UTF-7" => "UTF-7", # RFC 1642 (obs.)
218             "UTF8" => "UTF-8",
219             "UTF-8-STRICT" => "UTF-8", # Perl internal use
220             "GSM0338" => "GSM03.38", # not for MIME
221             );
222              
223             # Some vendors encode characters beyond standardized mappings using extended
224             # encoders. Some other standard encoders need additional encode modules.
225             my %ENCODERS = (
226             'EXTENDED' => {
227             'ISO-8859-1' => [['cp1252'], ], # Encode::Byte
228             'ISO-8859-2' => [['cp1250'], ], # Encode::Byte
229             'ISO-8859-5' => [['cp1251'], ], # Encode::Byte
230             'ISO-8859-6' => [
231             ['cp1256'], # Encode::Byte
232             # ['cp1006'], # ditto, for Farsi
233             ],
234             'ISO-8859-6-I'=>[['cp1256'], ], # ditto
235             'ISO-8859-7' => [['cp1253'], ], # Encode::Byte
236             'ISO-8859-8' => [['cp1255'], ], # Encode::Byte
237             'ISO-8859-8-I'=>[['cp1255'], ], # ditto
238             'ISO-8859-9' => [['cp1254'], ], # Encode::Byte
239             'ISO-8859-13'=> [['cp1257'], ], # Encode::Byte
240             'GB2312' => [
241             ['gb18030', 'Encode::HanExtra'],
242             ['cp936'], # Encode::CN
243             ],
244             'EUC-JP' => [
245             ['eucJP-ascii', 'Encode::EUCJPASCII'],
246             # ['cp51932', 'Encode::EUCJPMS'],
247             ],
248             'ISO-2022-JP'=> [
249             ['x-iso2022jp-ascii',
250             'Encode::EUCJPASCII'],
251             # ['iso-2022-jp-ms','Encode::ISO2022JPMS'],
252             # ['cp50220', 'Encode::EUCJPMS'],
253             # ['cp50221', 'Encode::EUCJPMS'],
254             ['iso-2022-jp-1'], # Encode::JP (note*)
255             ],
256             'SHIFT_JIS' => [
257             ['cp932'], # Encode::JP
258             ],
259             'EUC-JISX0213' => [['euc-jis-2004', 'Encode::JISX0213'], ],
260             'ISO-2022-JP-3' => [['iso-2022-jp-2004', 'Encode::JISX0213'], ],
261             'SHIFT_JISX0213'=> [['shift_jis-2004', 'Encode::ShiftJIS2004'], ],
262             'EUC-KR' => [['cp949'], ], # Encode::KR
263             'BIG5' => [
264             # ['big5plus', 'Encode::HanExtra'],
265             # ['big5-2003', 'Encode::HanExtra'],
266             ['cp950'], # Encode::TW
267             # ['big5-1984', 'Encode::HanExtra'],
268             ],
269             'TIS-620' => [['cp874'], ], # Encode::Byte
270             'UTF-8' => [['utf8'], ], # Special name on Perl
271             },
272             'STANDARD' => {
273             'DIN_66003' => [['din66003', 'Endode::DIN66003'], ],
274             'ISO-8859-6-E' => [['iso-8859-6'],],# Encode::Byte
275             'ISO-8859-6-I' => [['iso-8859-6'],],# ditto
276             'ISO-8859-8-E' => [['iso-8859-8'],],# Encode::Byte
277             'ISO-8859-8-I' => [['iso-8859-8'],],# ditto
278             'GB18030' => [['gb18030', 'Encode::HanExtra'], ],
279             'ISO-2022-JP-2' => [['iso-2022-jp-2','Encode::ISO2022JP2'], ],
280             'EUC-JISX0213' => [['euc-jisx0213', 'Encode::JISX0213'], ],
281             'ISO-2022-JP-3' => [['iso-2022-jp-3', 'Encode::JISX0213'], ],
282             'EUC-JIS-2004' => [['euc-jis-2004', 'Encode::JISX0213'], ],
283             'ISO-2022-JP-2004' => [['iso-2022-jp-2004', 'Encode::JISX0213'], ],
284             'SHIFT_JIS-2004'=> [['shift_jis-2004', 'Encode::ShiftJIS2004'], ],
285             'EUC-TW' => [['euc-tw', 'Encode::HanExtra'], ],
286             'HZ-GB-2312' => [['hz'], ], # Encode::CN
287             'TIS-620' => [['tis620'], ], # (note*)
288             'UTF-16' => [['x-utf16auto', 'MIME::Charset::UTF'],],
289             'UTF-32' => [['x-utf32auto', 'MIME::Charset::UTF'],],
290             'GSM03.38' => [['gsm0338'], ], # Encode::GSM0338
291              
292             # (note*) ISO-8859-11 was not registered by IANA.
293             # L treats it as canonical name of ``tis-?620''.
294             },
295             );
296              
297             # ISO-2022-* escape sequences etc. to detect charset from unencoded data.
298             my @ESCAPE_SEQS = (
299             # ISO-2022-* sequences
300             # escape seq, possible charset
301             # Following sequences are commonly used.
302             ["\033\$\@", "ISO-2022-JP"], # RFC 1468
303             ["\033\$B", "ISO-2022-JP"], # ditto
304             ["\033(J", "ISO-2022-JP"], # ditto
305             ["\033(I", "ISO-2022-JP"], # ditto (nonstandard)
306             ["\033\$(D", "ISO-2022-JP"], # RFC 2237 (note*)
307             # Following sequences are less commonly used.
308             ["\033.A", "ISO-2022-JP-2"], # RFC 1554
309             ["\033.F", "ISO-2022-JP-2"], # ditto
310             ["\033\$(C", "ISO-2022-JP-2"], # ditto
311             ["\033\$(O", "ISO-2022-JP-3"], # JIS X 0213:2000
312             ["\033\$(P", "ISO-2022-JP-2004"], # JIS X 0213:2000/2004
313             ["\033\$(Q", "ISO-2022-JP-2004"], # JIS X 0213:2004
314             ["\033\$)C", "ISO-2022-KR"], # RFC 1557
315             ["\033\$)A", "ISO-2022-CN"], # RFC 1922
316             ["\033\$A", "ISO-2022-CN"], # ditto (nonstandard)
317             ["\033\$)G", "ISO-2022-CN"], # ditto
318             ["\033\$*H", "ISO-2022-CN"], # ditto
319             # Other sequences will be used with appropriate charset
320             # parameters, or hardly used.
321              
322             # note*: This RFC defines ISO-2022-JP-1, superset of
323             # ISO-2022-JP. But that charset name is rarely used.
324             # OTOH many of encoders for ISO-2022-JP recognize this
325             # sequence so that comatibility with EUC-JP will be
326             # guaranteed.
327              
328             # Singlebyte 7-bit sequences
329             # escape seq, possible charset
330             ["\033e", "GSM03.38"], # ESTI GSM 03.38 (note*)
331             ["\033\012", "GSM03.38"], # ditto
332             ["\033<", "GSM03.38"], # ditto
333             ["\033/", "GSM03.38"], # ditto
334             ["\033>", "GSM03.38"], # ditto
335             ["\033\024", "GSM03.38"], # ditto
336             ["\033(", "GSM03.38"], # ditto
337             ["\033\@", "GSM03.38"], # ditto
338             ["\033)", "GSM03.38"], # ditto
339             ["\033=", "GSM03.38"], # ditto
340              
341             # note*: This is not used for MIME message.
342             );
343              
344             ######## Public Configuration Attributes ########
345              
346             $Config = {
347             Detect7bit => 'YES',
348             Mapping => 'EXTENDED',
349             Replacement => 'DEFAULT',
350             };
351             local @INC = @INC;
352             pop @INC if $INC[-1] eq '.';
353             eval { require MIME::Charset::Defaults; };
354              
355             ######## Private Constants ########
356              
357             my $NON7BITRE = qr{
358             [^\x01-\x7e]
359             }x;
360              
361             my $NONASCIIRE = qr{
362             [^\x09\x0a\x0d\x20\x21-\x7e]
363             }x;
364              
365             my $ISO2022RE = qr{
366             ISO-2022-.+
367             }ix;
368              
369             my $ASCIITRANSRE = qr{
370             HZ-GB-2312 | UTF-7
371             }ix;
372              
373              
374             ######## Public Functions ########
375              
376             =head2 Constructor
377              
378             =over
379              
380             =item $charset = MIME::Charset->new([CHARSET [, OPTS]])
381              
382             Create charset object.
383              
384             OPTS may accept following key-value pair.
385             B:
386             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
387             conversion will not be performed. So this option do not have any effects.
388              
389             =over 4
390              
391             =item Mapping => MAPTYPE
392              
393             Whether to extend mappings actually used for charset names or not.
394             C<"EXTENDED"> uses extended mappings.
395             C<"STANDARD"> uses standardized strict mappings.
396             Default is C<"EXTENDED">.
397              
398             =back
399              
400             =cut
401              
402             sub new {
403 75     75 1 1531 my $class = shift;
404 75         99 my $charset = shift;
405 75 100       141 return bless {}, $class unless $charset;
406 70 50       136 return bless {}, $class if 75 < length $charset; # w/a for CPAN RT #65796.
407 70         110 my %params = @_;
408 70   33     258 my $mapping = uc($params{'Mapping'} || $Config->{Mapping});
409              
410 70 100       299 if ($charset =~ /\bhz.?gb.?2312$/i) {
    100          
    100          
411             # workaround: "HZ-GB-2312" mistakenly treated as "EUC-CN" by Encode
412             # (2.12).
413 4         7 $charset = "HZ-GB-2312";
414             } elsif ($charset =~ /\btis-?620$/i) {
415             # workaround: "TIS620" treated as ISO-8859-11 by Encode.
416             # And "TIS-620" not known by some versions of Encode (cf.
417             # CPAN RT #20781).
418 1         2 $charset = "TIS-620";
419             } elsif ($charset =~ /\biso[-_]8859[-_]8[-_]i$/i) {
420             # workaround: "ISO-8859-8-I" is treated as an alias of "ISO-8859-8"
421             # by Encode (3.19): See the note in
422             # https://encoding.spec.whatwg.org/#legacy-single-byte-encodings
423             # However we'll treat these as separate names for compatibility.
424 1         2 $charset = "ISO-8859-8-I";
425             } else {
426 64   66     159 $charset = resolve_alias($charset) || $charset
427             }
428 70   66     114022 $charset = $CHARSET_ALIASES{uc($charset)} || uc($charset);
429 70         115 my ($henc, $benc, $outcset);
430 70         145 my $spec = $CHARSETS{$charset};
431 70 100       118 if ($spec) {
432 60         149 ($henc, $benc, $outcset) =
433             ($$spec[0], $$spec[1], USE_ENCODE? $$spec[2]: undef);
434             } else {
435 10         19 ($henc, $benc, $outcset) = ('S', 'B', undef);
436             }
437 70         93 my ($decoder, $encoder);
438 70         79 if (USE_ENCODE) {
439 70         128 $decoder = _find_encoder($charset, $mapping);
440 70         118 $encoder = _find_encoder($outcset, $mapping);
441             } else {
442             $decoder = $encoder = undef;
443             }
444              
445 70   66     604 bless {
      100        
446             InputCharset => $charset,
447             Decoder => $decoder,
448             HeaderEncoding => $henc,
449             BodyEncoding => $benc,
450             OutputCharset => ($outcset || $charset),
451             Encoder => ($encoder || $decoder),
452             }, $class;
453             }
454              
455             my %encoder_cache = ();
456              
457             sub _find_encoder($$) {
458 140   100 140   345 my $charset = uc(shift || "");
459 140 100       243 return undef unless $charset;
460 78         116 my $mapping = uc(shift);
461 78         93 my ($spec, $name, $module, $encoder);
462              
463 78         97 local($@);
464 78         180 $encoder = $encoder_cache{$charset, $mapping};
465 78 100       198 return $encoder if ref $encoder;
466              
467 55         88 foreach my $m (('EXTENDED', 'STANDARD')) {
468 82 50 66     233 next if $m eq 'EXTENDED' and $mapping ne 'EXTENDED';
469 82         155 $spec = $ENCODERS{$m}->{$charset};
470 82 100       146 next unless $spec;
471 41         53 foreach my $s (@{$spec}) {
  41         68  
472 47         67 ($name, $module) = @{$s};
  47         96  
473 47 100       89 if ($module) {
474 14 100       792 next unless eval "require $module;";
475             }
476 37         136 $encoder = Encode::find_encoding($name);
477 37 50       21314 last if ref $encoder;
478             }
479 41 100       90 last if ref $encoder;
480             }
481 55   100     186 $encoder ||= Encode::find_encoding($charset);
482 55 100       1405 $encoder_cache{$charset, $mapping} = $encoder if $encoder;
483 55         123 return $encoder;
484             }
485              
486             =back
487              
488             =head2 Getting Information of Charsets
489              
490             =over
491              
492             =item $charset->body_encoding
493              
494             =item body_encoding CHARSET
495              
496             Get recommended transfer-encoding of CHARSET for message body.
497              
498             Returned value will be one of C<"B"> (BASE64), C<"Q"> (QUOTED-PRINTABLE),
499             C<"S"> (shorter one of either) or
500             C (might not be transfer-encoded; either 7BIT or 8BIT). This may
501             not be same as encoding for message header.
502              
503             =cut
504              
505             sub body_encoding($) {
506 4     4 1 15 my $self = shift;
507 4 50       22 return undef unless $self;
508 4 100       16 $self = __PACKAGE__->new($self) unless ref $self;
509 4         31 $self->{BodyEncoding};
510             }
511              
512             =item $charset->as_string
513              
514             =item canonical_charset CHARSET
515              
516             Get canonical name for charset.
517              
518             =cut
519              
520             sub canonical_charset($) {
521 4     4 1 12 my $self = shift;
522 4 50       14 return undef unless $self;
523 4 100       11 $self = __PACKAGE__->new($self) unless ref $self;
524 4         25 $self->{InputCharset};
525             }
526              
527             sub as_string($) {
528 29     29 1 91 my $self = shift;
529 29         78 $self->{InputCharset};
530             }
531              
532             =item $charset->decoder
533              
534             Get L<"Encode::Encoding"> object to decode strings to Unicode by charset.
535             If charset is not specified or not known by this module,
536             undef will be returned.
537              
538             =cut
539              
540             sub decoder($) {
541 15     15 1 34 my $self = shift;
542 15         46 $self->{Decoder};
543             }
544              
545             =item $charset->dup
546              
547             Get a copy of charset object.
548              
549             =cut
550              
551             sub dup($) {
552 2     2 1 4 my $self = shift;
553 2         6 my $obj = __PACKAGE__->new(undef);
554 2         4 %{$obj} = %{$self};
  2         7  
  2         8  
555 2         6 $obj;
556             }
557              
558             =item $charset->encoder([CHARSET])
559              
560             Get L<"Encode::Encoding"> object to encode Unicode string using compatible
561             charset recommended to be used for messages on Internet.
562              
563             If optional CHARSET is specified, replace encoder (and output charset
564             name) of $charset object with those of CHARSET, therefore,
565             $charset object will be a converter between original charset and
566             new CHARSET.
567              
568             =cut
569              
570             sub encoder($$;) {
571 6     6 1 8 my $self = shift;
572 6         9 my $charset = shift;
573 6 100       13 if ($charset) {
574 2 50       10 $charset = __PACKAGE__->new($charset) unless ref $charset;
575 2         6 $self->{OutputCharset} = $charset->{InputCharset};
576 2         6 $self->{Encoder} = $charset->{Decoder};
577 2         3 $self->{BodyEncoding} = $charset->{BodyEncoding};
578 2         4 $self->{HeaderEncoding} = $charset->{HeaderEncoding};
579             }
580 6         45 $self->{Encoder};
581             }
582              
583             =item $charset->header_encoding
584              
585             =item header_encoding CHARSET
586              
587             Get recommended encoding scheme of CHARSET for message header.
588              
589             Returned value will be one of C<"B">, C<"Q">, C<"S"> (shorter one of either)
590             or C (might not be encoded). This may not be same as encoding
591             for message body.
592              
593             =cut
594              
595             sub header_encoding($) {
596 4     4 1 12 my $self = shift;
597 4 50       10 return undef unless $self;
598 4 100       12 $self = __PACKAGE__->new($self) unless ref $self;
599 4         27 $self->{HeaderEncoding};
600             }
601              
602             =item $charset->output_charset
603              
604             =item output_charset CHARSET
605              
606             Get a charset which is compatible with given CHARSET and is recommended
607             to be used for MIME messages on Internet (if it is known by this module).
608              
609             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
610             this function will simply
611             return the result of L<"canonical_charset">.
612              
613             =cut
614              
615             sub output_charset($) {
616 4     4 1 13 my $self = shift;
617 4 50       9 return undef unless $self;
618 4 100       13 $self = __PACKAGE__->new($self) unless ref $self;
619 4         17 $self->{OutputCharset};
620             }
621              
622             =back
623              
624             =head2 Translating Text Data
625              
626             =over
627              
628             =item $charset->body_encode(STRING [, OPTS])
629              
630             =item body_encode STRING, CHARSET [, OPTS]
631              
632             Get converted (if needed) data of STRING and recommended transfer-encoding
633             of that data for message body. CHARSET is the charset by which STRING
634             is encoded.
635              
636             OPTS may accept following key-value pairs.
637             B:
638             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
639             conversion will not be performed. So these options do not have any effects.
640              
641             =over 4
642              
643             =item Detect7bit => YESNO
644              
645             Try auto-detecting 7-bit charset when CHARSET is not given.
646             Default is C<"YES">.
647              
648             =item Replacement => REPLACEMENT
649              
650             Specifies error handling scheme. See L<"Error Handling">.
651              
652             =back
653              
654             3-item list of (I, I,
655             I) will be returned.
656             I will be either C<"BASE64">, C<"QUOTED-PRINTABLE">,
657             C<"7BIT"> or C<"8BIT">. If I could not be determined
658             and I contains non-ASCII byte(s), I will
659             be C and I will be C<"BASE64">.
660             I will be C<"US-ASCII"> if and only if string does not
661             contain any non-ASCII bytes.
662              
663             =cut
664              
665             sub body_encode {
666 4     4 1 394 my $self = shift;
667 4         7 my $text;
668 4 100       10 if (ref $self) {
669 2         3 $text = shift;
670             } else {
671 2         3 $text = $self;
672 2         15 $self = __PACKAGE__->new(shift);
673             }
674 4         16 my ($encoded, $charset) = $self->_text_encode($text, @_);
675             return ($encoded, undef, 'BASE64')
676 4 50 33     24 unless $charset and $charset->{InputCharset};
677 4         8 my $cset = $charset->{OutputCharset};
678              
679             # Determine transfer-encoding.
680 4         14 my $enc = $charset->{BodyEncoding};
681              
682 4 50 33     20 if (!$enc and $encoded !~ /\x00/) { # Eliminate hostile NUL character.
    0          
    0          
    0          
683 4 50       111 if ($encoded =~ $NON7BITRE) { # String contains 8bit char(s).
    50          
684 0         0 $enc = '8BIT';
685             } elsif ($cset =~ /^($ISO2022RE|$ASCIITRANSRE)$/) { # 7BIT.
686 4         14 $enc = '7BIT';
687             } else { # Pure ASCII.
688 0         0 $enc = '7BIT';
689 0         0 $cset = 'US-ASCII';
690             }
691             } elsif ($enc eq 'S') {
692 0         0 $enc = _resolve_S($encoded, 1);
693             } elsif ($enc eq 'B') {
694 0         0 $enc = 'BASE64';
695             } elsif ($enc eq 'Q') {
696 0         0 $enc = 'QUOTED-PRINTABLE';
697             } else {
698 0         0 $enc = 'BASE64';
699             }
700 4         38 return ($encoded, $cset, $enc);
701             }
702              
703             =item $charset->decode(STRING [,CHECK])
704              
705             Decode STRING to Unicode.
706              
707             B:
708             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
709             this function will die.
710              
711             =cut
712              
713             sub decode($$$;) {
714 14     14 1 1658 my $self = shift;
715 14         19 my $s = shift;
716 14   100     38 my $check = shift || 0;
717 14         59 $self->{Decoder}->decode($s, $check);
718             }
719              
720             =item detect_7bit_charset STRING
721              
722             Guess 7-bit charset that may encode a string STRING.
723             If STRING contains any 8-bit bytes, C will be returned.
724             Otherwise, Default Charset will be returned for unknown charset.
725              
726             =cut
727              
728             sub detect_7bit_charset($) {
729 4 50   4 1 12 return $DEFAULT_CHARSET unless &USE_ENCODE;
730 4         8 my $s = shift;
731 4 50       8 return $DEFAULT_CHARSET unless $s;
732              
733             # Non-7bit string
734 4 50       14 return undef if $s =~ $NON7BITRE;
735              
736             # Try to detect 7-bit escape sequences.
737 4         10 foreach (@ESCAPE_SEQS) {
738 8         18 my ($seq, $cset) = @$_;
739 8 100       20 if (index($s, $seq) >= 0) {
740 4         12 my $decoder = __PACKAGE__->new($cset);
741 4 50       11 next unless $decoder->{Decoder};
742 4         7 eval {
743 4         13 my $dummy = $s;
744 4         25 $decoder->decode($dummy, FB_CROAK());
745             };
746 4 50       299 if ($@) {
747 0         0 next;
748             }
749 4         24 return $decoder->{InputCharset};
750             }
751             }
752              
753             # How about HZ, VIQR, UTF-7, ...?
754              
755 0         0 return $DEFAULT_CHARSET;
756             }
757              
758             sub _detect_7bit_charset {
759 0     0   0 detect_7bit_charset(@_);
760             }
761              
762             =item $charset->encode(STRING [, CHECK])
763              
764             Encode STRING (Unicode or non-Unicode) using compatible charset recommended
765             to be used for messages on Internet (if this module knows it).
766             Note that string will be decoded to Unicode then encoded even if compatible charset
767             was equal to original charset.
768              
769             B:
770             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
771             this function will die.
772              
773             =cut
774              
775             sub encode($$$;) {
776 10     10 1 666 my $self = shift;
777 10         13 my $s = shift;
778 10   50     32 my $check = shift || 0;
779              
780 10 100 66     50 unless (is_utf8($s) or $s =~ /[^\x00-\xFF]/) {
781 4 50       34 $s = $self->{Decoder}->decode($s, ($check & 0x1)? FB_CROAK(): 0);
782             }
783 10         37 my $enc = $self->{Encoder}->encode($s, $check);
784 10 50       4464 Encode::_utf8_off($enc) if is_utf8($enc); # workaround for RT #35120
785 10         34 $enc;
786             }
787              
788             =item $charset->encoded_header_len(STRING [, ENCODING])
789              
790             =item encoded_header_len STRING, ENCODING, CHARSET
791              
792             Get length of encoded STRING for message header
793             (without folding).
794              
795             ENCODING may be one of C<"B">, C<"Q"> or C<"S"> (shorter
796             one of either C<"B"> or C<"Q">).
797              
798             =cut
799              
800             sub encoded_header_len($$$;) {
801 14     14 1 33 my $self = shift;
802 14         16 my ($encoding, $s);
803 14 100       27 if (ref $self) {
804 8         11 $s = shift;
805 8   66     22 $encoding = uc(shift || $self->{HeaderEncoding});
806             } else {
807 6         8 $s = $self;
808 6         9 $encoding = uc(shift);
809 6         7 $self = shift;
810 6 50       22 $self = __PACKAGE__->new($self) unless ref $self;
811             }
812              
813             #FIXME:$encoding === undef
814              
815 14         17 my $enclen;
816 14 100 66     39 if ($encoding eq 'Q') {
    50          
817 4         10 $enclen = _enclen_Q($s);
818             } elsif ($encoding eq 'S' and _resolve_S($s) eq 'Q') {
819 0         0 $enclen = _enclen_Q($s);
820             } else { # "B"
821 10         19 $enclen = _enclen_B($s);
822             }
823              
824 14         75 length($self->{OutputCharset})+$enclen+7;
825             }
826              
827             sub _enclen_B($) {
828 10     10   27 int((length(shift) + 2) / 3) * 4;
829             }
830              
831             sub _enclen_Q($;$) {
832 4     4   7 my $s = shift;
833 4         6 my $in_body = shift;
834 4         5 my @o;
835 4 50       7 if ($in_body) {
836 0         0 @o = ($s =~ m{([^-\t\r\n !*+/0-9A-Za-z])}go);
837             } else {
838 4         45 @o = ($s =~ m{([^- !*+/0-9A-Za-z])}gos);
839             }
840 4         14 length($s) + scalar(@o) * 2;
841             }
842              
843             sub _resolve_S($;$) {
844 6     6   9 my $s = shift;
845 6         8 my $in_body = shift;
846 6         7 my $e;
847 6 50       10 if ($in_body) {
848 0         0 $e = scalar(() = $s =~ m{[^-\t\r\n !*+/0-9A-Za-z]}g);
849 0 0       0 return (length($s) + 8 < $e * 6) ? 'BASE64' : 'QUOTED-PRINTABLE';
850             } else {
851 6         48 $e = scalar(() = $s =~ m{[^- !*+/0-9A-Za-z]}g);
852 6 50       31 return (length($s) + 8 < $e * 6) ? 'B' : 'Q';
853             }
854             }
855              
856             =item $charset->header_encode(STRING [, OPTS])
857              
858             =item header_encode STRING, CHARSET [, OPTS]
859              
860             Get converted (if needed) data of STRING and recommended encoding scheme of
861             that data for message headers. CHARSET is the charset by which STRING
862             is encoded.
863              
864             OPTS may accept following key-value pairs.
865             B:
866             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
867             conversion will not be performed. So these options do not have any effects.
868              
869             =over 4
870              
871             =item Detect7bit => YESNO
872              
873             Try auto-detecting 7-bit charset when CHARSET is not given.
874             Default is C<"YES">.
875              
876             =item Replacement => REPLACEMENT
877              
878             Specifies error handling scheme. See L<"Error Handling">.
879              
880             =back
881              
882             3-item list of (I, I,
883             I) will be returned. I will be
884             either C<"B">, C<"Q"> or C (might not be encoded).
885             If I could not be determined and I
886             contains non-ASCII byte(s), I will be C<"8BIT">
887             (this is I charset name but a special value to represent unencodable
888             data) and I will be C (should not be encoded).
889             I will be C<"US-ASCII"> if and only if string does not
890             contain any non-ASCII bytes.
891              
892             =cut
893              
894             sub header_encode {
895 8     8 1 1103 my $self = shift;
896 8         9 my $text;
897 8 100       20 if (ref $self) {
898 4         5 $text = shift;
899             } else {
900 4         4 $text = $self;
901 4         12 $self = __PACKAGE__->new(shift);
902             }
903 8         27 my ($encoded, $charset) = $self->_text_encode($text, @_);
904             return ($encoded, '8BIT', undef)
905 8 50 33     41 unless $charset and $charset->{InputCharset};
906 8         14 my $cset = $charset->{OutputCharset};
907              
908             # Determine encoding scheme.
909 8         12 my $enc = $charset->{HeaderEncoding};
910              
911 8 100 66     55 if (!$enc and $encoded !~ $NON7BITRE) {
    50          
    50          
912 2 50       70 unless ($cset =~ /^($ISO2022RE|$ASCIITRANSRE)$/) { # 7BIT.
913 2         5 $cset = 'US-ASCII';
914             }
915             } elsif ($enc eq 'S') {
916 0         0 $enc = _resolve_S($encoded);
917             } elsif ($enc !~ /^[BQ]$/) {
918 0         0 $enc = 'B';
919             }
920 8         40 return ($encoded, $cset, $enc);
921             }
922              
923             sub _text_encode {
924 12     12   18 my $charset = shift;
925 12         14 my $s = shift;
926 12         19 my %params = @_;
927 12   33     53 my $replacement = uc($params{'Replacement'} || $Config->{Replacement});
928 12   33     42 my $detect7bit = uc($params{'Detect7bit'} || $Config->{Detect7bit});
929             my $encoding = $params{'Encoding'} ||
930 12   33     43 (exists $params{'Encoding'}? undef: 'A'); # undocumented
931              
932 12 50 33     47 if (!$encoding or $encoding ne 'A') { # no 7-bit auto-detection
933 0         0 $detect7bit = 'NO';
934             }
935 12 100       30 unless ($charset->{InputCharset}) {
936 4 50       37 if ($s =~ $NON7BITRE) {
    50          
937 0         0 return ($s, undef);
938             } elsif ($detect7bit ne "NO") {
939 4         11 $charset = __PACKAGE__->new(&detect_7bit_charset($s));
940             } else {
941 0         0 $charset = __PACKAGE__->new($DEFAULT_CHARSET,
942             Mapping => 'STANDARD');
943             }
944             }
945 12 50 33     40 if (!$encoding or $encoding ne 'A') { # no conversion
946 0         0 $charset = $charset->dup;
947 0         0 $charset->encoder($charset);
948 0         0 $charset->{HeaderEncoding} = $encoding;
949 0         0 $charset->{BodyEncoding} = $encoding;
950             }
951             my $check = ($replacement and $replacement =~ /^\d+$/)?
952             $replacement:
953             {
954             'CROAK' => FB_CROAK(),
955             'STRICT' => FB_CROAK(),
956             'FALLBACK' => FB_CROAK(), # special
957             'PERLQQ' => FB_PERLQQ(),
958             'HTMLCREF' => FB_HTMLCREF(),
959             'XMLCREF' => FB_XMLCREF(),
960 12 50 33     155 }->{$replacement || ""} || 0;
      50        
961              
962             # Encode data by output charset if required. If failed, fallback to
963             # fallback charset.
964 12         29 my $encoded;
965 12 100 33     103 if (is_utf8($s) or $s =~ /[^\x00-\xFF]/ or
      50        
      50        
      66        
966             ($charset->{InputCharset} || "") ne ($charset->{OutputCharset} || "")) {
967 4 50       12 if ($check & 0x1) { # CROAK or FALLBACK
968 0         0 eval {
969 0         0 $encoded = $s;
970 0         0 $encoded = $charset->encode($encoded, FB_CROAK());
971             };
972 0 0       0 if ($@) {
973 0 0 0     0 if ($replacement eq "FALLBACK" and $FALLBACK_CHARSET) {
974 0         0 my $cset = __PACKAGE__->new($FALLBACK_CHARSET,
975             Mapping => 'STANDARD');
976             # croak unknown charset
977             croak "unknown charset ``$FALLBACK_CHARSET''"
978 0 0       0 unless $cset->{Decoder};
979             # charset translation
980 0         0 $charset = $charset->dup;
981 0         0 $charset->encoder($cset);
982 0         0 $encoded = $s;
983 0         0 $encoded = $charset->encode($encoded, 0);
984             # replace input & output charsets with fallback charset
985 0         0 $cset->encoder($cset);
986 0         0 $charset = $cset;
987             } else {
988 0         0 $@ =~ s/ at .+$//;
989 0         0 croak $@;
990             }
991             }
992             } else {
993 4         7 $encoded = $s;
994 4         11 $encoded = $charset->encode($encoded, $check);
995             }
996             } else {
997 8         13 $encoded = $s;
998             }
999              
1000 12 100       72 if ($encoded !~ /$NONASCIIRE/) { # maybe ASCII
1001             # check ``ASCII transformation'' charsets
1002 4 50       124 if ($charset->{OutputCharset} =~ /^($ASCIITRANSRE)$/) {
    0          
1003 4         10 my $u = $encoded;
1004 4         6 if (USE_ENCODE) {
1005 4         13 $u = $charset->encoder->decode($encoded); # dec. by output
1006             } elsif ($encoded =~ /[+~]/) { # workaround for pre-Encode env.
1007             $u = "x$u";
1008             }
1009 4 100       209 if ($u eq $encoded) {
1010 2         10 $charset = $charset->dup;
1011 2         5 $charset->encoder($DEFAULT_CHARSET);
1012             }
1013             } elsif ($charset->{OutputCharset} ne "US-ASCII") {
1014 0         0 $charset = $charset->dup;
1015 0         0 $charset->encoder($DEFAULT_CHARSET);
1016             }
1017             }
1018              
1019 12         42 return ($encoded, $charset);
1020             }
1021              
1022             =item $charset->undecode(STRING [,CHECK])
1023              
1024             Encode Unicode string STRING to byte string by input charset of $charset.
1025             This is equivalent to C<$charset-Edecoder-Eencode()>.
1026              
1027             B:
1028             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
1029             this function will die.
1030              
1031             =cut
1032              
1033             sub undecode($$$;) {
1034 0     0 1   my $self = shift;
1035 0           my $s = shift;
1036 0   0       my $check = shift || 0;
1037 0           my $enc = $self->{Decoder}->encode($s, $check);
1038 0           Encode::_utf8_off($enc); # workaround for RT #35120
1039 0           $enc;
1040             }
1041              
1042             =back
1043              
1044             =head2 Manipulating Module Defaults
1045              
1046             =over
1047              
1048             =item alias ALIAS [, CHARSET]
1049              
1050             Get/set charset alias for canonical names determined by
1051             L<"canonical_charset">.
1052              
1053             If CHARSET is given and isn't false, ALIAS will be assigned as an alias of
1054             CHARSET. Otherwise, alias won't be changed. In both cases,
1055             current charset name that ALIAS is assigned will be returned.
1056              
1057             =cut
1058              
1059             sub alias ($;$) {
1060 0     0 1   my $alias = uc(shift);
1061 0           my $charset = uc(shift);
1062              
1063 0 0         return $CHARSET_ALIASES{$alias} unless $charset;
1064              
1065 0           $CHARSET_ALIASES{$alias} = $charset;
1066 0           return $charset;
1067             }
1068              
1069             =item default [CHARSET]
1070              
1071             Get/set default charset.
1072              
1073             B is used by this module when charset context is
1074             unknown. Modules using this module are recommended to use this
1075             charset when charset context is unknown or implicit default is
1076             expected. By default, it is C<"US-ASCII">.
1077              
1078             If CHARSET is given and isn't false, it will be set to default charset.
1079             Otherwise, default charset won't be changed. In both cases,
1080             current default charset will be returned.
1081              
1082             B: Default charset I be changed.
1083              
1084             =cut
1085              
1086             sub default(;$) {
1087 0     0 1   my $charset = &canonical_charset(shift);
1088              
1089 0 0         if ($charset) {
1090 0 0         croak "Unknown charset '$charset'"
1091             unless resolve_alias($charset);
1092 0           $DEFAULT_CHARSET = $charset;
1093             }
1094 0           return $DEFAULT_CHARSET;
1095             }
1096              
1097             =item fallback [CHARSET]
1098              
1099             Get/set fallback charset.
1100              
1101             B is used by this module when conversion by given
1102             charset is failed and C<"FALLBACK"> error handling scheme is specified.
1103             Modules using this module may use this charset as last resort of charset
1104             for conversion. By default, it is C<"UTF-8">.
1105              
1106             If CHARSET is given and isn't false, it will be set to fallback charset.
1107             If CHARSET is C<"NONE">, fallback charset will be undefined.
1108             Otherwise, fallback charset won't be changed. In any cases,
1109             current fallback charset will be returned.
1110              
1111             B: It I useful that C<"US-ASCII"> is specified as fallback charset,
1112             since result of conversion will be readable without charset information.
1113              
1114             =cut
1115              
1116             sub fallback(;$) {
1117 0     0 1   my $charset = &canonical_charset(shift);
1118              
1119 0 0         if ($charset eq "NONE") {
    0          
1120 0           $FALLBACK_CHARSET = undef;
1121             } elsif ($charset) {
1122 0 0         croak "Unknown charset '$charset'"
1123             unless resolve_alias($charset);
1124 0           $FALLBACK_CHARSET = $charset;
1125             }
1126 0           return $FALLBACK_CHARSET;
1127             }
1128              
1129             =item recommended CHARSET [, HEADERENC, BODYENC [, ENCCHARSET]]
1130              
1131             Get/set charset profiles.
1132              
1133             If optional arguments are given and any of them are not false, profiles
1134             for CHARSET will be set by those arguments. Otherwise, profiles
1135             won't be changed. In both cases, current profiles for CHARSET will be
1136             returned as 3-item list of (HEADERENC, BODYENC, ENCCHARSET).
1137              
1138             HEADERENC is recommended encoding scheme for message header.
1139             It may be one of C<"B">, C<"Q">, C<"S"> (shorter one of either) or
1140             C (might not be encoded).
1141              
1142             BODYENC is recommended transfer-encoding for message body. It may be
1143             one of C<"B">, C<"Q">, C<"S"> (shorter one of either) or
1144             C (might not be transfer-encoded).
1145              
1146             ENCCHARSET is a charset which is compatible with given CHARSET and
1147             is recommended to be used for MIME messages on Internet.
1148             If conversion is not needed (or this module doesn't know appropriate
1149             charset), ENCCHARSET is C.
1150              
1151             B: This function in the future releases can accept more optional
1152             arguments (for example, properties to handle character widths, line folding
1153             behavior, ...). So format of returned value may probably be changed.
1154             Use L<"header_encoding">, L<"body_encoding"> or L<"output_charset"> to get
1155             particular profile.
1156              
1157             =cut
1158              
1159             sub recommended ($;$;$;$) {
1160 0     0 1   my $charset = &canonical_charset(shift);
1161 0   0       my $henc = uc(shift) || undef;
1162 0   0       my $benc = uc(shift) || undef;
1163 0           my $cset = &canonical_charset(shift);
1164              
1165 0 0         croak "CHARSET is not specified" unless $charset;
1166 0 0 0       croak "Unknown header encoding" unless !$henc or $henc =~ /^[BQS]$/;
1167 0 0 0       croak "Unknown body encoding" unless !$benc or $benc =~ /^[BQ]$/;
1168              
1169 0 0 0       if ($henc or $benc or $cset) {
      0        
1170 0 0         $cset = undef if $charset eq $cset;
1171 0           my @spec = ($henc, $benc, USE_ENCODE? $cset: undef);
1172 0           $CHARSETS{$charset} = \@spec;
1173 0           return @spec;
1174             } else {
1175 0 0         $charset = __PACKAGE__->new($charset) unless ref $charset;
1176 0           return map { $charset->{$_} } qw(HeaderEncoding BodyEncoding
  0            
1177             OutputCharset);
1178             }
1179             }
1180              
1181             =back
1182              
1183             =head2 Constants
1184              
1185             =over
1186              
1187             =item USE_ENCODE
1188              
1189             Unicode/multibyte support flag.
1190             Non-empty string will be set when Unicode and multibyte support is enabled.
1191             Currently, this flag will be non-empty on Perl 5.7.3 or later and
1192             empty string on earlier versions of Perl.
1193              
1194             =back
1195              
1196             =head2 Error Handling
1197              
1198             L<"body_encode"> and L<"header_encode"> accept following C
1199             options:
1200              
1201             =over
1202              
1203             =item C<"DEFAULT">
1204              
1205             Put a substitution character in place of a malformed character.
1206             For UCM-based encodings, will be used.
1207              
1208             =item C<"FALLBACK">
1209              
1210             Try C<"DEFAULT"> scheme using I (see L<"fallback">).
1211             When fallback charset is undefined and conversion causes error,
1212             code will die on error with an error message.
1213              
1214             =item C<"CROAK">
1215              
1216             Code will die on error immediately with an error message.
1217             Therefore, you should trap the fatal error with eval{} unless you
1218             really want to let it die on error.
1219             Synonym is C<"STRICT">.
1220              
1221             =item C<"PERLQQ">
1222              
1223             =item C<"HTMLCREF">
1224              
1225             =item C<"XMLCREF">
1226              
1227             Use C, C or C
1228             scheme defined by L module.
1229              
1230             =item numeric values
1231              
1232             Numeric values are also allowed.
1233             For more details see L.
1234              
1235             =back
1236              
1237             If error handling scheme is not specified or unknown scheme is specified,
1238             C<"DEFAULT"> will be assumed.
1239              
1240             =head2 Configuration File
1241              
1242             Built-in defaults for option parameters can be overridden by configuration
1243             file: F.
1244             For more details read F.
1245              
1246             =head1 VERSION
1247              
1248             Consult $VERSION variable.
1249              
1250             Development versions of this module may be found at
1251             L.
1252              
1253             =head2 Incompatible Changes
1254              
1255             =over 4
1256              
1257             =item Release 1.001
1258              
1259             =over 4
1260              
1261             =item *
1262              
1263             new() method returns an object when CHARSET argument is not specified.
1264              
1265             =back
1266              
1267             =item Release 1.005
1268              
1269             =over 4
1270              
1271             =item *
1272              
1273             Restrict characters in encoded-word according to RFC 2047 section 5 (3).
1274             This also affects return value of encoded_header_len() method.
1275              
1276             =back
1277              
1278             =item Release 1.008.2
1279              
1280             =over 4
1281              
1282             =item *
1283              
1284             body_encoding() method may also returns C<"S">.
1285              
1286             =item *
1287              
1288             Return value of body_encode() method for UTF-8 may include
1289             C<"QUOTED-PRINTABLE"> encoding item that in earlier versions was fixed to
1290             C<"BASE64">.
1291              
1292             =back
1293              
1294             =back
1295              
1296             =head1 SEE ALSO
1297              
1298             Multipurpose Internet Mail Extensions (MIME).
1299              
1300             =head1 AUTHOR
1301              
1302             Hatuka*nezumi - IKEDA Soji
1303              
1304             =head1 COPYRIGHT
1305              
1306             Copyright (C) 2006-2017 Hatuka*nezumi - IKEDA Soji.
1307             This program is free software; you can redistribute it and/or modify it
1308             under the same terms as Perl itself.
1309              
1310             =cut
1311              
1312             1;