File Coverage

blib/lib/MIME/Charset.pm
Criterion Covered Total %
statement 214 291 73.5
branch 92 168 54.7
condition 43 99 43.4
subroutine 29 35 82.8
pod 20 20 100.0
total 398 613 64.9


line stmt bran cond sub pod time code
1             #-*- perl -*-
2              
3             package MIME::Charset;
4 9     9   201910 use 5.005;
  9         92  
5              
6             =head1 NAME
7              
8             MIME::Charset - Charset Information for MIME
9              
10             =head1 SYNOPSIS
11              
12             use MIME::Charset:
13              
14             $charset = MIME::Charset->new("euc-jp");
15              
16             Getting charset information:
17              
18             $benc = $charset->body_encoding; # e.g. "Q"
19             $cset = $charset->as_string; # e.g. "US-ASCII"
20             $henc = $charset->header_encoding; # e.g. "S"
21             $cset = $charset->output_charset; # e.g. "ISO-2022-JP"
22              
23             Translating text data:
24              
25             ($text, $charset, $encoding) =
26             $charset->header_encode(
27             "\xc9\xc2\xc5\xaa\xc0\xde\xc3\xef\xc5\xaa".
28             "\xc7\xd1\xca\xaa\xbd\xd0\xce\xcf\xb4\xef",
29             Charset => 'euc-jp');
30             # ...returns e.g. (, "ISO-2022-JP", "B").
31              
32             ($text, $charset, $encoding) =
33             $charset->body_encode(
34             "Collectioneur path\xe9tiquement ".
35             "\xe9clectique de d\xe9chets",
36             Charset => 'latin1');
37             # ...returns e.g. (, "ISO-8859-1", "QUOTED-PRINTABLE").
38              
39             $len = $charset->encoded_header_len(
40             "Perl\xe8\xa8\x80\xe8\xaa\x9e",
41             Charset => 'utf-8',
42             Encoding => "b");
43             # ...returns e.g. 28.
44              
45             Manipulating module defaults:
46              
47             MIME::Charset::alias("csEUCKR", "euc-kr");
48             MIME::Charset::default("iso-8859-1");
49             MIME::Charset::fallback("us-ascii");
50              
51             Non-OO functions (may be deprecated in near future):
52              
53             use MIME::Charset qw(:info);
54              
55             $benc = body_encoding("iso-8859-2"); # "Q"
56             $cset = canonical_charset("ANSI X3.4-1968"); # "US-ASCII"
57             $henc = header_encoding("utf-8"); # "S"
58             $cset = output_charset("shift_jis"); # "ISO-2022-JP"
59              
60             use MIME::Charset qw(:trans);
61              
62             ($text, $charset, $encoding) =
63             header_encode(
64             "\xc9\xc2\xc5\xaa\xc0\xde\xc3\xef\xc5\xaa".
65             "\xc7\xd1\xca\xaa\xbd\xd0\xce\xcf\xb4\xef",
66             "euc-jp");
67             # ...returns (, "ISO-2022-JP", "B");
68              
69             ($text, $charset, $encoding) =
70             body_encode(
71             "Collectioneur path\xe9tiquement ".
72             "\xe9clectique de d\xe9chets",
73             "latin1");
74             # ...returns (, "ISO-8859-1", "QUOTED-PRINTABLE");
75              
76             $len = encoded_header_len(
77             "Perl\xe8\xa8\x80\xe8\xaa\x9e", "b", "utf-8"); # 28
78              
79             =head1 DESCRIPTION
80              
81             MIME::Charset provides information about character sets used for
82             MIME messages on Internet.
83              
84             =head2 Definitions
85              
86             The B is ``character set'' used in MIME to refer to a
87             method of converting a sequence of octets into a sequence of characters.
88             It includes both concepts of ``coded character set'' (CCS) and
89             ``character encoding scheme'' (CES) of ISO/IEC.
90              
91             The B is that used in MIME to refer to a method of representing
92             a body part or a header body as sequence(s) of printable US-ASCII
93             characters.
94              
95             =cut
96              
97 9     9   44 use strict;
  9         11  
  9         246  
98 9     9   39 use vars qw(@ISA $VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS $Config);
  9         14  
  9         653  
99 9     9   55 use Exporter;
  9         14  
  9         854  
100             @ISA = qw(Exporter);
101             @EXPORT = qw(body_encoding canonical_charset header_encoding output_charset
102             body_encode encoded_header_len header_encode);
103             @EXPORT_OK = qw(alias default fallback recommended);
104             %EXPORT_TAGS = (
105             "info" => [qw(body_encoding header_encoding
106             canonical_charset output_charset)],
107             "trans" =>[ qw(body_encode encoded_header_len
108             header_encode)],
109             );
110 9     9   62 use Carp qw(croak);
  9         15  
  9         656  
111              
112 9 50   9   56 use constant USE_ENCODE => ($] >= 5.007003)? 'Encode': '';
  9         15  
  9         1430  
113              
114             my @ENCODE_SUBS = qw(FB_CROAK FB_PERLQQ FB_HTMLCREF FB_XMLCREF
115             is_utf8 resolve_alias);
116             if (USE_ENCODE) {
117 9     9   4830 eval "use ".USE_ENCODE." \@ENCODE_SUBS;";
  9         84376  
  9         5381  
118             if ($@) { # Perl 5.7.3 + Encode 0.40
119             eval "use ".USE_ENCODE." qw(is_utf8);";
120             require MIME::Charset::_Compat;
121             for my $sub (@ENCODE_SUBS) {
122 9     9   55 no strict "refs";
  9         11  
  9         742  
123             *{$sub} = \&{"MIME::Charset::_Compat::$sub"}
124             unless $sub eq 'is_utf8';
125             }
126             }
127             } else {
128             require MIME::Charset::_Compat;
129             for my $sub (@ENCODE_SUBS) {
130 9     9   53 no strict "refs";
  9         19  
  9         33495  
131             *{$sub} = \&{"MIME::Charset::_Compat::$sub"};
132             }
133             }
134              
135             $VERSION = '1.013_01';
136              
137             ######## Private Attributes ########
138              
139             my $DEFAULT_CHARSET = 'US-ASCII';
140             my $FALLBACK_CHARSET = 'UTF-8';
141              
142             # This table was initially borrowed from Python email package.
143              
144             my %CHARSETS = (# input header enc body enc output conv
145             'DIN_66003' => ['Q', undef, undef],
146             'ISO-8859-1' => ['Q', 'Q', undef],
147             'ISO-8859-2' => ['Q', 'Q', undef],
148             'ISO-8859-3' => ['Q', 'Q', undef],
149             'ISO-8859-4' => ['Q', 'Q', undef],
150             # ISO-8859-5 is Cyrillic, and not especially used
151             # ISO-8859-6 is Arabic, also not particularly used
152             # ISO-8859-7 is Greek, 'Q' will not make it readable
153             # ISO-8859-8 is Hebrew, 'Q' will not make it readable
154             'ISO-8859-9' => ['Q', 'Q', undef],
155             'ISO-8859-10' => ['Q', 'Q', undef],
156             # ISO-8859-11 is Thai, 'Q' will not make it readable
157             'ISO-8859-13' => ['Q', 'Q', undef],
158             'ISO-8859-14' => ['Q', 'Q', undef],
159             'ISO-8859-15' => ['Q', 'Q', undef],
160             'ISO-8859-16' => ['Q', 'Q', undef],
161             'WINDOWS-1252' => ['Q', 'Q', undef],
162             'VISCII' => ['Q', 'Q', undef],
163             'US-ASCII' => [undef, undef, undef],
164             'BIG5' => ['B', 'B', undef],
165             'GB2312' => ['B', 'B', undef],
166             'HZ-GB-2312' => ['B', undef, undef],
167             'EUC-JP' => ['B', undef, 'ISO-2022-JP'],
168             'SHIFT_JIS' => ['B', undef, 'ISO-2022-JP'],
169             'ISO-2022-JP' => ['B', undef, undef],
170             'ISO-2022-JP-1' => ['B', undef, undef],
171             'ISO-2022-JP-2' => ['B', undef, undef],
172             'EUC-JISX0213' => ['B', undef, 'ISO-2022-JP-3'],
173             'SHIFT_JISX0213' => ['B', undef, 'ISO-2022-JP-3'],
174             'ISO-2022-JP-3' => ['B', undef, undef],
175             'EUC-JIS-2004' => ['B', undef, 'ISO-2022-JP-2004'],
176             'SHIFT_JIS-2004' => ['B', undef, 'ISO-2022-JP-2004'],
177             'ISO-2022-JP-2004' => ['B', undef, undef],
178             'KOI8-R' => ['B', 'B', undef],
179             'TIS-620' => ['B', 'B', undef], # cf. Mew
180             'UTF-16' => ['B', 'B', undef],
181             'UTF-16BE' => ['B', 'B', undef],
182             'UTF-16LE' => ['B', 'B', undef],
183             'UTF-32' => ['B', 'B', undef],
184             'UTF-32BE' => ['B', 'B', undef],
185             'UTF-32LE' => ['B', 'B', undef],
186             'UTF-7' => ['Q', undef, undef],
187             'UTF-8' => ['S', 'S', undef],
188             'GSM03.38' => [undef, undef, undef], # not for MIME
189             # We're making this one up to represent raw unencoded 8bit
190             '8BIT' => [undef, 'B', 'ISO-8859-1'],
191             );
192              
193             # Fix some unexpected or unpreferred names returned by
194             # Encode::resolve_alias() or used by somebodies else.
195             my %CHARSET_ALIASES = (# unpreferred preferred
196             "ASCII" => "US-ASCII",
197             "BIG5-ETEN" => "BIG5",
198             "CP1250" => "WINDOWS-1250",
199             "CP1251" => "WINDOWS-1251",
200             "CP1252" => "WINDOWS-1252",
201             "CP1253" => "WINDOWS-1253",
202             "CP1254" => "WINDOWS-1254",
203             "CP1255" => "WINDOWS-1255",
204             "CP1256" => "WINDOWS-1256",
205             "CP1257" => "WINDOWS-1257",
206             "CP1258" => "WINDOWS-1258",
207             "CP874" => "WINDOWS-874",
208             "CP936" => "GBK",
209             "CP949" => "KS_C_5601-1987",
210             "DIN66003" => "DIN_66003",
211             "EUC-CN" => "GB2312",
212             "HZ" => "HZ-GB-2312", # RFC 1842
213             "KS_C_5601" => "KS_C_5601-1987",
214             "SHIFTJIS" => "SHIFT_JIS",
215             "SHIFTJISX0213" => "SHIFT_JISX0213",
216             "TIS620" => "TIS-620", # IANA MIBenum 2259
217             "UNICODE-1-1-UTF-7" => "UTF-7", # RFC 1642 (obs.)
218             "UTF8" => "UTF-8",
219             "UTF-8-STRICT" => "UTF-8", # Perl internal use
220             "GSM0338" => "GSM03.38", # not for MIME
221             );
222              
223             # Some vendors encode characters beyond standardized mappings using extended
224             # encoders. Some other standard encoders need additional encode modules.
225             my %ENCODERS = (
226             'EXTENDED' => {
227             'ISO-8859-1' => [['cp1252'], ], # Encode::Byte
228             'ISO-8859-2' => [['cp1250'], ], # Encode::Byte
229             'ISO-8859-5' => [['cp1251'], ], # Encode::Byte
230             'ISO-8859-6' => [
231             ['cp1256'], # Encode::Byte
232             # ['cp1006'], # ditto, for Farsi
233             ],
234             'ISO-8859-6-I'=>[['cp1256'], ], # ditto
235             'ISO-8859-7' => [['cp1253'], ], # Encode::Byte
236             'ISO-8859-8' => [['cp1255'], ], # Encode::Byte
237             'ISO-8859-8-I'=>[['cp1255'], ], # ditto
238             'ISO-8859-9' => [['cp1254'], ], # Encode::Byte
239             'ISO-8859-13'=> [['cp1257'], ], # Encode::Byte
240             'GB2312' => [
241             ['gb18030', 'Encode::HanExtra'],
242             ['cp936'], # Encode::CN
243             ],
244             'EUC-JP' => [
245             ['eucJP-ascii', 'Encode::EUCJPASCII'],
246             # ['cp51932', 'Encode::EUCJPMS'],
247             ],
248             'ISO-2022-JP'=> [
249             ['x-iso2022jp-ascii',
250             'Encode::EUCJPASCII'],
251             # ['iso-2022-jp-ms','Encode::ISO2022JPMS'],
252             # ['cp50220', 'Encode::EUCJPMS'],
253             # ['cp50221', 'Encode::EUCJPMS'],
254             ['iso-2022-jp-1'], # Encode::JP (note*)
255             ],
256             'SHIFT_JIS' => [
257             ['cp932'], # Encode::JP
258             ],
259             'EUC-JISX0213' => [['euc-jis-2004', 'Encode::JISX0213'], ],
260             'ISO-2022-JP-3' => [['iso-2022-jp-2004', 'Encode::JISX0213'], ],
261             'SHIFT_JISX0213'=> [['shift_jis-2004', 'Encode::ShiftJIS2004'], ],
262             'EUC-KR' => [['cp949'], ], # Encode::KR
263             'BIG5' => [
264             # ['big5plus', 'Encode::HanExtra'],
265             # ['big5-2003', 'Encode::HanExtra'],
266             ['cp950'], # Encode::TW
267             # ['big5-1984', 'Encode::HanExtra'],
268             ],
269             'TIS-620' => [['cp874'], ], # Encode::Byte
270             'UTF-8' => [['utf8'], ], # Special name on Perl
271             },
272             'STANDARD' => {
273             'DIN_66003' => [['din66003', 'Endode::DIN66003'], ],
274             'ISO-8859-6-E' => [['iso-8859-6'],],# Encode::Byte
275             'ISO-8859-6-I' => [['iso-8859-6'],],# ditto
276             'ISO-8859-8-E' => [['iso-8859-8'],],# Encode::Byte
277             'ISO-8859-8-I' => [['iso-8859-8'],],# ditto
278             'GB18030' => [['gb18030', 'Encode::HanExtra'], ],
279             'ISO-2022-JP-2' => [['iso-2022-jp-2','Encode::ISO2022JP2'], ],
280             'EUC-JISX0213' => [['euc-jisx0213', 'Encode::JISX0213'], ],
281             'ISO-2022-JP-3' => [['iso-2022-jp-3', 'Encode::JISX0213'], ],
282             'EUC-JIS-2004' => [['euc-jis-2004', 'Encode::JISX0213'], ],
283             'ISO-2022-JP-2004' => [['iso-2022-jp-2004', 'Encode::JISX0213'], ],
284             'SHIFT_JIS-2004'=> [['shift_jis-2004', 'Encode::ShiftJIS2004'], ],
285             'EUC-TW' => [['euc-tw', 'Encode::HanExtra'], ],
286             'HZ-GB-2312' => [['hz'], ], # Encode::CN
287             'TIS-620' => [['tis620'], ], # (note*)
288             'UTF-16' => [['x-utf16auto', 'MIME::Charset::UTF'],],
289             'UTF-32' => [['x-utf32auto', 'MIME::Charset::UTF'],],
290             'GSM03.38' => [['gsm0338'], ], # Encode::GSM0338
291              
292             # (note*) ISO-8859-11 was not registered by IANA.
293             # L treats it as canonical name of ``tis-?620''.
294             },
295             );
296              
297             # ISO-2022-* escape sequences etc. to detect charset from unencoded data.
298             my @ESCAPE_SEQS = (
299             # ISO-2022-* sequences
300             # escape seq, possible charset
301             # Following sequences are commonly used.
302             ["\033\$\@", "ISO-2022-JP"], # RFC 1468
303             ["\033\$B", "ISO-2022-JP"], # ditto
304             ["\033(J", "ISO-2022-JP"], # ditto
305             ["\033(I", "ISO-2022-JP"], # ditto (nonstandard)
306             ["\033\$(D", "ISO-2022-JP"], # RFC 2237 (note*)
307             # Following sequences are less commonly used.
308             ["\033.A", "ISO-2022-JP-2"], # RFC 1554
309             ["\033.F", "ISO-2022-JP-2"], # ditto
310             ["\033\$(C", "ISO-2022-JP-2"], # ditto
311             ["\033\$(O", "ISO-2022-JP-3"], # JIS X 0213:2000
312             ["\033\$(P", "ISO-2022-JP-2004"], # JIS X 0213:2000/2004
313             ["\033\$(Q", "ISO-2022-JP-2004"], # JIS X 0213:2004
314             ["\033\$)C", "ISO-2022-KR"], # RFC 1557
315             ["\033\$)A", "ISO-2022-CN"], # RFC 1922
316             ["\033\$A", "ISO-2022-CN"], # ditto (nonstandard)
317             ["\033\$)G", "ISO-2022-CN"], # ditto
318             ["\033\$*H", "ISO-2022-CN"], # ditto
319             # Other sequences will be used with appropriate charset
320             # parameters, or hardly used.
321              
322             # note*: This RFC defines ISO-2022-JP-1, superset of
323             # ISO-2022-JP. But that charset name is rarely used.
324             # OTOH many of encoders for ISO-2022-JP recognize this
325             # sequence so that comatibility with EUC-JP will be
326             # guaranteed.
327              
328             # Singlebyte 7-bit sequences
329             # escape seq, possible charset
330             ["\033(K", "DIN_66003"], # ISO-IR-21
331             ["\033)K", "DIN_66003"], # ditto
332             ["\033e", "GSM03.38"], # ESTI GSM 03.38 (note*)
333             ["\033\012", "GSM03.38"], # ditto
334             ["\033<", "GSM03.38"], # ditto
335             ["\033/", "GSM03.38"], # ditto
336             ["\033>", "GSM03.38"], # ditto
337             ["\033\024", "GSM03.38"], # ditto
338             ["\033(", "GSM03.38"], # ditto
339             ["\033\@", "GSM03.38"], # ditto
340             ["\033)", "GSM03.38"], # ditto
341             ["\033=", "GSM03.38"], # ditto
342              
343             # note*: This is not used for MIME message.
344             );
345              
346             ######## Public Configuration Attributes ########
347              
348             $Config = {
349             Detect7bit => 'YES',
350             Mapping => 'EXTENDED',
351             Replacement => 'DEFAULT',
352             };
353             local @INC = @INC;
354             pop @INC if $INC[-1] eq '.';
355             eval { require MIME::Charset::Defaults; };
356              
357             ######## Private Constants ########
358              
359             my $NON7BITRE = qr{
360             [^\x01-\x7e]
361             }x;
362              
363             my $NONASCIIRE = qr{
364             [^\x09\x0a\x0d\x20\x21-\x7e]
365             }x;
366              
367             my $ISO2022RE = qr{
368             ISO-2022-.+
369             }ix;
370              
371             my $ASCIITRANSRE = qr{
372             HZ-GB-2312 | UTF-7
373             }ix;
374              
375              
376             ######## Public Functions ########
377              
378             =head2 Constructor
379              
380             =over
381              
382             =item $charset = MIME::Charset->new([CHARSET [, OPTS]])
383              
384             Create charset object.
385              
386             OPTS may accept following key-value pair.
387             B:
388             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
389             conversion will not be performed. So this option do not have any effects.
390              
391             =over 4
392              
393             =item Mapping => MAPTYPE
394              
395             Whether to extend mappings actually used for charset names or not.
396             C<"EXTENDED"> uses extended mappings.
397             C<"STANDARD"> uses standardized strict mappings.
398             Default is C<"EXTENDED">.
399              
400             =back
401              
402             =cut
403              
404             sub new {
405 75     75 1 1408 my $class = shift;
406 75         107 my $charset = shift;
407 75 100       164 return bless {}, $class unless $charset;
408 70 50       165 return bless {}, $class if 75 < length $charset; # w/a for CPAN RT #65796.
409 70         119 my %params = @_;
410 70   33     308 my $mapping = uc($params{'Mapping'} || $Config->{Mapping});
411              
412 70 100       305 if ($charset =~ /\bhz.?gb.?2312$/i) {
    100          
    100          
413             # workaround: "HZ-GB-2312" mistakenly treated as "EUC-CN" by Encode
414             # (2.12).
415 4         10 $charset = "HZ-GB-2312";
416             } elsif ($charset =~ /\btis-?620$/i) {
417             # workaround: "TIS620" treated as ISO-8859-11 by Encode.
418             # And "TIS-620" not known by some versions of Encode (cf.
419             # CPAN RT #20781).
420 1         4 $charset = "TIS-620";
421             } elsif ($charset =~ /\biso[-_]8859[-_]8[-_]i$/i) {
422             # workaround: "ISO-8859-8-I" is treated as an alias of "ISO-8859-8"
423             # by Encode (3.18): See the note in
424             # https://encoding.spec.whatwg.org/#legacy-single-byte-encodings
425             # However we'll treat these as separate names for compatibility.
426 1         3 $charset = "ISO-8859-8-I";
427             } else {
428 64   66     183 $charset = resolve_alias($charset) || $charset
429             }
430 70   66     120754 $charset = $CHARSET_ALIASES{uc($charset)} || uc($charset);
431 70         125 my ($henc, $benc, $outcset);
432 70         160 my $spec = $CHARSETS{$charset};
433 70 100       139 if ($spec) {
434 60         183 ($henc, $benc, $outcset) =
435             ($$spec[0], $$spec[1], USE_ENCODE? $$spec[2]: undef);
436             } else {
437 10         20 ($henc, $benc, $outcset) = ('S', 'B', undef);
438             }
439 70         110 my ($decoder, $encoder);
440 70         97 if (USE_ENCODE) {
441 70         153 $decoder = _find_encoder($charset, $mapping);
442 70         131 $encoder = _find_encoder($outcset, $mapping);
443             } else {
444             $decoder = $encoder = undef;
445             }
446              
447 70   66     696 bless {
      100        
448             InputCharset => $charset,
449             Decoder => $decoder,
450             HeaderEncoding => $henc,
451             BodyEncoding => $benc,
452             OutputCharset => ($outcset || $charset),
453             Encoder => ($encoder || $decoder),
454             }, $class;
455             }
456              
457             my %encoder_cache = ();
458              
459             sub _find_encoder($$) {
460 140   100 140   360 my $charset = uc(shift || "");
461 140 100       265 return undef unless $charset;
462 78         117 my $mapping = uc(shift);
463 78         112 my ($spec, $name, $module, $encoder);
464              
465 78         109 local($@);
466 78         228 $encoder = $encoder_cache{$charset, $mapping};
467 78 100       166 return $encoder if ref $encoder;
468              
469 55         129 foreach my $m (('EXTENDED', 'STANDARD')) {
470 82 50 66     256 next if $m eq 'EXTENDED' and $mapping ne 'EXTENDED';
471 82         197 $spec = $ENCODERS{$m}->{$charset};
472 82 100       160 next unless $spec;
473 41         63 foreach my $s (@{$spec}) {
  41         87  
474 47         70 ($name, $module) = @{$s};
  47         107  
475 47 100       122 if ($module) {
476 14 100       940 next unless eval "require $module;";
477             }
478 37         154 $encoder = Encode::find_encoding($name);
479 37 50       21748 last if ref $encoder;
480             }
481 41 100       111 last if ref $encoder;
482             }
483 55   100     214 $encoder ||= Encode::find_encoding($charset);
484 55 100       1419 $encoder_cache{$charset, $mapping} = $encoder if $encoder;
485 55         126 return $encoder;
486             }
487              
488             =back
489              
490             =head2 Getting Information of Charsets
491              
492             =over
493              
494             =item $charset->body_encoding
495              
496             =item body_encoding CHARSET
497              
498             Get recommended transfer-encoding of CHARSET for message body.
499              
500             Returned value will be one of C<"B"> (BASE64), C<"Q"> (QUOTED-PRINTABLE),
501             C<"S"> (shorter one of either) or
502             C (might not be transfer-encoded; either 7BIT or 8BIT). This may
503             not be same as encoding for message header.
504              
505             =cut
506              
507             sub body_encoding($) {
508 4     4 1 17 my $self = shift;
509 4 50       28 return undef unless $self;
510 4 100       22 $self = __PACKAGE__->new($self) unless ref $self;
511 4         38 $self->{BodyEncoding};
512             }
513              
514             =item $charset->as_string
515              
516             =item canonical_charset CHARSET
517              
518             Get canonical name for charset.
519              
520             =cut
521              
522             sub canonical_charset($) {
523 4     4 1 12 my $self = shift;
524 4 50       13 return undef unless $self;
525 4 100       13 $self = __PACKAGE__->new($self) unless ref $self;
526 4         18 $self->{InputCharset};
527             }
528              
529             sub as_string($) {
530 29     29 1 114 my $self = shift;
531 29         118 $self->{InputCharset};
532             }
533              
534             =item $charset->decoder
535              
536             Get L<"Encode::Encoding"> object to decode strings to Unicode by charset.
537             If charset is not specified or not known by this module,
538             undef will be returned.
539              
540             =cut
541              
542             sub decoder($) {
543 12     12 1 34 my $self = shift;
544 12         47 $self->{Decoder};
545             }
546              
547             =item $charset->dup
548              
549             Get a copy of charset object.
550              
551             =cut
552              
553             sub dup($) {
554 2     2 1 5 my $self = shift;
555 2         6 my $obj = __PACKAGE__->new(undef);
556 2         4 %{$obj} = %{$self};
  2         9  
  2         9  
557 2         5 $obj;
558             }
559              
560             =item $charset->encoder([CHARSET])
561              
562             Get L<"Encode::Encoding"> object to encode Unicode string using compatible
563             charset recommended to be used for messages on Internet.
564              
565             If optional CHARSET is specified, replace encoder (and output charset
566             name) of $charset object with those of CHARSET, therefore,
567             $charset object will be a converter between original charset and
568             new CHARSET.
569              
570             =cut
571              
572             sub encoder($$;) {
573 6     6 1 9 my $self = shift;
574 6         9 my $charset = shift;
575 6 100       16 if ($charset) {
576 2 50       13 $charset = __PACKAGE__->new($charset) unless ref $charset;
577 2         6 $self->{OutputCharset} = $charset->{InputCharset};
578 2         5 $self->{Encoder} = $charset->{Decoder};
579 2         4 $self->{BodyEncoding} = $charset->{BodyEncoding};
580 2         4 $self->{HeaderEncoding} = $charset->{HeaderEncoding};
581             }
582 6         45 $self->{Encoder};
583             }
584              
585             =item $charset->header_encoding
586              
587             =item header_encoding CHARSET
588              
589             Get recommended encoding scheme of CHARSET for message header.
590              
591             Returned value will be one of C<"B">, C<"Q">, C<"S"> (shorter one of either)
592             or C (might not be encoded). This may not be same as encoding
593             for message body.
594              
595             =cut
596              
597             sub header_encoding($) {
598 4     4 1 10 my $self = shift;
599 4 50       9 return undef unless $self;
600 4 100       12 $self = __PACKAGE__->new($self) unless ref $self;
601 4         17 $self->{HeaderEncoding};
602             }
603              
604             =item $charset->output_charset
605              
606             =item output_charset CHARSET
607              
608             Get a charset which is compatible with given CHARSET and is recommended
609             to be used for MIME messages on Internet (if it is known by this module).
610              
611             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
612             this function will simply
613             return the result of L<"canonical_charset">.
614              
615             =cut
616              
617             sub output_charset($) {
618 4     4 1 11 my $self = shift;
619 4 50       10 return undef unless $self;
620 4 100       15 $self = __PACKAGE__->new($self) unless ref $self;
621 4         21 $self->{OutputCharset};
622             }
623              
624             =back
625              
626             =head2 Translating Text Data
627              
628             =over
629              
630             =item $charset->body_encode(STRING [, OPTS])
631              
632             =item body_encode STRING, CHARSET [, OPTS]
633              
634             Get converted (if needed) data of STRING and recommended transfer-encoding
635             of that data for message body. CHARSET is the charset by which STRING
636             is encoded.
637              
638             OPTS may accept following key-value pairs.
639             B:
640             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
641             conversion will not be performed. So these options do not have any effects.
642              
643             =over 4
644              
645             =item Detect7bit => YESNO
646              
647             Try auto-detecting 7-bit charset when CHARSET is not given.
648             Default is C<"YES">.
649              
650             =item Replacement => REPLACEMENT
651              
652             Specifies error handling scheme. See L<"Error Handling">.
653              
654             =back
655              
656             3-item list of (I, I,
657             I) will be returned.
658             I will be either C<"BASE64">, C<"QUOTED-PRINTABLE">,
659             C<"7BIT"> or C<"8BIT">. If I could not be determined
660             and I contains non-ASCII byte(s), I will
661             be C and I will be C<"BASE64">.
662             I will be C<"US-ASCII"> if and only if string does not
663             contain any non-ASCII bytes.
664              
665             =cut
666              
667             sub body_encode {
668 4     4 1 421 my $self = shift;
669 4         8 my $text;
670 4 100       12 if (ref $self) {
671 2         5 $text = shift;
672             } else {
673 2         4 $text = $self;
674 2         10 $self = __PACKAGE__->new(shift);
675             }
676 4         17 my ($encoded, $charset) = $self->_text_encode($text, @_);
677             return ($encoded, undef, 'BASE64')
678 4 50 33     25 unless $charset and $charset->{InputCharset};
679 4         9 my $cset = $charset->{OutputCharset};
680              
681             # Determine transfer-encoding.
682 4         7 my $enc = $charset->{BodyEncoding};
683              
684 4 50 33     20 if (!$enc and $encoded !~ /\x00/) { # Eliminate hostile NUL character.
    0          
    0          
    0          
685 4 50       100 if ($encoded =~ $NON7BITRE) { # String contains 8bit char(s).
    50          
686 0         0 $enc = '8BIT';
687             } elsif ($cset =~ /^($ISO2022RE|$ASCIITRANSRE)$/) { # 7BIT.
688 4         10 $enc = '7BIT';
689             } else { # Pure ASCII.
690 0         0 $enc = '7BIT';
691 0         0 $cset = 'US-ASCII';
692             }
693             } elsif ($enc eq 'S') {
694 0         0 $enc = _resolve_S($encoded, 1);
695             } elsif ($enc eq 'B') {
696 0         0 $enc = 'BASE64';
697             } elsif ($enc eq 'Q') {
698 0         0 $enc = 'QUOTED-PRINTABLE';
699             } else {
700 0         0 $enc = 'BASE64';
701             }
702 4         42 return ($encoded, $cset, $enc);
703             }
704              
705             =item $charset->decode(STRING [,CHECK])
706              
707             Decode STRING to Unicode.
708              
709             B:
710             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
711             this function will die.
712              
713             =cut
714              
715             sub decode($$$;) {
716 14     14 1 1671 my $self = shift;
717 14         18 my $s = shift;
718 14   100     45 my $check = shift || 0;
719 14         62 $self->{Decoder}->decode($s, $check);
720             }
721              
722             =item detect_7bit_charset STRING
723              
724             Guess 7-bit charset that may encode a string STRING.
725             If STRING contains any 8-bit bytes, C will be returned.
726             Otherwise, Default Charset will be returned for unknown charset.
727              
728             =cut
729              
730             sub detect_7bit_charset($) {
731 4 50   4 1 16 return $DEFAULT_CHARSET unless &USE_ENCODE;
732 4         6 my $s = shift;
733 4 50       10 return $DEFAULT_CHARSET unless $s;
734              
735             # Non-7bit string
736 4 50       15 return undef if $s =~ $NON7BITRE;
737              
738             # Try to detect 7-bit escape sequences.
739 4         10 foreach (@ESCAPE_SEQS) {
740 8         18 my ($seq, $cset) = @$_;
741 8 100       24 if (index($s, $seq) >= 0) {
742 4         15 my $decoder = __PACKAGE__->new($cset);
743 4 50       19 next unless $decoder->{Decoder};
744 4         8 eval {
745 4         5 my $dummy = $s;
746 4         18 $decoder->decode($dummy, FB_CROAK());
747             };
748 4 50       322 if ($@) {
749 0         0 next;
750             }
751 4         27 return $decoder->{InputCharset};
752             }
753             }
754              
755             # How about HZ, VIQR, UTF-7, ...?
756              
757 0         0 return $DEFAULT_CHARSET;
758             }
759              
760             sub _detect_7bit_charset {
761 0     0   0 detect_7bit_charset(@_);
762             }
763              
764             =item $charset->encode(STRING [, CHECK])
765              
766             Encode STRING (Unicode or non-Unicode) using compatible charset recommended
767             to be used for messages on Internet (if this module knows it).
768             Note that string will be decoded to Unicode then encoded even if compatible charset
769             was equal to original charset.
770              
771             B:
772             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
773             this function will die.
774              
775             =cut
776              
777             sub encode($$$;) {
778 10     10 1 663 my $self = shift;
779 10         17 my $s = shift;
780 10   50     32 my $check = shift || 0;
781              
782 10 100 66     57 unless (is_utf8($s) or $s =~ /[^\x00-\xFF]/) {
783 4 50       51 $s = $self->{Decoder}->decode($s, ($check & 0x1)? FB_CROAK(): 0);
784             }
785 10         44 my $enc = $self->{Encoder}->encode($s, $check);
786 10 50       5211 Encode::_utf8_off($enc) if is_utf8($enc); # workaround for RT #35120
787 10         33 $enc;
788             }
789              
790             =item $charset->encoded_header_len(STRING [, ENCODING])
791              
792             =item encoded_header_len STRING, ENCODING, CHARSET
793              
794             Get length of encoded STRING for message header
795             (without folding).
796              
797             ENCODING may be one of C<"B">, C<"Q"> or C<"S"> (shorter
798             one of either C<"B"> or C<"Q">).
799              
800             =cut
801              
802             sub encoded_header_len($$$;) {
803 14     14 1 33 my $self = shift;
804 14         19 my ($encoding, $s);
805 14 100       26 if (ref $self) {
806 8         9 $s = shift;
807 8   66     23 $encoding = uc(shift || $self->{HeaderEncoding});
808             } else {
809 6         7 $s = $self;
810 6         10 $encoding = uc(shift);
811 6         7 $self = shift;
812 6 50       18 $self = __PACKAGE__->new($self) unless ref $self;
813             }
814              
815             #FIXME:$encoding === undef
816              
817 14         17 my $enclen;
818 14 100 66     44 if ($encoding eq 'Q') {
    50          
819 4         9 $enclen = _enclen_Q($s);
820             } elsif ($encoding eq 'S' and _resolve_S($s) eq 'Q') {
821 0         0 $enclen = _enclen_Q($s);
822             } else { # "B"
823 10         26 $enclen = _enclen_B($s);
824             }
825              
826 14         64 length($self->{OutputCharset})+$enclen+7;
827             }
828              
829             sub _enclen_B($) {
830 10     10   39 int((length(shift) + 2) / 3) * 4;
831             }
832              
833             sub _enclen_Q($;$) {
834 4     4   15 my $s = shift;
835 4         7 my $in_body = shift;
836 4         6 my @o;
837 4 50       11 if ($in_body) {
838 0         0 @o = ($s =~ m{([^-\t\r\n !*+/0-9A-Za-z])}go);
839             } else {
840 4         33 @o = ($s =~ m{([^- !*+/0-9A-Za-z])}gos);
841             }
842 4         13 length($s) + scalar(@o) * 2;
843             }
844              
845             sub _resolve_S($;$) {
846 6     6   8 my $s = shift;
847 6         8 my $in_body = shift;
848 6         7 my $e;
849 6 50       10 if ($in_body) {
850 0         0 $e = scalar(() = $s =~ m{[^-\t\r\n !*+/0-9A-Za-z]}g);
851 0 0       0 return (length($s) + 8 < $e * 6) ? 'BASE64' : 'QUOTED-PRINTABLE';
852             } else {
853 6         33 $e = scalar(() = $s =~ m{[^- !*+/0-9A-Za-z]}g);
854 6 50       27 return (length($s) + 8 < $e * 6) ? 'B' : 'Q';
855             }
856             }
857              
858             =item $charset->header_encode(STRING [, OPTS])
859              
860             =item header_encode STRING, CHARSET [, OPTS]
861              
862             Get converted (if needed) data of STRING and recommended encoding scheme of
863             that data for message headers. CHARSET is the charset by which STRING
864             is encoded.
865              
866             OPTS may accept following key-value pairs.
867             B:
868             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
869             conversion will not be performed. So these options do not have any effects.
870              
871             =over 4
872              
873             =item Detect7bit => YESNO
874              
875             Try auto-detecting 7-bit charset when CHARSET is not given.
876             Default is C<"YES">.
877              
878             =item Replacement => REPLACEMENT
879              
880             Specifies error handling scheme. See L<"Error Handling">.
881              
882             =back
883              
884             3-item list of (I, I,
885             I) will be returned. I will be
886             either C<"B">, C<"Q"> or C (might not be encoded).
887             If I could not be determined and I
888             contains non-ASCII byte(s), I will be C<"8BIT">
889             (this is I charset name but a special value to represent unencodable
890             data) and I will be C (should not be encoded).
891             I will be C<"US-ASCII"> if and only if string does not
892             contain any non-ASCII bytes.
893              
894             =cut
895              
896             sub header_encode {
897 8     8 1 1143 my $self = shift;
898 8         13 my $text;
899 8 100       19 if (ref $self) {
900 4         8 $text = shift;
901             } else {
902 4         7 $text = $self;
903 4         11 $self = __PACKAGE__->new(shift);
904             }
905 8         35 my ($encoded, $charset) = $self->_text_encode($text, @_);
906             return ($encoded, '8BIT', undef)
907 8 50 33     36 unless $charset and $charset->{InputCharset};
908 8         15 my $cset = $charset->{OutputCharset};
909              
910             # Determine encoding scheme.
911 8         13 my $enc = $charset->{HeaderEncoding};
912              
913 8 100 66     63 if (!$enc and $encoded !~ $NON7BITRE) {
    50          
    50          
914 2 50       75 unless ($cset =~ /^($ISO2022RE|$ASCIITRANSRE)$/) { # 7BIT.
915 2         6 $cset = 'US-ASCII';
916             }
917             } elsif ($enc eq 'S') {
918 0         0 $enc = _resolve_S($encoded);
919             } elsif ($enc !~ /^[BQ]$/) {
920 0         0 $enc = 'B';
921             }
922 8         44 return ($encoded, $cset, $enc);
923             }
924              
925             sub _text_encode {
926 12     12   18 my $charset = shift;
927 12         18 my $s = shift;
928 12         24 my %params = @_;
929 12   33     68 my $replacement = uc($params{'Replacement'} || $Config->{Replacement});
930 12   33     44 my $detect7bit = uc($params{'Detect7bit'} || $Config->{Detect7bit});
931             my $encoding = $params{'Encoding'} ||
932 12   33     54 (exists $params{'Encoding'}? undef: 'A'); # undocumented
933              
934 12 50 33     50 if (!$encoding or $encoding ne 'A') { # no 7-bit auto-detection
935 0         0 $detect7bit = 'NO';
936             }
937 12 100       34 unless ($charset->{InputCharset}) {
938 4 50       29 if ($s =~ $NON7BITRE) {
    50          
939 0         0 return ($s, undef);
940             } elsif ($detect7bit ne "NO") {
941 4         14 $charset = __PACKAGE__->new(&detect_7bit_charset($s));
942             } else {
943 0         0 $charset = __PACKAGE__->new($DEFAULT_CHARSET,
944             Mapping => 'STANDARD');
945             }
946             }
947 12 50 33     45 if (!$encoding or $encoding ne 'A') { # no conversion
948 0         0 $charset = $charset->dup;
949 0         0 $charset->encoder($charset);
950 0         0 $charset->{HeaderEncoding} = $encoding;
951 0         0 $charset->{BodyEncoding} = $encoding;
952             }
953             my $check = ($replacement and $replacement =~ /^\d+$/)?
954             $replacement:
955             {
956             'CROAK' => FB_CROAK(),
957             'STRICT' => FB_CROAK(),
958             'FALLBACK' => FB_CROAK(), # special
959             'PERLQQ' => FB_PERLQQ(),
960             'HTMLCREF' => FB_HTMLCREF(),
961             'XMLCREF' => FB_XMLCREF(),
962 12 50 33     163 }->{$replacement || ""} || 0;
      50        
963              
964             # Encode data by output charset if required. If failed, fallback to
965             # fallback charset.
966 12         29 my $encoded;
967 12 100 33     105 if (is_utf8($s) or $s =~ /[^\x00-\xFF]/ or
      50        
      50        
      66        
968             ($charset->{InputCharset} || "") ne ($charset->{OutputCharset} || "")) {
969 4 50       11 if ($check & 0x1) { # CROAK or FALLBACK
970 0         0 eval {
971 0         0 $encoded = $s;
972 0         0 $encoded = $charset->encode($encoded, FB_CROAK());
973             };
974 0 0       0 if ($@) {
975 0 0 0     0 if ($replacement eq "FALLBACK" and $FALLBACK_CHARSET) {
976 0         0 my $cset = __PACKAGE__->new($FALLBACK_CHARSET,
977             Mapping => 'STANDARD');
978             # croak unknown charset
979             croak "unknown charset ``$FALLBACK_CHARSET''"
980 0 0       0 unless $cset->{Decoder};
981             # charset translation
982 0         0 $charset = $charset->dup;
983 0         0 $charset->encoder($cset);
984 0         0 $encoded = $s;
985 0         0 $encoded = $charset->encode($encoded, 0);
986             # replace input & output charsets with fallback charset
987 0         0 $cset->encoder($cset);
988 0         0 $charset = $cset;
989             } else {
990 0         0 $@ =~ s/ at .+$//;
991 0         0 croak $@;
992             }
993             }
994             } else {
995 4         7 $encoded = $s;
996 4         13 $encoded = $charset->encode($encoded, $check);
997             }
998             } else {
999 8         12 $encoded = $s;
1000             }
1001              
1002 12 100       72 if ($encoded !~ /$NONASCIIRE/) { # maybe ASCII
1003             # check ``ASCII transformation'' charsets
1004 4 50       162 if ($charset->{OutputCharset} =~ /^($ASCIITRANSRE)$/) {
    0          
1005 4         10 my $u = $encoded;
1006 4         6 if (USE_ENCODE) {
1007 4         16 $u = $charset->encoder->decode($encoded); # dec. by output
1008             } elsif ($encoded =~ /[+~]/) { # workaround for pre-Encode env.
1009             $u = "x$u";
1010             }
1011 4 100       214 if ($u eq $encoded) {
1012 2         10 $charset = $charset->dup;
1013 2         5 $charset->encoder($DEFAULT_CHARSET);
1014             }
1015             } elsif ($charset->{OutputCharset} ne "US-ASCII") {
1016 0         0 $charset = $charset->dup;
1017 0         0 $charset->encoder($DEFAULT_CHARSET);
1018             }
1019             }
1020              
1021 12         51 return ($encoded, $charset);
1022             }
1023              
1024             =item $charset->undecode(STRING [,CHECK])
1025              
1026             Encode Unicode string STRING to byte string by input charset of $charset.
1027             This is equivalent to C<$charset-Edecoder-Eencode()>.
1028              
1029             B:
1030             When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
1031             this function will die.
1032              
1033             =cut
1034              
1035             sub undecode($$$;) {
1036 0     0 1   my $self = shift;
1037 0           my $s = shift;
1038 0   0       my $check = shift || 0;
1039 0           my $enc = $self->{Decoder}->encode($s, $check);
1040 0           Encode::_utf8_off($enc); # workaround for RT #35120
1041 0           $enc;
1042             }
1043              
1044             =back
1045              
1046             =head2 Manipulating Module Defaults
1047              
1048             =over
1049              
1050             =item alias ALIAS [, CHARSET]
1051              
1052             Get/set charset alias for canonical names determined by
1053             L<"canonical_charset">.
1054              
1055             If CHARSET is given and isn't false, ALIAS will be assigned as an alias of
1056             CHARSET. Otherwise, alias won't be changed. In both cases,
1057             current charset name that ALIAS is assigned will be returned.
1058              
1059             =cut
1060              
1061             sub alias ($;$) {
1062 0     0 1   my $alias = uc(shift);
1063 0           my $charset = uc(shift);
1064              
1065 0 0         return $CHARSET_ALIASES{$alias} unless $charset;
1066              
1067 0           $CHARSET_ALIASES{$alias} = $charset;
1068 0           return $charset;
1069             }
1070              
1071             =item default [CHARSET]
1072              
1073             Get/set default charset.
1074              
1075             B is used by this module when charset context is
1076             unknown. Modules using this module are recommended to use this
1077             charset when charset context is unknown or implicit default is
1078             expected. By default, it is C<"US-ASCII">.
1079              
1080             If CHARSET is given and isn't false, it will be set to default charset.
1081             Otherwise, default charset won't be changed. In both cases,
1082             current default charset will be returned.
1083              
1084             B: Default charset I be changed.
1085              
1086             =cut
1087              
1088             sub default(;$) {
1089 0     0 1   my $charset = &canonical_charset(shift);
1090              
1091 0 0         if ($charset) {
1092 0 0         croak "Unknown charset '$charset'"
1093             unless resolve_alias($charset);
1094 0           $DEFAULT_CHARSET = $charset;
1095             }
1096 0           return $DEFAULT_CHARSET;
1097             }
1098              
1099             =item fallback [CHARSET]
1100              
1101             Get/set fallback charset.
1102              
1103             B is used by this module when conversion by given
1104             charset is failed and C<"FALLBACK"> error handling scheme is specified.
1105             Modules using this module may use this charset as last resort of charset
1106             for conversion. By default, it is C<"UTF-8">.
1107              
1108             If CHARSET is given and isn't false, it will be set to fallback charset.
1109             If CHARSET is C<"NONE">, fallback charset will be undefined.
1110             Otherwise, fallback charset won't be changed. In any cases,
1111             current fallback charset will be returned.
1112              
1113             B: It I useful that C<"US-ASCII"> is specified as fallback charset,
1114             since result of conversion will be readable without charset information.
1115              
1116             =cut
1117              
1118             sub fallback(;$) {
1119 0     0 1   my $charset = &canonical_charset(shift);
1120              
1121 0 0         if ($charset eq "NONE") {
    0          
1122 0           $FALLBACK_CHARSET = undef;
1123             } elsif ($charset) {
1124 0 0         croak "Unknown charset '$charset'"
1125             unless resolve_alias($charset);
1126 0           $FALLBACK_CHARSET = $charset;
1127             }
1128 0           return $FALLBACK_CHARSET;
1129             }
1130              
1131             =item recommended CHARSET [, HEADERENC, BODYENC [, ENCCHARSET]]
1132              
1133             Get/set charset profiles.
1134              
1135             If optional arguments are given and any of them are not false, profiles
1136             for CHARSET will be set by those arguments. Otherwise, profiles
1137             won't be changed. In both cases, current profiles for CHARSET will be
1138             returned as 3-item list of (HEADERENC, BODYENC, ENCCHARSET).
1139              
1140             HEADERENC is recommended encoding scheme for message header.
1141             It may be one of C<"B">, C<"Q">, C<"S"> (shorter one of either) or
1142             C (might not be encoded).
1143              
1144             BODYENC is recommended transfer-encoding for message body. It may be
1145             one of C<"B">, C<"Q">, C<"S"> (shorter one of either) or
1146             C (might not be transfer-encoded).
1147              
1148             ENCCHARSET is a charset which is compatible with given CHARSET and
1149             is recommended to be used for MIME messages on Internet.
1150             If conversion is not needed (or this module doesn't know appropriate
1151             charset), ENCCHARSET is C.
1152              
1153             B: This function in the future releases can accept more optional
1154             arguments (for example, properties to handle character widths, line folding
1155             behavior, ...). So format of returned value may probably be changed.
1156             Use L<"header_encoding">, L<"body_encoding"> or L<"output_charset"> to get
1157             particular profile.
1158              
1159             =cut
1160              
1161             sub recommended ($;$;$;$) {
1162 0     0 1   my $charset = &canonical_charset(shift);
1163 0   0       my $henc = uc(shift) || undef;
1164 0   0       my $benc = uc(shift) || undef;
1165 0           my $cset = &canonical_charset(shift);
1166              
1167 0 0         croak "CHARSET is not specified" unless $charset;
1168 0 0 0       croak "Unknown header encoding" unless !$henc or $henc =~ /^[BQS]$/;
1169 0 0 0       croak "Unknown body encoding" unless !$benc or $benc =~ /^[BQ]$/;
1170              
1171 0 0 0       if ($henc or $benc or $cset) {
      0        
1172 0 0         $cset = undef if $charset eq $cset;
1173 0           my @spec = ($henc, $benc, USE_ENCODE? $cset: undef);
1174 0           $CHARSETS{$charset} = \@spec;
1175 0           return @spec;
1176             } else {
1177 0 0         $charset = __PACKAGE__->new($charset) unless ref $charset;
1178 0           return map { $charset->{$_} } qw(HeaderEncoding BodyEncoding
  0            
1179             OutputCharset);
1180             }
1181             }
1182              
1183             =back
1184              
1185             =head2 Constants
1186              
1187             =over
1188              
1189             =item USE_ENCODE
1190              
1191             Unicode/multibyte support flag.
1192             Non-empty string will be set when Unicode and multibyte support is enabled.
1193             Currently, this flag will be non-empty on Perl 5.7.3 or later and
1194             empty string on earlier versions of Perl.
1195              
1196             =back
1197              
1198             =head2 Error Handling
1199              
1200             L<"body_encode"> and L<"header_encode"> accept following C
1201             options:
1202              
1203             =over
1204              
1205             =item C<"DEFAULT">
1206              
1207             Put a substitution character in place of a malformed character.
1208             For UCM-based encodings, will be used.
1209              
1210             =item C<"FALLBACK">
1211              
1212             Try C<"DEFAULT"> scheme using I (see L<"fallback">).
1213             When fallback charset is undefined and conversion causes error,
1214             code will die on error with an error message.
1215              
1216             =item C<"CROAK">
1217              
1218             Code will die on error immediately with an error message.
1219             Therefore, you should trap the fatal error with eval{} unless you
1220             really want to let it die on error.
1221             Synonym is C<"STRICT">.
1222              
1223             =item C<"PERLQQ">
1224              
1225             =item C<"HTMLCREF">
1226              
1227             =item C<"XMLCREF">
1228              
1229             Use C, C or C
1230             scheme defined by L module.
1231              
1232             =item numeric values
1233              
1234             Numeric values are also allowed.
1235             For more details see L.
1236              
1237             =back
1238              
1239             If error handling scheme is not specified or unknown scheme is specified,
1240             C<"DEFAULT"> will be assumed.
1241              
1242             =head2 Configuration File
1243              
1244             Built-in defaults for option parameters can be overridden by configuration
1245             file: F.
1246             For more details read F.
1247              
1248             =head1 VERSION
1249              
1250             Consult $VERSION variable.
1251              
1252             Development versions of this module may be found at
1253             L.
1254              
1255             =head2 Incompatible Changes
1256              
1257             =over 4
1258              
1259             =item Release 1.001
1260              
1261             =over 4
1262              
1263             =item *
1264              
1265             new() method returns an object when CHARSET argument is not specified.
1266              
1267             =back
1268              
1269             =item Release 1.005
1270              
1271             =over 4
1272              
1273             =item *
1274              
1275             Restrict characters in encoded-word according to RFC 2047 section 5 (3).
1276             This also affects return value of encoded_header_len() method.
1277              
1278             =back
1279              
1280             =item Release 1.008.2
1281              
1282             =over 4
1283              
1284             =item *
1285              
1286             body_encoding() method may also returns C<"S">.
1287              
1288             =item *
1289              
1290             Return value of body_encode() method for UTF-8 may include
1291             C<"QUOTED-PRINTABLE"> encoding item that in earlier versions was fixed to
1292             C<"BASE64">.
1293              
1294             =back
1295              
1296             =back
1297              
1298             =head1 SEE ALSO
1299              
1300             Multipurpose Internet Mail Extensions (MIME).
1301              
1302             =head1 AUTHOR
1303              
1304             Hatuka*nezumi - IKEDA Soji
1305              
1306             =head1 COPYRIGHT
1307              
1308             Copyright (C) 2006-2017 Hatuka*nezumi - IKEDA Soji.
1309             This program is free software; you can redistribute it and/or modify it
1310             under the same terms as Perl itself.
1311              
1312             =cut
1313              
1314             1;