File Coverage

blib/lib/MIME/WordDecoder.pm
Criterion Covered Total %
statement 103 142 72.5
branch 17 44 38.6
condition 14 32 43.7
subroutine 28 34 82.3
pod 7 9 77.7
total 169 261 64.7


line stmt bran cond sub pod time code
1             package MIME::WordDecoder;
2              
3             =head1 NAME
4              
5             MIME::WordDecoder - decode RFC 2047 encoded words to a local representation
6              
7             WARNING: Most of this module is deprecated and may disappear. The only
8             function you should use for MIME decoding is "mime_to_perl_string".
9              
10             =head1 SYNOPSIS
11              
12             See L for the basics of encoded words.
13             See L<"DESCRIPTION"> for how this class works.
14              
15             use MIME::WordDecoder;
16              
17              
18             ### Get the default word-decoder (used by unmime()):
19             $wd = default MIME::WordDecoder;
20              
21             ### Get a word-decoder which maps to ISO-8859-1 (Latin1):
22             $wd = supported MIME::WordDecoder "ISO-8859-1";
23              
24              
25             ### Decode a MIME string (e.g., into Latin1) via the default decoder:
26             $str = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
27              
28             ### Decode a string using the default decoder, non-OO style:
29             $str = unmime('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
30              
31             ### Decode a string to an internal Perl string, non-OO style
32             ### The result is likely to have the UTF8 flag ON.
33             $str = mime_to_perl_string('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
34              
35             =head1 DESCRIPTION
36              
37             WARNING: Most of this module is deprecated and may disappear. It
38             duplicates (badly) the function of the standard 'Encode' module. The
39             only function you should rely on is mime_to_perl_string.
40              
41             A MIME::WordDecoder consists, fundamentally, of a hash which maps
42             a character set name (US-ASCII, ISO-8859-1, etc.) to a subroutine which
43             knows how to take bytes in that character set and turn them into
44             the target string representation. Ideally, this target representation
45             would be Unicode, but we don't want to overspecify the translation
46             that takes place: if you want to convert MIME strings directly to Big5,
47             that's your own decision.
48              
49             The subroutine will be invoked with two arguments: DATA (the data in
50             the given character set), and CHARSET (the upcased character set name).
51              
52             For example:
53              
54             ### Keep 7-bit characters as-is, convert 8-bit characters to '#':
55             sub keep7bit {
56             local $_ = shift;
57             tr/\x00-\x7F/#/c;
58             $_;
59             }
60              
61             Here's a decoder which uses that:
62              
63             ### Construct a decoder:
64             $wd = MIME::WordDecoder->new({'US-ASCII' => "KEEP", ### sub { $_[0] }
65             'ISO-8859-1' => \&keep7bit,
66             'ISO-8859-2' => \&keep7bit,
67             'Big5' => "WARN",
68             '*' => "DIE"});
69              
70             ### Convert some MIME text to a pure ASCII string...
71             $ascii = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
72              
73             ### ...which will now hold: "To: Keld J#rn Simonsen "
74              
75             The UTF-8 built-in decoder decodes everything into Perl's internal
76             string format, possibly turning on the internal UTF8 flag. Use it like
77             this:
78              
79             $wd = supported MIME::WordDecoder 'UTF-8';
80             $perl_string = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
81             # perl_string will be a valid UTF-8 string with the "UTF8" flag set.
82              
83             Generally, you should use the UTF-8 decoder in preference to "unmime".
84              
85             =head1 PUBLIC INTERFACE
86              
87             =over
88              
89             =cut
90              
91 25     25   56520 use strict;
  25         38  
  25         641  
92 25     25   76 use Carp qw( carp croak );
  25         28  
  25         1023  
93 25     25   7221 use MIME::Words qw(decode_mimewords);
  25         38  
  25         1098  
94 25     25   102 use Exporter;
  25         43  
  25         678  
95 25     25   80 use vars qw(@ISA @EXPORT);
  25         29  
  25         14479  
96              
97             @ISA = qw(Exporter);
98             @EXPORT = qw( unmime mime_to_perl_string );
99              
100              
101              
102             #------------------------------
103             #
104             # Globals
105             #
106             #------------------------------
107              
108             ### Decoders.
109             my %DecoderFor = ();
110              
111             ### Standard handlers.
112             my %Handler =
113             (
114             KEEP => sub {$_[0]},
115             IGNORE => sub {''},
116             WARN => sub { carp "ignoring text in character set `$_[1]'\n" },
117             DIE => sub { croak "can't handle text in character set `$_[1]'\n" },
118             );
119              
120             ### Global default decoder. We init it below.
121             my $Default;
122              
123             ### Global UTF8 decoder.
124             my $DefaultUTF8;
125              
126             #------------------------------
127              
128             =item default [DECODER]
129              
130             I
131             Get/set the default DECODER object.
132              
133             =cut
134              
135             sub default {
136 25     25 1 110 my $class = shift;
137 25 50       51 if (@_) {
138 25         35 $Default = shift;
139             }
140 25         81 $Default;
141             }
142              
143             #------------------------------
144              
145             =item supported CHARSET, [DECODER]
146              
147             I
148             If just CHARSET is given, returns a decoder object which maps
149             data into that character set (the character set is forced to
150             all-uppercase).
151              
152             $wd = supported MIME::WordDecoder "ISO-8859-1";
153              
154             If DECODER is given, installs such an object:
155              
156             MIME::WordDecoder->supported("ISO-8859-1" =>
157             (new MIME::WordDecoder::ISO_8859 "1"));
158              
159             You should not override this method.
160              
161             =cut
162              
163             sub supported {
164 215     215 1 1195 my ($class, $charset, $decoder) = @_;
165 215 50       326 $DecoderFor{uc($charset)} = $decoder if (@_ > 2);
166 215         443 $DecoderFor{uc($charset)};
167             }
168              
169             #------------------------------
170              
171             =item new [\@HANDLERS]
172              
173             I
174             If \@HANDLERS is given, then @HANDLERS is passed to handler()
175             to initialize the internal map.
176              
177             =cut
178              
179             sub new {
180 476     476 1 370 my ($class, $h) = @_;
181 476         808 my $self = bless { MWD_Map=>{} }, $class;
182              
183             ### Init the map:
184 476         1556 $self->handler(@$h);
185              
186             ### Add fallbacks:
187 476   66     1556 $self->{MWD_Map}{'*'} ||= $Handler{WARN};
188 476   66     1125 $self->{MWD_Map}{'raw'} ||= $self->{MWD_Map}{'US-ASCII'};
189 476         503 $self;
190             }
191              
192             #------------------------------
193              
194             =item handler CHARSET=>\&SUBREF, ...
195              
196             I
197             Set the handler SUBREF for a given CHARSET, for as many pairs
198             as you care to supply.
199              
200             When performing the translation of a MIME-encoded string, a
201             given SUBREF will be invoked when translating a block of text
202             in character set CHARSET. The subroutine will be invoked with
203             the following arguments:
204              
205             DATA - the data in the given character set.
206             CHARSET - the upcased character set name, which may prove useful
207             if you are using the same SUBREF for multiple CHARSETs.
208             DECODER - the decoder itself, if it contains configuration information
209             that your handler function needs.
210              
211             For example:
212              
213             $wd = new MIME::WordDecoder;
214             $wd->handler('US-ASCII' => "KEEP");
215             $wd->handler('ISO-8859-1' => \&handle_latin1,
216             'ISO-8859-2' => \&handle_latin1,
217             '*' => "DIE");
218              
219             Notice that, much as with %SIG, the SUBREF can also be taken from
220             a set of special keywords:
221              
222             KEEP Pass data through unchanged.
223             IGNORE Ignore data in this character set, without warning.
224             WARN Ignore data in this character set, with warning.
225             DIE Fatal exception with "can't handle character set" message.
226              
227             The subroutine for the special CHARSET of 'raw' is used for raw
228             (non-MIME-encoded) text, which is supposed to be US-ASCII.
229             The handler for 'raw' defaults to whatever was specified for 'US-ASCII'
230             at the time of construction.
231              
232             The subroutine for the special CHARSET of '*' is used for any
233             unrecognized character set. The default action for '*' is WARN.
234              
235             =cut
236              
237             sub handler {
238 951     951 1 651 my $self = shift;
239              
240             ### Copy the hash, and edit it:
241 951         1409 while (@_) {
242 928         678 my $c = shift;
243 928         553 my $sub = shift;
244 928         1074 $self->{MWD_Map}{$c} = $self->real_handler($sub);
245             }
246 951         627 $self;
247             }
248              
249             #------------------------------
250              
251             =item decode STRING
252              
253             I
254             Decode a STRING which might contain MIME-encoded components into a
255             local representation (e.g., UTF-8, etc.).
256              
257             =cut
258              
259             sub decode {
260 94     94 1 937 my ($self, $str) = @_;
261 94 50       172 defined($str) or return undef;
262             join('', map {
263             ### Get the data and (upcased) charset:
264 94         236 my $data = $_->[0];
  119         146  
265 119 100       194 my $charset = (defined($_->[1]) ? uc($_->[1]) : 'raw');
266 119         121 $charset =~ s/\*\w+\Z//; ### RFC2184 language suffix
267              
268             ### Get the handler; guess if never seen before:
269             defined($self->{MWD_Map}{$charset}) or
270 119 100 100     402 $self->{MWD_Map}{$charset} =
271             ($self->real_handler($self->guess_handler($charset)) || 0);
272 119   66     323 my $subr = $self->{MWD_Map}{$charset} || $self->{MWD_Map}{'*'};
273              
274             ### Map this chunk:
275 119         193 &$subr($data, $charset, $self);
276             } decode_mimewords($str));
277             }
278              
279             #------------------------------
280             #
281             # guess_handler CHARSET
282             #
283             # Instance method.
284             # An unrecognized charset has been seen. Guess a handler subref
285             # for the given charset, returning false if there is none.
286             # Successful mappings will be cached in the main map.
287             #
288             sub guess_handler {
289 14     14 0 41 undef;
290             }
291              
292             #------------------------------
293             #
294             # real_handler HANDLER
295             #
296             # Instance method.
297             # Translate the given handler, which might be a subref or a string.
298             #
299             sub real_handler {
300 944     944 0 745 my ($self, $sub) = @_;
301             (!$sub) or
302             (ref($sub) eq 'CODE') or
303 944 100 33     3174 $sub = ($Handler{$sub} || croak "bad named handler: $sub\n");
      100        
304 944         1866 $sub;
305             }
306              
307             #------------------------------
308              
309             =item unmime STRING
310              
311             I
312             Decode the given STRING using the default() decoder.
313             See L.
314              
315             You should consider using the UTF-8 decoder instead. It decodes
316             MIME strings into Perl's internal string format.
317              
318             =cut
319              
320             sub unmime($) {
321 0     0 1 0 my $str = shift;
322 0         0 $Default->decode($str);
323             }
324              
325             =item mime_to_perl_string
326              
327             I
328             Decode the given STRING into an internal Perl Unicode string.
329             You should use this function in preference to all others.
330              
331             The result of mime_to_perl_string is likely to have Perl's
332             UTF8 flag set.
333              
334             =cut
335              
336             sub mime_to_perl_string($) {
337 1     1 1 802 my $str = shift;
338 1         4 $DecoderFor{'UTF-8'}->decode($str);
339             }
340              
341             =back
342              
343             =cut
344              
345              
346              
347              
348              
349             =head1 SUBCLASSES
350              
351             =over
352              
353             =cut
354              
355             #------------------------------------------------------------
356             #------------------------------------------------------------
357              
358             =item MIME::WordDecoder::ISO_8859
359              
360             A simple decoder which keeps US-ASCII and the 7-bit characters
361             of ISO-8859 character sets and UTF8, and also keeps 8-bit
362             characters from the indicated character set.
363              
364             ### Construct:
365             $wd = new MIME::WordDecoder::ISO_8859 2; ### ISO-8859-2
366              
367             ### What to translate unknown characters to (can also use empty):
368             ### Default is "?".
369             $wd->unknown("?");
370              
371             ### Collapse runs of unknown characters to a single unknown()?
372             ### Default is false.
373             $wd->collapse(1);
374              
375              
376             According to B
377             (ca. November 2000):
378              
379             ISO 8859 is a full series of 10 (and soon even more) standardized
380             multilingual single-byte coded (8bit) graphic character sets for
381             writing in alphabetic languages:
382              
383             1. Latin1 (West European)
384             2. Latin2 (East European)
385             3. Latin3 (South European)
386             4. Latin4 (North European)
387             5. Cyrillic
388             6. Arabic
389             7. Greek
390             8. Hebrew
391             9. Latin5 (Turkish)
392             10. Latin6 (Nordic)
393              
394             The ISO 8859 charsets are not even remotely as complete as the truly
395             great Unicode but they have been around and usable for quite a while
396             (first registered Internet charsets for use with MIME) and have
397             already offered a major improvement over the plain 7bit US-ASCII.
398              
399             Characters 0 to 127 are always identical with US-ASCII and the
400             positions 128 to 159 hold some less used control characters: the
401             so-called C1 set from ISO 6429.
402              
403             =cut
404              
405             package MIME::WordDecoder::ISO_8859;
406              
407 25     25   113 use strict;
  25         35  
  25         479  
408 25     25   75 use vars qw(@ISA);
  25         27  
  25         15795  
409             @ISA = qw( MIME::WordDecoder );
410              
411              
412             #------------------------------
413             #
414             # HANDLERS
415             #
416             #------------------------------
417              
418             ### Keep 7bit characters.
419             ### Turn all else to the special \x00.
420             sub h_keep7bit {
421 2     2   3 local $_ = $_[0];
422             # my $unknown = $_[2]->{MWDI_Unknown};
423              
424 2         3 s{[\x80-\xFF]}{\x00}g;
425 2         3 $_;
426             }
427              
428             ### Note: should use Unicode::String, converting/manipulating
429             ### everything into full Unicode form.
430              
431             ### Keep 7bit UTF8 characters (ASCII).
432             ### Keep ISO-8859-1 if this decoder is for Latin-1.
433             ### Turn all else to the special \x00.
434             sub h_utf8 {
435 0     0   0 local $_ = $_[0];
436             # my $unknown = $_[2]->{MWDI_Unknown};
437 0         0 my $latin1 = ($_[2]->{MWDI_Num} == 1);
438             #print STDERR "UTF8 in: <$_>\n";
439              
440 0         0 local($1,$2,$3);
441 0         0 my $tgt = '';
442 0   0     0 while (m{\G(
443             ([\x00-\x7F]) | # 0xxxxxxx
444             ([\xC0-\xDF] [\x80-\xBF]) | # 110yyyyy 10xxxxxx
445             ([\xE0-\xEF] [\x80-\xBF]{2}) | # 1110zzzz 10yyyyyy 10xxxxxx
446             ([\xF0-\xF7] [\x80-\xBF]{3}) | # 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
447             . # error; synch
448             )}gcsx and ($1 ne '')) {
449              
450 0 0 0     0 if (defined($2)) { $tgt .= $2 }
  0 0       0  
451 0         0 elsif (defined($3) && $latin1) { $tgt .= "\x00" }
452 0         0 else { $tgt .= "\x00" }
453             }
454              
455             #print STDERR "UTF8 out: <$tgt>\n";
456 0         0 $tgt;
457             }
458              
459             ### Keep characters which are 7bit in UTF8 (ASCII).
460             ### Keep ISO-8859-1 if this decoder is for Latin-1.
461             ### Turn all else to the special \x00.
462             sub h_utf16 {
463 0     0   0 local $_ = $_[0];
464             # my $unknown = $_[2]->{MWDI_Unknown};
465 0         0 my $latin1 = ($_[2]->{MWDI_Num} == 1);
466             #print STDERR "UTF16 in: <$_>\n";
467              
468 0         0 local($1,$2,$3,$4,$5);
469 0         0 my $tgt = '';
470 0   0     0 while (m{\G(
471             ( \x00 ([\x00-\x7F])) | # 00000000 0xxxxxxx
472             ( \x00 ([\x80-\xFF])) | # 00000000 1xxxxxxx
473             ( [^\x00] [\x00-\xFF]) | # etc
474             )
475             }gcsx and ($1 ne '')) {
476              
477 0 0 0     0 if (defined($2)) { $tgt .= $3 }
  0 0       0  
478 0         0 elsif (defined($4) && $latin1) { $tgt .= $5 }
479 0         0 else { $tgt .= "\x00" }
480             }
481              
482             #print STDERR "UTF16 out: <$tgt>\n";
483 0         0 $tgt;
484             }
485              
486              
487             #------------------------------
488             #
489             # PUBLIC INTERFACE
490             #
491             #------------------------------
492              
493             #------------------------------
494             #
495             # new NUMBER
496             #
497             sub new {
498 450     450   849 my ($class, $num) = @_;
499              
500 450         593 my $self = $class->SUPER::new();
501 450         544 $self->handler('raw' => 'KEEP',
502             'US-ASCII' => 'KEEP');
503              
504 450         415 $self->{MWDI_Num} = $num;
505 450         397 $self->{MWDI_Unknown} = "?";
506 450         339 $self->{MWDI_Collapse} = 0;
507 450         899 $self;
508             }
509              
510             #------------------------------
511             #
512             # guess_handler CHARSET
513             #
514             sub guess_handler {
515 2     2   3 my ($self, $charset) = @_;
516             return 'KEEP' if (($charset =~ /^ISO[-_]?8859[-_](\d+)$/) &&
517 2 100 66     18 ($1 eq $self->{MWDI_Num}));
518 1 50       7 return \&h_keep7bit if ($charset =~ /^ISO[-_]?8859/);
519 0 0       0 return \&h_utf8 if ($charset =~ /^UTF[-_]?8$/);
520 0 0       0 return \&h_utf16 if ($charset =~ /^UTF[-_]?16$/);
521 0         0 undef;
522             }
523              
524             #------------------------------
525             #
526             # unknown [REPLACEMENT]
527             #
528             sub unknown {
529 0     0   0 my $self = shift;
530 0 0       0 $self->{MWDI_Unknown} = shift if @_;
531 0         0 $self->{MWDI_Unknown};
532             }
533              
534             #------------------------------
535             #
536             # collapse [YESNO]
537             #
538             sub collapse {
539 0     0   0 my $self = shift;
540 0 0       0 $self->{MWDI_Collapse} = shift if @_;
541 0         0 $self->{MWDI_Collapse};
542             }
543              
544             #------------------------------
545             #
546             # decode STRING
547             #
548             sub decode {
549 10     10   6387 my $self = shift;
550              
551             ### Do inherited action:
552 10         26 my $basic = $self->SUPER::decode(@_);
553 10 50       22 defined($basic) or return undef;
554              
555             ### Translate/consolidate illegal characters:
556 10 50       19 $basic =~ tr{\x00}{\x00}c if $self->{MWDI_Collapse};
557 10         8 $basic =~ s{\x00}{$self->{MWDI_Unknown}}g;
558 10         15 $basic;
559             }
560              
561             #------------------------------------------------------------
562             #------------------------------------------------------------
563              
564             =item MIME::WordDecoder::US_ASCII
565              
566             A subclass of the ISO-8859-1 decoder which discards 8-bit characters.
567             You're probably better off using ISO-8859-1.
568              
569             =cut
570              
571             package MIME::WordDecoder::US_ASCII;
572              
573 25     25   113 use strict;
  25         29  
  25         459  
574 25     25   74 use vars qw(@ISA);
  25         24  
  25         3223  
575             @ISA = qw( MIME::WordDecoder::ISO_8859 );
576              
577             sub new {
578 25     25   40 my ($class) = @_;
579 25         107 return $class->SUPER::new("1");
580             }
581              
582             sub decode {
583 0     0   0 my $self = shift;
584              
585             ### Do inherited action:
586 0         0 my $basic = $self->SUPER::decode(@_);
587 0 0       0 defined($basic) or return undef;
588              
589             ### Translate/consolidate 8-bit characters:
590 0 0       0 $basic =~ tr{\x80-\xFF}{}c if $self->{MWDI_Collapse};
591 0         0 $basic =~ s{[\x80-\xFF]}{$self->{MWDI_Unknown}}g;
592 0         0 $basic;
593             }
594              
595             =back
596              
597             =cut
598              
599             package MIME::WordDecoder::UTF_8;
600 25     25   142 use strict;
  25         25  
  25         410  
601 25     25   12636 use Encode qw();
  25         182119  
  25         533  
602 25     25   129 use Carp qw( carp );
  25         29  
  25         1105  
603 25     25   93 use vars qw(@ISA);
  25         23  
  25         5317  
604              
605             @ISA = qw( MIME::WordDecoder );
606              
607             sub h_convert_to_utf8
608             {
609 96     96   107 my ($data, $charset, $decoder) = @_;
610 96 100       187 $charset = 'US-ASCII' if ($charset eq 'raw');
611 96         226 my $enc = Encode::find_encoding($charset);
612 96 50       6910 if (!$enc) {
613 0         0 carp "Unable to convert text in character set `$charset' to UTF-8... ignoring\n";
614 0         0 return '';
615             }
616 96         503 my $ans = $enc->decode($data, Encode::FB_PERLQQ);
617 96         498 return $ans;
618             }
619              
620             sub new {
621 25     25   70 my ($class) = @_;
622 25         96 my $self = $class->SUPER::new();
623 25         59 $self->handler('*' => \&h_convert_to_utf8);
624             }
625              
626              
627             #------------------------------------------------------------
628             #------------------------------------------------------------
629              
630             package MIME::WordDecoder;
631              
632             ### Now we can init the default handler.
633             $Default = (MIME::WordDecoder::ISO_8859->new('1'));
634              
635              
636             ### Add US-ASCII handler:
637             $DecoderFor{"US-ASCII"} = MIME::WordDecoder::US_ASCII->new;
638              
639             ### Add ISO-8859-{1..15} handlers:
640             for (1..15) {
641             $DecoderFor{"ISO-8859-$_"} = MIME::WordDecoder::ISO_8859->new($_);
642             }
643              
644             ### UTF-8
645             $DecoderFor{'UTF-8'} = MIME::WordDecoder::UTF_8->new();
646              
647             1; # end the module
648             __END__