File Coverage

blib/lib/MIME/WordDecoder.pm
Criterion Covered Total %
statement 103 142 72.5
branch 17 44 38.6
condition 14 32 43.7
subroutine 28 34 82.3
pod 7 9 77.7
total 169 261 64.7


line stmt bran cond sub pod time code
1             package MIME::WordDecoder;
2              
3             =head1 NAME
4              
5             MIME::WordDecoder - decode RFC 2047 encoded words to a local representation
6              
7             WARNING: Most of this module is deprecated and may disappear. The only
8             function you should use for MIME decoding is "mime_to_perl_string".
9              
10             =head1 SYNOPSIS
11              
12             See L for the basics of encoded words.
13             See L<"DESCRIPTION"> for how this class works.
14              
15             use MIME::WordDecoder;
16              
17              
18             ### Get the default word-decoder (used by unmime()):
19             $wd = default MIME::WordDecoder;
20              
21             ### Get a word-decoder which maps to ISO-8859-1 (Latin1):
22             $wd = supported MIME::WordDecoder "ISO-8859-1";
23              
24              
25             ### Decode a MIME string (e.g., into Latin1) via the default decoder:
26             $str = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
27              
28             ### Decode a string using the default decoder, non-OO style:
29             $str = unmime('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
30              
31             ### Decode a string to an internal Perl string, non-OO style
32             ### The result is likely to have the UTF8 flag ON.
33             $str = mime_to_perl_string('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
34              
35             =head1 DESCRIPTION
36              
37             WARNING: Most of this module is deprecated and may disappear. It
38             duplicates (badly) the function of the standard 'Encode' module. The
39             only function you should rely on is mime_to_perl_string.
40              
41             A MIME::WordDecoder consists, fundamentally, of a hash which maps
42             a character set name (US-ASCII, ISO-8859-1, etc.) to a subroutine which
43             knows how to take bytes in that character set and turn them into
44             the target string representation. Ideally, this target representation
45             would be Unicode, but we don't want to overspecify the translation
46             that takes place: if you want to convert MIME strings directly to Big5,
47             that's your own decision.
48              
49             The subroutine will be invoked with two arguments: DATA (the data in
50             the given character set), and CHARSET (the upcased character set name).
51              
52             For example:
53              
54             ### Keep 7-bit characters as-is, convert 8-bit characters to '#':
55             sub keep7bit {
56             local $_ = shift;
57             tr/\x00-\x7F/#/c;
58             $_;
59             }
60              
61             Here's a decoder which uses that:
62              
63             ### Construct a decoder:
64             $wd = MIME::WordDecoder->new({'US-ASCII' => "KEEP", ### sub { $_[0] }
65             'ISO-8859-1' => \&keep7bit,
66             'ISO-8859-2' => \&keep7bit,
67             'Big5' => "WARN",
68             '*' => "DIE"});
69              
70             ### Convert some MIME text to a pure ASCII string...
71             $ascii = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
72              
73             ### ...which will now hold: "To: Keld J#rn Simonsen "
74              
75             The UTF-8 built-in decoder decodes everything into Perl's internal
76             string format, possibly turning on the internal UTF8 flag. Use it like
77             this:
78              
79             $wd = supported MIME::WordDecoder 'UTF-8';
80             $perl_string = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
81             # perl_string will be a valid UTF-8 string with the "UTF8" flag set.
82              
83             Generally, you should use the UTF-8 decoder in preference to "unmime".
84              
85             =head1 PUBLIC INTERFACE
86              
87             =over
88              
89             =cut
90              
91 27     27   53830 use strict;
  27         40  
  27         747  
92 27     27   89 use Carp qw( carp croak );
  27         33  
  27         1222  
93 27     27   8470 use MIME::Words qw(decode_mimewords);
  27         46  
  27         1345  
94 27     27   115 use Exporter;
  27         46  
  27         771  
95 27     27   99 use vars qw(@ISA @EXPORT);
  27         30  
  27         18424  
96              
97             @ISA = qw(Exporter);
98             @EXPORT = qw( unmime mime_to_perl_string );
99              
100              
101              
102             #------------------------------
103             #
104             # Globals
105             #
106             #------------------------------
107              
108             ### Decoders.
109             my %DecoderFor = ();
110              
111             ### Standard handlers.
112             my %Handler =
113             (
114             KEEP => sub {$_[0]},
115             IGNORE => sub {''},
116             WARN => sub { carp "ignoring text in character set `$_[1]'\n" },
117             DIE => sub { croak "can't handle text in character set `$_[1]'\n" },
118             );
119              
120             ### Global default decoder. We init it below.
121             my $Default;
122              
123             ### Global UTF8 decoder.
124             my $DefaultUTF8;
125              
126             #------------------------------
127              
128             =item default [DECODER]
129              
130             I
131             Get/set the default DECODER object.
132              
133             =cut
134              
135             sub default {
136 25     25 1 147 my $class = shift;
137 25 50       54 if (@_) {
138 25         28 $Default = shift;
139             }
140 25         94 $Default;
141             }
142              
143             #------------------------------
144              
145             =item supported CHARSET, [DECODER]
146              
147             I
148             If just CHARSET is given, returns a decoder object which maps
149             data into that character set (the character set is forced to
150             all-uppercase).
151              
152             $wd = supported MIME::WordDecoder "ISO-8859-1";
153              
154             If DECODER is given, installs such an object:
155              
156             MIME::WordDecoder->supported("ISO-8859-1" =>
157             (new MIME::WordDecoder::ISO_8859 "1"));
158              
159             You should not override this method.
160              
161             =cut
162              
163             sub supported {
164 218     218 1 1092 my ($class, $charset, $decoder) = @_;
165 218 50       348 $DecoderFor{uc($charset)} = $decoder if (@_ > 2);
166 218         485 $DecoderFor{uc($charset)};
167             }
168              
169             #------------------------------
170              
171             =item new [\@HANDLERS]
172              
173             I
174             If \@HANDLERS is given, then @HANDLERS is passed to handler()
175             to initialize the internal map.
176              
177             =cut
178              
179             sub new {
180 512     512 1 426 my ($class, $h) = @_;
181 512         962 my $self = bless { MWD_Map=>{} }, $class;
182              
183             ### Init the map:
184 512         848 $self->handler(@$h);
185              
186             ### Add fallbacks:
187 512   66     1751 $self->{MWD_Map}{'*'} ||= $Handler{WARN};
188 512   66     1272 $self->{MWD_Map}{'raw'} ||= $self->{MWD_Map}{'US-ASCII'};
189 512         549 $self;
190             }
191              
192             #------------------------------
193              
194             =item handler CHARSET=>\&SUBREF, ...
195              
196             I
197             Set the handler SUBREF for a given CHARSET, for as many pairs
198             as you care to supply.
199              
200             When performing the translation of a MIME-encoded string, a
201             given SUBREF will be invoked when translating a block of text
202             in character set CHARSET. The subroutine will be invoked with
203             the following arguments:
204              
205             DATA - the data in the given character set.
206             CHARSET - the upcased character set name, which may prove useful
207             if you are using the same SUBREF for multiple CHARSETs.
208             DECODER - the decoder itself, if it contains configuration information
209             that your handler function needs.
210              
211             For example:
212              
213             $wd = new MIME::WordDecoder;
214             $wd->handler('US-ASCII' => "KEEP");
215             $wd->handler('ISO-8859-1' => \&handle_latin1,
216             'ISO-8859-2' => \&handle_latin1,
217             '*' => "DIE");
218              
219             Notice that, much as with %SIG, the SUBREF can also be taken from
220             a set of special keywords:
221              
222             KEEP Pass data through unchanged.
223             IGNORE Ignore data in this character set, without warning.
224             WARN Ignore data in this character set, with warning.
225             DIE Fatal exception with "can't handle character set" message.
226              
227             The subroutine for the special CHARSET of 'raw' is used for raw
228             (non-MIME-encoded) text, which is supposed to be US-ASCII.
229             The handler for 'raw' defaults to whatever was specified for 'US-ASCII'
230             at the time of construction.
231              
232             The subroutine for the special CHARSET of '*' is used for any
233             unrecognized character set. The default action for '*' is WARN.
234              
235             =cut
236              
237             sub handler {
238 1023     1023 1 711 my $self = shift;
239              
240             ### Copy the hash, and edit it:
241 1023         1409 while (@_) {
242 998         702 my $c = shift;
243 998         655 my $sub = shift;
244 998         1231 $self->{MWD_Map}{$c} = $self->real_handler($sub);
245             }
246 1023         751 $self;
247             }
248              
249             #------------------------------
250              
251             =item decode STRING
252              
253             I
254             Decode a STRING which might contain MIME-encoded components into a
255             local representation (e.g., UTF-8, etc.).
256              
257             =cut
258              
259             sub decode {
260 95     95 1 809 my ($self, $str) = @_;
261 95 50       174 defined($str) or return undef;
262             join('', map {
263             ### Get the data and (upcased) charset:
264 95         258 my $data = $_->[0];
  120         139  
265 120 100       238 my $charset = (defined($_->[1]) ? uc($_->[1]) : 'raw');
266 120         153 $charset =~ s/\*\w+\Z//; ### RFC2184 language suffix
267              
268             ### Get the handler; guess if never seen before:
269             defined($self->{MWD_Map}{$charset}) or
270 120 100 100     466 $self->{MWD_Map}{$charset} =
271             ($self->real_handler($self->guess_handler($charset)) || 0);
272 120   66     1181 my $subr = $self->{MWD_Map}{$charset} || $self->{MWD_Map}{'*'};
273              
274             ### Map this chunk:
275 120         212 &$subr($data, $charset, $self);
276             } decode_mimewords($str));
277             }
278              
279             #------------------------------
280             #
281             # guess_handler CHARSET
282             #
283             # Instance method.
284             # An unrecognized charset has been seen. Guess a handler subref
285             # for the given charset, returning false if there is none.
286             # Successful mappings will be cached in the main map.
287             #
288             sub guess_handler {
289 15     15 0 46 undef;
290             }
291              
292             #------------------------------
293             #
294             # real_handler HANDLER
295             #
296             # Instance method.
297             # Translate the given handler, which might be a subref or a string.
298             #
299             sub real_handler {
300 1015     1015 0 812 my ($self, $sub) = @_;
301             (!$sub) or
302             (ref($sub) eq 'CODE') or
303 1015 100 33     3600 $sub = ($Handler{$sub} || croak "bad named handler: $sub\n");
      100        
304 1015         2131 $sub;
305             }
306              
307             #------------------------------
308              
309             =item unmime STRING
310              
311             I
312             Decode the given STRING using the default() decoder.
313             See L.
314              
315             You should consider using the UTF-8 decoder instead. It decodes
316             MIME strings into Perl's internal string format.
317              
318             =cut
319              
320             sub unmime($) {
321 0     0 1 0 my $str = shift;
322 0         0 $Default->decode($str);
323             }
324              
325             =item mime_to_perl_string
326              
327             I
328             Decode the given STRING into an internal Perl Unicode string.
329             You should use this function in preference to all others.
330              
331             The result of mime_to_perl_string is likely to have Perl's
332             UTF8 flag set.
333              
334             =cut
335              
336             sub mime_to_perl_string($) {
337 1     1 1 681 my $str = shift;
338 1         4 $DecoderFor{'UTF-8'}->decode($str);
339             }
340              
341             =back
342              
343             =cut
344              
345              
346              
347              
348              
349             =head1 SUBCLASSES
350              
351             =over
352              
353             =cut
354              
355             #------------------------------------------------------------
356             #------------------------------------------------------------
357              
358             =item MIME::WordDecoder::ISO_8859
359              
360             A simple decoder which keeps US-ASCII and the 7-bit characters
361             of ISO-8859 character sets and UTF8, and also keeps 8-bit
362             characters from the indicated character set.
363              
364             ### Construct:
365             $wd = new MIME::WordDecoder::ISO_8859 2; ### ISO-8859-2
366              
367             ### What to translate unknown characters to (can also use empty):
368             ### Default is "?".
369             $wd->unknown("?");
370              
371             ### Collapse runs of unknown characters to a single unknown()?
372             ### Default is false.
373             $wd->collapse(1);
374              
375              
376             According to B
377             (ca. November 2000):
378              
379             ISO 8859 is a full series of 10 (and soon even more) standardized
380             multilingual single-byte coded (8bit) graphic character sets for
381             writing in alphabetic languages:
382              
383             1. Latin1 (West European)
384             2. Latin2 (East European)
385             3. Latin3 (South European)
386             4. Latin4 (North European)
387             5. Cyrillic
388             6. Arabic
389             7. Greek
390             8. Hebrew
391             9. Latin5 (Turkish)
392             10. Latin6 (Nordic)
393              
394             The ISO 8859 charsets are not even remotely as complete as the truly
395             great Unicode but they have been around and usable for quite a while
396             (first registered Internet charsets for use with MIME) and have
397             already offered a major improvement over the plain 7bit US-ASCII.
398              
399             Characters 0 to 127 are always identical with US-ASCII and the
400             positions 128 to 159 hold some less used control characters: the
401             so-called C1 set from ISO 6429.
402              
403             =cut
404              
405             package MIME::WordDecoder::ISO_8859;
406              
407 27     27   136 use strict;
  27         38  
  27         680  
408 27     27   93 use vars qw(@ISA);
  27         28  
  27         18756  
409             @ISA = qw( MIME::WordDecoder );
410              
411              
412             #------------------------------
413             #
414             # HANDLERS
415             #
416             #------------------------------
417              
418             ### Keep 7bit characters.
419             ### Turn all else to the special \x00.
420             sub h_keep7bit {
421 2     2   4 local $_ = $_[0];
422             # my $unknown = $_[2]->{MWDI_Unknown};
423              
424 2         3 s{[\x80-\xFF]}{\x00}g;
425 2         4 $_;
426             }
427              
428             ### Note: should use Unicode::String, converting/manipulating
429             ### everything into full Unicode form.
430              
431             ### Keep 7bit UTF8 characters (ASCII).
432             ### Keep ISO-8859-1 if this decoder is for Latin-1.
433             ### Turn all else to the special \x00.
434             sub h_utf8 {
435 0     0   0 local $_ = $_[0];
436             # my $unknown = $_[2]->{MWDI_Unknown};
437 0         0 my $latin1 = ($_[2]->{MWDI_Num} == 1);
438             #print STDERR "UTF8 in: <$_>\n";
439              
440 0         0 local($1,$2,$3);
441 0         0 my $tgt = '';
442 0   0     0 while (m{\G(
443             ([\x00-\x7F]) | # 0xxxxxxx
444             ([\xC0-\xDF] [\x80-\xBF]) | # 110yyyyy 10xxxxxx
445             ([\xE0-\xEF] [\x80-\xBF]{2}) | # 1110zzzz 10yyyyyy 10xxxxxx
446             ([\xF0-\xF7] [\x80-\xBF]{3}) | # 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
447             . # error; synch
448             )}gcsx and ($1 ne '')) {
449              
450 0 0 0     0 if (defined($2)) { $tgt .= $2 }
  0 0       0  
451 0         0 elsif (defined($3) && $latin1) { $tgt .= "\x00" }
452 0         0 else { $tgt .= "\x00" }
453             }
454              
455             #print STDERR "UTF8 out: <$tgt>\n";
456 0         0 $tgt;
457             }
458              
459             ### Keep characters which are 7bit in UTF8 (ASCII).
460             ### Keep ISO-8859-1 if this decoder is for Latin-1.
461             ### Turn all else to the special \x00.
462             sub h_utf16 {
463 0     0   0 local $_ = $_[0];
464             # my $unknown = $_[2]->{MWDI_Unknown};
465 0         0 my $latin1 = ($_[2]->{MWDI_Num} == 1);
466             #print STDERR "UTF16 in: <$_>\n";
467              
468 0         0 local($1,$2,$3,$4,$5);
469 0         0 my $tgt = '';
470 0   0     0 while (m{\G(
471             ( \x00 ([\x00-\x7F])) | # 00000000 0xxxxxxx
472             ( \x00 ([\x80-\xFF])) | # 00000000 1xxxxxxx
473             ( [^\x00] [\x00-\xFF]) | # etc
474             )
475             }gcsx and ($1 ne '')) {
476              
477 0 0 0     0 if (defined($2)) { $tgt .= $3 }
  0 0       0  
478 0         0 elsif (defined($4) && $latin1) { $tgt .= $5 }
479 0         0 else { $tgt .= "\x00" }
480             }
481              
482             #print STDERR "UTF16 out: <$tgt>\n";
483 0         0 $tgt;
484             }
485              
486              
487             #------------------------------
488             #
489             # PUBLIC INTERFACE
490             #
491             #------------------------------
492              
493             #------------------------------
494             #
495             # new NUMBER
496             #
497             sub new {
498 484     484   931 my ($class, $num) = @_;
499              
500 484         727 my $self = $class->SUPER::new();
501 484         599 $self->handler('raw' => 'KEEP',
502             'US-ASCII' => 'KEEP');
503              
504 484         442 $self->{MWDI_Num} = $num;
505 484         458 $self->{MWDI_Unknown} = "?";
506 484         373 $self->{MWDI_Collapse} = 0;
507 484         1040 $self;
508             }
509              
510             #------------------------------
511             #
512             # guess_handler CHARSET
513             #
514             sub guess_handler {
515 2     2   3 my ($self, $charset) = @_;
516             return 'KEEP' if (($charset =~ /^ISO[-_]?8859[-_](\d+)$/) &&
517 2 100 66     24 ($1 eq $self->{MWDI_Num}));
518 1 50       7 return \&h_keep7bit if ($charset =~ /^ISO[-_]?8859/);
519 0 0       0 return \&h_utf8 if ($charset =~ /^UTF[-_]?8$/);
520 0 0       0 return \&h_utf16 if ($charset =~ /^UTF[-_]?16$/);
521 0         0 undef;
522             }
523              
524             #------------------------------
525             #
526             # unknown [REPLACEMENT]
527             #
528             sub unknown {
529 0     0   0 my $self = shift;
530 0 0       0 $self->{MWDI_Unknown} = shift if @_;
531 0         0 $self->{MWDI_Unknown};
532             }
533              
534             #------------------------------
535             #
536             # collapse [YESNO]
537             #
538             sub collapse {
539 0     0   0 my $self = shift;
540 0 0       0 $self->{MWDI_Collapse} = shift if @_;
541 0         0 $self->{MWDI_Collapse};
542             }
543              
544             #------------------------------
545             #
546             # decode STRING
547             #
548             sub decode {
549 10     10   4764 my $self = shift;
550              
551             ### Do inherited action:
552 10         27 my $basic = $self->SUPER::decode(@_);
553 10 50       21 defined($basic) or return undef;
554              
555             ### Translate/consolidate illegal characters:
556 10 50       18 $basic =~ tr{\x00}{\x00}c if $self->{MWDI_Collapse};
557 10         10 $basic =~ s{\x00}{$self->{MWDI_Unknown}}g;
558 10         17 $basic;
559             }
560              
561             #------------------------------------------------------------
562             #------------------------------------------------------------
563              
564             =item MIME::WordDecoder::US_ASCII
565              
566             A subclass of the ISO-8859-1 decoder which discards 8-bit characters.
567             You're probably better off using ISO-8859-1.
568              
569             =cut
570              
571             package MIME::WordDecoder::US_ASCII;
572              
573 27     27   139 use strict;
  27         29  
  27         641  
574 27     27   102 use vars qw(@ISA);
  27         79  
  27         4078  
575             @ISA = qw( MIME::WordDecoder::ISO_8859 );
576              
577             sub new {
578 27     27   63 my ($class) = @_;
579 27         123 return $class->SUPER::new("1");
580             }
581              
582             sub decode {
583 0     0   0 my $self = shift;
584              
585             ### Do inherited action:
586 0         0 my $basic = $self->SUPER::decode(@_);
587 0 0       0 defined($basic) or return undef;
588              
589             ### Translate/consolidate 8-bit characters:
590 0 0       0 $basic =~ tr{\x80-\xFF}{}c if $self->{MWDI_Collapse};
591 0         0 $basic =~ s{[\x80-\xFF]}{$self->{MWDI_Unknown}}g;
592 0         0 $basic;
593             }
594              
595             =back
596              
597             =cut
598              
599             package MIME::WordDecoder::UTF_8;
600 27     27   139 use strict;
  27         32  
  27         575  
601 27     27   14821 use Encode qw();
  27         219813  
  27         786  
602 27     27   171 use Carp qw( carp );
  27         32  
  27         1365  
603 27     27   112 use vars qw(@ISA);
  27         28  
  27         6326  
604              
605             @ISA = qw( MIME::WordDecoder );
606              
607             sub h_convert_to_utf8
608             {
609 97     97   142 my ($data, $charset, $decoder) = @_;
610 97 100       200 $charset = 'US-ASCII' if ($charset eq 'raw');
611 97         289 my $enc = Encode::find_encoding($charset);
612 97 50       7930 if (!$enc) {
613 0         0 carp "Unable to convert text in character set `$charset' to UTF-8... ignoring\n";
614 0         0 return '';
615             }
616 97         524 my $ans = $enc->decode($data, Encode::FB_PERLQQ);
617 97         569 return $ans;
618             }
619              
620             sub new {
621 27     27   42 my ($class) = @_;
622 27         117 my $self = $class->SUPER::new();
623 27         83 $self->handler('*' => \&h_convert_to_utf8);
624             }
625              
626              
627             #------------------------------------------------------------
628             #------------------------------------------------------------
629              
630             package MIME::WordDecoder;
631              
632             ### Now we can init the default handler.
633             $Default = (MIME::WordDecoder::ISO_8859->new('1'));
634              
635              
636             ### Add US-ASCII handler:
637             $DecoderFor{"US-ASCII"} = MIME::WordDecoder::US_ASCII->new;
638              
639             ### Add ISO-8859-{1..15} handlers:
640             for (1..15) {
641             $DecoderFor{"ISO-8859-$_"} = MIME::WordDecoder::ISO_8859->new($_);
642             }
643              
644             ### UTF-8
645             $DecoderFor{'UTF-8'} = MIME::WordDecoder::UTF_8->new();
646              
647             1; # end the module
648             __END__