File Coverage

blib/lib/MIME/WordDecoder.pm
Criterion Covered Total %
statement 103 142 72.5
branch 17 44 38.6
condition 14 32 43.7
subroutine 28 34 82.3
pod 7 9 77.7
total 169 261 64.7


line stmt bran cond sub pod time code
1             package MIME::WordDecoder;
2              
3             =head1 NAME
4              
5             MIME::WordDecoder - decode RFC 2047 encoded words to a local representation
6              
7             WARNING: Most of this module is deprecated and may disappear. The only
8             function you should use for MIME decoding is "mime_to_perl_string".
9              
10             =head1 SYNOPSIS
11              
12             See L for the basics of encoded words.
13             See L<"DESCRIPTION"> for how this class works.
14              
15             use MIME::WordDecoder;
16              
17              
18             ### Get the default word-decoder (used by unmime()):
19             $wd = default MIME::WordDecoder;
20              
21             ### Get a word-decoder which maps to ISO-8859-1 (Latin1):
22             $wd = supported MIME::WordDecoder "ISO-8859-1";
23              
24              
25             ### Decode a MIME string (e.g., into Latin1) via the default decoder:
26             $str = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
27              
28             ### Decode a string using the default decoder, non-OO style:
29             $str = unmime('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
30              
31             ### Decode a string to an internal Perl string, non-OO style
32             ### The result is likely to have the UTF8 flag ON.
33             $str = mime_to_perl_string('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
34              
35             =head1 DESCRIPTION
36              
37             WARNING: Most of this module is deprecated and may disappear. It
38             duplicates (badly) the function of the standard 'Encode' module. The
39             only function you should rely on is mime_to_perl_string.
40              
41             A MIME::WordDecoder consists, fundamentally, of a hash which maps
42             a character set name (US-ASCII, ISO-8859-1, etc.) to a subroutine which
43             knows how to take bytes in that character set and turn them into
44             the target string representation. Ideally, this target representation
45             would be Unicode, but we don't want to overspecify the translation
46             that takes place: if you want to convert MIME strings directly to Big5,
47             that's your own decision.
48              
49             The subroutine will be invoked with two arguments: DATA (the data in
50             the given character set), and CHARSET (the upcased character set name).
51              
52             For example:
53              
54             ### Keep 7-bit characters as-is, convert 8-bit characters to '#':
55             sub keep7bit {
56             local $_ = shift;
57             tr/\x00-\x7F/#/c;
58             $_;
59             }
60              
61             Here's a decoder which uses that:
62              
63             ### Construct a decoder:
64             $wd = MIME::WordDecoder->new({'US-ASCII' => "KEEP", ### sub { $_[0] }
65             'ISO-8859-1' => \&keep7bit,
66             'ISO-8859-2' => \&keep7bit,
67             'Big5' => "WARN",
68             '*' => "DIE"});
69              
70             ### Convert some MIME text to a pure ASCII string...
71             $ascii = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
72              
73             ### ...which will now hold: "To: Keld J#rn Simonsen "
74              
75             The UTF-8 built-in decoder decodes everything into Perl's internal
76             string format, possibly turning on the internal UTF8 flag. Use it like
77             this:
78              
79             $wd = supported MIME::WordDecoder 'UTF-8';
80             $perl_string = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
81             # perl_string will be a valid UTF-8 string with the "UTF8" flag set.
82              
83             Generally, you should use the UTF-8 decoder in preference to "unmime".
84              
85             =head1 PUBLIC INTERFACE
86              
87             =over
88              
89             =cut
90              
91 25     25   26995 use strict;
  25         56  
  25         702  
92 25     25   109 use Carp qw( carp croak );
  25         45  
  25         1264  
93 25     25   10844 use MIME::Words qw(decode_mimewords);
  25         56  
  25         1366  
94 25     25   130 use Exporter;
  25         136  
  25         839  
95 25     25   121 use vars qw(@ISA @EXPORT);
  25         40  
  25         20762  
96              
97             @ISA = qw(Exporter);
98             @EXPORT = qw( unmime mime_to_perl_string );
99              
100              
101              
102             #------------------------------
103             #
104             # Globals
105             #
106             #------------------------------
107              
108             ### Decoders.
109             my %DecoderFor = ();
110              
111             ### Standard handlers.
112             my %Handler =
113             (
114             KEEP => sub {$_[0]},
115             IGNORE => sub {''},
116             WARN => sub { carp "ignoring text in character set `$_[1]'\n" },
117             DIE => sub { croak "can't handle text in character set `$_[1]'\n" },
118             );
119              
120             ### Global default decoder. We init it below.
121             my $Default;
122              
123             ### Global UTF8 decoder.
124             my $DefaultUTF8;
125              
126             #------------------------------
127              
128             =item default [DECODER]
129              
130             I
131             Get/set the default DECODER object.
132              
133             =cut
134              
135             sub default {
136 25     25 1 157 my $class = shift;
137 25 50       57 if (@_) {
138 25         45 $Default = shift;
139             }
140 25         132 $Default;
141             }
142              
143             #------------------------------
144              
145             =item supported CHARSET, [DECODER]
146              
147             I
148             If just CHARSET is given, returns a decoder object which maps
149             data into that character set (the character set is forced to
150             all-uppercase).
151              
152             $wd = supported MIME::WordDecoder "ISO-8859-1";
153              
154             If DECODER is given, installs such an object:
155              
156             MIME::WordDecoder->supported("ISO-8859-1" =>
157             (new MIME::WordDecoder::ISO_8859 "1"));
158              
159             You should not override this method.
160              
161             =cut
162              
163             sub supported {
164 212     212 1 1253 my ($class, $charset, $decoder) = @_;
165 212 50       478 $DecoderFor{uc($charset)} = $decoder if (@_ > 2);
166 212         705 $DecoderFor{uc($charset)};
167             }
168              
169             #------------------------------
170              
171             =item new [\@HANDLERS]
172              
173             I
174             If \@HANDLERS is given, then @HANDLERS is passed to handler()
175             to initialize the internal map.
176              
177             =cut
178              
179             sub new {
180 476     476 1 686 my ($class, $h) = @_;
181 476         1252 my $self = bless { MWD_Map=>{} }, $class;
182              
183             ### Init the map:
184 476         1205 $self->handler(@$h);
185              
186             ### Add fallbacks:
187 476   66     2332 $self->{MWD_Map}{'*'} ||= $Handler{WARN};
188 476   66     1675 $self->{MWD_Map}{'raw'} ||= $self->{MWD_Map}{'US-ASCII'};
189 476         879 $self;
190             }
191              
192             #------------------------------
193              
194             =item handler CHARSET=>\&SUBREF, ...
195              
196             I
197             Set the handler SUBREF for a given CHARSET, for as many pairs
198             as you care to supply.
199              
200             When performing the translation of a MIME-encoded string, a
201             given SUBREF will be invoked when translating a block of text
202             in character set CHARSET. The subroutine will be invoked with
203             the following arguments:
204              
205             DATA - the data in the given character set.
206             CHARSET - the upcased character set name, which may prove useful
207             if you are using the same SUBREF for multiple CHARSETs.
208             DECODER - the decoder itself, if it contains configuration information
209             that your handler function needs.
210              
211             For example:
212              
213             $wd = new MIME::WordDecoder;
214             $wd->handler('US-ASCII' => "KEEP");
215             $wd->handler('ISO-8859-1' => \&handle_latin1,
216             'ISO-8859-2' => \&handle_latin1,
217             '*' => "DIE");
218              
219             Notice that, much as with %SIG, the SUBREF can also be taken from
220             a set of special keywords:
221              
222             KEEP Pass data through unchanged.
223             IGNORE Ignore data in this character set, without warning.
224             WARN Ignore data in this character set, with warning.
225             DIE Fatal exception with "can't handle character set" message.
226              
227             The subroutine for the special CHARSET of 'raw' is used for raw
228             (non-MIME-encoded) text, which is supposed to be US-ASCII.
229             The handler for 'raw' defaults to whatever was specified for 'US-ASCII'
230             at the time of construction.
231              
232             The subroutine for the special CHARSET of '*' is used for any
233             unrecognized character set. The default action for '*' is WARN.
234              
235             =cut
236              
237             sub handler {
238 951     951 1 1095 my $self = shift;
239              
240             ### Copy the hash, and edit it:
241 951         2043 while (@_) {
242 928         1154 my $c = shift;
243 928         1088 my $sub = shift;
244 928         2038 $self->{MWD_Map}{$c} = $self->real_handler($sub);
245             }
246 951         1189 $self;
247             }
248              
249             #------------------------------
250              
251             =item decode STRING
252              
253             I
254             Decode a STRING which might contain MIME-encoded components into a
255             local representation (e.g., UTF-8, etc.).
256              
257             =cut
258              
259             sub decode {
260 94     94 1 716 my ($self, $str) = @_;
261 94 50       215 defined($str) or return undef;
262             join('', map {
263             ### Get the data and (upcased) charset:
264 94         410 my $data = $_->[0];
  119         228  
265 119 100       315 my $charset = (defined($_->[1]) ? uc($_->[1]) : 'raw');
266 119         219 $charset =~ s/\*\w+\Z//; ### RFC2184 language suffix
267              
268             ### Get the handler; guess if never seen before:
269             defined($self->{MWD_Map}{$charset}) or
270 119 100 100     602 $self->{MWD_Map}{$charset} =
271             ($self->real_handler($self->guess_handler($charset)) || 0);
272 119   66     422 my $subr = $self->{MWD_Map}{$charset} || $self->{MWD_Map}{'*'};
273              
274             ### Map this chunk:
275 119         326 &$subr($data, $charset, $self);
276             } decode_mimewords($str));
277             }
278              
279             #------------------------------
280             #
281             # guess_handler CHARSET
282             #
283             # Instance method.
284             # An unrecognized charset has been seen. Guess a handler subref
285             # for the given charset, returning false if there is none.
286             # Successful mappings will be cached in the main map.
287             #
288             sub guess_handler {
289 14     14 0 62 undef;
290             }
291              
292             #------------------------------
293             #
294             # real_handler HANDLER
295             #
296             # Instance method.
297             # Translate the given handler, which might be a subref or a string.
298             #
299             sub real_handler {
300 944     944 0 1196 my ($self, $sub) = @_;
301             (!$sub) or
302             (ref($sub) eq 'CODE') or
303 944 100 33     4589 $sub = ($Handler{$sub} || croak "bad named handler: $sub\n");
      100        
304 944         3274 $sub;
305             }
306              
307             #------------------------------
308              
309             =item unmime STRING
310              
311             I
312             Decode the given STRING using the default() decoder.
313             See L.
314              
315             You should consider using the UTF-8 decoder instead. It decodes
316             MIME strings into Perl's internal string format.
317              
318             =cut
319              
320             sub unmime($) {
321 0     0 1 0 my $str = shift;
322 0         0 $Default->decode($str);
323             }
324              
325             =item mime_to_perl_string
326              
327             I
328             Decode the given STRING into an internal Perl Unicode string.
329             You should use this function in preference to all others.
330              
331             The result of mime_to_perl_string is likely to have Perl's
332             UTF8 flag set.
333              
334             =cut
335              
336             sub mime_to_perl_string($) {
337 1     1 1 717 my $str = shift;
338 1         4 $DecoderFor{'UTF-8'}->decode($str);
339             }
340              
341             =back
342              
343             =cut
344              
345              
346              
347              
348              
349             =head1 SUBCLASSES
350              
351             =over
352              
353             =cut
354              
355             #------------------------------------------------------------
356             #------------------------------------------------------------
357              
358             =item MIME::WordDecoder::ISO_8859
359              
360             A simple decoder which keeps US-ASCII and the 7-bit characters
361             of ISO-8859 character sets and UTF8, and also keeps 8-bit
362             characters from the indicated character set.
363              
364             ### Construct:
365             $wd = new MIME::WordDecoder::ISO_8859 2; ### ISO-8859-2
366              
367             ### What to translate unknown characters to (can also use empty):
368             ### Default is "?".
369             $wd->unknown("?");
370              
371             ### Collapse runs of unknown characters to a single unknown()?
372             ### Default is false.
373             $wd->collapse(1);
374              
375              
376             According to B
377             (ca. November 2000):
378              
379             ISO 8859 is a full series of 10 (and soon even more) standardized
380             multilingual single-byte coded (8bit) graphic character sets for
381             writing in alphabetic languages:
382              
383             1. Latin1 (West European)
384             2. Latin2 (East European)
385             3. Latin3 (South European)
386             4. Latin4 (North European)
387             5. Cyrillic
388             6. Arabic
389             7. Greek
390             8. Hebrew
391             9. Latin5 (Turkish)
392             10. Latin6 (Nordic)
393              
394             The ISO 8859 charsets are not even remotely as complete as the truly
395             great Unicode but they have been around and usable for quite a while
396             (first registered Internet charsets for use with MIME) and have
397             already offered a major improvement over the plain 7bit US-ASCII.
398              
399             Characters 0 to 127 are always identical with US-ASCII and the
400             positions 128 to 159 hold some less used control characters: the
401             so-called C1 set from ISO 6429.
402              
403             =cut
404              
405             package MIME::WordDecoder::ISO_8859;
406              
407 25     25   154 use strict;
  25         47  
  25         700  
408 25     25   124 use vars qw(@ISA);
  25         39  
  25         21135  
409             @ISA = qw( MIME::WordDecoder );
410              
411              
412             #------------------------------
413             #
414             # HANDLERS
415             #
416             #------------------------------
417              
418             ### Keep 7bit characters.
419             ### Turn all else to the special \x00.
420             sub h_keep7bit {
421 2     2   5 local $_ = $_[0];
422             # my $unknown = $_[2]->{MWDI_Unknown};
423              
424 2         4 s{[\x80-\xFF]}{\x00}g;
425 2         8 $_;
426             }
427              
428             ### Note: should use Unicode::String, converting/manipulating
429             ### everything into full Unicode form.
430              
431             ### Keep 7bit UTF8 characters (ASCII).
432             ### Keep ISO-8859-1 if this decoder is for Latin-1.
433             ### Turn all else to the special \x00.
434             sub h_utf8 {
435 0     0   0 local $_ = $_[0];
436             # my $unknown = $_[2]->{MWDI_Unknown};
437 0         0 my $latin1 = ($_[2]->{MWDI_Num} == 1);
438             #print STDERR "UTF8 in: <$_>\n";
439              
440 0         0 local($1,$2,$3);
441 0         0 my $tgt = '';
442 0   0     0 while (m{\G(
443             ([\x00-\x7F]) | # 0xxxxxxx
444             ([\xC0-\xDF] [\x80-\xBF]) | # 110yyyyy 10xxxxxx
445             ([\xE0-\xEF] [\x80-\xBF]{2}) | # 1110zzzz 10yyyyyy 10xxxxxx
446             ([\xF0-\xF7] [\x80-\xBF]{3}) | # 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
447             . # error; synch
448             )}gcsx and ($1 ne '')) {
449              
450 0 0 0     0 if (defined($2)) { $tgt .= $2 }
  0 0       0  
451 0         0 elsif (defined($3) && $latin1) { $tgt .= "\x00" }
452 0         0 else { $tgt .= "\x00" }
453             }
454              
455             #print STDERR "UTF8 out: <$tgt>\n";
456 0         0 $tgt;
457             }
458              
459             ### Keep characters which are 7bit in UTF8 (ASCII).
460             ### Keep ISO-8859-1 if this decoder is for Latin-1.
461             ### Turn all else to the special \x00.
462             sub h_utf16 {
463 0     0   0 local $_ = $_[0];
464             # my $unknown = $_[2]->{MWDI_Unknown};
465 0         0 my $latin1 = ($_[2]->{MWDI_Num} == 1);
466             #print STDERR "UTF16 in: <$_>\n";
467              
468 0         0 local($1,$2,$3,$4,$5);
469 0         0 my $tgt = '';
470 0   0     0 while (m{\G(
471             ( \x00 ([\x00-\x7F])) | # 00000000 0xxxxxxx
472             ( \x00 ([\x80-\xFF])) | # 00000000 1xxxxxxx
473             ( [^\x00] [\x00-\xFF]) | # etc
474             )
475             }gcsx and ($1 ne '')) {
476              
477 0 0 0     0 if (defined($2)) { $tgt .= $3 }
  0 0       0  
478 0         0 elsif (defined($4) && $latin1) { $tgt .= $5 }
479 0         0 else { $tgt .= "\x00" }
480             }
481              
482             #print STDERR "UTF16 out: <$tgt>\n";
483 0         0 $tgt;
484             }
485              
486              
487             #------------------------------
488             #
489             # PUBLIC INTERFACE
490             #
491             #------------------------------
492              
493             #------------------------------
494             #
495             # new NUMBER
496             #
497             sub new {
498 450     450   1116 my ($class, $num) = @_;
499              
500 450         978 my $self = $class->SUPER::new();
501 450         931 $self->handler('raw' => 'KEEP',
502             'US-ASCII' => 'KEEP');
503              
504 450         815 $self->{MWDI_Num} = $num;
505 450         683 $self->{MWDI_Unknown} = "?";
506 450         626 $self->{MWDI_Collapse} = 0;
507 450         2641 $self;
508             }
509              
510             #------------------------------
511             #
512             # guess_handler CHARSET
513             #
514             sub guess_handler {
515 2     2   5 my ($self, $charset) = @_;
516             return 'KEEP' if (($charset =~ /^ISO[-_]?8859[-_](\d+)$/) &&
517 2 100 66     20 ($1 eq $self->{MWDI_Num}));
518 1 50       7 return \&h_keep7bit if ($charset =~ /^ISO[-_]?8859/);
519 0 0       0 return \&h_utf8 if ($charset =~ /^UTF[-_]?8$/);
520 0 0       0 return \&h_utf16 if ($charset =~ /^UTF[-_]?16$/);
521 0         0 undef;
522             }
523              
524             #------------------------------
525             #
526             # unknown [REPLACEMENT]
527             #
528             sub unknown {
529 0     0   0 my $self = shift;
530 0 0       0 $self->{MWDI_Unknown} = shift if @_;
531 0         0 $self->{MWDI_Unknown};
532             }
533              
534             #------------------------------
535             #
536             # collapse [YESNO]
537             #
538             sub collapse {
539 0     0   0 my $self = shift;
540 0 0       0 $self->{MWDI_Collapse} = shift if @_;
541 0         0 $self->{MWDI_Collapse};
542             }
543              
544             #------------------------------
545             #
546             # decode STRING
547             #
548             sub decode {
549 10     10   5646 my $self = shift;
550              
551             ### Do inherited action:
552 10         26 my $basic = $self->SUPER::decode(@_);
553 10 50       33 defined($basic) or return undef;
554              
555             ### Translate/consolidate illegal characters:
556 10 50       24 $basic =~ tr{\x00}{\x00}c if $self->{MWDI_Collapse};
557 10         15 $basic =~ s{\x00}{$self->{MWDI_Unknown}}g;
558 10         21 $basic;
559             }
560              
561             #------------------------------------------------------------
562             #------------------------------------------------------------
563              
564             =item MIME::WordDecoder::US_ASCII
565              
566             A subclass of the ISO-8859-1 decoder which discards 8-bit characters.
567             You're probably better off using ISO-8859-1.
568              
569             =cut
570              
571             package MIME::WordDecoder::US_ASCII;
572              
573 25     25   138 use strict;
  25         45  
  25         597  
574 25     25   119 use vars qw(@ISA);
  25         44  
  25         4310  
575             @ISA = qw( MIME::WordDecoder::ISO_8859 );
576              
577             sub new {
578 25     25   50 my ($class) = @_;
579 25         143 return $class->SUPER::new("1");
580             }
581              
582             sub decode {
583 0     0   0 my $self = shift;
584              
585             ### Do inherited action:
586 0         0 my $basic = $self->SUPER::decode(@_);
587 0 0       0 defined($basic) or return undef;
588              
589             ### Translate/consolidate 8-bit characters:
590 0 0       0 $basic =~ tr{\x80-\xFF}{}c if $self->{MWDI_Collapse};
591 0         0 $basic =~ s{[\x80-\xFF]}{$self->{MWDI_Unknown}}g;
592 0         0 $basic;
593             }
594              
595             =back
596              
597             =cut
598              
599             package MIME::WordDecoder::UTF_8;
600 25     25   148 use strict;
  25         39  
  25         543  
601 25     25   22722 use Encode qw();
  25         274178  
  25         711  
602 25     25   175 use Carp qw( carp );
  25         48  
  25         1380  
603 25     25   127 use vars qw(@ISA);
  25         42  
  25         7319  
604              
605             @ISA = qw( MIME::WordDecoder );
606              
607             sub h_convert_to_utf8
608             {
609 96     96   171 my ($data, $charset, $decoder) = @_;
610 96 100       276 $charset = 'US-ASCII' if ($charset eq 'raw');
611 96         352 my $enc = Encode::find_encoding($charset);
612 96 50       30003 if (!$enc) {
613 0         0 carp "Unable to convert text in character set `$charset' to UTF-8... ignoring\n";
614 0         0 return '';
615             }
616 96         709 my $ans = $enc->decode($data, Encode::FB_PERLQQ);
617 96         801 return $ans;
618             }
619              
620             sub new {
621 25     25   56 my ($class) = @_;
622 25         139 my $self = $class->SUPER::new();
623 25         85 $self->handler('*' => \&h_convert_to_utf8);
624             }
625              
626              
627             #------------------------------------------------------------
628             #------------------------------------------------------------
629              
630             package MIME::WordDecoder;
631              
632             ### Now we can init the default handler.
633             $Default = (MIME::WordDecoder::ISO_8859->new('1'));
634              
635              
636             ### Add US-ASCII handler:
637             $DecoderFor{"US-ASCII"} = MIME::WordDecoder::US_ASCII->new;
638              
639             ### Add ISO-8859-{1..15} handlers:
640             for (1..15) {
641             $DecoderFor{"ISO-8859-$_"} = MIME::WordDecoder::ISO_8859->new($_);
642             }
643              
644             ### UTF-8
645             $DecoderFor{'UTF-8'} = MIME::WordDecoder::UTF_8->new();
646              
647             1; # end the module
648             __END__