File Coverage

blib/lib/MIME/WordDecoder.pm

Criterion	Covered	Total	%
statement	103	142	72.5
branch	17	44	38.6
condition	14	32	43.7
subroutine	28	34	82.3
pod	7	9	77.7
total	169	261	64.7

line	stmt	bran	cond	sub	pod	time	code
1							package MIME::WordDecoder;
2
3							=head1 NAME
4
5							MIME::WordDecoder - decode RFC 2047 encoded words to a local representation
6
7							WARNING: Most of this module is deprecated and may disappear. The only
8							function you should use for MIME decoding is "mime_to_perl_string".
9
10							=head1 SYNOPSIS
11
12							See L for the basics of encoded words.
13							See L<"DESCRIPTION"> for how this class works.
14
15							use MIME::WordDecoder;
16
17
18							### Get the default word-decoder (used by unmime()):
19							$wd = default MIME::WordDecoder;
20
21							### Get a word-decoder which maps to ISO-8859-1 (Latin1):
22							$wd = supported MIME::WordDecoder "ISO-8859-1";
23
24
25							### Decode a MIME string (e.g., into Latin1) via the default decoder:
26							$str = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
27
28							### Decode a string using the default decoder, non-OO style:
29							$str = unmime('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
30
31							### Decode a string to an internal Perl string, non-OO style
32							### The result is likely to have the UTF8 flag ON.
33							$str = mime_to_perl_string('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
34
35							=head1 DESCRIPTION
36
37							WARNING: Most of this module is deprecated and may disappear. It
38							duplicates (badly) the function of the standard 'Encode' module. The
39							only function you should rely on is mime_to_perl_string.
40
41							A MIME::WordDecoder consists, fundamentally, of a hash which maps
42							a character set name (US-ASCII, ISO-8859-1, etc.) to a subroutine which
43							knows how to take bytes in that character set and turn them into
44							the target string representation. Ideally, this target representation
45							would be Unicode, but we don't want to overspecify the translation
46							that takes place: if you want to convert MIME strings directly to Big5,
47							that's your own decision.
48
49							The subroutine will be invoked with two arguments: DATA (the data in
50							the given character set), and CHARSET (the upcased character set name).
51
52							For example:
53
54							### Keep 7-bit characters as-is, convert 8-bit characters to '#':
55							sub keep7bit {
56							local $_ = shift;
57							tr/\x00-\x7F/#/c;
58							$_;
59							}
60
61							Here's a decoder which uses that:
62
63							### Construct a decoder:
64							$wd = MIME::WordDecoder->new({'US-ASCII' => "KEEP", ### sub { $_[0] }
65							'ISO-8859-1' => \&keep7bit,
66							'ISO-8859-2' => \&keep7bit,
67							'Big5' => "WARN",
68							'*' => "DIE"});
69
70							### Convert some MIME text to a pure ASCII string...
71							$ascii = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
72
73							### ...which will now hold: "To: Keld J#rn Simonsen "
74
75							The UTF-8 built-in decoder decodes everything into Perl's internal
76							string format, possibly turning on the internal UTF8 flag. Use it like
77							this:
78
79							$wd = supported MIME::WordDecoder 'UTF-8';
80							$perl_string = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
81							# perl_string will be a valid UTF-8 string with the "UTF8" flag set.
82
83							Generally, you should use the UTF-8 decoder in preference to "unmime".
84
85							=head1 PUBLIC INTERFACE
86
87							=over
88
89							=cut
90
91	27			27		53830	use strict;
	27					40
	27					747
92	27			27		89	use Carp qw( carp croak );
	27					33
	27					1222
93	27			27		8470	use MIME::Words qw(decode_mimewords);
	27					46
	27					1345
94	27			27		115	use Exporter;
	27					46
	27					771
95	27			27		99	use vars qw(@ISA @EXPORT);
	27					30
	27					18424
96
97							@ISA = qw(Exporter);
98							@EXPORT = qw( unmime mime_to_perl_string );
99
100
101
102							#------------------------------
103							#
104							# Globals
105							#
106							#------------------------------
107
108							### Decoders.
109							my %DecoderFor = ();
110
111							### Standard handlers.
112							my %Handler =
113							(
114							KEEP => sub {$_[0]},
115							IGNORE => sub {''},
116							WARN => sub { carp "ignoring text in character set `$_[1]'\n" },
117							DIE => sub { croak "can't handle text in character set `$_[1]'\n" },
118							);
119
120							### Global default decoder. We init it below.
121							my $Default;
122
123							### Global UTF8 decoder.
124							my $DefaultUTF8;
125
126							#------------------------------
127
128							=item default [DECODER]
129
130							I
131							Get/set the default DECODER object.
132
133							=cut
134
135							sub default {
136	25			25	1	147	my $class = shift;
137	25	50				54	if (@_) {
138	25					28	$Default = shift;
139							}
140	25					94	$Default;
141							}
142
143							#------------------------------
144
145							=item supported CHARSET, [DECODER]
146
147							I
148							If just CHARSET is given, returns a decoder object which maps
149							data into that character set (the character set is forced to
150							all-uppercase).
151
152							$wd = supported MIME::WordDecoder "ISO-8859-1";
153
154							If DECODER is given, installs such an object:
155
156							MIME::WordDecoder->supported("ISO-8859-1" =>
157							(new MIME::WordDecoder::ISO_8859 "1"));
158
159							You should not override this method.
160
161							=cut
162
163							sub supported {
164	218			218	1	1092	my ($class, $charset, $decoder) = @_;
165	218	50				348	$DecoderFor{uc($charset)} = $decoder if (@_ > 2);
166	218					485	$DecoderFor{uc($charset)};
167							}
168
169							#------------------------------
170
171							=item new [\@HANDLERS]
172
173							I
174							If \@HANDLERS is given, then @HANDLERS is passed to handler()
175							to initialize the internal map.
176
177							=cut
178
179							sub new {
180	512			512	1	426	my ($class, $h) = @_;
181	512					962	my $self = bless { MWD_Map=>{} }, $class;
182
183							### Init the map:
184	512					848	$self->handler(@$h);
185
186							### Add fallbacks:
187	512		66			1751	$self->{MWD_Map}{'*'} \|\|= $Handler{WARN};
188	512		66			1272	$self->{MWD_Map}{'raw'} \|\|= $self->{MWD_Map}{'US-ASCII'};
189	512					549	$self;
190							}
191
192							#------------------------------
193
194							=item handler CHARSET=>\&SUBREF, ...
195
196							I
197							Set the handler SUBREF for a given CHARSET, for as many pairs
198							as you care to supply.
199
200							When performing the translation of a MIME-encoded string, a
201							given SUBREF will be invoked when translating a block of text
202							in character set CHARSET. The subroutine will be invoked with
203							the following arguments:
204
205							DATA - the data in the given character set.
206							CHARSET - the upcased character set name, which may prove useful
207							if you are using the same SUBREF for multiple CHARSETs.
208							DECODER - the decoder itself, if it contains configuration information
209							that your handler function needs.
210
211							For example:
212
213							$wd = new MIME::WordDecoder;
214							$wd->handler('US-ASCII' => "KEEP");
215							$wd->handler('ISO-8859-1' => \&handle_latin1,
216							'ISO-8859-2' => \&handle_latin1,
217							'*' => "DIE");
218
219							Notice that, much as with %SIG, the SUBREF can also be taken from
220							a set of special keywords:
221
222							KEEP Pass data through unchanged.
223							IGNORE Ignore data in this character set, without warning.
224							WARN Ignore data in this character set, with warning.
225							DIE Fatal exception with "can't handle character set" message.
226
227							The subroutine for the special CHARSET of 'raw' is used for raw
228							(non-MIME-encoded) text, which is supposed to be US-ASCII.
229							The handler for 'raw' defaults to whatever was specified for 'US-ASCII'
230							at the time of construction.
231
232							The subroutine for the special CHARSET of '*' is used for any
233							unrecognized character set. The default action for '*' is WARN.
234
235							=cut
236
237							sub handler {
238	1023			1023	1	711	my $self = shift;
239
240							### Copy the hash, and edit it:
241	1023					1409	while (@_) {
242	998					702	my $c = shift;
243	998					655	my $sub = shift;
244	998					1231	$self->{MWD_Map}{$c} = $self->real_handler($sub);
245							}
246	1023					751	$self;
247							}
248
249							#------------------------------
250
251							=item decode STRING
252
253							I
254							Decode a STRING which might contain MIME-encoded components into a
255							local representation (e.g., UTF-8, etc.).
256
257							=cut
258
259							sub decode {
260	95			95	1	809	my ($self, $str) = @_;
261	95	50				174	defined($str) or return undef;
262							join('', map {
263							### Get the data and (upcased) charset:
264	95					258	my $data = $_->[0];
	120					139
265	120	100				238	my $charset = (defined($_->[1]) ? uc($_->[1]) : 'raw');
266	120					153	$charset =~ s/\*\w+\Z//; ### RFC2184 language suffix
267
268							### Get the handler; guess if never seen before:
269							defined($self->{MWD_Map}{$charset}) or
270	120	100	100			466	$self->{MWD_Map}{$charset} =
271							($self->real_handler($self->guess_handler($charset)) \|\| 0);
272	120		66			1181	my $subr = $self->{MWD_Map}{$charset} \|\| $self->{MWD_Map}{'*'};
273
274							### Map this chunk:
275	120					212	&$subr($data, $charset, $self);
276							} decode_mimewords($str));
277							}
278
279							#------------------------------
280							#
281							# guess_handler CHARSET
282							#
283							# Instance method.
284							# An unrecognized charset has been seen. Guess a handler subref
285							# for the given charset, returning false if there is none.
286							# Successful mappings will be cached in the main map.
287							#
288							sub guess_handler {
289	15			15	0	46	undef;
290							}
291
292							#------------------------------
293							#
294							# real_handler HANDLER
295							#
296							# Instance method.
297							# Translate the given handler, which might be a subref or a string.
298							#
299							sub real_handler {
300	1015			1015	0	812	my ($self, $sub) = @_;
301							(!$sub) or
302							(ref($sub) eq 'CODE') or
303	1015	100	33			3600	$sub = ($Handler{$sub} \|\| croak "bad named handler: $sub\n");
			100
304	1015					2131	$sub;
305							}
306
307							#------------------------------
308
309							=item unmime STRING
310
311							I
312							Decode the given STRING using the default() decoder.
313							See L.
314
315							You should consider using the UTF-8 decoder instead. It decodes
316							MIME strings into Perl's internal string format.
317
318							=cut
319
320							sub unmime($) {
321	0			0	1	0	my $str = shift;
322	0					0	$Default->decode($str);
323							}
324
325							=item mime_to_perl_string
326
327							I
328							Decode the given STRING into an internal Perl Unicode string.
329							You should use this function in preference to all others.
330
331							The result of mime_to_perl_string is likely to have Perl's
332							UTF8 flag set.
333
334							=cut
335
336							sub mime_to_perl_string($) {
337	1			1	1	681	my $str = shift;
338	1					4	$DecoderFor{'UTF-8'}->decode($str);
339							}
340
341							=back
342
343							=cut
344
345
346
347
348
349							=head1 SUBCLASSES
350
351							=over
352
353							=cut
354
355							#------------------------------------------------------------
356							#------------------------------------------------------------
357
358							=item MIME::WordDecoder::ISO_8859
359
360							A simple decoder which keeps US-ASCII and the 7-bit characters
361							of ISO-8859 character sets and UTF8, and also keeps 8-bit
362							characters from the indicated character set.
363
364							### Construct:
365							$wd = new MIME::WordDecoder::ISO_8859 2; ### ISO-8859-2
366
367							### What to translate unknown characters to (can also use empty):
368							### Default is "?".
369							$wd->unknown("?");
370
371							### Collapse runs of unknown characters to a single unknown()?
372							### Default is false.
373							$wd->collapse(1);
374
375
376							According to B
377							(ca. November 2000):
378
379							ISO 8859 is a full series of 10 (and soon even more) standardized
380							multilingual single-byte coded (8bit) graphic character sets for
381							writing in alphabetic languages:
382
383							1. Latin1 (West European)
384							2. Latin2 (East European)
385							3. Latin3 (South European)
386							4. Latin4 (North European)
387							5. Cyrillic
388							6. Arabic
389							7. Greek
390							8. Hebrew
391							9. Latin5 (Turkish)
392							10. Latin6 (Nordic)
393
394							The ISO 8859 charsets are not even remotely as complete as the truly
395							great Unicode but they have been around and usable for quite a while
396							(first registered Internet charsets for use with MIME) and have
397							already offered a major improvement over the plain 7bit US-ASCII.
398
399							Characters 0 to 127 are always identical with US-ASCII and the
400							positions 128 to 159 hold some less used control characters: the
401							so-called C1 set from ISO 6429.
402
403							=cut
404
405							package MIME::WordDecoder::ISO_8859;
406
407	27			27		136	use strict;
	27					38
	27					680
408	27			27		93	use vars qw(@ISA);
	27					28
	27					18756
409							@ISA = qw( MIME::WordDecoder );
410
411
412							#------------------------------
413							#
414							# HANDLERS
415							#
416							#------------------------------
417
418							### Keep 7bit characters.
419							### Turn all else to the special \x00.
420							sub h_keep7bit {
421	2			2		4	local $_ = $_[0];
422							# my $unknown = $_[2]->{MWDI_Unknown};
423
424	2					3	s{[\x80-\xFF]}{\x00}g;
425	2					4	$_;
426							}
427
428							### Note: should use Unicode::String, converting/manipulating
429							### everything into full Unicode form.
430
431							### Keep 7bit UTF8 characters (ASCII).
432							### Keep ISO-8859-1 if this decoder is for Latin-1.
433							### Turn all else to the special \x00.
434							sub h_utf8 {
435	0			0		0	local $_ = $_[0];
436							# my $unknown = $_[2]->{MWDI_Unknown};
437	0					0	my $latin1 = ($_[2]->{MWDI_Num} == 1);
438							#print STDERR "UTF8 in: <$_>\n";
439
440	0					0	local($1,$2,$3);
441	0					0	my $tgt = '';
442	0		0			0	while (m{\G(
443							([\x00-\x7F]) \| # 0xxxxxxx
444							([\xC0-\xDF] [\x80-\xBF]) \| # 110yyyyy 10xxxxxx
445							([\xE0-\xEF] [\x80-\xBF]{2}) \| # 1110zzzz 10yyyyyy 10xxxxxx
446							([\xF0-\xF7] [\x80-\xBF]{3}) \| # 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
447							. # error; synch
448							)}gcsx and ($1 ne '')) {
449
450	0	0	0			0	if (defined($2)) { $tgt .= $2 }
	0	0				0
451	0					0	elsif (defined($3) && $latin1) { $tgt .= "\x00" }
452	0					0	else { $tgt .= "\x00" }
453							}
454
455							#print STDERR "UTF8 out: <$tgt>\n";
456	0					0	$tgt;
457							}
458
459							### Keep characters which are 7bit in UTF8 (ASCII).
460							### Keep ISO-8859-1 if this decoder is for Latin-1.
461							### Turn all else to the special \x00.
462							sub h_utf16 {
463	0			0		0	local $_ = $_[0];
464							# my $unknown = $_[2]->{MWDI_Unknown};
465	0					0	my $latin1 = ($_[2]->{MWDI_Num} == 1);
466							#print STDERR "UTF16 in: <$_>\n";
467
468	0					0	local($1,$2,$3,$4,$5);
469	0					0	my $tgt = '';
470	0		0			0	while (m{\G(
471							( \x00 ([\x00-\x7F])) \| # 00000000 0xxxxxxx
472							( \x00 ([\x80-\xFF])) \| # 00000000 1xxxxxxx
473							( [^\x00] [\x00-\xFF]) \| # etc
474							)
475							}gcsx and ($1 ne '')) {
476
477	0	0	0			0	if (defined($2)) { $tgt .= $3 }
	0	0				0
478	0					0	elsif (defined($4) && $latin1) { $tgt .= $5 }
479	0					0	else { $tgt .= "\x00" }
480							}
481
482							#print STDERR "UTF16 out: <$tgt>\n";
483	0					0	$tgt;
484							}
485
486
487							#------------------------------
488							#
489							# PUBLIC INTERFACE
490							#
491							#------------------------------
492
493							#------------------------------
494							#
495							# new NUMBER
496							#
497							sub new {
498	484			484		931	my ($class, $num) = @_;
499
500	484					727	my $self = $class->SUPER::new();
501	484					599	$self->handler('raw' => 'KEEP',
502							'US-ASCII' => 'KEEP');
503
504	484					442	$self->{MWDI_Num} = $num;
505	484					458	$self->{MWDI_Unknown} = "?";
506	484					373	$self->{MWDI_Collapse} = 0;
507	484					1040	$self;
508							}
509
510							#------------------------------
511							#
512							# guess_handler CHARSET
513							#
514							sub guess_handler {
515	2			2		3	my ($self, $charset) = @_;
516							return 'KEEP' if (($charset =~ /^ISO[-_]?8859[-_](\d+)$/) &&
517	2	100	66			24	($1 eq $self->{MWDI_Num}));
518	1	50				7	return \&h_keep7bit if ($charset =~ /^ISO[-_]?8859/);
519	0	0				0	return \&h_utf8 if ($charset =~ /^UTF[-_]?8$/);
520	0	0				0	return \&h_utf16 if ($charset =~ /^UTF[-_]?16$/);
521	0					0	undef;
522							}
523
524							#------------------------------
525							#
526							# unknown [REPLACEMENT]
527							#
528							sub unknown {
529	0			0		0	my $self = shift;
530	0	0				0	$self->{MWDI_Unknown} = shift if @_;
531	0					0	$self->{MWDI_Unknown};
532							}
533
534							#------------------------------
535							#
536							# collapse [YESNO]
537							#
538							sub collapse {
539	0			0		0	my $self = shift;
540	0	0				0	$self->{MWDI_Collapse} = shift if @_;
541	0					0	$self->{MWDI_Collapse};
542							}
543
544							#------------------------------
545							#
546							# decode STRING
547							#
548							sub decode {
549	10			10		4764	my $self = shift;
550
551							### Do inherited action:
552	10					27	my $basic = $self->SUPER::decode(@_);
553	10	50				21	defined($basic) or return undef;
554
555							### Translate/consolidate illegal characters:
556	10	50				18	$basic =~ tr{\x00}{\x00}c if $self->{MWDI_Collapse};
557	10					10	$basic =~ s{\x00}{$self->{MWDI_Unknown}}g;
558	10					17	$basic;
559							}
560
561							#------------------------------------------------------------
562							#------------------------------------------------------------
563
564							=item MIME::WordDecoder::US_ASCII
565
566							A subclass of the ISO-8859-1 decoder which discards 8-bit characters.
567							You're probably better off using ISO-8859-1.
568
569							=cut
570
571							package MIME::WordDecoder::US_ASCII;
572
573	27			27		139	use strict;
	27					29
	27					641
574	27			27		102	use vars qw(@ISA);
	27					79
	27					4078
575							@ISA = qw( MIME::WordDecoder::ISO_8859 );
576
577							sub new {
578	27			27		63	my ($class) = @_;
579	27					123	return $class->SUPER::new("1");
580							}
581
582							sub decode {
583	0			0		0	my $self = shift;
584
585							### Do inherited action:
586	0					0	my $basic = $self->SUPER::decode(@_);
587	0	0				0	defined($basic) or return undef;
588
589							### Translate/consolidate 8-bit characters:
590	0	0				0	$basic =~ tr{\x80-\xFF}{}c if $self->{MWDI_Collapse};
591	0					0	$basic =~ s{[\x80-\xFF]}{$self->{MWDI_Unknown}}g;
592	0					0	$basic;
593							}
594
595							=back
596
597							=cut
598
599							package MIME::WordDecoder::UTF_8;
600	27			27		139	use strict;
	27					32
	27					575
601	27			27		14821	use Encode qw();
	27					219813
	27					786
602	27			27		171	use Carp qw( carp );
	27					32
	27					1365
603	27			27		112	use vars qw(@ISA);
	27					28
	27					6326
604
605							@ISA = qw( MIME::WordDecoder );
606
607							sub h_convert_to_utf8
608							{
609	97			97		142	my ($data, $charset, $decoder) = @_;
610	97	100				200	$charset = 'US-ASCII' if ($charset eq 'raw');
611	97					289	my $enc = Encode::find_encoding($charset);
612	97	50				7930	if (!$enc) {
613	0					0	carp "Unable to convert text in character set `$charset' to UTF-8... ignoring\n";
614	0					0	return '';
615							}
616	97					524	my $ans = $enc->decode($data, Encode::FB_PERLQQ);
617	97					569	return $ans;
618							}
619
620							sub new {
621	27			27		42	my ($class) = @_;
622	27					117	my $self = $class->SUPER::new();
623	27					83	$self->handler('*' => \&h_convert_to_utf8);
624							}
625
626
627							#------------------------------------------------------------
628							#------------------------------------------------------------
629
630							package MIME::WordDecoder;
631
632							### Now we can init the default handler.
633							$Default = (MIME::WordDecoder::ISO_8859->new('1'));
634
635
636							### Add US-ASCII handler:
637							$DecoderFor{"US-ASCII"} = MIME::WordDecoder::US_ASCII->new;
638
639							### Add ISO-8859-{1..15} handlers:
640							for (1..15) {
641							$DecoderFor{"ISO-8859-$_"} = MIME::WordDecoder::ISO_8859->new($_);
642							}
643
644							### UTF-8
645							$DecoderFor{'UTF-8'} = MIME::WordDecoder::UTF_8->new();
646
647							1; # end the module
648							__END__