File Coverage

blib/lib/MIME/WordDecoder.pm

Criterion	Covered	Total	%
statement	103	142	72.5
branch	17	44	38.6
condition	14	32	43.7
subroutine	28	34	82.3
pod	7	9	77.7
total	169	261	64.7

line	stmt	bran	cond	sub	pod	time	code
1							package MIME::WordDecoder;
2
3							=head1 NAME
4
5							MIME::WordDecoder - decode RFC 2047 encoded words to a local representation
6
7							WARNING: Most of this module is deprecated and may disappear. The only
8							function you should use for MIME decoding is "mime_to_perl_string".
9
10							=head1 SYNOPSIS
11
12							See L for the basics of encoded words.
13							See L<"DESCRIPTION"> for how this class works.
14
15							use MIME::WordDecoder;
16
17
18							### Get the default word-decoder (used by unmime()):
19							$wd = default MIME::WordDecoder;
20
21							### Get a word-decoder which maps to ISO-8859-1 (Latin1):
22							$wd = supported MIME::WordDecoder "ISO-8859-1";
23
24
25							### Decode a MIME string (e.g., into Latin1) via the default decoder:
26							$str = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
27
28							### Decode a string using the default decoder, non-OO style:
29							$str = unmime('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
30
31							### Decode a string to an internal Perl string, non-OO style
32							### The result is likely to have the UTF8 flag ON.
33							$str = mime_to_perl_string('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
34
35							=head1 DESCRIPTION
36
37							WARNING: Most of this module is deprecated and may disappear. It
38							duplicates (badly) the function of the standard 'Encode' module. The
39							only function you should rely on is mime_to_perl_string.
40
41							A MIME::WordDecoder consists, fundamentally, of a hash which maps
42							a character set name (US-ASCII, ISO-8859-1, etc.) to a subroutine which
43							knows how to take bytes in that character set and turn them into
44							the target string representation. Ideally, this target representation
45							would be Unicode, but we don't want to overspecify the translation
46							that takes place: if you want to convert MIME strings directly to Big5,
47							that's your own decision.
48
49							The subroutine will be invoked with two arguments: DATA (the data in
50							the given character set), and CHARSET (the upcased character set name).
51
52							For example:
53
54							### Keep 7-bit characters as-is, convert 8-bit characters to '#':
55							sub keep7bit {
56							local $_ = shift;
57							tr/\x00-\x7F/#/c;
58							$_;
59							}
60
61							Here's a decoder which uses that:
62
63							### Construct a decoder:
64							$wd = MIME::WordDecoder->new({'US-ASCII' => "KEEP", ### sub { $_[0] }
65							'ISO-8859-1' => \&keep7bit,
66							'ISO-8859-2' => \&keep7bit,
67							'Big5' => "WARN",
68							'*' => "DIE"});
69
70							### Convert some MIME text to a pure ASCII string...
71							$ascii = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
72
73							### ...which will now hold: "To: Keld J#rn Simonsen "
74
75							The UTF-8 built-in decoder decodes everything into Perl's internal
76							string format, possibly turning on the internal UTF8 flag. Use it like
77							this:
78
79							$wd = supported MIME::WordDecoder 'UTF-8';
80							$perl_string = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
81							# perl_string will be a valid UTF-8 string with the "UTF8" flag set.
82
83							Generally, you should use the UTF-8 decoder in preference to "unmime".
84
85							=head1 PUBLIC INTERFACE
86
87							=over
88
89							=cut
90
91	25			25		26995	use strict;
	25					56
	25					702
92	25			25		109	use Carp qw( carp croak );
	25					45
	25					1264
93	25			25		10844	use MIME::Words qw(decode_mimewords);
	25					56
	25					1366
94	25			25		130	use Exporter;
	25					136
	25					839
95	25			25		121	use vars qw(@ISA @EXPORT);
	25					40
	25					20762
96
97							@ISA = qw(Exporter);
98							@EXPORT = qw( unmime mime_to_perl_string );
99
100
101
102							#------------------------------
103							#
104							# Globals
105							#
106							#------------------------------
107
108							### Decoders.
109							my %DecoderFor = ();
110
111							### Standard handlers.
112							my %Handler =
113							(
114							KEEP => sub {$_[0]},
115							IGNORE => sub {''},
116							WARN => sub { carp "ignoring text in character set `$_[1]'\n" },
117							DIE => sub { croak "can't handle text in character set `$_[1]'\n" },
118							);
119
120							### Global default decoder. We init it below.
121							my $Default;
122
123							### Global UTF8 decoder.
124							my $DefaultUTF8;
125
126							#------------------------------
127
128							=item default [DECODER]
129
130							I
131							Get/set the default DECODER object.
132
133							=cut
134
135							sub default {
136	25			25	1	157	my $class = shift;
137	25	50				57	if (@_) {
138	25					45	$Default = shift;
139							}
140	25					132	$Default;
141							}
142
143							#------------------------------
144
145							=item supported CHARSET, [DECODER]
146
147							I
148							If just CHARSET is given, returns a decoder object which maps
149							data into that character set (the character set is forced to
150							all-uppercase).
151
152							$wd = supported MIME::WordDecoder "ISO-8859-1";
153
154							If DECODER is given, installs such an object:
155
156							MIME::WordDecoder->supported("ISO-8859-1" =>
157							(new MIME::WordDecoder::ISO_8859 "1"));
158
159							You should not override this method.
160
161							=cut
162
163							sub supported {
164	212			212	1	1253	my ($class, $charset, $decoder) = @_;
165	212	50				478	$DecoderFor{uc($charset)} = $decoder if (@_ > 2);
166	212					705	$DecoderFor{uc($charset)};
167							}
168
169							#------------------------------
170
171							=item new [\@HANDLERS]
172
173							I
174							If \@HANDLERS is given, then @HANDLERS is passed to handler()
175							to initialize the internal map.
176
177							=cut
178
179							sub new {
180	476			476	1	686	my ($class, $h) = @_;
181	476					1252	my $self = bless { MWD_Map=>{} }, $class;
182
183							### Init the map:
184	476					1205	$self->handler(@$h);
185
186							### Add fallbacks:
187	476		66			2332	$self->{MWD_Map}{'*'} \|\|= $Handler{WARN};
188	476		66			1675	$self->{MWD_Map}{'raw'} \|\|= $self->{MWD_Map}{'US-ASCII'};
189	476					879	$self;
190							}
191
192							#------------------------------
193
194							=item handler CHARSET=>\&SUBREF, ...
195
196							I
197							Set the handler SUBREF for a given CHARSET, for as many pairs
198							as you care to supply.
199
200							When performing the translation of a MIME-encoded string, a
201							given SUBREF will be invoked when translating a block of text
202							in character set CHARSET. The subroutine will be invoked with
203							the following arguments:
204
205							DATA - the data in the given character set.
206							CHARSET - the upcased character set name, which may prove useful
207							if you are using the same SUBREF for multiple CHARSETs.
208							DECODER - the decoder itself, if it contains configuration information
209							that your handler function needs.
210
211							For example:
212
213							$wd = new MIME::WordDecoder;
214							$wd->handler('US-ASCII' => "KEEP");
215							$wd->handler('ISO-8859-1' => \&handle_latin1,
216							'ISO-8859-2' => \&handle_latin1,
217							'*' => "DIE");
218
219							Notice that, much as with %SIG, the SUBREF can also be taken from
220							a set of special keywords:
221
222							KEEP Pass data through unchanged.
223							IGNORE Ignore data in this character set, without warning.
224							WARN Ignore data in this character set, with warning.
225							DIE Fatal exception with "can't handle character set" message.
226
227							The subroutine for the special CHARSET of 'raw' is used for raw
228							(non-MIME-encoded) text, which is supposed to be US-ASCII.
229							The handler for 'raw' defaults to whatever was specified for 'US-ASCII'
230							at the time of construction.
231
232							The subroutine for the special CHARSET of '*' is used for any
233							unrecognized character set. The default action for '*' is WARN.
234
235							=cut
236
237							sub handler {
238	951			951	1	1095	my $self = shift;
239
240							### Copy the hash, and edit it:
241	951					2043	while (@_) {
242	928					1154	my $c = shift;
243	928					1088	my $sub = shift;
244	928					2038	$self->{MWD_Map}{$c} = $self->real_handler($sub);
245							}
246	951					1189	$self;
247							}
248
249							#------------------------------
250
251							=item decode STRING
252
253							I
254							Decode a STRING which might contain MIME-encoded components into a
255							local representation (e.g., UTF-8, etc.).
256
257							=cut
258
259							sub decode {
260	94			94	1	716	my ($self, $str) = @_;
261	94	50				215	defined($str) or return undef;
262							join('', map {
263							### Get the data and (upcased) charset:
264	94					410	my $data = $_->[0];
	119					228
265	119	100				315	my $charset = (defined($_->[1]) ? uc($_->[1]) : 'raw');
266	119					219	$charset =~ s/\*\w+\Z//; ### RFC2184 language suffix
267
268							### Get the handler; guess if never seen before:
269							defined($self->{MWD_Map}{$charset}) or
270	119	100	100			602	$self->{MWD_Map}{$charset} =
271							($self->real_handler($self->guess_handler($charset)) \|\| 0);
272	119		66			422	my $subr = $self->{MWD_Map}{$charset} \|\| $self->{MWD_Map}{'*'};
273
274							### Map this chunk:
275	119					326	&$subr($data, $charset, $self);
276							} decode_mimewords($str));
277							}
278
279							#------------------------------
280							#
281							# guess_handler CHARSET
282							#
283							# Instance method.
284							# An unrecognized charset has been seen. Guess a handler subref
285							# for the given charset, returning false if there is none.
286							# Successful mappings will be cached in the main map.
287							#
288							sub guess_handler {
289	14			14	0	62	undef;
290							}
291
292							#------------------------------
293							#
294							# real_handler HANDLER
295							#
296							# Instance method.
297							# Translate the given handler, which might be a subref or a string.
298							#
299							sub real_handler {
300	944			944	0	1196	my ($self, $sub) = @_;
301							(!$sub) or
302							(ref($sub) eq 'CODE') or
303	944	100	33			4589	$sub = ($Handler{$sub} \|\| croak "bad named handler: $sub\n");
			100
304	944					3274	$sub;
305							}
306
307							#------------------------------
308
309							=item unmime STRING
310
311							I
312							Decode the given STRING using the default() decoder.
313							See L.
314
315							You should consider using the UTF-8 decoder instead. It decodes
316							MIME strings into Perl's internal string format.
317
318							=cut
319
320							sub unmime($) {
321	0			0	1	0	my $str = shift;
322	0					0	$Default->decode($str);
323							}
324
325							=item mime_to_perl_string
326
327							I
328							Decode the given STRING into an internal Perl Unicode string.
329							You should use this function in preference to all others.
330
331							The result of mime_to_perl_string is likely to have Perl's
332							UTF8 flag set.
333
334							=cut
335
336							sub mime_to_perl_string($) {
337	1			1	1	717	my $str = shift;
338	1					4	$DecoderFor{'UTF-8'}->decode($str);
339							}
340
341							=back
342
343							=cut
344
345
346
347
348
349							=head1 SUBCLASSES
350
351							=over
352
353							=cut
354
355							#------------------------------------------------------------
356							#------------------------------------------------------------
357
358							=item MIME::WordDecoder::ISO_8859
359
360							A simple decoder which keeps US-ASCII and the 7-bit characters
361							of ISO-8859 character sets and UTF8, and also keeps 8-bit
362							characters from the indicated character set.
363
364							### Construct:
365							$wd = new MIME::WordDecoder::ISO_8859 2; ### ISO-8859-2
366
367							### What to translate unknown characters to (can also use empty):
368							### Default is "?".
369							$wd->unknown("?");
370
371							### Collapse runs of unknown characters to a single unknown()?
372							### Default is false.
373							$wd->collapse(1);
374
375
376							According to B
377							(ca. November 2000):
378
379							ISO 8859 is a full series of 10 (and soon even more) standardized
380							multilingual single-byte coded (8bit) graphic character sets for
381							writing in alphabetic languages:
382
383							1. Latin1 (West European)
384							2. Latin2 (East European)
385							3. Latin3 (South European)
386							4. Latin4 (North European)
387							5. Cyrillic
388							6. Arabic
389							7. Greek
390							8. Hebrew
391							9. Latin5 (Turkish)
392							10. Latin6 (Nordic)
393
394							The ISO 8859 charsets are not even remotely as complete as the truly
395							great Unicode but they have been around and usable for quite a while
396							(first registered Internet charsets for use with MIME) and have
397							already offered a major improvement over the plain 7bit US-ASCII.
398
399							Characters 0 to 127 are always identical with US-ASCII and the
400							positions 128 to 159 hold some less used control characters: the
401							so-called C1 set from ISO 6429.
402
403							=cut
404
405							package MIME::WordDecoder::ISO_8859;
406
407	25			25		154	use strict;
	25					47
	25					700
408	25			25		124	use vars qw(@ISA);
	25					39
	25					21135
409							@ISA = qw( MIME::WordDecoder );
410
411
412							#------------------------------
413							#
414							# HANDLERS
415							#
416							#------------------------------
417
418							### Keep 7bit characters.
419							### Turn all else to the special \x00.
420							sub h_keep7bit {
421	2			2		5	local $_ = $_[0];
422							# my $unknown = $_[2]->{MWDI_Unknown};
423
424	2					4	s{[\x80-\xFF]}{\x00}g;
425	2					8	$_;
426							}
427
428							### Note: should use Unicode::String, converting/manipulating
429							### everything into full Unicode form.
430
431							### Keep 7bit UTF8 characters (ASCII).
432							### Keep ISO-8859-1 if this decoder is for Latin-1.
433							### Turn all else to the special \x00.
434							sub h_utf8 {
435	0			0		0	local $_ = $_[0];
436							# my $unknown = $_[2]->{MWDI_Unknown};
437	0					0	my $latin1 = ($_[2]->{MWDI_Num} == 1);
438							#print STDERR "UTF8 in: <$_>\n";
439
440	0					0	local($1,$2,$3);
441	0					0	my $tgt = '';
442	0		0			0	while (m{\G(
443							([\x00-\x7F]) \| # 0xxxxxxx
444							([\xC0-\xDF] [\x80-\xBF]) \| # 110yyyyy 10xxxxxx
445							([\xE0-\xEF] [\x80-\xBF]{2}) \| # 1110zzzz 10yyyyyy 10xxxxxx
446							([\xF0-\xF7] [\x80-\xBF]{3}) \| # 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
447							. # error; synch
448							)}gcsx and ($1 ne '')) {
449
450	0	0	0			0	if (defined($2)) { $tgt .= $2 }
	0	0				0
451	0					0	elsif (defined($3) && $latin1) { $tgt .= "\x00" }
452	0					0	else { $tgt .= "\x00" }
453							}
454
455							#print STDERR "UTF8 out: <$tgt>\n";
456	0					0	$tgt;
457							}
458
459							### Keep characters which are 7bit in UTF8 (ASCII).
460							### Keep ISO-8859-1 if this decoder is for Latin-1.
461							### Turn all else to the special \x00.
462							sub h_utf16 {
463	0			0		0	local $_ = $_[0];
464							# my $unknown = $_[2]->{MWDI_Unknown};
465	0					0	my $latin1 = ($_[2]->{MWDI_Num} == 1);
466							#print STDERR "UTF16 in: <$_>\n";
467
468	0					0	local($1,$2,$3,$4,$5);
469	0					0	my $tgt = '';
470	0		0			0	while (m{\G(
471							( \x00 ([\x00-\x7F])) \| # 00000000 0xxxxxxx
472							( \x00 ([\x80-\xFF])) \| # 00000000 1xxxxxxx
473							( [^\x00] [\x00-\xFF]) \| # etc
474							)
475							}gcsx and ($1 ne '')) {
476
477	0	0	0			0	if (defined($2)) { $tgt .= $3 }
	0	0				0
478	0					0	elsif (defined($4) && $latin1) { $tgt .= $5 }
479	0					0	else { $tgt .= "\x00" }
480							}
481
482							#print STDERR "UTF16 out: <$tgt>\n";
483	0					0	$tgt;
484							}
485
486
487							#------------------------------
488							#
489							# PUBLIC INTERFACE
490							#
491							#------------------------------
492
493							#------------------------------
494							#
495							# new NUMBER
496							#
497							sub new {
498	450			450		1116	my ($class, $num) = @_;
499
500	450					978	my $self = $class->SUPER::new();
501	450					931	$self->handler('raw' => 'KEEP',
502							'US-ASCII' => 'KEEP');
503
504	450					815	$self->{MWDI_Num} = $num;
505	450					683	$self->{MWDI_Unknown} = "?";
506	450					626	$self->{MWDI_Collapse} = 0;
507	450					2641	$self;
508							}
509
510							#------------------------------
511							#
512							# guess_handler CHARSET
513							#
514							sub guess_handler {
515	2			2		5	my ($self, $charset) = @_;
516							return 'KEEP' if (($charset =~ /^ISO[-_]?8859[-_](\d+)$/) &&
517	2	100	66			20	($1 eq $self->{MWDI_Num}));
518	1	50				7	return \&h_keep7bit if ($charset =~ /^ISO[-_]?8859/);
519	0	0				0	return \&h_utf8 if ($charset =~ /^UTF[-_]?8$/);
520	0	0				0	return \&h_utf16 if ($charset =~ /^UTF[-_]?16$/);
521	0					0	undef;
522							}
523
524							#------------------------------
525							#
526							# unknown [REPLACEMENT]
527							#
528							sub unknown {
529	0			0		0	my $self = shift;
530	0	0				0	$self->{MWDI_Unknown} = shift if @_;
531	0					0	$self->{MWDI_Unknown};
532							}
533
534							#------------------------------
535							#
536							# collapse [YESNO]
537							#
538							sub collapse {
539	0			0		0	my $self = shift;
540	0	0				0	$self->{MWDI_Collapse} = shift if @_;
541	0					0	$self->{MWDI_Collapse};
542							}
543
544							#------------------------------
545							#
546							# decode STRING
547							#
548							sub decode {
549	10			10		5646	my $self = shift;
550
551							### Do inherited action:
552	10					26	my $basic = $self->SUPER::decode(@_);
553	10	50				33	defined($basic) or return undef;
554
555							### Translate/consolidate illegal characters:
556	10	50				24	$basic =~ tr{\x00}{\x00}c if $self->{MWDI_Collapse};
557	10					15	$basic =~ s{\x00}{$self->{MWDI_Unknown}}g;
558	10					21	$basic;
559							}
560
561							#------------------------------------------------------------
562							#------------------------------------------------------------
563
564							=item MIME::WordDecoder::US_ASCII
565
566							A subclass of the ISO-8859-1 decoder which discards 8-bit characters.
567							You're probably better off using ISO-8859-1.
568
569							=cut
570
571							package MIME::WordDecoder::US_ASCII;
572
573	25			25		138	use strict;
	25					45
	25					597
574	25			25		119	use vars qw(@ISA);
	25					44
	25					4310
575							@ISA = qw( MIME::WordDecoder::ISO_8859 );
576
577							sub new {
578	25			25		50	my ($class) = @_;
579	25					143	return $class->SUPER::new("1");
580							}
581
582							sub decode {
583	0			0		0	my $self = shift;
584
585							### Do inherited action:
586	0					0	my $basic = $self->SUPER::decode(@_);
587	0	0				0	defined($basic) or return undef;
588
589							### Translate/consolidate 8-bit characters:
590	0	0				0	$basic =~ tr{\x80-\xFF}{}c if $self->{MWDI_Collapse};
591	0					0	$basic =~ s{[\x80-\xFF]}{$self->{MWDI_Unknown}}g;
592	0					0	$basic;
593							}
594
595							=back
596
597							=cut
598
599							package MIME::WordDecoder::UTF_8;
600	25			25		148	use strict;
	25					39
	25					543
601	25			25		22722	use Encode qw();
	25					274178
	25					711
602	25			25		175	use Carp qw( carp );
	25					48
	25					1380
603	25			25		127	use vars qw(@ISA);
	25					42
	25					7319
604
605							@ISA = qw( MIME::WordDecoder );
606
607							sub h_convert_to_utf8
608							{
609	96			96		171	my ($data, $charset, $decoder) = @_;
610	96	100				276	$charset = 'US-ASCII' if ($charset eq 'raw');
611	96					352	my $enc = Encode::find_encoding($charset);
612	96	50				30003	if (!$enc) {
613	0					0	carp "Unable to convert text in character set `$charset' to UTF-8... ignoring\n";
614	0					0	return '';
615							}
616	96					709	my $ans = $enc->decode($data, Encode::FB_PERLQQ);
617	96					801	return $ans;
618							}
619
620							sub new {
621	25			25		56	my ($class) = @_;
622	25					139	my $self = $class->SUPER::new();
623	25					85	$self->handler('*' => \&h_convert_to_utf8);
624							}
625
626
627							#------------------------------------------------------------
628							#------------------------------------------------------------
629
630							package MIME::WordDecoder;
631
632							### Now we can init the default handler.
633							$Default = (MIME::WordDecoder::ISO_8859->new('1'));
634
635
636							### Add US-ASCII handler:
637							$DecoderFor{"US-ASCII"} = MIME::WordDecoder::US_ASCII->new;
638
639							### Add ISO-8859-{1..15} handlers:
640							for (1..15) {
641							$DecoderFor{"ISO-8859-$_"} = MIME::WordDecoder::ISO_8859->new($_);
642							}
643
644							### UTF-8
645							$DecoderFor{'UTF-8'} = MIME::WordDecoder::UTF_8->new();
646
647							1; # end the module
648							__END__