File Coverage

blib/lib/MIME/Words.pm

Criterion	Covered	Total	%
statement	73	76	96.0
branch	15	18	83.3
condition	5	8	62.5
subroutine	12	13	92.3
pod	3	3	100.0
total	108	118	91.5

line	stmt	bran	cond	sub	pod	time	code
1							package MIME::Words;
2
3							=head1 NAME
4
5							MIME::Words - deal with RFC 2047 encoded words
6
7
8							=head1 SYNOPSIS
9
10							Before reading further, you should see L to make sure that
11							you understand where this module fits into the grand scheme of things.
12							Go on, do it now. I'll wait.
13
14							Ready? Ok...
15
16
17							use MIME::Words qw(:all);
18
19							### Decode the string into another string, forgetting the charsets:
20							$decoded = decode_mimewords(
21							'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
22							);
23
24							### Split string into array of decoded [DATA,CHARSET] pairs:
25							@decoded = decode_mimewords(
26							'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
27							);
28
29							### Encode a single unsafe word:
30							$encoded = encode_mimeword("\xABFran\xE7ois\xBB");
31
32							### Encode a string, trying to find the unsafe words inside it:
33							$encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
34
35
36
37							=head1 DESCRIPTION
38
39							Fellow Americans, you probably won't know what the hell this module
40							is for. Europeans, Russians, et al, you probably do. C<:-)>.
41
42							For example, here's a valid MIME header you might get:
43
44							From: =?US-ASCII?Q?Keith_Moore?=
45							To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=
46							CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard
47							Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
48							=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
49							=?US-ASCII?Q?.._cool!?=
50
51							The fields basically decode to (sorry, I can only approximate the
52							Latin characters with 7 bit sequences /o and 'e):
53
54							From: Keith Moore
55							To: Keld J/orn Simonsen
56							CC: Andr'e Pirard
57							Subject: If you can read this you understand the example... cool!
58
59
60							=head1 PUBLIC INTERFACE
61
62							=over 4
63
64							=cut
65
66							require 5.001;
67
68							### Pragmas:
69	29			29		151589	use strict;
	29					33
	29					701
70	29			29		98	use re 'taint';
	29					35
	29					1164
71	29			29		90	use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
	29					32
	29					1550
72
73							### Exporting:
74	29			29		101	use Exporter;
	29					31
	29					1557
75							%EXPORT_TAGS = (all => [qw(decode_mimewords
76							encode_mimeword
77							encode_mimewords
78							)]);
79							Exporter::export_ok_tags('all');
80
81							### Inheritance:
82							@ISA = qw(Exporter);
83
84							### Other modules:
85	29			29		11153	use MIME::Base64;
	29					13169
	29					1404
86	29			29		10445	use MIME::QuotedPrint;
	29					4382
	29					20586
87
88
89
90							#------------------------------
91							#
92							# Globals...
93							#
94							#------------------------------
95
96							### The package version, both in 1.23 style and usable by MakeMaker:
97							$VERSION = "5.508";
98
99							### Nonprintables (controls + x7F + 8bit):
100							my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
101
102
103							#------------------------------
104
105							# _decode_Q STRING
106							# Private: used by _decode_header() to decode "Q" encoding, which is
107							# almost, but not exactly, quoted-printable. :-P
108							sub _decode_Q {
109	54			54		50	my $str = shift;
110	54					75	local $1;
111	54					81	$str =~ s/_/\x20/g; # RFC-1522, Q rule 2
112	54					118	$str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
	272					514
113	54					101	$str;
114							}
115
116							# _encode_Q STRING
117							# Private: used by _encode_header() to decode "Q" encoding, which is
118							# almost, but not exactly, quoted-printable. :-P
119							sub _encode_Q {
120	25			25		22	my $str = shift;
121	25					31	local $1;
122	25					97	$str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
	238					342
123	25					90	$str;
124							}
125
126							# _decode_B STRING
127							# Private: used by _decode_header() to decode "B" encoding.
128							sub _decode_B {
129	12			12		25	my $str = shift;
130	12					54	decode_base64($str);
131							}
132
133							# _encode_B STRING
134							# Private: used by _decode_header() to decode "B" encoding.
135							sub _encode_B {
136	0			0		0	my $str = shift;
137	0					0	encode_base64($str, '');
138							}
139
140
141
142							#------------------------------
143
144							=item decode_mimewords ENCODED
145
146							I
147							Go through the string looking for RFC 2047-style "Q"
148							(quoted-printable, sort of) or "B" (base64) encoding, and decode them.
149
150							B splits the ENCODED string into a list of decoded
151							C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
152							data are returned in a 1-element array C<[DATA]>, giving an effective
153							CHARSET of C.
154
155							$enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ';
156							foreach (decode_mimewords($enc)) {
157							print "", ($_->[1] \|\| 'US-ASCII'), ": ", $_->[0], "\n";
158							}
159
160							B joins the "data" elements of the above
161							list together, and returns that. I
162							and probably I what you want, but if you know that all charsets
163							in the ENCODED string are identical, it might be useful to you.
164							(Before you use this, please see L,
165							which is probably what you want.)
166
167							In the event of a syntax error, $@ will be set to a description
168							of the error, but parsing will continue as best as possible (so as to
169							get I back when decoding headers).
170							$@ will be false if no error was detected.
171
172							Any arguments past the ENCODED string are taken to define a hash of options:
173
174							=cut
175
176							sub decode_mimewords {
177	131			131	1	4738	my $encstr = shift;
178	131					114	my @tokens;
179	131					284	local($1,$2,$3);
180	131					133	$@ = ''; ### error-return
181
182							### Collapse boundaries between adjacent encoded words:
183	131					234	$encstr =~ s{(\?\=)\s*(\=\?)}{$1$2}gs;
184	131					216	pos($encstr) = 0;
185							### print STDOUT "ENC = [", $encstr, "]\n";
186
187							### Decode:
188	131					151	my ($charset, $encoding, $enc, $dec);
189	131					102	while (1) {
190	337	100				530	last if (pos($encstr) >= length($encstr));
191	206					164	my $pos = pos($encstr); ### save it
192
193							### Case 1: are we looking at "=?..?..?="?
194	206	100				487	if ($encstr =~ m{\G # from where we left off..
195							=\?([^?]*) # "=?" + charset +
196							\?([bq]) # "?" + encoding +
197							\?([^?]+) # "?" + data maybe with spcs +
198							\?= # "?="
199							}xgi) {
200	66					147	($charset, $encoding, $enc) = ($1, lc($2), $3);
201	66	100				164	$dec = (($encoding eq 'q') ? _decode_Q($enc) : _decode_B($enc));
202	66					106	push @tokens, [$dec, $charset];
203	66					60	next;
204							}
205
206							### Case 2: are we looking at a bad "=?..." prefix?
207							### We need this to detect problems for case 3, which stops at "=?":
208	140					154	pos($encstr) = $pos; # reset the pointer.
209	140	100				245	if ($encstr =~ m{\G=\?}xg) {
210	4					13	$@ .= qq\|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n\|;
211	4					6	push @tokens, ['=?'];
212	4					5	next;
213							}
214
215							### Case 3: are we looking at ordinary text?
216	136					130	pos($encstr) = $pos; # reset the pointer.
217	136	50				796	if ($encstr =~ m{\G # from where we left off...
218							(.*? # shortest possible string,
219							\n*) # followed by 0 or more NLs,
220							(?=(\Z\|=\?)) # terminated by "=?" or EOS
221							}sxg) {
222	136	50				267	length($1) or die "MIME::Words: internal logic err: empty token\n";
223	136					230	push @tokens, [$1];
224	136					139	next;
225							}
226
227							### Case 4: bug!
228	0					0	die "MIME::Words: unexpected case:\n($encstr) pos $pos\n\t".
229							"Please alert developer.\n";
230							}
231	131	100				410	return (wantarray ? @tokens : join('',map {$_->[0]} @tokens));
	86					221
232							}
233
234							#------------------------------
235
236							=item encode_mimeword RAW, [ENCODING], [CHARSET]
237
238							I
239							Encode a single RAW "word" that has unsafe characters.
240							The "word" will be encoded in its entirety.
241
242							### Encode "<>":
243							$encoded = encode_mimeword("\xABFran\xE7ois\xBB");
244
245							You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
246							You may specify the CHARSET, which defaults to C.
247
248							=cut
249
250							sub encode_mimeword {
251	25			25	1	149	my $word = shift;
252	25		50			47	my $encoding = uc(shift \|\| 'Q');
253	25		50			37	my $charset = uc(shift \|\| 'ISO-8859-1');
254	25	50				36	my $encfunc = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B);
255	25					44	"=?$charset?$encoding?" . &$encfunc($word) . "?=";
256							}
257
258							#------------------------------
259
260							=item encode_mimewords RAW, [OPTS]
261
262							I
263							Given a RAW string, try to find and encode all "unsafe" sequences
264							of characters:
265
266							### Encode a string with some unsafe "words":
267							$encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
268
269							Returns the encoded string.
270							Any arguments past the RAW string are taken to define a hash of options:
271
272							=over 4
273
274							=item Charset
275
276							Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
277							a.k.a. "Latin-1".
278
279							=item Encoding
280
281							The encoding to use, C<"q"> or C<"b">. The default is C<"q">.
282
283							=back
284
285							B this is a quick-and-dirty solution, intended for character
286							sets which overlap ASCII. B
287							rules regarding the use of encoded words in message headers>.
288							You may want to roll your own variant,
289							using C, for your application.
290							I
291
292							=cut
293
294							sub encode_mimewords {
295	6			6	1	891	my ($rawstr, %params) = @_;
296	6		100			27	my $charset = $params{Charset} \|\| 'ISO-8859-1';
297	6		50			27	my $encoding = lc($params{Encoding} \|\| 'q');
298
299							### Encode any "words" with unsafe characters.
300							### We limit such words to 18 characters, to guarantee that the
301							### worst-case encoding give us no more than 54 + ~10 < 75 characters
302	6					7	my $word;
303	6					13	local $1;
304	6					32	$rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]+\s*)}{ ### get next "word"
305	51					54	$word = $1;
306	51	100				217	(($word !~ /(?:[$NONPRINT])\|(?:^\s+$)/o)
307							? $word ### no unsafe chars
308							: encode_mimeword($word, $encoding, $charset)); ### has unsafe chars
309							}xeg;
310	6					25	$rawstr =~ s/\?==\?/?= =?/g;
311	6					23	$rawstr;
312							}
313
314							1;
315							__END__