File Coverage

blib/lib/MIME/Words.pm
Criterion Covered Total %
statement 73 76 96.0
branch 15 18 83.3
condition 5 8 62.5
subroutine 12 13 92.3
pod 3 3 100.0
total 108 118 91.5


line stmt bran cond sub pod time code
1             package MIME::Words;
2              
3             =head1 NAME
4              
5             MIME::Words - deal with RFC 2047 encoded words
6              
7              
8             =head1 SYNOPSIS
9              
10             Before reading further, you should see L to make sure that
11             you understand where this module fits into the grand scheme of things.
12             Go on, do it now. I'll wait.
13              
14             Ready? Ok...
15              
16              
17             use MIME::Words qw(:all);
18              
19             ### Decode the string into another string, forgetting the charsets:
20             $decoded = decode_mimewords(
21             'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
22             );
23              
24             ### Split string into array of decoded [DATA,CHARSET] pairs:
25             @decoded = decode_mimewords(
26             'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
27             );
28              
29             ### Encode a single unsafe word:
30             $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
31              
32             ### Encode a string, trying to find the unsafe words inside it:
33             $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
34              
35              
36              
37             =head1 DESCRIPTION
38              
39             Fellow Americans, you probably won't know what the hell this module
40             is for. Europeans, Russians, et al, you probably do. C<:-)>.
41              
42             For example, here's a valid MIME header you might get:
43              
44             From: =?US-ASCII?Q?Keith_Moore?=
45             To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=
46             CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard
47             Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
48             =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
49             =?US-ASCII?Q?.._cool!?=
50              
51             The fields basically decode to (sorry, I can only approximate the
52             Latin characters with 7 bit sequences /o and 'e):
53              
54             From: Keith Moore
55             To: Keld J/orn Simonsen
56             CC: Andr'e Pirard
57             Subject: If you can read this you understand the example... cool!
58              
59              
60             =head1 PUBLIC INTERFACE
61              
62             =over 4
63              
64             =cut
65              
66             require 5.001;
67              
68             ### Pragmas:
69 29     29   82125 use strict;
  29         57  
  29         773  
70 29     29   135 use re 'taint';
  29         51  
  29         1531  
71 29     29   135 use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
  29         44  
  29         1967  
72              
73             ### Exporting:
74 29     29   142 use Exporter;
  29         54  
  29         1978  
75             %EXPORT_TAGS = (all => [qw(decode_mimewords
76             encode_mimeword
77             encode_mimewords
78             )]);
79             Exporter::export_ok_tags('all');
80              
81             ### Inheritance:
82             @ISA = qw(Exporter);
83              
84             ### Other modules:
85 29     29   20662 use MIME::Base64;
  29         18345  
  29         1602  
86 29     29   40871 use MIME::QuotedPrint;
  29         5841  
  29         27916  
87              
88              
89              
90             #------------------------------
91             #
92             # Globals...
93             #
94             #------------------------------
95              
96             ### The package version, both in 1.23 style *and* usable by MakeMaker:
97             $VERSION = "5.507";
98              
99             ### Nonprintables (controls + x7F + 8bit):
100             my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
101              
102              
103             #------------------------------
104              
105             # _decode_Q STRING
106             # Private: used by _decode_header() to decode "Q" encoding, which is
107             # almost, but not exactly, quoted-printable. :-P
108             sub _decode_Q {
109 54     54   88 my $str = shift;
110 54         123 local $1;
111 54         126 $str =~ s/_/\x20/g; # RFC-1522, Q rule 2
112 54         205 $str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
  272         1023  
113 54         175 $str;
114             }
115              
116             # _encode_Q STRING
117             # Private: used by _encode_header() to decode "Q" encoding, which is
118             # almost, but not exactly, quoted-printable. :-P
119             sub _encode_Q {
120 25     25   47 my $str = shift;
121 25         69 local $1;
122 25         172 $str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
  238         860  
123 25         200 $str;
124             }
125              
126             # _decode_B STRING
127             # Private: used by _decode_header() to decode "B" encoding.
128             sub _decode_B {
129 12     12   39 my $str = shift;
130 12         71 decode_base64($str);
131             }
132              
133             # _encode_B STRING
134             # Private: used by _decode_header() to decode "B" encoding.
135             sub _encode_B {
136 0     0   0 my $str = shift;
137 0         0 encode_base64($str, '');
138             }
139              
140              
141              
142             #------------------------------
143              
144             =item decode_mimewords ENCODED
145              
146             I
147             Go through the string looking for RFC 2047-style "Q"
148             (quoted-printable, sort of) or "B" (base64) encoding, and decode them.
149              
150             B splits the ENCODED string into a list of decoded
151             C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
152             data are returned in a 1-element array C<[DATA]>, giving an effective
153             CHARSET of C.
154              
155             $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ';
156             foreach (decode_mimewords($enc)) {
157             print "", ($_->[1] || 'US-ASCII'), ": ", $_->[0], "\n";
158             }
159              
160             B joins the "data" elements of the above
161             list together, and returns that. I
162             and probably I what you want, but if you know that all charsets
163             in the ENCODED string are identical, it might be useful to you.
164             (Before you use this, please see L,
165             which is probably what you want.)
166              
167             In the event of a syntax error, $@ will be set to a description
168             of the error, but parsing will continue as best as possible (so as to
169             get I back when decoding headers).
170             $@ will be false if no error was detected.
171              
172             Any arguments past the ENCODED string are taken to define a hash of options:
173              
174             =cut
175              
176             sub decode_mimewords {
177 131     131 1 9432 my $encstr = shift;
178 131         177 my @tokens;
179 131         423 local($1,$2,$3);
180 131         214 $@ = ''; ### error-return
181              
182             ### Collapse boundaries between adjacent encoded words:
183 131         346 $encstr =~ s{(\?\=)\s*(\=\?)}{$1$2}gs;
184 131         332 pos($encstr) = 0;
185             ### print STDOUT "ENC = [", $encstr, "]\n";
186              
187             ### Decode:
188 131         229 my ($charset, $encoding, $enc, $dec);
189 131         162 while (1) {
190 337 100       806 last if (pos($encstr) >= length($encstr));
191 206         283 my $pos = pos($encstr); ### save it
192              
193             ### Case 1: are we looking at "=?..?..?="?
194 206 100       769 if ($encstr =~ m{\G # from where we left off..
195             =\?([^?]*) # "=?" + charset +
196             \?([bq]) # "?" + encoding +
197             \?([^?]+) # "?" + data maybe with spcs +
198             \?= # "?="
199             }xgi) {
200 66         235 ($charset, $encoding, $enc) = ($1, lc($2), $3);
201 66 100       238 $dec = (($encoding eq 'q') ? _decode_Q($enc) : _decode_B($enc));
202 66         176 push @tokens, [$dec, $charset];
203 66         116 next;
204             }
205              
206             ### Case 2: are we looking at a bad "=?..." prefix?
207             ### We need this to detect problems for case 3, which stops at "=?":
208 140         266 pos($encstr) = $pos; # reset the pointer.
209 140 100       433 if ($encstr =~ m{\G=\?}xg) {
210 4         15 $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|;
211 4         9 push @tokens, ['=?'];
212 4         7 next;
213             }
214              
215             ### Case 3: are we looking at ordinary text?
216 136         225 pos($encstr) = $pos; # reset the pointer.
217 136 50       1167 if ($encstr =~ m{\G # from where we left off...
218             (.*? # shortest possible string,
219             \n*) # followed by 0 or more NLs,
220             (?=(\Z|=\?)) # terminated by "=?" or EOS
221             }sxg) {
222 136 50       380 length($1) or die "MIME::Words: internal logic err: empty token\n";
223 136         424 push @tokens, [$1];
224 136         222 next;
225             }
226              
227             ### Case 4: bug!
228 0         0 die "MIME::Words: unexpected case:\n($encstr) pos $pos\n\t".
229             "Please alert developer.\n";
230             }
231 131 100       594 return (wantarray ? @tokens : join('',map {$_->[0]} @tokens));
  86         399  
232             }
233              
234             #------------------------------
235              
236             =item encode_mimeword RAW, [ENCODING], [CHARSET]
237              
238             I
239             Encode a single RAW "word" that has unsafe characters.
240             The "word" will be encoded in its entirety.
241              
242             ### Encode "<>":
243             $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
244              
245             You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
246             You may specify the CHARSET, which defaults to C.
247              
248             =cut
249              
250             sub encode_mimeword {
251 25     25 1 62 my $word = shift;
252 25   50     100 my $encoding = uc(shift || 'Q');
253 25   50     81 my $charset = uc(shift || 'ISO-8859-1');
254 25 50       91 my $encfunc = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B);
255 25         98 "=?$charset?$encoding?" . &$encfunc($word) . "?=";
256             }
257              
258             #------------------------------
259              
260             =item encode_mimewords RAW, [OPTS]
261              
262             I
263             Given a RAW string, try to find and encode all "unsafe" sequences
264             of characters:
265              
266             ### Encode a string with some unsafe "words":
267             $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
268              
269             Returns the encoded string.
270             Any arguments past the RAW string are taken to define a hash of options:
271              
272             =over 4
273              
274             =item Charset
275              
276             Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
277             a.k.a. "Latin-1".
278              
279             =item Encoding
280              
281             The encoding to use, C<"q"> or C<"b">. The default is C<"q">.
282              
283             =back
284              
285             B this is a quick-and-dirty solution, intended for character
286             sets which overlap ASCII. B
287             rules regarding the use of encoded words in message headers>.
288             You may want to roll your own variant,
289             using C, for your application.
290             I
291              
292             =cut
293              
294             sub encode_mimewords {
295 6     6 1 1808 my ($rawstr, %params) = @_;
296 6   100     50 my $charset = $params{Charset} || 'ISO-8859-1';
297 6   50     45 my $encoding = lc($params{Encoding} || 'q');
298              
299             ### Encode any "words" with unsafe characters.
300             ### We limit such words to 18 characters, to guarantee that the
301             ### worst-case encoding give us no more than 54 + ~10 < 75 characters
302 6         13 my $word;
303 6         22 local $1;
304 6         45 $rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]+\s*)}{ ### get next "word"
305 51         126 $word = $1;
306 51 100       677 (($word !~ /(?:[$NONPRINT])|(?:^\s+$)/o)
307             ? $word ### no unsafe chars
308             : encode_mimeword($word, $encoding, $charset)); ### has unsafe chars
309             }xeg;
310 6         160 $rawstr =~ s/\?==\?/?= =?/g;
311 6         113 $rawstr;
312             }
313              
314             1;
315             __END__