File Coverage

blib/lib/MIME/Words.pm
Criterion Covered Total %
statement 73 76 96.0
branch 15 18 83.3
condition 5 8 62.5
subroutine 12 13 92.3
pod 3 3 100.0
total 108 118 91.5


line stmt bran cond sub pod time code
1             package MIME::Words;
2              
3             =head1 NAME
4              
5             MIME::Words - deal with RFC 2047 encoded words
6              
7              
8             =head1 SYNOPSIS
9              
10             Before reading further, you should see L to make sure that
11             you understand where this module fits into the grand scheme of things.
12             Go on, do it now. I'll wait.
13              
14             Ready? Ok...
15              
16              
17             use MIME::Words qw(:all);
18              
19             ### Decode the string into another string, forgetting the charsets:
20             $decoded = decode_mimewords(
21             'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
22             );
23              
24             ### Split string into array of decoded [DATA,CHARSET] pairs:
25             @decoded = decode_mimewords(
26             'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
27             );
28              
29             ### Encode a single unsafe word:
30             $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
31              
32             ### Encode a string, trying to find the unsafe words inside it:
33             $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
34              
35              
36              
37             =head1 DESCRIPTION
38              
39             Fellow Americans, you probably won't know what the hell this module
40             is for. Europeans, Russians, et al, you probably do. C<:-)>.
41              
42             For example, here's a valid MIME header you might get:
43              
44             From: =?US-ASCII?Q?Keith_Moore?=
45             To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=
46             CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard
47             Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
48             =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
49             =?US-ASCII?Q?.._cool!?=
50              
51             The fields basically decode to (sorry, I can only approximate the
52             Latin characters with 7 bit sequences /o and 'e):
53              
54             From: Keith Moore
55             To: Keld J/orn Simonsen
56             CC: Andr'e Pirard
57             Subject: If you can read this you understand the example... cool!
58              
59              
60             =head1 PUBLIC INTERFACE
61              
62             =over 4
63              
64             =cut
65              
66             require 5.001;
67              
68             ### Pragmas:
69 31     31   168116 use strict;
  31         36  
  31         834  
70 31     31   107 use re 'taint';
  31         36  
  31         1383  
71 31     31   109 use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
  31         38  
  31         1796  
72              
73             ### Exporting:
74 31     31   117 use Exporter;
  31         39  
  31         1923  
75             %EXPORT_TAGS = (all => [qw(decode_mimewords
76             encode_mimeword
77             encode_mimewords
78             )]);
79             Exporter::export_ok_tags('all');
80              
81             ### Inheritance:
82             @ISA = qw(Exporter);
83              
84             ### Other modules:
85 31     31   12810 use MIME::Base64;
  31         16371  
  31         1589  
86 31     31   12444 use MIME::QuotedPrint;
  31         5028  
  31         23226  
87              
88              
89              
90             #------------------------------
91             #
92             # Globals...
93             #
94             #------------------------------
95              
96             ### The package version, both in 1.23 style *and* usable by MakeMaker:
97             $VERSION = "5.509";
98              
99             ### Nonprintables (controls + x7F + 8bit):
100             my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
101              
102              
103             #------------------------------
104              
105             # _decode_Q STRING
106             # Private: used by _decode_header() to decode "Q" encoding, which is
107             # almost, but not exactly, quoted-printable. :-P
108             sub _decode_Q {
109 54     54   46 my $str = shift;
110 54         73 local $1;
111 54         87 $str =~ s/_/\x20/g; # RFC-1522, Q rule 2
112 54         131 $str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
  272         510  
113 54         102 $str;
114             }
115              
116             # _encode_Q STRING
117             # Private: used by _encode_header() to decode "Q" encoding, which is
118             # almost, but not exactly, quoted-printable. :-P
119             sub _encode_Q {
120 25     25   16 my $str = shift;
121 25         30 local $1;
122 25         103 $str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
  238         336  
123 25         92 $str;
124             }
125              
126             # _decode_B STRING
127             # Private: used by _decode_header() to decode "B" encoding.
128             sub _decode_B {
129 12     12   21 my $str = shift;
130 12         64 decode_base64($str);
131             }
132              
133             # _encode_B STRING
134             # Private: used by _decode_header() to decode "B" encoding.
135             sub _encode_B {
136 0     0   0 my $str = shift;
137 0         0 encode_base64($str, '');
138             }
139              
140              
141              
142             #------------------------------
143              
144             =item decode_mimewords ENCODED
145              
146             I
147             Go through the string looking for RFC 2047-style "Q"
148             (quoted-printable, sort of) or "B" (base64) encoding, and decode them.
149              
150             B splits the ENCODED string into a list of decoded
151             C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
152             data are returned in a 1-element array C<[DATA]>, giving an effective
153             CHARSET of C.
154              
155             $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ';
156             foreach (decode_mimewords($enc)) {
157             print "", ($_->[1] || 'US-ASCII'), ": ", $_->[0], "\n";
158             }
159              
160             B joins the "data" elements of the above
161             list together, and returns that. I
162             and probably I what you want, but if you know that all charsets
163             in the ENCODED string are identical, it might be useful to you.
164             (Before you use this, please see L,
165             which is probably what you want.)
166              
167             In the event of a syntax error, $@ will be set to a description
168             of the error, but parsing will continue as best as possible (so as to
169             get I back when decoding headers).
170             $@ will be false if no error was detected.
171              
172             Any arguments past the ENCODED string are taken to define a hash of options:
173              
174             =cut
175              
176             sub decode_mimewords {
177 132     132 1 5312 my $encstr = shift;
178 132         112 my @tokens;
179 132         289 local($1,$2,$3);
180 132         132 $@ = ''; ### error-return
181              
182             ### Collapse boundaries between adjacent encoded words:
183 132         284 $encstr =~ s{(\?\=)\s*(\=\?)}{$1$2}gs;
184 132         299 pos($encstr) = 0;
185             ### print STDOUT "ENC = [", $encstr, "]\n";
186              
187             ### Decode:
188 132         149 my ($charset, $encoding, $enc, $dec);
189 132         106 while (1) {
190 339 100       544 last if (pos($encstr) >= length($encstr));
191 207         184 my $pos = pos($encstr); ### save it
192              
193             ### Case 1: are we looking at "=?..?..?="?
194 207 100       524 if ($encstr =~ m{\G # from where we left off..
195             =\?([^?]*) # "=?" + charset +
196             \?([bq]) # "?" + encoding +
197             \?([^?]+) # "?" + data maybe with spcs +
198             \?= # "?="
199             }xgi) {
200 66         156 ($charset, $encoding, $enc) = ($1, lc($2), $3);
201 66 100       160 $dec = (($encoding eq 'q') ? _decode_Q($enc) : _decode_B($enc));
202 66         110 push @tokens, [$dec, $charset];
203 66         63 next;
204             }
205              
206             ### Case 2: are we looking at a bad "=?..." prefix?
207             ### We need this to detect problems for case 3, which stops at "=?":
208 141         175 pos($encstr) = $pos; # reset the pointer.
209 141 100       272 if ($encstr =~ m{\G=\?}xg) {
210 4         9 $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|;
211 4         8 push @tokens, ['=?'];
212 4         4 next;
213             }
214              
215             ### Case 3: are we looking at ordinary text?
216 137         139 pos($encstr) = $pos; # reset the pointer.
217 137 50       772 if ($encstr =~ m{\G # from where we left off...
218             (.*? # shortest possible string,
219             \n*) # followed by 0 or more NLs,
220             (?=(\Z|=\?)) # terminated by "=?" or EOS
221             }sxg) {
222 137 50       333 length($1) or die "MIME::Words: internal logic err: empty token\n";
223 137         284 push @tokens, [$1];
224 137         146 next;
225             }
226              
227             ### Case 4: bug!
228 0         0 die "MIME::Words: unexpected case:\n($encstr) pos $pos\n\t".
229             "Please alert developer.\n";
230             }
231 132 100       497 return (wantarray ? @tokens : join('',map {$_->[0]} @tokens));
  86         219  
232             }
233              
234             #------------------------------
235              
236             =item encode_mimeword RAW, [ENCODING], [CHARSET]
237              
238             I
239             Encode a single RAW "word" that has unsafe characters.
240             The "word" will be encoded in its entirety.
241              
242             ### Encode "<>":
243             $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
244              
245             You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
246             You may specify the CHARSET, which defaults to C.
247              
248             =cut
249              
250             sub encode_mimeword {
251 25     25 1 132 my $word = shift;
252 25   50     43 my $encoding = uc(shift || 'Q');
253 25   50     42 my $charset = uc(shift || 'ISO-8859-1');
254 25 50       39 my $encfunc = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B);
255 25         47 "=?$charset?$encoding?" . &$encfunc($word) . "?=";
256             }
257              
258             #------------------------------
259              
260             =item encode_mimewords RAW, [OPTS]
261              
262             I
263             Given a RAW string, try to find and encode all "unsafe" sequences
264             of characters:
265              
266             ### Encode a string with some unsafe "words":
267             $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
268              
269             Returns the encoded string.
270             Any arguments past the RAW string are taken to define a hash of options:
271              
272             =over 4
273              
274             =item Charset
275              
276             Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
277             a.k.a. "Latin-1".
278              
279             =item Encoding
280              
281             The encoding to use, C<"q"> or C<"b">. The default is C<"q">.
282              
283             =back
284              
285             B this is a quick-and-dirty solution, intended for character
286             sets which overlap ASCII. B
287             rules regarding the use of encoded words in message headers>.
288             You may want to roll your own variant,
289             using C, for your application.
290             I
291              
292             =cut
293              
294             sub encode_mimewords {
295 6     6 1 937 my ($rawstr, %params) = @_;
296 6   100     28 my $charset = $params{Charset} || 'ISO-8859-1';
297 6   50     27 my $encoding = lc($params{Encoding} || 'q');
298              
299             ### Encode any "words" with unsafe characters.
300             ### We limit such words to 18 characters, to guarantee that the
301             ### worst-case encoding give us no more than 54 + ~10 < 75 characters
302 6         5 my $word;
303 6         14 local $1;
304 6         32 $rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]+\s*)}{ ### get next "word"
305 51         53 $word = $1;
306 51 100       232 (($word !~ /(?:[$NONPRINT])|(?:^\s+$)/o)
307             ? $word ### no unsafe chars
308             : encode_mimeword($word, $encoding, $charset)); ### has unsafe chars
309             }xeg;
310 6         22 $rawstr =~ s/\?==\?/?= =?/g;
311 6         26 $rawstr;
312             }
313              
314             1;
315             __END__