File Coverage

blib/lib/MIME/Words.pm
Criterion Covered Total %
statement 73 76 96.0
branch 15 18 83.3
condition 5 8 62.5
subroutine 12 13 92.3
pod 3 3 100.0
total 108 118 91.5


line stmt bran cond sub pod time code
1             package MIME::Words;
2              
3             =head1 NAME
4              
5             MIME::Words - deal with RFC 2047 encoded words
6              
7              
8             =head1 SYNOPSIS
9              
10             Before reading further, you should see L to make sure that
11             you understand where this module fits into the grand scheme of things.
12             Go on, do it now. I'll wait.
13              
14             Ready? Ok...
15              
16              
17             use MIME::Words qw(:all);
18              
19             ### Decode the string into another string, forgetting the charsets:
20             $decoded = decode_mimewords(
21             'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
22             );
23              
24             ### Split string into array of decoded [DATA,CHARSET] pairs:
25             @decoded = decode_mimewords(
26             'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
27             );
28              
29             ### Encode a single unsafe word:
30             $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
31              
32             ### Encode a string, trying to find the unsafe words inside it:
33             $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
34              
35              
36              
37             =head1 DESCRIPTION
38              
39             Fellow Americans, you probably won't know what the hell this module
40             is for. Europeans, Russians, et al, you probably do. C<:-)>.
41              
42             For example, here's a valid MIME header you might get:
43              
44             From: =?US-ASCII?Q?Keith_Moore?=
45             To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=
46             CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard
47             Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
48             =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
49             =?US-ASCII?Q?.._cool!?=
50              
51             The fields basically decode to (sorry, I can only approximate the
52             Latin characters with 7 bit sequences /o and 'e):
53              
54             From: Keith Moore
55             To: Keld J/orn Simonsen
56             CC: Andr'e Pirard
57             Subject: If you can read this you understand the example... cool!
58              
59              
60             =head1 PUBLIC INTERFACE
61              
62             =over 4
63              
64             =cut
65              
66             require 5.001;
67              
68             ### Pragmas:
69 29     29   151589 use strict;
  29         33  
  29         701  
70 29     29   98 use re 'taint';
  29         35  
  29         1164  
71 29     29   90 use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
  29         32  
  29         1550  
72              
73             ### Exporting:
74 29     29   101 use Exporter;
  29         31  
  29         1557  
75             %EXPORT_TAGS = (all => [qw(decode_mimewords
76             encode_mimeword
77             encode_mimewords
78             )]);
79             Exporter::export_ok_tags('all');
80              
81             ### Inheritance:
82             @ISA = qw(Exporter);
83              
84             ### Other modules:
85 29     29   11153 use MIME::Base64;
  29         13169  
  29         1404  
86 29     29   10445 use MIME::QuotedPrint;
  29         4382  
  29         20586  
87              
88              
89              
90             #------------------------------
91             #
92             # Globals...
93             #
94             #------------------------------
95              
96             ### The package version, both in 1.23 style *and* usable by MakeMaker:
97             $VERSION = "5.508";
98              
99             ### Nonprintables (controls + x7F + 8bit):
100             my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
101              
102              
103             #------------------------------
104              
105             # _decode_Q STRING
106             # Private: used by _decode_header() to decode "Q" encoding, which is
107             # almost, but not exactly, quoted-printable. :-P
108             sub _decode_Q {
109 54     54   50 my $str = shift;
110 54         75 local $1;
111 54         81 $str =~ s/_/\x20/g; # RFC-1522, Q rule 2
112 54         118 $str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
  272         514  
113 54         101 $str;
114             }
115              
116             # _encode_Q STRING
117             # Private: used by _encode_header() to decode "Q" encoding, which is
118             # almost, but not exactly, quoted-printable. :-P
119             sub _encode_Q {
120 25     25   22 my $str = shift;
121 25         31 local $1;
122 25         97 $str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
  238         342  
123 25         90 $str;
124             }
125              
126             # _decode_B STRING
127             # Private: used by _decode_header() to decode "B" encoding.
128             sub _decode_B {
129 12     12   25 my $str = shift;
130 12         54 decode_base64($str);
131             }
132              
133             # _encode_B STRING
134             # Private: used by _decode_header() to decode "B" encoding.
135             sub _encode_B {
136 0     0   0 my $str = shift;
137 0         0 encode_base64($str, '');
138             }
139              
140              
141              
142             #------------------------------
143              
144             =item decode_mimewords ENCODED
145              
146             I
147             Go through the string looking for RFC 2047-style "Q"
148             (quoted-printable, sort of) or "B" (base64) encoding, and decode them.
149              
150             B splits the ENCODED string into a list of decoded
151             C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
152             data are returned in a 1-element array C<[DATA]>, giving an effective
153             CHARSET of C.
154              
155             $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ';
156             foreach (decode_mimewords($enc)) {
157             print "", ($_->[1] || 'US-ASCII'), ": ", $_->[0], "\n";
158             }
159              
160             B joins the "data" elements of the above
161             list together, and returns that. I
162             and probably I what you want, but if you know that all charsets
163             in the ENCODED string are identical, it might be useful to you.
164             (Before you use this, please see L,
165             which is probably what you want.)
166              
167             In the event of a syntax error, $@ will be set to a description
168             of the error, but parsing will continue as best as possible (so as to
169             get I back when decoding headers).
170             $@ will be false if no error was detected.
171              
172             Any arguments past the ENCODED string are taken to define a hash of options:
173              
174             =cut
175              
176             sub decode_mimewords {
177 131     131 1 4738 my $encstr = shift;
178 131         114 my @tokens;
179 131         284 local($1,$2,$3);
180 131         133 $@ = ''; ### error-return
181              
182             ### Collapse boundaries between adjacent encoded words:
183 131         234 $encstr =~ s{(\?\=)\s*(\=\?)}{$1$2}gs;
184 131         216 pos($encstr) = 0;
185             ### print STDOUT "ENC = [", $encstr, "]\n";
186              
187             ### Decode:
188 131         151 my ($charset, $encoding, $enc, $dec);
189 131         102 while (1) {
190 337 100       530 last if (pos($encstr) >= length($encstr));
191 206         164 my $pos = pos($encstr); ### save it
192              
193             ### Case 1: are we looking at "=?..?..?="?
194 206 100       487 if ($encstr =~ m{\G # from where we left off..
195             =\?([^?]*) # "=?" + charset +
196             \?([bq]) # "?" + encoding +
197             \?([^?]+) # "?" + data maybe with spcs +
198             \?= # "?="
199             }xgi) {
200 66         147 ($charset, $encoding, $enc) = ($1, lc($2), $3);
201 66 100       164 $dec = (($encoding eq 'q') ? _decode_Q($enc) : _decode_B($enc));
202 66         106 push @tokens, [$dec, $charset];
203 66         60 next;
204             }
205              
206             ### Case 2: are we looking at a bad "=?..." prefix?
207             ### We need this to detect problems for case 3, which stops at "=?":
208 140         154 pos($encstr) = $pos; # reset the pointer.
209 140 100       245 if ($encstr =~ m{\G=\?}xg) {
210 4         13 $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|;
211 4         6 push @tokens, ['=?'];
212 4         5 next;
213             }
214              
215             ### Case 3: are we looking at ordinary text?
216 136         130 pos($encstr) = $pos; # reset the pointer.
217 136 50       796 if ($encstr =~ m{\G # from where we left off...
218             (.*? # shortest possible string,
219             \n*) # followed by 0 or more NLs,
220             (?=(\Z|=\?)) # terminated by "=?" or EOS
221             }sxg) {
222 136 50       267 length($1) or die "MIME::Words: internal logic err: empty token\n";
223 136         230 push @tokens, [$1];
224 136         139 next;
225             }
226              
227             ### Case 4: bug!
228 0         0 die "MIME::Words: unexpected case:\n($encstr) pos $pos\n\t".
229             "Please alert developer.\n";
230             }
231 131 100       410 return (wantarray ? @tokens : join('',map {$_->[0]} @tokens));
  86         221  
232             }
233              
234             #------------------------------
235              
236             =item encode_mimeword RAW, [ENCODING], [CHARSET]
237              
238             I
239             Encode a single RAW "word" that has unsafe characters.
240             The "word" will be encoded in its entirety.
241              
242             ### Encode "<>":
243             $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
244              
245             You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
246             You may specify the CHARSET, which defaults to C.
247              
248             =cut
249              
250             sub encode_mimeword {
251 25     25 1 149 my $word = shift;
252 25   50     47 my $encoding = uc(shift || 'Q');
253 25   50     37 my $charset = uc(shift || 'ISO-8859-1');
254 25 50       36 my $encfunc = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B);
255 25         44 "=?$charset?$encoding?" . &$encfunc($word) . "?=";
256             }
257              
258             #------------------------------
259              
260             =item encode_mimewords RAW, [OPTS]
261              
262             I
263             Given a RAW string, try to find and encode all "unsafe" sequences
264             of characters:
265              
266             ### Encode a string with some unsafe "words":
267             $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
268              
269             Returns the encoded string.
270             Any arguments past the RAW string are taken to define a hash of options:
271              
272             =over 4
273              
274             =item Charset
275              
276             Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
277             a.k.a. "Latin-1".
278              
279             =item Encoding
280              
281             The encoding to use, C<"q"> or C<"b">. The default is C<"q">.
282              
283             =back
284              
285             B this is a quick-and-dirty solution, intended for character
286             sets which overlap ASCII. B
287             rules regarding the use of encoded words in message headers>.
288             You may want to roll your own variant,
289             using C, for your application.
290             I
291              
292             =cut
293              
294             sub encode_mimewords {
295 6     6 1 891 my ($rawstr, %params) = @_;
296 6   100     27 my $charset = $params{Charset} || 'ISO-8859-1';
297 6   50     27 my $encoding = lc($params{Encoding} || 'q');
298              
299             ### Encode any "words" with unsafe characters.
300             ### We limit such words to 18 characters, to guarantee that the
301             ### worst-case encoding give us no more than 54 + ~10 < 75 characters
302 6         7 my $word;
303 6         13 local $1;
304 6         32 $rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]+\s*)}{ ### get next "word"
305 51         54 $word = $1;
306 51 100       217 (($word !~ /(?:[$NONPRINT])|(?:^\s+$)/o)
307             ? $word ### no unsafe chars
308             : encode_mimeword($word, $encoding, $charset)); ### has unsafe chars
309             }xeg;
310 6         25 $rawstr =~ s/\?==\?/?= =?/g;
311 6         23 $rawstr;
312             }
313              
314             1;
315             __END__