File Coverage

blib/lib/Email/MIME/ContentType.pm
Criterion Covered Total %
statement 214 243 88.0
branch 109 158 68.9
condition 30 48 62.5
subroutine 17 17 100.0
pod 4 4 100.0
total 374 470 79.5


line stmt bran cond sub pod time code
1 4     4   238024 use strict;
  4         38  
  4         113  
2 4     4   20 use warnings;
  4         8  
  4         210  
3             package Email::MIME::ContentType;
4             # ABSTRACT: Parse and build a MIME Content-Type or Content-Disposition Header
5             $Email::MIME::ContentType::VERSION = '1.026';
6 4     4   23 use Carp;
  4         7  
  4         300  
7 4     4   2091 use Encode 2.87 qw(encode find_mime_encoding);
  4         36615  
  4         270  
8 4     4   25 use Exporter 5.57 'import';
  4         46  
  4         97  
9 4     4   1774 use Text::Unidecode;
  4         8476  
  4         5036  
10              
11             our @EXPORT = qw(parse_content_type parse_content_disposition build_content_type build_content_disposition);
12              
13             #pod =head1 SYNOPSIS
14             #pod
15             #pod use Email::MIME::ContentType;
16             #pod
17             #pod # Content-Type: text/plain; charset="us-ascii"; format=flowed
18             #pod my $ct = 'text/plain; charset="us-ascii"; format=flowed';
19             #pod my $data = parse_content_type($ct);
20             #pod
21             #pod $data = {
22             #pod type => "text",
23             #pod subtype => "plain",
24             #pod attributes => {
25             #pod charset => "us-ascii",
26             #pod format => "flowed"
27             #pod }
28             #pod };
29             #pod
30             #pod my $ct_new = build_content_type($data);
31             #pod # text/plain; charset=us-ascii; format=flowed
32             #pod
33             #pod
34             #pod # Content-Type: application/x-stuff;
35             #pod # title*0*=us-ascii'en'This%20is%20even%20more%20;
36             #pod # title*1*=%2A%2A%2Afun%2A%2A%2A%20;
37             #pod # title*2="isn't it!"
38             #pod my $ct = q(application/x-stuff;
39             #pod title*0*=us-ascii'en'This%20is%20even%20more%20;
40             #pod title*1*=%2A%2A%2Afun%2A%2A%2A%20;
41             #pod title*2="isn't it!");
42             #pod my $data = parse_content_type($ct);
43             #pod
44             #pod $data = {
45             #pod type => "application",
46             #pod subtype => "x-stuff",
47             #pod attributes => {
48             #pod title => "This is even more ***fun*** isn't it!"
49             #pod }
50             #pod };
51             #pod
52             #pod
53             #pod # Content-Disposition: attachment; filename=genome.jpeg;
54             #pod # modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
55             #pod my $cd = q(attachment; filename=genome.jpeg;
56             #pod modification-date="Wed, 12 Feb 1997 16:29:51 -0500");
57             #pod my $data = parse_content_disposition($cd);
58             #pod
59             #pod $data = {
60             #pod type => "attachment",
61             #pod attributes => {
62             #pod filename => "genome.jpeg",
63             #pod "modification-date" => "Wed, 12 Feb 1997 16:29:51 -0500"
64             #pod }
65             #pod };
66             #pod
67             #pod my $cd_new = build_content_disposition($data);
68             #pod # attachment; filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
69             #pod
70             #pod =cut
71              
72             our $STRICT_PARAMS = 1;
73              
74             my $ct_default = 'text/plain; charset=us-ascii';
75              
76             my $re_token = qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/; # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
77             my $re_token_non_strict = qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/; # allow CTLs and above ASCII
78              
79             my $re_qtext = qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/; # US-ASCII except CR, LF, white space, backslash and quote
80             my $re_quoted_pair = qr/\\[\x00-\x7F]/;
81             my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
82              
83             my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
84             my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
85             my $re_quoted_string_non_strict = qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
86              
87             my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
88             my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
89             my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
90              
91             sub parse_content_type {
92 85     85 1 55758 my $ct = shift;
93              
94             # If the header isn't there or is empty, give default answer.
95 85 100 66     362 return parse_content_type($ct_default) unless defined $ct and length $ct;
96              
97 83         211 _unfold_lines($ct);
98 83         207 _clean_comments($ct);
99              
100             # It is also recommend (sic.) that this default be assumed when a
101             # syntactically invalid Content-Type header field is encountered.
102 83 50       682 unless ($ct =~ s/^($re_token)\/($re_token)//) {
103 0 0 0     0 unless ($STRICT_PARAMS and $ct =~ s/^($re_token_non_strict)\/($re_token_non_strict)//) {
104 0         0 carp "Invalid Content-Type '$ct'";
105 0         0 return parse_content_type($ct_default);
106             }
107             }
108              
109 83         385 my ($type, $subtype) = (lc $1, lc $2);
110              
111 83         181 _clean_comments($ct);
112 83         279 $ct =~ s/\s+$//;
113              
114 83         153 my $attributes = {};
115 83 50 100     368 if ($STRICT_PARAMS and length $ct and $ct !~ /^;/) {
      66        
116 0         0 carp "Missing semicolon before first Content-Type parameter '$ct'";
117             } else {
118 83         179 $attributes = _process_rfc2231(_parse_attributes($ct));
119             }
120              
121             return {
122 83         490 type => $type,
123             subtype => $subtype,
124             attributes => $attributes,
125              
126             # This is dumb. Really really dumb. For backcompat. -- rjbs,
127             # 2013-08-10
128             discrete => $type,
129             composite => $subtype,
130             };
131             }
132              
133             my $cd_default = 'attachment';
134              
135             sub parse_content_disposition {
136 34     34 1 16789 my $cd = shift;
137              
138 34 100 66     254 return parse_content_disposition($cd_default) unless defined $cd and length $cd;
139              
140 32         72 _unfold_lines($cd);
141 32         56 _clean_comments($cd);
142              
143 32 50       212 unless ($cd =~ s/^($re_token)//) {
144 0 0 0     0 unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
145 0         0 carp "Invalid Content-Disposition '$cd'";
146 0         0 return parse_content_disposition($cd_default);
147             }
148             }
149              
150 32         91 my $type = lc $1;
151              
152 32         65 _clean_comments($cd);
153 32         88 $cd =~ s/\s+$//;
154              
155 32         46 my $attributes = {};
156 32 50 100     170 if ($STRICT_PARAMS and length $cd and $cd !~ /^;/) {
      66        
157 0         0 carp "Missing semicolon before first Content-Disposition parameter '$cd'";
158             } else {
159 32         73 $attributes = _process_rfc2231(_parse_attributes($cd));
160             }
161              
162             return {
163 32         139 type => $type,
164             attributes => $attributes,
165             };
166             }
167              
168             my $re_invalid_for_quoted_value = qr/[\x00-\x08\x0A-\x1F\x7F-\xFF]/; # non-US-ASCII and CTLs without SPACE and TAB
169             my $re_escape_extended_value = qr/[\x00-\x20\x7F-\xFF\*'%()<>@,;:\\"\/\[\]?=]/; # non-US-ASCII, SPACE, CTLs, *'% and tspecials ()<>@,;:\\"/[]?=
170              
171             sub build_content_type {
172 15     15 1 8410 my $ct = shift;
173              
174 15 50       37 croak 'Missing Content-Type \'type\' parameter' unless exists $ct->{type};
175 15 50       24 croak 'Missing Content-Type \'subtype\' parameter' unless exists $ct->{subtype};
176              
177 15 50       126 croak 'Invalid Content-Type \'type\' parameter' if $ct->{type} !~ /^(?:$re_token)*$/;
178 15 50       75 croak 'Invalid Content-Type \'subtype\' parameter' if $ct->{subtype} !~ /^(?:$re_token)*$/;
179              
180 15 50       39 croak 'Too long Content-Type \'type\' and \'subtype\' parameters' if length($ct->{type}) + length($ct->{subtype}) > 76;
181              
182 15         21 my ($extra) = grep !/(?:type|subtype|attributes)/, sort keys %{$ct};
  15         141  
183 15 50       36 croak "Extra Content-Type '$extra' parameter" if defined $extra;
184              
185 15         33 my $ret = $ct->{type} . '/' . $ct->{subtype};
186 15 50       40 my $attrs = exists $ct->{attributes} ? _build_attributes($ct->{attributes}) : '';
187 15 100       40 $ret .= "; $attrs" if length($attrs);
188 15         37 return $ret;
189             }
190              
191             sub build_content_disposition {
192 13     13 1 7300 my $cd = shift;
193              
194 13 50       32 croak 'Missing Content-Type \'type\' parameter' unless exists $cd->{type};
195              
196 13 50       114 croak 'Invalid Content-Type \'type\' parameter' if $cd->{type} !~ /^(?:$re_token)*$/;
197              
198 13 50       34 croak 'Too long Content-Type \'type\' parameter' if length($cd->{type}) > 77;
199              
200 13         19 my ($extra) = grep !/(?:type|attributes)/, sort keys %{$cd};
  13         106  
201 13 50       30 croak "Extra Content-Type '$extra' parameter" if defined $extra;
202              
203 13         22 my $ret = $cd->{type};
204 13 50       35 my $attrs = exists $cd->{attributes} ? _build_attributes($cd->{attributes}) : '';
205 13 100       31 $ret .= "; $attrs" if length($attrs);
206 13         32 return $ret;
207             }
208              
209             sub _build_attributes {
210 28     28   37 my $attributes = shift;
211              
212 28         35 my $ret = '';
213              
214 28         34 foreach my $key (sort keys %{$attributes}) {
  28         74  
215 38         57 my $value = $attributes->{$key};
216 38         49 my $ascii_value = $value;
217 38         56 my @continuous_value;
218             my $extended_value_charset;
219              
220 38 50       130 croak "Invalid attribute '$key'" if $key =~ /$re_escape_extended_value/; # complement to attribute-char in 8bit space
221 38 50       66 croak "Undefined attribute '$key'" unless defined $value;
222              
223 4 100   4   45 if ($value =~ /\P{ASCII}/) {
  4         21  
  4         54  
  38         84  
224 7         23 $ascii_value = unidecode($value);
225 7         2627 $ascii_value =~ s/\P{ASCII}/_/g;
226 7         41 @continuous_value = map { encode('UTF-8', $_) } split //, $value;
  197         5888  
227 7         227 $extended_value_charset = 'UTF-8';
228             }
229              
230 38 100 100     285 if ($ascii_value !~ /^(?:$re_token)*$/ or $ascii_value =~ /'/) {
231 20 50       64 if ($ascii_value =~ /$re_invalid_for_quoted_value/) {
232 0 0       0 @continuous_value = split //, $value unless @continuous_value;
233 0         0 $ascii_value =~ s/[\n\r]/ /g;
234 0         0 $ascii_value =~ s/$re_invalid_for_quoted_value/_/g;
235             }
236 20         52 $ascii_value =~ s/(["\\])/\\$1/g;
237 20         37 $ascii_value = "\"$ascii_value\"";
238             }
239              
240 38 100       81 if (length($key) + length($ascii_value) > 75) { # length(" $key=$ascii_value;") > 78
241 3 50       10 croak "Too long attribute '$key'" if length($key) > 71; # length(" $key=...;") > 78
242 3 100       11 my $pos = $ascii_value =~ /"$/ ? 71 : 72;
243 3         11 substr($ascii_value, $pos - length($key), length($ascii_value) + length($key) - 72, '...');
244 3 50       44 @continuous_value = split //, $value unless @continuous_value;
245             }
246              
247 38 100       66 if (@continuous_value) {
248 10         14 my $needs_quote;
249 10 100       21 unless (defined $extended_value_charset) {
250 3 100       6 $needs_quote = 1 if grep { $_ !~ /^(?:$re_token)*$/ or $_ =~ /'/ } @continuous_value;
  204 100       898  
251 3 50 66     69 $extended_value_charset = 'US-ASCII' if $needs_quote and grep /$re_invalid_for_quoted_value/, @continuous_value;
252             }
253              
254 10         13 my $add_param_len = 4; # for '; *='
255 10 100       25 if (defined $extended_value_charset) {
    100          
256 7         124 $_ =~ s/($re_escape_extended_value)/sprintf('%%%02X', ord($1))/eg foreach @continuous_value;
  71         345  
257 7         20 substr($continuous_value[0], 0, 0, "$extended_value_charset''");
258 7         12 $add_param_len += 1; # for '*' - charset
259             } elsif ($needs_quote) {
260 2         38 $_ =~ s/(["\\])/\\$1/g foreach @continuous_value;
261 2         5 $add_param_len += 2; # for quotes
262             }
263              
264 10 100 100     60 if ($value =~ /\P{ASCII}/ and length(my $oneparameter = "; $key*=" . join '', @continuous_value) <= 78) {
265 4         9 $ret .= $oneparameter;
266             } else {
267 6         11 my $buf = '';
268 6         7 my $count = 0;
269 6         13 foreach (@continuous_value) {
270 308 100       458 if (length($key) + length($count) + length($buf) + length($_) + $add_param_len > 78) {
271 6 100       13 $buf = "\"$buf\"" if $needs_quote;
272 6         13 my $parameter = "; $key*$count";
273 6 100       14 $parameter .= '*' if defined $extended_value_charset;
274 6         11 $parameter .= "=$buf";
275 6 50       14 croak "Too long attribute '$key'" if length($parameter) > 78;
276 6         8 $ret .= $parameter;
277 6         9 $buf = '';
278 6         9 $count++;
279             }
280 308         354 $buf .= $_;
281             }
282 6 50       13 if (length($buf)) {
283 6 100       12 $buf = "\"$buf\"" if $needs_quote;
284 6         15 my $parameter = "; $key*$count";
285 6 100       13 $parameter .= '*' if defined $extended_value_charset;
286 6         11 $parameter .= "=$buf";
287 6 50       11 croak "Too long attribute '$key'" if length($parameter) > 78;
288 6         11 $ret .= $parameter;
289             }
290             }
291             }
292              
293 38         114 $ret .= "; $key=$ascii_value";
294             }
295              
296 28 100       74 substr($ret, 0, 2, '') if length $ret;
297 28         65 return $ret;
298             }
299              
300             sub _unfold_lines {
301 115     115   1139 $_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
302             }
303              
304             sub _clean_comments {
305 962     962   1939 my $ret = ($_[0] =~ s/^\s+//);
306 962         1695 while (length $_[0]) {
307 782 100       1448 last unless $_[0] =~ s/^\(//;
308 32         57 my $level = 1;
309 32         71 while (length $_[0]) {
310 408         666 my $ch = substr $_[0], 0, 1, '';
311 408 100       1016 if ($ch eq '(') {
    100          
    100          
312 10         20 $level++;
313             } elsif ($ch eq ')') {
314 42         55 $level--;
315 42 100       89 last if $level == 0;
316             } elsif ($ch eq '\\') {
317 12         23 substr $_[0], 0, 1, '';
318             }
319             }
320 32 0 33     62 carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
321 32         90 $ret |= ($_[0] =~ s/^\s+//);
322             }
323 962         1535 return $ret;
324             }
325              
326             sub _process_rfc2231 {
327 115     115   210 my ($attribs) = @_;
328 115         179 my %cont;
329             my %encoded;
330              
331 115         158 foreach (keys %{$attribs}) {
  115         357  
332 181 100       512 next unless $_ =~ m/^(.*)\*([0-9]+)\*?$/;
333 66         150 my ($attr, $sec) = ($1, $2);
334 66         149 $cont{$attr}->{$sec} = $attribs->{$_};
335 66 100       198 $encoded{$attr} = 1 if $_ =~ m/\*$/;
336 66         126 delete $attribs->{$_};
337             }
338              
339 115         253 foreach (keys %cont) {
340 26         38 my $key = $_;
341 26 100       72 $key .= '*' if $encoded{$_};
342 26         40 $attribs->{$key} = join '', @{$cont{$_}}{sort { $a <=> $b } keys %{$cont{$_}}};
  26         107  
  46         113  
  26         123  
343             }
344              
345 115         157 foreach (keys %{$attribs}) {
  115         214  
346 138 100       324 next unless $_ =~ m/^(.*)\*$/;
347 35         96 my $key = $1;
348 35 50 33     511 next unless defined $attribs->{$_} and $attribs->{$_} =~ m/^$re_exvalue$/;
349 35         113 my ($charset, $value) = ($1, $2);
350 35         138 $value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
  313         983  
351 35 100       121 if (length $charset) {
352 23         72 my $enc = find_mime_encoding($charset);
353 23 50       4280 if (defined $enc) {
354 23         145 $value = $enc->decode($value);
355             } else {
356 0         0 carp "Unknown charset '$charset' in attribute '$key' value";
357             }
358             }
359 35         117 $attribs->{$key} = $value;
360 35         102 delete $attribs->{$_};
361             }
362              
363 115         330 return $attribs;
364             }
365              
366             sub _parse_attributes {
367 115     115   212 local $_ = shift;
368 115 50 66     457 substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
369 115         173 my $attribs = {};
370              
371 115         229 while (length $_) {
372 185 100 33     587 s/^;// or $STRICT_PARAMS and do {
373 0         0 carp "Missing semicolon before parameter '$_'";
374 0         0 return $attribs;
375             };
376              
377 185         454 _clean_comments($_);
378              
379 185 100       308 unless (length $_) {
380             # Some mail software generates a Content-Type like this:
381             # "Content-Type: text/plain;"
382             # RFC 1521 section 3 says a parameter must exist if there is a
383             # semicolon.
384 4 50       13 carp "Extra semicolon after last parameter" if $STRICT_PARAMS;
385 4         12 return $attribs;
386             }
387              
388 181         231 my $attribute;
389 181 100       880 if (s/^($re_token)=//) {
390 180         412 $attribute = lc $1;
391             } else {
392 1 50       5 if ($STRICT_PARAMS) {
393 0         0 carp "Illegal parameter '$_'";
394 0         0 return $attribs;
395             }
396 1 50       41 if (s/^($re_token_non_strict)=//) {
397 0         0 $attribute = lc $1;
398             } else {
399 1 50       8 unless (s/^([^;=\s]+)\s*=//) {
400 0         0 carp "Cannot parse parameter '$_'";
401 0         0 return $attribs;
402             }
403 1         4 $attribute = lc $1;
404             }
405             }
406              
407 181         388 _clean_comments($_);
408 181         267 my $value = _extract_attribute_value();
409 181         398 $attribs->{$attribute} = $value;
410 181         336 _clean_comments($_);
411             }
412              
413 111         235 return $attribs;
414             }
415              
416             sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
417 181     181   259 my $value;
418 181         320 while (length $_) {
419 185 100       1648 if (s/^($re_token)//) {
    50          
    0          
    0          
    0          
420 112         264 $value .= $1;
421             } elsif (s/^$re_quoted_string//) {
422 73         187 my $sub = $1;
423 73         156 $sub =~ s/\\(.)/$1/g;
424 73         140 $value .= $sub;
425             } elsif ($STRICT_PARAMS) {
426 0         0 my $char = substr $_, 0, 1;
427 0         0 carp "Unquoted '$char' not allowed";
428 0         0 return;
429             } elsif (s/^($re_token_non_strict)//) {
430 0         0 $value .= $1;
431             } elsif (s/^$re_quoted_string_non_strict//) {
432 0         0 my $sub = $1;
433 0         0 $sub =~ s/\\(.)/$1/g;
434 0         0 $value .= $sub;
435             }
436              
437 185         322 my $erased = _clean_comments($_);
438 185 100 100     607 last if !length $_ or /^;/;
439 6 50       14 if ($STRICT_PARAMS) {
440 0         0 my $char = substr $_, 0, 1;
441 0         0 carp "Extra '$char' found after parameter";
442 0         0 return;
443             }
444              
445 6 50       13 if ($erased) {
446             # Sometimes semicolon is missing, so check for = char
447 6 100       79 last if m/^$re_token_non_strict=/;
448 4         11 $value .= ' ';
449             }
450              
451 4         12 $value .= substr $_, 0, 1, '';
452             }
453 181         313 return $value;
454             }
455              
456             1;
457              
458             #pod =func parse_content_type
459             #pod
460             #pod This routine is exported by default.
461             #pod
462             #pod This routine parses email content type headers according to section 5.1 of RFC
463             #pod 2045 and also RFC 2231 (Character Set and Parameter Continuations). It returns
464             #pod a hash as above, with entries for the C, the C, and a hash of
465             #pod C.
466             #pod
467             #pod For backward compatibility with a really unfortunate misunderstanding of RFC
468             #pod 2045 by the early implementors of this module, C and C are
469             #pod also present in the returned hashref, with the values of C and C
470             #pod respectively.
471             #pod
472             #pod =func parse_content_disposition
473             #pod
474             #pod This routine is exported by default.
475             #pod
476             #pod This routine parses email Content-Disposition headers according to RFC 2183 and
477             #pod RFC 2231. It returns a hash as above, with entries for the C, and a hash
478             #pod of C.
479             #pod
480             #pod =func build_content_type
481             #pod
482             #pod This routine is exported by default.
483             #pod
484             #pod This routine builds email Content-Type header according to RFC 2045 and RFC 2231.
485             #pod It takes a hash as above, with entries for the C, the C, and
486             #pod optionally also a hash of C. It returns a string representing
487             #pod Content-Type header. Non-ASCII attributes are encoded to UTF-8 according to
488             #pod Character Set section of RFC 2231. Attribute which has more then 78 ASCII
489             #pod characters is split into more attributes accorrding to Parameter Continuations
490             #pod of RFC 2231. For compatibility reasons with clients which do not support
491             #pod RFC 2231, output string contains also truncated ASCII version of any too long or
492             #pod non-ASCII attribute. Encoding to ASCII is done via Text::Unidecode module.
493             #pod
494             #pod =func build_content_disposition
495             #pod
496             #pod This routine is exported by default.
497             #pod
498             #pod This routine builds email Content-Disposition header according to RFC 2182 and
499             #pod RFC 2231. It takes a hash as above, with entries for the C, and
500             #pod optionally also a hash of C. It returns a string representing
501             #pod Content-Disposition header. Non-ASCII or too long attributes are handled in
502             #pod the same way like in L.
503             #pod
504             #pod =head1 WARNINGS
505             #pod
506             #pod This is not a valid content-type header, according to both RFC 1521 and RFC
507             #pod 2045:
508             #pod
509             #pod Content-Type: type/subtype;
510             #pod
511             #pod If a semicolon appears, a parameter must. C will carp if
512             #pod it encounters a header of this type, but you can suppress this by setting
513             #pod C<$Email::MIME::ContentType::STRICT_PARAMS> to a false value. Please consider
514             #pod localizing this assignment!
515             #pod
516             #pod Same applies for C.
517             #pod
518             #pod =cut
519              
520             __END__