File Coverage

blib/lib/Email/MIME/ContentType.pm
Criterion Covered Total %
statement 215 244 88.1
branch 111 160 69.3
condition 33 51 64.7
subroutine 17 17 100.0
pod 4 4 100.0
total 380 476 79.8


line stmt bran cond sub pod time code
1 4     4   234845 use strict;
  4         36  
  4         102  
2 4     4   18 use warnings;
  4         5  
  4         161  
3             package Email::MIME::ContentType 1.027;
4             # ABSTRACT: Parse and build a MIME Content-Type or Content-Disposition Header
5              
6 4     4   19 use Carp;
  4         6  
  4         273  
7 4     4   1910 use Encode 2.87 qw(encode find_mime_encoding);
  4         35617  
  4         292  
8 4     4   28 use Exporter 5.57 'import';
  4         45  
  4         97  
9 4     4   1782 use Text::Unidecode;
  4         8292  
  4         5101  
10              
11             # If set, generate both foo*0=x and foo=x versions. -- rjbs, 2022-08-24
12             our $PRE_2231_FORM = 1;
13              
14             our @EXPORT = qw(parse_content_type parse_content_disposition build_content_type build_content_disposition);
15              
16             #pod =head1 SYNOPSIS
17             #pod
18             #pod use Email::MIME::ContentType;
19             #pod
20             #pod # Content-Type: text/plain; charset="us-ascii"; format=flowed
21             #pod my $ct = 'text/plain; charset="us-ascii"; format=flowed';
22             #pod my $data = parse_content_type($ct);
23             #pod
24             #pod $data = {
25             #pod type => "text",
26             #pod subtype => "plain",
27             #pod attributes => {
28             #pod charset => "us-ascii",
29             #pod format => "flowed"
30             #pod }
31             #pod };
32             #pod
33             #pod my $ct_new = build_content_type($data);
34             #pod # text/plain; charset=us-ascii; format=flowed
35             #pod
36             #pod
37             #pod # Content-Type: application/x-stuff;
38             #pod # title*0*=us-ascii'en'This%20is%20even%20more%20;
39             #pod # title*1*=%2A%2A%2Afun%2A%2A%2A%20;
40             #pod # title*2="isn't it!"
41             #pod my $ct = q(application/x-stuff;
42             #pod title*0*=us-ascii'en'This%20is%20even%20more%20;
43             #pod title*1*=%2A%2A%2Afun%2A%2A%2A%20;
44             #pod title*2="isn't it!");
45             #pod my $data = parse_content_type($ct);
46             #pod
47             #pod $data = {
48             #pod type => "application",
49             #pod subtype => "x-stuff",
50             #pod attributes => {
51             #pod title => "This is even more ***fun*** isn't it!"
52             #pod }
53             #pod };
54             #pod
55             #pod
56             #pod # Content-Disposition: attachment; filename=genome.jpeg;
57             #pod # modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
58             #pod my $cd = q(attachment; filename=genome.jpeg;
59             #pod modification-date="Wed, 12 Feb 1997 16:29:51 -0500");
60             #pod my $data = parse_content_disposition($cd);
61             #pod
62             #pod $data = {
63             #pod type => "attachment",
64             #pod attributes => {
65             #pod filename => "genome.jpeg",
66             #pod "modification-date" => "Wed, 12 Feb 1997 16:29:51 -0500"
67             #pod }
68             #pod };
69             #pod
70             #pod my $cd_new = build_content_disposition($data);
71             #pod # attachment; filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
72             #pod
73             #pod =cut
74              
75             our $STRICT_PARAMS = 1;
76              
77             my $ct_default = 'text/plain; charset=us-ascii';
78              
79             my $re_token = qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/; # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
80             my $re_token_non_strict = qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/; # allow CTLs and above ASCII
81              
82             my $re_qtext = qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/; # US-ASCII except CR, LF, white space, backslash and quote
83             my $re_quoted_pair = qr/\\[\x00-\x7F]/;
84             my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
85              
86             my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
87             my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
88             my $re_quoted_string_non_strict = qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
89              
90             my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
91             my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
92             my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
93              
94             sub parse_content_type {
95 100     100 1 55372 my $ct = shift;
96              
97             # If the header isn't there or is empty, give default answer.
98 100 100 66     409 return parse_content_type($ct_default) unless defined $ct and length $ct;
99              
100 98         228 _unfold_lines($ct);
101 98         201 _clean_comments($ct);
102              
103             # It is also recommend (sic.) that this default be assumed when a
104             # syntactically invalid Content-Type header field is encountered.
105 98 50       723 unless ($ct =~ s/^($re_token)\/($re_token)//) {
106 0 0 0     0 unless ($STRICT_PARAMS and $ct =~ s/^($re_token_non_strict)\/($re_token_non_strict)//) {
107 0         0 carp "Invalid Content-Type '$ct'";
108 0         0 return parse_content_type($ct_default);
109             }
110             }
111              
112 98         354 my ($type, $subtype) = (lc $1, lc $2);
113              
114 98         205 _clean_comments($ct);
115 98         259 $ct =~ s/\s+$//;
116              
117 98         146 my $attributes = {};
118 98 50 100     383 if ($STRICT_PARAMS and length $ct and $ct !~ /^;/) {
      66        
119 0         0 carp "Missing semicolon before first Content-Type parameter '$ct'";
120             } else {
121 98         175 $attributes = _process_rfc2231(_parse_attributes($ct));
122             }
123              
124             return {
125 98         513 type => $type,
126             subtype => $subtype,
127             attributes => $attributes,
128              
129             # This is dumb. Really really dumb. For backcompat. -- rjbs,
130             # 2013-08-10
131             discrete => $type,
132             composite => $subtype,
133             };
134             }
135              
136             my $cd_default = 'attachment';
137              
138             sub parse_content_disposition {
139 47     47 1 25470 my $cd = shift;
140              
141 47 100 66     208 return parse_content_disposition($cd_default) unless defined $cd and length $cd;
142              
143 45         138 _unfold_lines($cd);
144 45         120 _clean_comments($cd);
145              
146 45 50       321 unless ($cd =~ s/^($re_token)//) {
147 0 0 0     0 unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
148 0         0 carp "Invalid Content-Disposition '$cd'";
149 0         0 return parse_content_disposition($cd_default);
150             }
151             }
152              
153 45         137 my $type = lc $1;
154              
155 45         96 _clean_comments($cd);
156 45         138 $cd =~ s/\s+$//;
157              
158 45         77 my $attributes = {};
159 45 50 100     285 if ($STRICT_PARAMS and length $cd and $cd !~ /^;/) {
      66        
160 0         0 carp "Missing semicolon before first Content-Disposition parameter '$cd'";
161             } else {
162 45         109 $attributes = _process_rfc2231(_parse_attributes($cd));
163             }
164              
165             return {
166 45         191 type => $type,
167             attributes => $attributes,
168             };
169             }
170              
171             my $re_invalid_for_quoted_value = qr/[\x00-\x08\x0A-\x1F\x7F-\xFF]/; # non-US-ASCII and CTLs without SPACE and TAB
172             my $re_escape_extended_value = qr/[\x00-\x20\x7F-\xFF\*'%()<>@,;:\\"\/\[\]?=]/; # non-US-ASCII, SPACE, CTLs, *'% and tspecials ()<>@,;:\\"/[]?=
173              
174             sub build_content_type {
175 30     30 1 41562 my $ct = shift;
176              
177 30 50       67 croak 'Missing Content-Type \'type\' parameter' unless exists $ct->{type};
178 30 50       59 croak 'Missing Content-Type \'subtype\' parameter' unless exists $ct->{subtype};
179              
180 30 50       245 croak 'Invalid Content-Type \'type\' parameter' if $ct->{type} !~ /^(?:$re_token)*$/;
181 30 50       132 croak 'Invalid Content-Type \'subtype\' parameter' if $ct->{subtype} !~ /^(?:$re_token)*$/;
182              
183 30 50       80 croak 'Too long Content-Type \'type\' and \'subtype\' parameters' if length($ct->{type}) + length($ct->{subtype}) > 76;
184              
185 30         36 my ($extra) = grep !/(?:type|subtype|attributes)/, sort keys %{$ct};
  30         262  
186 30 50       68 croak "Extra Content-Type '$extra' parameter" if defined $extra;
187              
188 30         67 my $ret = $ct->{type} . '/' . $ct->{subtype};
189 30 50       82 my $attrs = exists $ct->{attributes} ? _build_attributes($ct->{attributes}) : '';
190 30 100       76 $ret .= "; $attrs" if length($attrs);
191 30         68 return $ret;
192             }
193              
194             sub build_content_disposition {
195 26     26 1 39986 my $cd = shift;
196              
197 26 50       79 croak 'Missing Content-Type \'type\' parameter' unless exists $cd->{type};
198              
199 26 50       277 croak 'Invalid Content-Type \'type\' parameter' if $cd->{type} !~ /^(?:$re_token)*$/;
200              
201 26 50       82 croak 'Too long Content-Type \'type\' parameter' if length($cd->{type}) > 77;
202              
203 26         48 my ($extra) = grep !/(?:type|attributes)/, sort keys %{$cd};
  26         282  
204 26 50       96 croak "Extra Content-Type '$extra' parameter" if defined $extra;
205              
206 26         46 my $ret = $cd->{type};
207 26 50       92 my $attrs = exists $cd->{attributes} ? _build_attributes($cd->{attributes}) : '';
208 26 100       86 $ret .= "; $attrs" if length($attrs);
209 26         72 return $ret;
210             }
211              
212             sub _build_attributes {
213 56     56   78 my $attributes = shift;
214              
215 56         118 my $ret = '';
216              
217 56         81 foreach my $key (sort keys %{$attributes}) {
  56         148  
218 76         133 my $value = $attributes->{$key};
219 76         99 my $ascii_value = $value;
220 76         111 my @continuous_value;
221             my $extended_value_charset;
222              
223 76 50       281 croak "Invalid attribute '$key'" if $key =~ /$re_escape_extended_value/; # complement to attribute-char in 8bit space
224 76 50       145 croak "Undefined attribute '$key'" unless defined $value;
225              
226 4 100   4   32 if ($value =~ /\P{ASCII}/) {
  4         12  
  4         49  
  76         180  
227 14         77 $ascii_value = unidecode($value);
228 14         3214 $ascii_value =~ s/\P{ASCII}/_/g;
229 14         109 @continuous_value = map { encode('UTF-8', $_) } split //, $value;
  394         11749  
230 14         455 $extended_value_charset = 'UTF-8';
231             }
232              
233 76 100 100     633 if ($ascii_value !~ /^(?:$re_token)*$/ or $ascii_value =~ /'/) {
234 40 50       171 if ($ascii_value =~ /$re_invalid_for_quoted_value/) {
235 0 0       0 @continuous_value = split //, $value unless @continuous_value;
236 0         0 $ascii_value =~ s/[\n\r]/ /g;
237 0         0 $ascii_value =~ s/$re_invalid_for_quoted_value/_/g;
238             }
239 40         126 $ascii_value =~ s/(["\\])/\\$1/g;
240 40         88 $ascii_value = "\"$ascii_value\"";
241             }
242              
243 76 100       177 if (length($key) + length($ascii_value) > 75) { # length(" $key=$ascii_value;") > 78
244 6 50       17 croak "Too long attribute '$key'" if length($key) > 71; # length(" $key=...;") > 78
245 6 100       21 my $pos = $ascii_value =~ /"$/ ? 71 : 72;
246 6         20 substr($ascii_value, $pos - length($key), length($ascii_value) + length($key) - 72, '...');
247 6 50       87 @continuous_value = split //, $value unless @continuous_value;
248             }
249              
250 76 100       153 if (@continuous_value) {
251 20         26 my $needs_quote;
252 20 100       51 unless (defined $extended_value_charset) {
253 6 100       23 $needs_quote = 1 if grep { $_ !~ /^(?:$re_token)*$/ or $_ =~ /'/ } @continuous_value;
  408 100       1736  
254 6 50 66     141 $extended_value_charset = 'US-ASCII' if $needs_quote and grep /$re_invalid_for_quoted_value/, @continuous_value;
255             }
256              
257 20         32 my $add_param_len = 4; # for '; *='
258 20 100       54 if (defined $extended_value_charset) {
    100          
259 14         166 $_ =~ s/($re_escape_extended_value)/sprintf('%%%02X', ord($1))/eg foreach @continuous_value;
  142         725  
260 14         43 substr($continuous_value[0], 0, 0, "$extended_value_charset''");
261 14         20 $add_param_len += 1; # for '*' - charset
262             } elsif ($needs_quote) {
263 4         71 $_ =~ s/(["\\])/\\$1/g foreach @continuous_value;
264 4         10 $add_param_len += 2; # for quotes
265             }
266              
267 20 100 100     156 if ($value =~ /\P{ASCII}/ and length(my $oneparameter = "; $key*=" . join '', @continuous_value) <= 78) {
268 8         20 $ret .= $oneparameter;
269             } else {
270 12         25 my $buf = '';
271 12         19 my $count = 0;
272 12         24 foreach (@continuous_value) {
273 616 100       927 if (length($key) + length($count) + length($buf) + length($_) + $add_param_len > 78) {
274 12 100       29 $buf = "\"$buf\"" if $needs_quote;
275 12         34 my $parameter = "; $key*$count";
276 12 100       27 $parameter .= '*' if defined $extended_value_charset;
277 12         41 $parameter .= "=$buf";
278 12 50       60 croak "Too long attribute '$key'" if length($parameter) > 78;
279 12         21 $ret .= $parameter;
280 12         20 $buf = '';
281 12         18 $count++;
282             }
283 616         719 $buf .= $_;
284             }
285 12 50       30 if (length($buf)) {
286 12 100       27 $buf = "\"$buf\"" if $needs_quote;
287 12         28 my $parameter = "; $key*$count";
288 12 100       26 $parameter .= '*' if defined $extended_value_charset;
289 12         22 $parameter .= "=$buf";
290 12 50       26 croak "Too long attribute '$key'" if length($parameter) > 78;
291 12         20 $ret .= $parameter;
292             }
293             }
294             }
295              
296 76 100 100     224 if (! @continuous_value || $PRE_2231_FORM) {
297 66         216 $ret .= "; $key=$ascii_value";
298             }
299             }
300              
301 56 100       169 substr($ret, 0, 2, '') if length $ret;
302 56         126 return $ret;
303             }
304              
305             sub _unfold_lines {
306 143     143   1301 $_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
307             }
308              
309             sub _clean_comments {
310 1194     1194   2181 my $ret = ($_[0] =~ s/^\s+//);
311 1194         1943 while (length $_[0]) {
312 962 100       1579 last unless $_[0] =~ s/^\(//;
313 32         45 my $level = 1;
314 32         60 while (length $_[0]) {
315 408         492 my $ch = substr $_[0], 0, 1, '';
316 408 100       832 if ($ch eq '(') {
    100          
    100          
317 10         17 $level++;
318             } elsif ($ch eq ')') {
319 42         58 $level--;
320 42 100       76 last if $level == 0;
321             } elsif ($ch eq '\\') {
322 12         31 substr $_[0], 0, 1, '';
323             }
324             }
325 32 0 33     50 carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
326 32         95 $ret |= ($_[0] =~ s/^\s+//);
327             }
328 1194         1619 return $ret;
329             }
330              
331             sub _process_rfc2231 {
332 143     143   227 my ($attribs) = @_;
333 143         221 my %cont;
334             my %encoded;
335              
336 143         183 foreach (keys %{$attribs}) {
  143         381  
337 225 100       618 next unless $_ =~ m/^(.*)\*([0-9]+)\*?$/;
338 78         200 my ($attr, $sec) = ($1, $2);
339 78         178 $cont{$attr}->{$sec} = $attribs->{$_};
340 78 100       206 $encoded{$attr} = 1 if $_ =~ m/\*$/;
341 78         143 delete $attribs->{$_};
342             }
343              
344 143         335 foreach (keys %cont) {
345 32         47 my $key = $_;
346 32 100       106 $key .= '*' if $encoded{$_};
347 32         58 $attribs->{$key} = join '', @{$cont{$_}}{sort { $a <=> $b } keys %{$cont{$_}}};
  32         131  
  58         136  
  32         171  
348             }
349              
350 143         173 foreach (keys %{$attribs}) {
  143         251  
351 176 100       460 next unless $_ =~ m/^(.*)\*$/;
352 42         80 my $key = $1;
353 42 50 33     579 next unless defined $attribs->{$_} and $attribs->{$_} =~ m/^$re_exvalue$/;
354 42         130 my ($charset, $value) = ($1, $2);
355 42         174 $value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
  384         1082  
356 42 100       101 if (length $charset) {
357 30         105 my $enc = find_mime_encoding($charset);
358 30 50       5720 if (defined $enc) {
359 30         197 $value = $enc->decode($value);
360             } else {
361 0         0 carp "Unknown charset '$charset' in attribute '$key' value";
362             }
363             }
364 42         168 $attribs->{$key} = $value;
365 42         112 delete $attribs->{$_};
366             }
367              
368 143         357 return $attribs;
369             }
370              
371             sub _parse_attributes {
372 143     143   227 local $_ = shift;
373 143 50 66     605 substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
374 143         217 my $attribs = {};
375              
376 143         271 while (length $_) {
377 229 100 33     682 s/^;// or $STRICT_PARAMS and do {
378 0         0 carp "Missing semicolon before parameter '$_'";
379 0         0 return $attribs;
380             };
381              
382 229         474 _clean_comments($_);
383              
384 229 100       419 unless (length $_) {
385             # Some mail software generates a Content-Type like this:
386             # "Content-Type: text/plain;"
387             # RFC 1521 section 3 says a parameter must exist if there is a
388             # semicolon.
389 4 50       12 carp "Extra semicolon after last parameter" if $STRICT_PARAMS;
390 4         12 return $attribs;
391             }
392              
393 225         259 my $attribute;
394 225 100       973 if (s/^($re_token)=//) {
395 224         447 $attribute = lc $1;
396             } else {
397 1 50       4 if ($STRICT_PARAMS) {
398 0         0 carp "Illegal parameter '$_'";
399 0         0 return $attribs;
400             }
401 1 50       31 if (s/^($re_token_non_strict)=//) {
402 0         0 $attribute = lc $1;
403             } else {
404 1 50       7 unless (s/^([^;=\s]+)\s*=//) {
405 0         0 carp "Cannot parse parameter '$_'";
406 0         0 return $attribs;
407             }
408 1         5 $attribute = lc $1;
409             }
410             }
411              
412 225         435 _clean_comments($_);
413 225         349 my $value = _extract_attribute_value();
414 225         485 $attribs->{$attribute} = $value;
415 225         324 _clean_comments($_);
416             }
417              
418 139         284 return $attribs;
419             }
420              
421             sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
422 225     225   305 my $value;
423 225         353 while (length $_) {
424 229 100       2101 if (s/^($re_token)//) {
    50          
    0          
    0          
    0          
425 136         296 $value .= $1;
426             } elsif (s/^$re_quoted_string//) {
427 93         221 my $sub = $1;
428 93         188 $sub =~ s/\\(.)/$1/g;
429 93         175 $value .= $sub;
430             } elsif ($STRICT_PARAMS) {
431 0         0 my $char = substr $_, 0, 1;
432 0         0 carp "Unquoted '$char' not allowed";
433 0         0 return;
434             } elsif (s/^($re_token_non_strict)//) {
435 0         0 $value .= $1;
436             } elsif (s/^$re_quoted_string_non_strict//) {
437 0         0 my $sub = $1;
438 0         0 $sub =~ s/\\(.)/$1/g;
439 0         0 $value .= $sub;
440             }
441              
442 229         365 my $erased = _clean_comments($_);
443 229 100 100     745 last if !length $_ or /^;/;
444 6 50       12 if ($STRICT_PARAMS) {
445 0         0 my $char = substr $_, 0, 1;
446 0         0 carp "Extra '$char' found after parameter";
447 0         0 return;
448             }
449              
450 6 50       13 if ($erased) {
451             # Sometimes semicolon is missing, so check for = char
452 6 100       69 last if m/^$re_token_non_strict=/;
453 4         8 $value .= ' ';
454             }
455              
456 4         11 $value .= substr $_, 0, 1, '';
457             }
458 225         360 return $value;
459             }
460              
461             1;
462              
463             #pod =func parse_content_type
464             #pod
465             #pod This routine is exported by default.
466             #pod
467             #pod This routine parses email content type headers according to section 5.1 of RFC
468             #pod 2045 and also RFC 2231 (Character Set and Parameter Continuations). It returns
469             #pod a hash as above, with entries for the C, the C, and a hash of
470             #pod C.
471             #pod
472             #pod For backward compatibility with a really unfortunate misunderstanding of RFC
473             #pod 2045 by the early implementors of this module, C and C are
474             #pod also present in the returned hashref, with the values of C and C
475             #pod respectively.
476             #pod
477             #pod =func parse_content_disposition
478             #pod
479             #pod This routine is exported by default.
480             #pod
481             #pod This routine parses email Content-Disposition headers according to RFC 2183 and
482             #pod RFC 2231. It returns a hash as above, with entries for the C, and a hash
483             #pod of C.
484             #pod
485             #pod =func build_content_type
486             #pod
487             #pod This routine is exported by default.
488             #pod
489             #pod This routine builds email Content-Type header according to RFC 2045 and RFC 2231.
490             #pod It takes a hash as above, with entries for the C, the C, and
491             #pod optionally also a hash of C. It returns a string representing
492             #pod Content-Type header. Non-ASCII attributes are encoded to UTF-8 according to
493             #pod Character Set section of RFC 2231. Attribute which has more then 78 ASCII
494             #pod characters is split into more attributes accorrding to Parameter Continuations
495             #pod of RFC 2231.
496             #pod
497             #pod For compatibility reasons with clients which do not support RFC 2231, output
498             #pod string contains also truncated ASCII version of any too long or non-ASCII
499             #pod attribute. Encoding to ASCII is done via Text::Unidecode module. This
500             #pod behavior can cause confusion by 2231-compatible MIME implementations, and can
501             #pod be disabled by setting C<$Email::MIME::ContentType::STRICT> to true.
502             #pod
503             #pod =func build_content_disposition
504             #pod
505             #pod This routine is exported by default.
506             #pod
507             #pod This routine builds email Content-Disposition header according to RFC 2182 and
508             #pod RFC 2231. It takes a hash as above, with entries for the C, and
509             #pod optionally also a hash of C. It returns a string representing
510             #pod Content-Disposition header. Non-ASCII or too long attributes are handled in
511             #pod the same way like in L.
512             #pod
513             #pod =head1 WARNINGS
514             #pod
515             #pod This is not a valid content-type header, according to both RFC 1521 and RFC
516             #pod 2045:
517             #pod
518             #pod Content-Type: type/subtype;
519             #pod
520             #pod If a semicolon appears, a parameter must. C will carp if
521             #pod it encounters a header of this type, but you can suppress this by setting
522             #pod C<$Email::MIME::ContentType::STRICT_PARAMS> to a false value. Please consider
523             #pod localizing this assignment!
524             #pod
525             #pod Same applies for C.
526             #pod
527             #pod =cut
528              
529             __END__