File Coverage

blib/lib/Email/MIME/ContentType.pm

Criterion	Covered	Total	%
statement	215	244	88.1
branch	111	160	69.3
condition	33	51	64.7
subroutine	17	17	100.0
pod	4	4	100.0
total	380	476	79.8

line	stmt	bran	cond	sub	pod	time	code
1	4			4		234845	use strict;
	4					36
	4					102
2	4			4		18	use warnings;
	4					5
	4					161
3							package Email::MIME::ContentType 1.027;
4							# ABSTRACT: Parse and build a MIME Content-Type or Content-Disposition Header
5
6	4			4		19	use Carp;
	4					6
	4					273
7	4			4		1910	use Encode 2.87 qw(encode find_mime_encoding);
	4					35617
	4					292
8	4			4		28	use Exporter 5.57 'import';
	4					45
	4					97
9	4			4		1782	use Text::Unidecode;
	4					8292
	4					5101
10
11							# If set, generate both foo*0=x and foo=x versions. -- rjbs, 2022-08-24
12							our $PRE_2231_FORM = 1;
13
14							our @EXPORT = qw(parse_content_type parse_content_disposition build_content_type build_content_disposition);
15
16							#pod =head1 SYNOPSIS
17							#pod
18							#pod use Email::MIME::ContentType;
19							#pod
20							#pod # Content-Type: text/plain; charset="us-ascii"; format=flowed
21							#pod my $ct = 'text/plain; charset="us-ascii"; format=flowed';
22							#pod my $data = parse_content_type($ct);
23							#pod
24							#pod $data = {
25							#pod type => "text",
26							#pod subtype => "plain",
27							#pod attributes => {
28							#pod charset => "us-ascii",
29							#pod format => "flowed"
30							#pod }
31							#pod };
32							#pod
33							#pod my $ct_new = build_content_type($data);
34							#pod # text/plain; charset=us-ascii; format=flowed
35							#pod
36							#pod
37							#pod # Content-Type: application/x-stuff;
38							#pod # title0=us-ascii'en'This%20is%20even%20more%20;
39							#pod # title1=%2A%2A%2Afun%2A%2A%2A%20;
40							#pod # title*2="isn't it!"
41							#pod my $ct = q(application/x-stuff;
42							#pod title0=us-ascii'en'This%20is%20even%20more%20;
43							#pod title1=%2A%2A%2Afun%2A%2A%2A%20;
44							#pod title*2="isn't it!");
45							#pod my $data = parse_content_type($ct);
46							#pod
47							#pod $data = {
48							#pod type => "application",
49							#pod subtype => "x-stuff",
50							#pod attributes => {
51							#pod title => "This is even more *fun* isn't it!"
52							#pod }
53							#pod };
54							#pod
55							#pod
56							#pod # Content-Disposition: attachment; filename=genome.jpeg;
57							#pod # modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
58							#pod my $cd = q(attachment; filename=genome.jpeg;
59							#pod modification-date="Wed, 12 Feb 1997 16:29:51 -0500");
60							#pod my $data = parse_content_disposition($cd);
61							#pod
62							#pod $data = {
63							#pod type => "attachment",
64							#pod attributes => {
65							#pod filename => "genome.jpeg",
66							#pod "modification-date" => "Wed, 12 Feb 1997 16:29:51 -0500"
67							#pod }
68							#pod };
69							#pod
70							#pod my $cd_new = build_content_disposition($data);
71							#pod # attachment; filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
72							#pod
73							#pod =cut
74
75							our $STRICT_PARAMS = 1;
76
77							my $ct_default = 'text/plain; charset=us-ascii';
78
79							my $re_token = qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/; # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
80							my $re_token_non_strict = qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+\|$re_token)/; # allow CTLs and above ASCII
81
82							my $re_qtext = qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/; # US-ASCII except CR, LF, white space, backslash and quote
83							my $re_quoted_pair = qr/\\[\x00-\x7F]/;
84							my $re_quoted_string = qr/"((?:[ \t](?:$re_qtext\|$re_quoted_pair))[ \t]*)"/;
85
86							my $re_qtext_non_strict = qr/[\x80-\xFF]\|$re_qtext/;
87							my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
88							my $re_quoted_string_non_strict = qr/"((?:[ \t](?:$re_qtext_non_strict\|$re_quoted_pair_non_strict))[ \t]*)"/;
89
90							my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\\|\}~]+/;
91							my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
92							my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
93
94							sub parse_content_type {
95	100			100	1	55372	my $ct = shift;
96
97							# If the header isn't there or is empty, give default answer.
98	100	100	66			409	return parse_content_type($ct_default) unless defined $ct and length $ct;
99
100	98					228	_unfold_lines($ct);
101	98					201	_clean_comments($ct);
102
103							# It is also recommend (sic.) that this default be assumed when a
104							# syntactically invalid Content-Type header field is encountered.
105	98	50				723	unless ($ct =~ s/^($re_token)\/($re_token)//) {
106	0	0	0			0	unless ($STRICT_PARAMS and $ct =~ s/^($re_token_non_strict)\/($re_token_non_strict)//) {
107	0					0	carp "Invalid Content-Type '$ct'";
108	0					0	return parse_content_type($ct_default);
109							}
110							}
111
112	98					354	my ($type, $subtype) = (lc $1, lc $2);
113
114	98					205	_clean_comments($ct);
115	98					259	$ct =~ s/\s+$//;
116
117	98					146	my $attributes = {};
118	98	50	100			383	if ($STRICT_PARAMS and length $ct and $ct !~ /^;/) {
			66
119	0					0	carp "Missing semicolon before first Content-Type parameter '$ct'";
120							} else {
121	98					175	$attributes = _process_rfc2231(_parse_attributes($ct));
122							}
123
124							return {
125	98					513	type => $type,
126							subtype => $subtype,
127							attributes => $attributes,
128
129							# This is dumb. Really really dumb. For backcompat. -- rjbs,
130							# 2013-08-10
131							discrete => $type,
132							composite => $subtype,
133							};
134							}
135
136							my $cd_default = 'attachment';
137
138							sub parse_content_disposition {
139	47			47	1	25470	my $cd = shift;
140
141	47	100	66			208	return parse_content_disposition($cd_default) unless defined $cd and length $cd;
142
143	45					138	_unfold_lines($cd);
144	45					120	_clean_comments($cd);
145
146	45	50				321	unless ($cd =~ s/^($re_token)//) {
147	0	0	0			0	unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
148	0					0	carp "Invalid Content-Disposition '$cd'";
149	0					0	return parse_content_disposition($cd_default);
150							}
151							}
152
153	45					137	my $type = lc $1;
154
155	45					96	_clean_comments($cd);
156	45					138	$cd =~ s/\s+$//;
157
158	45					77	my $attributes = {};
159	45	50	100			285	if ($STRICT_PARAMS and length $cd and $cd !~ /^;/) {
			66
160	0					0	carp "Missing semicolon before first Content-Disposition parameter '$cd'";
161							} else {
162	45					109	$attributes = _process_rfc2231(_parse_attributes($cd));
163							}
164
165							return {
166	45					191	type => $type,
167							attributes => $attributes,
168							};
169							}
170
171							my $re_invalid_for_quoted_value = qr/[\x00-\x08\x0A-\x1F\x7F-\xFF]/; # non-US-ASCII and CTLs without SPACE and TAB
172							my $re_escape_extended_value = qr/[\x00-\x20\x7F-\xFF\'%()<>@,;:\\"\/\[\]?=]/; # non-US-ASCII, SPACE, CTLs, '% and tspecials ()<>@,;:\\"/[]?=
173
174							sub build_content_type {
175	30			30	1	41562	my $ct = shift;
176
177	30	50				67	croak 'Missing Content-Type \'type\' parameter' unless exists $ct->{type};
178	30	50				59	croak 'Missing Content-Type \'subtype\' parameter' unless exists $ct->{subtype};
179
180	30	50				245	croak 'Invalid Content-Type \'type\' parameter' if $ct->{type} !~ /^(?:$re_token)*$/;
181	30	50				132	croak 'Invalid Content-Type \'subtype\' parameter' if $ct->{subtype} !~ /^(?:$re_token)*$/;
182
183	30	50				80	croak 'Too long Content-Type \'type\' and \'subtype\' parameters' if length($ct->{type}) + length($ct->{subtype}) > 76;
184
185	30					36	my ($extra) = grep !/(?:type\|subtype\|attributes)/, sort keys %{$ct};
	30					262
186	30	50				68	croak "Extra Content-Type '$extra' parameter" if defined $extra;
187
188	30					67	my $ret = $ct->{type} . '/' . $ct->{subtype};
189	30	50				82	my $attrs = exists $ct->{attributes} ? _build_attributes($ct->{attributes}) : '';
190	30	100				76	$ret .= "; $attrs" if length($attrs);
191	30					68	return $ret;
192							}
193
194							sub build_content_disposition {
195	26			26	1	39986	my $cd = shift;
196
197	26	50				79	croak 'Missing Content-Type \'type\' parameter' unless exists $cd->{type};
198
199	26	50				277	croak 'Invalid Content-Type \'type\' parameter' if $cd->{type} !~ /^(?:$re_token)*$/;
200
201	26	50				82	croak 'Too long Content-Type \'type\' parameter' if length($cd->{type}) > 77;
202
203	26					48	my ($extra) = grep !/(?:type\|attributes)/, sort keys %{$cd};
	26					282
204	26	50				96	croak "Extra Content-Type '$extra' parameter" if defined $extra;
205
206	26					46	my $ret = $cd->{type};
207	26	50				92	my $attrs = exists $cd->{attributes} ? _build_attributes($cd->{attributes}) : '';
208	26	100				86	$ret .= "; $attrs" if length($attrs);
209	26					72	return $ret;
210							}
211
212							sub _build_attributes {
213	56			56		78	my $attributes = shift;
214
215	56					118	my $ret = '';
216
217	56					81	foreach my $key (sort keys %{$attributes}) {
	56					148
218	76					133	my $value = $attributes->{$key};
219	76					99	my $ascii_value = $value;
220	76					111	my @continuous_value;
221							my $extended_value_charset;
222
223	76	50				281	croak "Invalid attribute '$key'" if $key =~ /$re_escape_extended_value/; # complement to attribute-char in 8bit space
224	76	50				145	croak "Undefined attribute '$key'" unless defined $value;
225
226	4	100		4		32	if ($value =~ /\P{ASCII}/) {
	4					12
	4					49
	76					180
227	14					77	$ascii_value = unidecode($value);
228	14					3214	$ascii_value =~ s/\P{ASCII}/_/g;
229	14					109	@continuous_value = map { encode('UTF-8', $_) } split //, $value;
	394					11749
230	14					455	$extended_value_charset = 'UTF-8';
231							}
232
233	76	100	100			633	if ($ascii_value !~ /^(?:$re_token)*$/ or $ascii_value =~ /'/) {
234	40	50				171	if ($ascii_value =~ /$re_invalid_for_quoted_value/) {
235	0	0				0	@continuous_value = split //, $value unless @continuous_value;
236	0					0	$ascii_value =~ s/[\n\r]/ /g;
237	0					0	$ascii_value =~ s/$re_invalid_for_quoted_value/_/g;
238							}
239	40					126	$ascii_value =~ s/(["\\])/\\$1/g;
240	40					88	$ascii_value = "\"$ascii_value\"";
241							}
242
243	76	100				177	if (length($key) + length($ascii_value) > 75) { # length(" $key=$ascii_value;") > 78
244	6	50				17	croak "Too long attribute '$key'" if length($key) > 71; # length(" $key=...;") > 78
245	6	100				21	my $pos = $ascii_value =~ /"$/ ? 71 : 72;
246	6					20	substr($ascii_value, $pos - length($key), length($ascii_value) + length($key) - 72, '...');
247	6	50				87	@continuous_value = split //, $value unless @continuous_value;
248							}
249
250	76	100				153	if (@continuous_value) {
251	20					26	my $needs_quote;
252	20	100				51	unless (defined $extended_value_charset) {
253	6	100				23	$needs_quote = 1 if grep { $_ !~ /^(?:$re_token)*$/ or $_ =~ /'/ } @continuous_value;
	408	100				1736
254	6	50	66			141	$extended_value_charset = 'US-ASCII' if $needs_quote and grep /$re_invalid_for_quoted_value/, @continuous_value;
255							}
256
257	20					32	my $add_param_len = 4; # for '; *='
258	20	100				54	if (defined $extended_value_charset) {
		100
259	14					166	$_ =~ s/($re_escape_extended_value)/sprintf('%%%02X', ord($1))/eg foreach @continuous_value;
	142					725
260	14					43	substr($continuous_value[0], 0, 0, "$extended_value_charset''");
261	14					20	$add_param_len += 1; # for '*' - charset
262							} elsif ($needs_quote) {
263	4					71	$_ =~ s/(["\\])/\\$1/g foreach @continuous_value;
264	4					10	$add_param_len += 2; # for quotes
265							}
266
267	20	100	100			156	if ($value =~ /\P{ASCII}/ and length(my $oneparameter = "; $key*=" . join '', @continuous_value) <= 78) {
268	8					20	$ret .= $oneparameter;
269							} else {
270	12					25	my $buf = '';
271	12					19	my $count = 0;
272	12					24	foreach (@continuous_value) {
273	616	100				927	if (length($key) + length($count) + length($buf) + length($_) + $add_param_len > 78) {
274	12	100				29	$buf = "\"$buf\"" if $needs_quote;
275	12					34	my $parameter = "; $key*$count";
276	12	100				27	$parameter .= '*' if defined $extended_value_charset;
277	12					41	$parameter .= "=$buf";
278	12	50				60	croak "Too long attribute '$key'" if length($parameter) > 78;
279	12					21	$ret .= $parameter;
280	12					20	$buf = '';
281	12					18	$count++;
282							}
283	616					719	$buf .= $_;
284							}
285	12	50				30	if (length($buf)) {
286	12	100				27	$buf = "\"$buf\"" if $needs_quote;
287	12					28	my $parameter = "; $key*$count";
288	12	100				26	$parameter .= '*' if defined $extended_value_charset;
289	12					22	$parameter .= "=$buf";
290	12	50				26	croak "Too long attribute '$key'" if length($parameter) > 78;
291	12					20	$ret .= $parameter;
292							}
293							}
294							}
295
296	76	100	100			224	if (! @continuous_value \|\| $PRE_2231_FORM) {
297	66					216	$ret .= "; $key=$ascii_value";
298							}
299							}
300
301	56	100				169	substr($ret, 0, 2, '') if length $ret;
302	56					126	return $ret;
303							}
304
305							sub _unfold_lines {
306	143			143		1301	$_[0] =~ s/(?:\r\n\|[\r\n])(?=[ \t])//g;
307							}
308
309							sub _clean_comments {
310	1194			1194		2181	my $ret = ($_[0] =~ s/^\s+//);
311	1194					1943	while (length $_[0]) {
312	962	100				1579	last unless $_[0] =~ s/^\(//;
313	32					45	my $level = 1;
314	32					60	while (length $_[0]) {
315	408					492	my $ch = substr $_[0], 0, 1, '';
316	408	100				832	if ($ch eq '(') {
		100
		100
317	10					17	$level++;
318							} elsif ($ch eq ')') {
319	42					58	$level--;
320	42	100				76	last if $level == 0;
321							} elsif ($ch eq '\\') {
322	12					31	substr $_[0], 0, 1, '';
323							}
324							}
325	32	0	33			50	carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
326	32					95	$ret \|= ($_[0] =~ s/^\s+//);
327							}
328	1194					1619	return $ret;
329							}
330
331							sub _process_rfc2231 {
332	143			143		227	my ($attribs) = @_;
333	143					221	my %cont;
334							my %encoded;
335
336	143					183	foreach (keys %{$attribs}) {
	143					381
337	225	100				618	next unless $_ =~ m/^(.)\([0-9]+)\*?$/;
338	78					200	my ($attr, $sec) = ($1, $2);
339	78					178	$cont{$attr}->{$sec} = $attribs->{$_};
340	78	100				206	$encoded{$attr} = 1 if $_ =~ m/\*$/;
341	78					143	delete $attribs->{$_};
342							}
343
344	143					335	foreach (keys %cont) {
345	32					47	my $key = $_;
346	32	100				106	$key .= '*' if $encoded{$_};
347	32					58	$attribs->{$key} = join '', @{$cont{$_}}{sort { $a <=> $b } keys %{$cont{$_}}};
	32					131
	58					136
	32					171
348							}
349
350	143					173	foreach (keys %{$attribs}) {
	143					251
351	176	100				460	next unless $_ =~ m/^(.)\$/;
352	42					80	my $key = $1;
353	42	50	33			579	next unless defined $attribs->{$_} and $attribs->{$_} =~ m/^$re_exvalue$/;
354	42					130	my ($charset, $value) = ($1, $2);
355	42					174	$value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
	384					1082
356	42	100				101	if (length $charset) {
357	30					105	my $enc = find_mime_encoding($charset);
358	30	50				5720	if (defined $enc) {
359	30					197	$value = $enc->decode($value);
360							} else {
361	0					0	carp "Unknown charset '$charset' in attribute '$key' value";
362							}
363							}
364	42					168	$attribs->{$key} = $value;
365	42					112	delete $attribs->{$_};
366							}
367
368	143					357	return $attribs;
369							}
370
371							sub _parse_attributes {
372	143			143		227	local $_ = shift;
373	143	50	66			605	substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
374	143					217	my $attribs = {};
375
376	143					271	while (length $_) {
377	229	100	33			682	s/^;// or $STRICT_PARAMS and do {
378	0					0	carp "Missing semicolon before parameter '$_'";
379	0					0	return $attribs;
380							};
381
382	229					474	_clean_comments($_);
383
384	229	100				419	unless (length $_) {
385							# Some mail software generates a Content-Type like this:
386							# "Content-Type: text/plain;"
387							# RFC 1521 section 3 says a parameter must exist if there is a
388							# semicolon.
389	4	50				12	carp "Extra semicolon after last parameter" if $STRICT_PARAMS;
390	4					12	return $attribs;
391							}
392
393	225					259	my $attribute;
394	225	100				973	if (s/^($re_token)=//) {
395	224					447	$attribute = lc $1;
396							} else {
397	1	50				4	if ($STRICT_PARAMS) {
398	0					0	carp "Illegal parameter '$_'";
399	0					0	return $attribs;
400							}
401	1	50				31	if (s/^($re_token_non_strict)=//) {
402	0					0	$attribute = lc $1;
403							} else {
404	1	50				7	unless (s/^([^;=\s]+)\s*=//) {
405	0					0	carp "Cannot parse parameter '$_'";
406	0					0	return $attribs;
407							}
408	1					5	$attribute = lc $1;
409							}
410							}
411
412	225					435	_clean_comments($_);
413	225					349	my $value = _extract_attribute_value();
414	225					485	$attribs->{$attribute} = $value;
415	225					324	_clean_comments($_);
416							}
417
418	139					284	return $attribs;
419							}
420
421							sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
422	225			225		305	my $value;
423	225					353	while (length $_) {
424	229	100				2101	if (s/^($re_token)//) {
		50
		0
		0
		0
425	136					296	$value .= $1;
426							} elsif (s/^$re_quoted_string//) {
427	93					221	my $sub = $1;
428	93					188	$sub =~ s/\\(.)/$1/g;
429	93					175	$value .= $sub;
430							} elsif ($STRICT_PARAMS) {
431	0					0	my $char = substr $_, 0, 1;
432	0					0	carp "Unquoted '$char' not allowed";
433	0					0	return;
434							} elsif (s/^($re_token_non_strict)//) {
435	0					0	$value .= $1;
436							} elsif (s/^$re_quoted_string_non_strict//) {
437	0					0	my $sub = $1;
438	0					0	$sub =~ s/\\(.)/$1/g;
439	0					0	$value .= $sub;
440							}
441
442	229					365	my $erased = _clean_comments($_);
443	229	100	100			745	last if !length $_ or /^;/;
444	6	50				12	if ($STRICT_PARAMS) {
445	0					0	my $char = substr $_, 0, 1;
446	0					0	carp "Extra '$char' found after parameter";
447	0					0	return;
448							}
449
450	6	50				13	if ($erased) {
451							# Sometimes semicolon is missing, so check for = char
452	6	100				69	last if m/^$re_token_non_strict=/;
453	4					8	$value .= ' ';
454							}
455
456	4					11	$value .= substr $_, 0, 1, '';
457							}
458	225					360	return $value;
459							}
460
461							1;
462
463							#pod =func parse_content_type
464							#pod
465							#pod This routine is exported by default.
466							#pod
467							#pod This routine parses email content type headers according to section 5.1 of RFC
468							#pod 2045 and also RFC 2231 (Character Set and Parameter Continuations). It returns
469							#pod a hash as above, with entries for the C, the C, and a hash of
470							#pod C.
471							#pod
472							#pod For backward compatibility with a really unfortunate misunderstanding of RFC
473							#pod 2045 by the early implementors of this module, C and C are
474							#pod also present in the returned hashref, with the values of C and C
475							#pod respectively.
476							#pod
477							#pod =func parse_content_disposition
478							#pod
479							#pod This routine is exported by default.
480							#pod
481							#pod This routine parses email Content-Disposition headers according to RFC 2183 and
482							#pod RFC 2231. It returns a hash as above, with entries for the C, and a hash
483							#pod of C.
484							#pod
485							#pod =func build_content_type
486							#pod
487							#pod This routine is exported by default.
488							#pod
489							#pod This routine builds email Content-Type header according to RFC 2045 and RFC 2231.
490							#pod It takes a hash as above, with entries for the C, the C, and
491							#pod optionally also a hash of C. It returns a string representing
492							#pod Content-Type header. Non-ASCII attributes are encoded to UTF-8 according to
493							#pod Character Set section of RFC 2231. Attribute which has more then 78 ASCII
494							#pod characters is split into more attributes accorrding to Parameter Continuations
495							#pod of RFC 2231.
496							#pod
497							#pod For compatibility reasons with clients which do not support RFC 2231, output
498							#pod string contains also truncated ASCII version of any too long or non-ASCII
499							#pod attribute. Encoding to ASCII is done via Text::Unidecode module. This
500							#pod behavior can cause confusion by 2231-compatible MIME implementations, and can
501							#pod be disabled by setting C<$Email::MIME::ContentType::STRICT> to true.
502							#pod
503							#pod =func build_content_disposition
504							#pod
505							#pod This routine is exported by default.
506							#pod
507							#pod This routine builds email Content-Disposition header according to RFC 2182 and
508							#pod RFC 2231. It takes a hash as above, with entries for the C, and
509							#pod optionally also a hash of C. It returns a string representing
510							#pod Content-Disposition header. Non-ASCII or too long attributes are handled in
511							#pod the same way like in L.
512							#pod
513							#pod =head1 WARNINGS
514							#pod
515							#pod This is not a valid content-type header, according to both RFC 1521 and RFC
516							#pod 2045:
517							#pod
518							#pod Content-Type: type/subtype;
519							#pod
520							#pod If a semicolon appears, a parameter must. C will carp if
521							#pod it encounters a header of this type, but you can suppress this by setting
522							#pod C<$Email::MIME::ContentType::STRICT_PARAMS> to a false value. Please consider
523							#pod localizing this assignment!
524							#pod
525							#pod Same applies for C.
526							#pod
527							#pod =cut
528
529							__END__