File Coverage

blib/lib/Email/MIME/ContentType.pm

Criterion	Covered	Total	%
statement	214	243	88.0
branch	111	160	69.3
condition	29	45	64.4
subroutine	17	17	100.0
pod	4	4	100.0
total	375	469	79.9

line	stmt	bran	cond	sub	pod	time	code
1	4			4		276331	use v5.12.0;
	4					65
2	4			4		25	use warnings;
	4					8
	4					194
3							package Email::MIME::ContentType 1.028;
4							# ABSTRACT: Parse and build a MIME Content-Type or Content-Disposition Header
5
6	4			4		22	use Carp;
	4					22
	4					320
7	4			4		2229	use Encode 2.87 qw(encode find_mime_encoding);
	4					42224
	4					327
8	4			4		30	use Exporter 5.57 'import';
	4					60
	4					131
9	4			4		2096	use Text::Unidecode;
	4					9874
	4					6528
10
11							# If set, generate both foo*0=x and foo=x versions. -- rjbs, 2022-08-24
12							our $PRE_2231_FORM = 1;
13
14							our @EXPORT = qw(parse_content_type parse_content_disposition build_content_type build_content_disposition);
15
16							#pod =head1 SYNOPSIS
17							#pod
18							#pod use Email::MIME::ContentType;
19							#pod
20							#pod # Content-Type: text/plain; charset="us-ascii"; format=flowed
21							#pod my $ct = 'text/plain; charset="us-ascii"; format=flowed';
22							#pod my $data = parse_content_type($ct);
23							#pod
24							#pod $data = {
25							#pod type => "text",
26							#pod subtype => "plain",
27							#pod attributes => {
28							#pod charset => "us-ascii",
29							#pod format => "flowed"
30							#pod }
31							#pod };
32							#pod
33							#pod my $ct_new = build_content_type($data);
34							#pod # text/plain; charset=us-ascii; format=flowed
35							#pod
36							#pod
37							#pod # Content-Type: application/x-stuff;
38							#pod # title0=us-ascii'en'This%20is%20even%20more%20;
39							#pod # title1=%2A%2A%2Afun%2A%2A%2A%20;
40							#pod # title*2="isn't it!"
41							#pod my $ct = q(application/x-stuff;
42							#pod title0=us-ascii'en'This%20is%20even%20more%20;
43							#pod title1=%2A%2A%2Afun%2A%2A%2A%20;
44							#pod title*2="isn't it!");
45							#pod my $data = parse_content_type($ct);
46							#pod
47							#pod $data = {
48							#pod type => "application",
49							#pod subtype => "x-stuff",
50							#pod attributes => {
51							#pod title => "This is even more *fun* isn't it!"
52							#pod }
53							#pod };
54							#pod
55							#pod
56							#pod # Content-Disposition: attachment; filename=genome.jpeg;
57							#pod # modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
58							#pod my $cd = q(attachment; filename=genome.jpeg;
59							#pod modification-date="Wed, 12 Feb 1997 16:29:51 -0500");
60							#pod my $data = parse_content_disposition($cd);
61							#pod
62							#pod $data = {
63							#pod type => "attachment",
64							#pod attributes => {
65							#pod filename => "genome.jpeg",
66							#pod "modification-date" => "Wed, 12 Feb 1997 16:29:51 -0500"
67							#pod }
68							#pod };
69							#pod
70							#pod my $cd_new = build_content_disposition($data);
71							#pod # attachment; filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
72							#pod
73							#pod =cut
74
75							our $STRICT_PARAMS = 1;
76
77							my $ct_default = 'text/plain; charset=us-ascii';
78
79							my $re_token = qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/; # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
80							my $re_token_non_strict = qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+\|$re_token)/; # allow CTLs and above ASCII
81
82							my $re_qtext = qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/; # US-ASCII except CR, LF, white space, backslash and quote
83							my $re_quoted_pair = qr/\\[\x00-\x7F]/;
84							my $re_quoted_string = qr/"((?:[ \t](?:$re_qtext\|$re_quoted_pair))[ \t]*)"/;
85
86							my $re_qtext_non_strict = qr/[\x80-\xFF]\|$re_qtext/;
87							my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
88							my $re_quoted_string_non_strict = qr/"((?:[ \t](?:$re_qtext_non_strict\|$re_quoted_pair_non_strict))[ \t]*)"/;
89
90							my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\\|\}~]+/;
91							my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
92							my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
93
94							sub parse_content_type {
95	100			100	1	66632	my $ct = shift;
96
97							# If the header isn't there or is empty, give default answer.
98	100	100				272	return parse_content_type($ct_default) unless length $ct;
99
100	98					267	_unfold_lines($ct);
101	98					238	_clean_comments($ct);
102
103							# It is also recommend (sic.) that this default be assumed when a
104							# syntactically invalid Content-Type header field is encountered.
105	98	50				875	unless ($ct =~ s/^($re_token)\/($re_token)//) {
106	0	0	0			0	unless ($STRICT_PARAMS and $ct =~ s/^($re_token_non_strict)\/($re_token_non_strict)//) {
107	0					0	carp "Invalid Content-Type '$ct'";
108	0					0	return parse_content_type($ct_default);
109							}
110							}
111
112	98					431	my ($type, $subtype) = (lc $1, lc $2);
113
114	98					215	_clean_comments($ct);
115	98					317	$ct =~ s/\s+$//;
116
117	98					180	my $attributes = {};
118	98	50	100			508	if ($STRICT_PARAMS and length $ct and $ct !~ /^;/) {
			66
119	0					0	carp "Missing semicolon before first Content-Type parameter '$ct'";
120							} else {
121	98					215	$attributes = _process_rfc2231(_parse_attributes($ct));
122							}
123
124							return {
125	98					615	type => $type,
126							subtype => $subtype,
127							attributes => $attributes,
128
129							# This is dumb. Really really dumb. For backcompat. -- rjbs,
130							# 2013-08-10
131							discrete => $type,
132							composite => $subtype,
133							};
134							}
135
136							my $cd_default = 'attachment';
137
138							sub parse_content_disposition {
139	47			47	1	29615	my $cd = shift;
140
141	47	100				134	return parse_content_disposition($cd_default) unless length $cd;
142
143	45					122	_unfold_lines($cd);
144	45					108	_clean_comments($cd);
145
146	45	50				398	unless ($cd =~ s/^($re_token)//) {
147	0	0	0			0	unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
148	0					0	carp "Invalid Content-Disposition '$cd'";
149	0					0	return parse_content_disposition($cd_default);
150							}
151							}
152
153	45					153	my $type = lc $1;
154
155	45					107	_clean_comments($cd);
156	45					147	$cd =~ s/\s+$//;
157
158	45					81	my $attributes = {};
159	45	50	100			262	if ($STRICT_PARAMS and length $cd and $cd !~ /^;/) {
			66
160	0					0	carp "Missing semicolon before first Content-Disposition parameter '$cd'";
161							} else {
162	45					122	$attributes = _process_rfc2231(_parse_attributes($cd));
163							}
164
165							return {
166	45					204	type => $type,
167							attributes => $attributes,
168							};
169							}
170
171							my $re_invalid_for_quoted_value = qr/[\x00-\x08\x0A-\x1F\x7F-\xFF]/; # non-US-ASCII and CTLs without SPACE and TAB
172							my $re_escape_extended_value = qr/[\x00-\x20\x7F-\xFF\'%()<>@,;:\\"\/\[\]?=]/; # non-US-ASCII, SPACE, CTLs, '% and tspecials ()<>@,;:\\"/[]?=
173
174							sub build_content_type {
175	30			30	1	50381	my $ct = shift;
176
177	30	50				93	croak 'Missing Content-Type \'type\' parameter' unless exists $ct->{type};
178	30	50				70	croak 'Missing Content-Type \'subtype\' parameter' unless exists $ct->{subtype};
179
180	30	50				280	croak 'Invalid Content-Type \'type\' parameter' if $ct->{type} !~ /^(?:$re_token)*$/;
181	30	50				171	croak 'Invalid Content-Type \'subtype\' parameter' if $ct->{subtype} !~ /^(?:$re_token)*$/;
182
183	30	50				99	croak 'Too long Content-Type \'type\' and \'subtype\' parameters' if length($ct->{type}) + length($ct->{subtype}) > 76;
184
185	30					48	my ($extra) = grep !/(?:type\|subtype\|attributes)/, sort keys %{$ct};
	30					320
186	30	50				94	croak "Extra Content-Type '$extra' parameter" if defined $extra;
187
188	30					97	my $ret = $ct->{type} . '/' . $ct->{subtype};
189	30	50				95	my $attrs = exists $ct->{attributes} ? _build_attributes($ct->{attributes}) : '';
190	30	100				93	$ret .= "; $attrs" if length($attrs);
191	30					86	return $ret;
192							}
193
194							sub build_content_disposition {
195	26			26	1	44499	my $cd = shift;
196
197	26	50				107	croak 'Missing Content-Type \'type\' parameter' unless exists $cd->{type};
198
199	26	50				259	croak 'Invalid Content-Type \'type\' parameter' if $cd->{type} !~ /^(?:$re_token)*$/;
200
201	26	50				80	croak 'Too long Content-Type \'type\' parameter' if length($cd->{type}) > 77;
202
203	26					38	my ($extra) = grep !/(?:type\|attributes)/, sort keys %{$cd};
	26					246
204	26	50				74	croak "Extra Content-Type '$extra' parameter" if defined $extra;
205
206	26					52	my $ret = $cd->{type};
207	26	50				81	my $attrs = exists $cd->{attributes} ? _build_attributes($cd->{attributes}) : '';
208	26	100				80	$ret .= "; $attrs" if length($attrs);
209	26					79	return $ret;
210							}
211
212							sub _build_attributes {
213	56			56		84	my $attributes = shift;
214
215	56					95	my $ret = '';
216
217	56					79	foreach my $key (sort keys %{$attributes}) {
	56					187
218	76					154	my $value = $attributes->{$key};
219	76					120	my $ascii_value = $value;
220	76					125	my @continuous_value;
221							my $extended_value_charset;
222
223	76	50				331	croak "Invalid attribute '$key'" if $key =~ /$re_escape_extended_value/; # complement to attribute-char in 8bit space
224	76	50				166	croak "Undefined attribute '$key'" unless defined $value;
225
226	4	100		4		33	if ($value =~ /\P{ASCII}/) {
	4					9
	4					62
	76					227
227	14					51	$ascii_value = unidecode($value);
228	14					4080	$ascii_value =~ s/\P{ASCII}/_/g;
229	14					101	@continuous_value = map { encode('UTF-8', $_) } split //, $value;
	394					14242
230	14					547	$extended_value_charset = 'UTF-8';
231							}
232
233	76	100	100			627	if ($ascii_value !~ /^(?:$re_token)*$/ or $ascii_value =~ /'/) {
234	40	50				188	if ($ascii_value =~ /$re_invalid_for_quoted_value/) {
235	0	0				0	@continuous_value = split //, $value unless @continuous_value;
236	0					0	$ascii_value =~ s/[\n\r]/ /g;
237	0					0	$ascii_value =~ s/$re_invalid_for_quoted_value/_/g;
238							}
239	40					121	$ascii_value =~ s/(["\\])/\\$1/g;
240	40					96	$ascii_value = "\"$ascii_value\"";
241							}
242
243	76	100				206	if (length($key) + length($ascii_value) > 75) { # length(" $key=$ascii_value;") > 78
244	6	50				17	croak "Too long attribute '$key'" if length($key) > 71; # length(" $key=...;") > 78
245	6	100				27	my $pos = $ascii_value =~ /"$/ ? 71 : 72;
246	6					36	substr($ascii_value, $pos - length($key), length($ascii_value) + length($key) - 72, '...');
247	6	50				128	@continuous_value = split //, $value unless @continuous_value;
248							}
249
250	76	100				171	if (@continuous_value) {
251	20					35	my $needs_quote;
252	20	100				46	unless (defined $extended_value_charset) {
253	6	100				14	$needs_quote = 1 if grep { $_ !~ /^(?:$re_token)*$/ or $_ =~ /'/ } @continuous_value;
	408	100				2106
254	6	50	66			162	$extended_value_charset = 'US-ASCII' if $needs_quote and grep /$re_invalid_for_quoted_value/, @continuous_value;
255							}
256
257	20					34	my $add_param_len = 4; # for '; *='
258	20	100				53	if (defined $extended_value_charset) {
		100
259	14					200	$_ =~ s/($re_escape_extended_value)/sprintf('%%%02X', ord($1))/eg foreach @continuous_value;
	142					910
260	14					44	substr($continuous_value[0], 0, 0, "$extended_value_charset''");
261	14					26	$add_param_len += 1; # for '*' - charset
262							} elsif ($needs_quote) {
263	4					107	$_ =~ s/(["\\])/\\$1/g foreach @continuous_value;
264	4					12	$add_param_len += 2; # for quotes
265							}
266
267	20	100	100			142	if ($value =~ /\P{ASCII}/ and length(my $oneparameter = "; $key*=" . join '', @continuous_value) <= 78) {
268	8					19	$ret .= $oneparameter;
269							} else {
270	12					24	my $buf = '';
271	12					24	my $count = 0;
272	12					27	foreach (@continuous_value) {
273	616	100				1141	if (length($key) + length($count) + length($buf) + length($_) + $add_param_len > 78) {
274	12	100				47	$buf = "\"$buf\"" if $needs_quote;
275	12					29	my $parameter = "; $key*$count";
276	12	100				30	$parameter .= '*' if defined $extended_value_charset;
277	12					46	$parameter .= "=$buf";
278	12	50				38	croak "Too long attribute '$key'" if length($parameter) > 78;
279	12					21	$ret .= $parameter;
280	12					19	$buf = '';
281	12					22	$count++;
282							}
283	616					892	$buf .= $_;
284							}
285	12	50				31	if (length($buf)) {
286	12	100				32	$buf = "\"$buf\"" if $needs_quote;
287	12					28	my $parameter = "; $key*$count";
288	12	100				32	$parameter .= '*' if defined $extended_value_charset;
289	12					23	$parameter .= "=$buf";
290	12	50				29	croak "Too long attribute '$key'" if length($parameter) > 78;
291	12					23	$ret .= $parameter;
292							}
293							}
294							}
295
296	76	100	100			253	if (! @continuous_value \|\| $PRE_2231_FORM) {
297	66					239	$ret .= "; $key=$ascii_value";
298							}
299							}
300
301	56	100				183	substr($ret, 0, 2, '') if length $ret;
302	56					177	return $ret;
303							}
304
305							sub _unfold_lines {
306	143			143		1537	$_[0] =~ s/(?:\r\n\|[\r\n])(?=[ \t])//g;
307							}
308
309							sub _clean_comments {
310	1194			1194		2610	my $ret = ($_[0] =~ s/^\s+//);
311	1194					2364	while (length $_[0]) {
312	962	100				1941	last unless $_[0] =~ s/^\(//;
313	32					50	my $level = 1;
314	32					64	while (length $_[0]) {
315	408					592	my $ch = substr $_[0], 0, 1, '';
316	408	100				1085	if ($ch eq '(') {
		100
		100
317	10					20	$level++;
318							} elsif ($ch eq ')') {
319	42					53	$level--;
320	42	100				87	last if $level == 0;
321							} elsif ($ch eq '\\') {
322	12					26	substr $_[0], 0, 1, '';
323							}
324							}
325	32	0	33			60	carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
326	32					87	$ret \|= ($_[0] =~ s/^\s+//);
327							}
328	1194					2083	return $ret;
329							}
330
331							sub _process_rfc2231 {
332	143			143		271	my ($attribs) = @_;
333	143					227	my %cont;
334							my %encoded;
335
336	143					619	foreach (keys %{$attribs}) {
	143					456
337	225	100				701	next unless $_ =~ m/^(.)\([0-9]+)\*?$/;
338	78					222	my ($attr, $sec) = ($1, $2);
339	78					197	$cont{$attr}->{$sec} = $attribs->{$_};
340	78	100				238	$encoded{$attr} = 1 if $_ =~ m/\*$/;
341	78					200	delete $attribs->{$_};
342							}
343
344	143					348	foreach (keys %cont) {
345	32					63	my $key = $_;
346	32	100				108	$key .= '*' if $encoded{$_};
347	32					67	$attribs->{$key} = join '', @{$cont{$_}}{sort { $a <=> $b } keys %{$cont{$_}}};
	32					145
	56					149
	32					150
348							}
349
350	143					206	foreach (keys %{$attribs}) {
	143					281
351	176	100				488	next unless $_ =~ m/^(.)\$/;
352	42					93	my $key = $1;
353	42	50	33			676	next unless defined $attribs->{$_} and $attribs->{$_} =~ m/^$re_exvalue$/;
354	42					144	my ($charset, $value) = ($1, $2);
355	42					187	$value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
	384					1237
356	42	100				112	if (length $charset) {
357	30					111	my $enc = find_mime_encoding($charset);
358	30	50				5944	if (defined $enc) {
359	30					207	$value = $enc->decode($value);
360							} else {
361	0					0	carp "Unknown charset '$charset' in attribute '$key' value";
362							}
363							}
364	42					187	$attribs->{$key} = $value;
365	42					127	delete $attribs->{$_};
366							}
367
368	143					424	return $attribs;
369							}
370
371							sub _parse_attributes {
372	143			143		306	local $_ = shift;
373	143	50	66			621	substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
374	143					255	my $attribs = {};
375
376	143					316	while (length $_) {
377	229	100	33			847	s/^;// or $STRICT_PARAMS and do {
378	0					0	carp "Missing semicolon before parameter '$_'";
379	0					0	return $attribs;
380							};
381
382	229					548	_clean_comments($_);
383
384	229	100				485	unless (length $_) {
385							# Some mail software generates a Content-Type like this:
386							# "Content-Type: text/plain;"
387							# RFC 1521 section 3 says a parameter must exist if there is a
388							# semicolon.
389	4	50				12	carp "Extra semicolon after last parameter" if $STRICT_PARAMS;
390	4					12	return $attribs;
391							}
392
393	225					293	my $attribute;
394	225	100				1192	if (s/^($re_token)=//) {
395	224					565	$attribute = lc $1;
396							} else {
397	1	50				4	if ($STRICT_PARAMS) {
398	0					0	carp "Illegal parameter '$_'";
399	0					0	return $attribs;
400							}
401	1	50				38	if (s/^($re_token_non_strict)=//) {
402	0					0	$attribute = lc $1;
403							} else {
404	1	50				8	unless (s/^([^;=\s]+)\s*=//) {
405	0					0	carp "Cannot parse parameter '$_'";
406	0					0	return $attribs;
407							}
408	1					4	$attribute = lc $1;
409							}
410							}
411
412	225					535	_clean_comments($_);
413	225					397	my $value = _extract_attribute_value();
414	225					563	$attribs->{$attribute} = $value;
415	225					423	_clean_comments($_);
416							}
417
418	139					328	return $attribs;
419							}
420
421							sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
422	225			225		309	my $value;
423	225					466	while (length $_) {
424	229	100				2339	if (s/^($re_token)//) {
		50
		0
		0
		0
425	136					367	$value .= $1;
426							} elsif (s/^$re_quoted_string//) {
427	93					260	my $sub = $1;
428	93					218	$sub =~ s/\\(.)/$1/g;
429	93					205	$value .= $sub;
430							} elsif ($STRICT_PARAMS) {
431	0					0	my $char = substr $_, 0, 1;
432	0					0	carp "Unquoted '$char' not allowed";
433	0					0	return;
434							} elsif (s/^($re_token_non_strict)//) {
435	0					0	$value .= $1;
436							} elsif (s/^$re_quoted_string_non_strict//) {
437	0					0	my $sub = $1;
438	0					0	$sub =~ s/\\(.)/$1/g;
439	0					0	$value .= $sub;
440							}
441
442	229					451	my $erased = _clean_comments($_);
443	229	100	100			870	last if !length $_ or /^;/;
444	6	50				16	if ($STRICT_PARAMS) {
445	0					0	my $char = substr $_, 0, 1;
446	0					0	carp "Extra '$char' found after parameter";
447	0					0	return;
448							}
449
450	6	50				13	if ($erased) {
451							# Sometimes semicolon is missing, so check for = char
452	6	100				68	last if m/^$re_token_non_strict=/;
453	4					9	$value .= ' ';
454							}
455
456	4					12	$value .= substr $_, 0, 1, '';
457							}
458	225					445	return $value;
459							}
460
461							1;
462
463							#pod =func parse_content_type
464							#pod
465							#pod This routine is exported by default.
466							#pod
467							#pod This routine parses email content type headers according to section 5.1 of RFC
468							#pod 2045 and also RFC 2231 (Character Set and Parameter Continuations). It returns
469							#pod a hash as above, with entries for the C, the C, and a hash of
470							#pod C.
471							#pod
472							#pod For backward compatibility with a really unfortunate misunderstanding of RFC
473							#pod 2045 by the early implementors of this module, C and C are
474							#pod also present in the returned hashref, with the values of C and C
475							#pod respectively.
476							#pod
477							#pod =func parse_content_disposition
478							#pod
479							#pod This routine is exported by default.
480							#pod
481							#pod This routine parses email Content-Disposition headers according to RFC 2183 and
482							#pod RFC 2231. It returns a hash as above, with entries for the C, and a hash
483							#pod of C.
484							#pod
485							#pod =func build_content_type
486							#pod
487							#pod This routine is exported by default.
488							#pod
489							#pod This routine builds email Content-Type header according to RFC 2045 and RFC 2231.
490							#pod It takes a hash as above, with entries for the C, the C, and
491							#pod optionally also a hash of C. It returns a string representing
492							#pod Content-Type header. Non-ASCII attributes are encoded to UTF-8 according to
493							#pod Character Set section of RFC 2231. Attribute which has more then 78 ASCII
494							#pod characters is split into more attributes accorrding to Parameter Continuations
495							#pod of RFC 2231.
496							#pod
497							#pod For compatibility reasons with clients which do not support RFC 2231, output
498							#pod string contains also truncated ASCII version of any too long or non-ASCII
499							#pod attribute. Encoding to ASCII is done via Text::Unidecode module. This
500							#pod behavior can cause confusion by 2231-compatible MIME implementations, and can
501							#pod be disabled by setting C<$Email::MIME::ContentType::STRICT> to true.
502							#pod
503							#pod =func build_content_disposition
504							#pod
505							#pod This routine is exported by default.
506							#pod
507							#pod This routine builds email Content-Disposition header according to RFC 2182 and
508							#pod RFC 2231. It takes a hash as above, with entries for the C, and
509							#pod optionally also a hash of C. It returns a string representing
510							#pod Content-Disposition header. Non-ASCII or too long attributes are handled in
511							#pod the same way like in L.
512							#pod
513							#pod =head1 WARNINGS
514							#pod
515							#pod This is not a valid content-type header, according to both RFC 1521 and RFC
516							#pod 2045:
517							#pod
518							#pod Content-Type: type/subtype;
519							#pod
520							#pod If a semicolon appears, a parameter must. C will carp if
521							#pod it encounters a header of this type, but you can suppress this by setting
522							#pod C<$Email::MIME::ContentType::STRICT_PARAMS> to a false value. Please consider
523							#pod localizing this assignment!
524							#pod
525							#pod Same applies for C.
526							#pod
527							#pod =cut
528
529							__END__