File Coverage

blib/lib/Email/MIME/ContentType.pm

Criterion	Covered	Total	%
statement	214	243	88.0
branch	109	158	68.9
condition	30	48	62.5
subroutine	17	17	100.0
pod	4	4	100.0
total	374	470	79.5

line	stmt	bran	cond	sub	pod	time	code
1	4			4		238024	use strict;
	4					38
	4					113
2	4			4		20	use warnings;
	4					8
	4					210
3							package Email::MIME::ContentType;
4							# ABSTRACT: Parse and build a MIME Content-Type or Content-Disposition Header
5							$Email::MIME::ContentType::VERSION = '1.026';
6	4			4		23	use Carp;
	4					7
	4					300
7	4			4		2091	use Encode 2.87 qw(encode find_mime_encoding);
	4					36615
	4					270
8	4			4		25	use Exporter 5.57 'import';
	4					46
	4					97
9	4			4		1774	use Text::Unidecode;
	4					8476
	4					5036
10
11							our @EXPORT = qw(parse_content_type parse_content_disposition build_content_type build_content_disposition);
12
13							#pod =head1 SYNOPSIS
14							#pod
15							#pod use Email::MIME::ContentType;
16							#pod
17							#pod # Content-Type: text/plain; charset="us-ascii"; format=flowed
18							#pod my $ct = 'text/plain; charset="us-ascii"; format=flowed';
19							#pod my $data = parse_content_type($ct);
20							#pod
21							#pod $data = {
22							#pod type => "text",
23							#pod subtype => "plain",
24							#pod attributes => {
25							#pod charset => "us-ascii",
26							#pod format => "flowed"
27							#pod }
28							#pod };
29							#pod
30							#pod my $ct_new = build_content_type($data);
31							#pod # text/plain; charset=us-ascii; format=flowed
32							#pod
33							#pod
34							#pod # Content-Type: application/x-stuff;
35							#pod # title0=us-ascii'en'This%20is%20even%20more%20;
36							#pod # title1=%2A%2A%2Afun%2A%2A%2A%20;
37							#pod # title*2="isn't it!"
38							#pod my $ct = q(application/x-stuff;
39							#pod title0=us-ascii'en'This%20is%20even%20more%20;
40							#pod title1=%2A%2A%2Afun%2A%2A%2A%20;
41							#pod title*2="isn't it!");
42							#pod my $data = parse_content_type($ct);
43							#pod
44							#pod $data = {
45							#pod type => "application",
46							#pod subtype => "x-stuff",
47							#pod attributes => {
48							#pod title => "This is even more *fun* isn't it!"
49							#pod }
50							#pod };
51							#pod
52							#pod
53							#pod # Content-Disposition: attachment; filename=genome.jpeg;
54							#pod # modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
55							#pod my $cd = q(attachment; filename=genome.jpeg;
56							#pod modification-date="Wed, 12 Feb 1997 16:29:51 -0500");
57							#pod my $data = parse_content_disposition($cd);
58							#pod
59							#pod $data = {
60							#pod type => "attachment",
61							#pod attributes => {
62							#pod filename => "genome.jpeg",
63							#pod "modification-date" => "Wed, 12 Feb 1997 16:29:51 -0500"
64							#pod }
65							#pod };
66							#pod
67							#pod my $cd_new = build_content_disposition($data);
68							#pod # attachment; filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
69							#pod
70							#pod =cut
71
72							our $STRICT_PARAMS = 1;
73
74							my $ct_default = 'text/plain; charset=us-ascii';
75
76							my $re_token = qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/; # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
77							my $re_token_non_strict = qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+\|$re_token)/; # allow CTLs and above ASCII
78
79							my $re_qtext = qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/; # US-ASCII except CR, LF, white space, backslash and quote
80							my $re_quoted_pair = qr/\\[\x00-\x7F]/;
81							my $re_quoted_string = qr/"((?:[ \t](?:$re_qtext\|$re_quoted_pair))[ \t]*)"/;
82
83							my $re_qtext_non_strict = qr/[\x80-\xFF]\|$re_qtext/;
84							my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
85							my $re_quoted_string_non_strict = qr/"((?:[ \t](?:$re_qtext_non_strict\|$re_quoted_pair_non_strict))[ \t]*)"/;
86
87							my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\\|\}~]+/;
88							my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
89							my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
90
91							sub parse_content_type {
92	85			85	1	55758	my $ct = shift;
93
94							# If the header isn't there or is empty, give default answer.
95	85	100	66			362	return parse_content_type($ct_default) unless defined $ct and length $ct;
96
97	83					211	_unfold_lines($ct);
98	83					207	_clean_comments($ct);
99
100							# It is also recommend (sic.) that this default be assumed when a
101							# syntactically invalid Content-Type header field is encountered.
102	83	50				682	unless ($ct =~ s/^($re_token)\/($re_token)//) {
103	0	0	0			0	unless ($STRICT_PARAMS and $ct =~ s/^($re_token_non_strict)\/($re_token_non_strict)//) {
104	0					0	carp "Invalid Content-Type '$ct'";
105	0					0	return parse_content_type($ct_default);
106							}
107							}
108
109	83					385	my ($type, $subtype) = (lc $1, lc $2);
110
111	83					181	_clean_comments($ct);
112	83					279	$ct =~ s/\s+$//;
113
114	83					153	my $attributes = {};
115	83	50	100			368	if ($STRICT_PARAMS and length $ct and $ct !~ /^;/) {
			66
116	0					0	carp "Missing semicolon before first Content-Type parameter '$ct'";
117							} else {
118	83					179	$attributes = _process_rfc2231(_parse_attributes($ct));
119							}
120
121							return {
122	83					490	type => $type,
123							subtype => $subtype,
124							attributes => $attributes,
125
126							# This is dumb. Really really dumb. For backcompat. -- rjbs,
127							# 2013-08-10
128							discrete => $type,
129							composite => $subtype,
130							};
131							}
132
133							my $cd_default = 'attachment';
134
135							sub parse_content_disposition {
136	34			34	1	16789	my $cd = shift;
137
138	34	100	66			254	return parse_content_disposition($cd_default) unless defined $cd and length $cd;
139
140	32					72	_unfold_lines($cd);
141	32					56	_clean_comments($cd);
142
143	32	50				212	unless ($cd =~ s/^($re_token)//) {
144	0	0	0			0	unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
145	0					0	carp "Invalid Content-Disposition '$cd'";
146	0					0	return parse_content_disposition($cd_default);
147							}
148							}
149
150	32					91	my $type = lc $1;
151
152	32					65	_clean_comments($cd);
153	32					88	$cd =~ s/\s+$//;
154
155	32					46	my $attributes = {};
156	32	50	100			170	if ($STRICT_PARAMS and length $cd and $cd !~ /^;/) {
			66
157	0					0	carp "Missing semicolon before first Content-Disposition parameter '$cd'";
158							} else {
159	32					73	$attributes = _process_rfc2231(_parse_attributes($cd));
160							}
161
162							return {
163	32					139	type => $type,
164							attributes => $attributes,
165							};
166							}
167
168							my $re_invalid_for_quoted_value = qr/[\x00-\x08\x0A-\x1F\x7F-\xFF]/; # non-US-ASCII and CTLs without SPACE and TAB
169							my $re_escape_extended_value = qr/[\x00-\x20\x7F-\xFF\'%()<>@,;:\\"\/\[\]?=]/; # non-US-ASCII, SPACE, CTLs, '% and tspecials ()<>@,;:\\"/[]?=
170
171							sub build_content_type {
172	15			15	1	8410	my $ct = shift;
173
174	15	50				37	croak 'Missing Content-Type \'type\' parameter' unless exists $ct->{type};
175	15	50				24	croak 'Missing Content-Type \'subtype\' parameter' unless exists $ct->{subtype};
176
177	15	50				126	croak 'Invalid Content-Type \'type\' parameter' if $ct->{type} !~ /^(?:$re_token)*$/;
178	15	50				75	croak 'Invalid Content-Type \'subtype\' parameter' if $ct->{subtype} !~ /^(?:$re_token)*$/;
179
180	15	50				39	croak 'Too long Content-Type \'type\' and \'subtype\' parameters' if length($ct->{type}) + length($ct->{subtype}) > 76;
181
182	15					21	my ($extra) = grep !/(?:type\|subtype\|attributes)/, sort keys %{$ct};
	15					141
183	15	50				36	croak "Extra Content-Type '$extra' parameter" if defined $extra;
184
185	15					33	my $ret = $ct->{type} . '/' . $ct->{subtype};
186	15	50				40	my $attrs = exists $ct->{attributes} ? _build_attributes($ct->{attributes}) : '';
187	15	100				40	$ret .= "; $attrs" if length($attrs);
188	15					37	return $ret;
189							}
190
191							sub build_content_disposition {
192	13			13	1	7300	my $cd = shift;
193
194	13	50				32	croak 'Missing Content-Type \'type\' parameter' unless exists $cd->{type};
195
196	13	50				114	croak 'Invalid Content-Type \'type\' parameter' if $cd->{type} !~ /^(?:$re_token)*$/;
197
198	13	50				34	croak 'Too long Content-Type \'type\' parameter' if length($cd->{type}) > 77;
199
200	13					19	my ($extra) = grep !/(?:type\|attributes)/, sort keys %{$cd};
	13					106
201	13	50				30	croak "Extra Content-Type '$extra' parameter" if defined $extra;
202
203	13					22	my $ret = $cd->{type};
204	13	50				35	my $attrs = exists $cd->{attributes} ? _build_attributes($cd->{attributes}) : '';
205	13	100				31	$ret .= "; $attrs" if length($attrs);
206	13					32	return $ret;
207							}
208
209							sub _build_attributes {
210	28			28		37	my $attributes = shift;
211
212	28					35	my $ret = '';
213
214	28					34	foreach my $key (sort keys %{$attributes}) {
	28					74
215	38					57	my $value = $attributes->{$key};
216	38					49	my $ascii_value = $value;
217	38					56	my @continuous_value;
218							my $extended_value_charset;
219
220	38	50				130	croak "Invalid attribute '$key'" if $key =~ /$re_escape_extended_value/; # complement to attribute-char in 8bit space
221	38	50				66	croak "Undefined attribute '$key'" unless defined $value;
222
223	4	100		4		45	if ($value =~ /\P{ASCII}/) {
	4					21
	4					54
	38					84
224	7					23	$ascii_value = unidecode($value);
225	7					2627	$ascii_value =~ s/\P{ASCII}/_/g;
226	7					41	@continuous_value = map { encode('UTF-8', $_) } split //, $value;
	197					5888
227	7					227	$extended_value_charset = 'UTF-8';
228							}
229
230	38	100	100			285	if ($ascii_value !~ /^(?:$re_token)*$/ or $ascii_value =~ /'/) {
231	20	50				64	if ($ascii_value =~ /$re_invalid_for_quoted_value/) {
232	0	0				0	@continuous_value = split //, $value unless @continuous_value;
233	0					0	$ascii_value =~ s/[\n\r]/ /g;
234	0					0	$ascii_value =~ s/$re_invalid_for_quoted_value/_/g;
235							}
236	20					52	$ascii_value =~ s/(["\\])/\\$1/g;
237	20					37	$ascii_value = "\"$ascii_value\"";
238							}
239
240	38	100				81	if (length($key) + length($ascii_value) > 75) { # length(" $key=$ascii_value;") > 78
241	3	50				10	croak "Too long attribute '$key'" if length($key) > 71; # length(" $key=...;") > 78
242	3	100				11	my $pos = $ascii_value =~ /"$/ ? 71 : 72;
243	3					11	substr($ascii_value, $pos - length($key), length($ascii_value) + length($key) - 72, '...');
244	3	50				44	@continuous_value = split //, $value unless @continuous_value;
245							}
246
247	38	100				66	if (@continuous_value) {
248	10					14	my $needs_quote;
249	10	100				21	unless (defined $extended_value_charset) {
250	3	100				6	$needs_quote = 1 if grep { $_ !~ /^(?:$re_token)*$/ or $_ =~ /'/ } @continuous_value;
	204	100				898
251	3	50	66			69	$extended_value_charset = 'US-ASCII' if $needs_quote and grep /$re_invalid_for_quoted_value/, @continuous_value;
252							}
253
254	10					13	my $add_param_len = 4; # for '; *='
255	10	100				25	if (defined $extended_value_charset) {
		100
256	7					124	$_ =~ s/($re_escape_extended_value)/sprintf('%%%02X', ord($1))/eg foreach @continuous_value;
	71					345
257	7					20	substr($continuous_value[0], 0, 0, "$extended_value_charset''");
258	7					12	$add_param_len += 1; # for '*' - charset
259							} elsif ($needs_quote) {
260	2					38	$_ =~ s/(["\\])/\\$1/g foreach @continuous_value;
261	2					5	$add_param_len += 2; # for quotes
262							}
263
264	10	100	100			60	if ($value =~ /\P{ASCII}/ and length(my $oneparameter = "; $key*=" . join '', @continuous_value) <= 78) {
265	4					9	$ret .= $oneparameter;
266							} else {
267	6					11	my $buf = '';
268	6					7	my $count = 0;
269	6					13	foreach (@continuous_value) {
270	308	100				458	if (length($key) + length($count) + length($buf) + length($_) + $add_param_len > 78) {
271	6	100				13	$buf = "\"$buf\"" if $needs_quote;
272	6					13	my $parameter = "; $key*$count";
273	6	100				14	$parameter .= '*' if defined $extended_value_charset;
274	6					11	$parameter .= "=$buf";
275	6	50				14	croak "Too long attribute '$key'" if length($parameter) > 78;
276	6					8	$ret .= $parameter;
277	6					9	$buf = '';
278	6					9	$count++;
279							}
280	308					354	$buf .= $_;
281							}
282	6	50				13	if (length($buf)) {
283	6	100				12	$buf = "\"$buf\"" if $needs_quote;
284	6					15	my $parameter = "; $key*$count";
285	6	100				13	$parameter .= '*' if defined $extended_value_charset;
286	6					11	$parameter .= "=$buf";
287	6	50				11	croak "Too long attribute '$key'" if length($parameter) > 78;
288	6					11	$ret .= $parameter;
289							}
290							}
291							}
292
293	38					114	$ret .= "; $key=$ascii_value";
294							}
295
296	28	100				74	substr($ret, 0, 2, '') if length $ret;
297	28					65	return $ret;
298							}
299
300							sub _unfold_lines {
301	115			115		1139	$_[0] =~ s/(?:\r\n\|[\r\n])(?=[ \t])//g;
302							}
303
304							sub _clean_comments {
305	962			962		1939	my $ret = ($_[0] =~ s/^\s+//);
306	962					1695	while (length $_[0]) {
307	782	100				1448	last unless $_[0] =~ s/^\(//;
308	32					57	my $level = 1;
309	32					71	while (length $_[0]) {
310	408					666	my $ch = substr $_[0], 0, 1, '';
311	408	100				1016	if ($ch eq '(') {
		100
		100
312	10					20	$level++;
313							} elsif ($ch eq ')') {
314	42					55	$level--;
315	42	100				89	last if $level == 0;
316							} elsif ($ch eq '\\') {
317	12					23	substr $_[0], 0, 1, '';
318							}
319							}
320	32	0	33			62	carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
321	32					90	$ret \|= ($_[0] =~ s/^\s+//);
322							}
323	962					1535	return $ret;
324							}
325
326							sub _process_rfc2231 {
327	115			115		210	my ($attribs) = @_;
328	115					179	my %cont;
329							my %encoded;
330
331	115					158	foreach (keys %{$attribs}) {
	115					357
332	181	100				512	next unless $_ =~ m/^(.)\([0-9]+)\*?$/;
333	66					150	my ($attr, $sec) = ($1, $2);
334	66					149	$cont{$attr}->{$sec} = $attribs->{$_};
335	66	100				198	$encoded{$attr} = 1 if $_ =~ m/\*$/;
336	66					126	delete $attribs->{$_};
337							}
338
339	115					253	foreach (keys %cont) {
340	26					38	my $key = $_;
341	26	100				72	$key .= '*' if $encoded{$_};
342	26					40	$attribs->{$key} = join '', @{$cont{$_}}{sort { $a <=> $b } keys %{$cont{$_}}};
	26					107
	46					113
	26					123
343							}
344
345	115					157	foreach (keys %{$attribs}) {
	115					214
346	138	100				324	next unless $_ =~ m/^(.)\$/;
347	35					96	my $key = $1;
348	35	50	33			511	next unless defined $attribs->{$_} and $attribs->{$_} =~ m/^$re_exvalue$/;
349	35					113	my ($charset, $value) = ($1, $2);
350	35					138	$value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
	313					983
351	35	100				121	if (length $charset) {
352	23					72	my $enc = find_mime_encoding($charset);
353	23	50				4280	if (defined $enc) {
354	23					145	$value = $enc->decode($value);
355							} else {
356	0					0	carp "Unknown charset '$charset' in attribute '$key' value";
357							}
358							}
359	35					117	$attribs->{$key} = $value;
360	35					102	delete $attribs->{$_};
361							}
362
363	115					330	return $attribs;
364							}
365
366							sub _parse_attributes {
367	115			115		212	local $_ = shift;
368	115	50	66			457	substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
369	115					173	my $attribs = {};
370
371	115					229	while (length $_) {
372	185	100	33			587	s/^;// or $STRICT_PARAMS and do {
373	0					0	carp "Missing semicolon before parameter '$_'";
374	0					0	return $attribs;
375							};
376
377	185					454	_clean_comments($_);
378
379	185	100				308	unless (length $_) {
380							# Some mail software generates a Content-Type like this:
381							# "Content-Type: text/plain;"
382							# RFC 1521 section 3 says a parameter must exist if there is a
383							# semicolon.
384	4	50				13	carp "Extra semicolon after last parameter" if $STRICT_PARAMS;
385	4					12	return $attribs;
386							}
387
388	181					231	my $attribute;
389	181	100				880	if (s/^($re_token)=//) {
390	180					412	$attribute = lc $1;
391							} else {
392	1	50				5	if ($STRICT_PARAMS) {
393	0					0	carp "Illegal parameter '$_'";
394	0					0	return $attribs;
395							}
396	1	50				41	if (s/^($re_token_non_strict)=//) {
397	0					0	$attribute = lc $1;
398							} else {
399	1	50				8	unless (s/^([^;=\s]+)\s*=//) {
400	0					0	carp "Cannot parse parameter '$_'";
401	0					0	return $attribs;
402							}
403	1					4	$attribute = lc $1;
404							}
405							}
406
407	181					388	_clean_comments($_);
408	181					267	my $value = _extract_attribute_value();
409	181					398	$attribs->{$attribute} = $value;
410	181					336	_clean_comments($_);
411							}
412
413	111					235	return $attribs;
414							}
415
416							sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
417	181			181		259	my $value;
418	181					320	while (length $_) {
419	185	100				1648	if (s/^($re_token)//) {
		50
		0
		0
		0
420	112					264	$value .= $1;
421							} elsif (s/^$re_quoted_string//) {
422	73					187	my $sub = $1;
423	73					156	$sub =~ s/\\(.)/$1/g;
424	73					140	$value .= $sub;
425							} elsif ($STRICT_PARAMS) {
426	0					0	my $char = substr $_, 0, 1;
427	0					0	carp "Unquoted '$char' not allowed";
428	0					0	return;
429							} elsif (s/^($re_token_non_strict)//) {
430	0					0	$value .= $1;
431							} elsif (s/^$re_quoted_string_non_strict//) {
432	0					0	my $sub = $1;
433	0					0	$sub =~ s/\\(.)/$1/g;
434	0					0	$value .= $sub;
435							}
436
437	185					322	my $erased = _clean_comments($_);
438	185	100	100			607	last if !length $_ or /^;/;
439	6	50				14	if ($STRICT_PARAMS) {
440	0					0	my $char = substr $_, 0, 1;
441	0					0	carp "Extra '$char' found after parameter";
442	0					0	return;
443							}
444
445	6	50				13	if ($erased) {
446							# Sometimes semicolon is missing, so check for = char
447	6	100				79	last if m/^$re_token_non_strict=/;
448	4					11	$value .= ' ';
449							}
450
451	4					12	$value .= substr $_, 0, 1, '';
452							}
453	181					313	return $value;
454							}
455
456							1;
457
458							#pod =func parse_content_type
459							#pod
460							#pod This routine is exported by default.
461							#pod
462							#pod This routine parses email content type headers according to section 5.1 of RFC
463							#pod 2045 and also RFC 2231 (Character Set and Parameter Continuations). It returns
464							#pod a hash as above, with entries for the C, the C, and a hash of
465							#pod C.
466							#pod
467							#pod For backward compatibility with a really unfortunate misunderstanding of RFC
468							#pod 2045 by the early implementors of this module, C and C are
469							#pod also present in the returned hashref, with the values of C and C
470							#pod respectively.
471							#pod
472							#pod =func parse_content_disposition
473							#pod
474							#pod This routine is exported by default.
475							#pod
476							#pod This routine parses email Content-Disposition headers according to RFC 2183 and
477							#pod RFC 2231. It returns a hash as above, with entries for the C, and a hash
478							#pod of C.
479							#pod
480							#pod =func build_content_type
481							#pod
482							#pod This routine is exported by default.
483							#pod
484							#pod This routine builds email Content-Type header according to RFC 2045 and RFC 2231.
485							#pod It takes a hash as above, with entries for the C, the C, and
486							#pod optionally also a hash of C. It returns a string representing
487							#pod Content-Type header. Non-ASCII attributes are encoded to UTF-8 according to
488							#pod Character Set section of RFC 2231. Attribute which has more then 78 ASCII
489							#pod characters is split into more attributes accorrding to Parameter Continuations
490							#pod of RFC 2231. For compatibility reasons with clients which do not support
491							#pod RFC 2231, output string contains also truncated ASCII version of any too long or
492							#pod non-ASCII attribute. Encoding to ASCII is done via Text::Unidecode module.
493							#pod
494							#pod =func build_content_disposition
495							#pod
496							#pod This routine is exported by default.
497							#pod
498							#pod This routine builds email Content-Disposition header according to RFC 2182 and
499							#pod RFC 2231. It takes a hash as above, with entries for the C, and
500							#pod optionally also a hash of C. It returns a string representing
501							#pod Content-Disposition header. Non-ASCII or too long attributes are handled in
502							#pod the same way like in L.
503							#pod
504							#pod =head1 WARNINGS
505							#pod
506							#pod This is not a valid content-type header, according to both RFC 1521 and RFC
507							#pod 2045:
508							#pod
509							#pod Content-Type: type/subtype;
510							#pod
511							#pod If a semicolon appears, a parameter must. C will carp if
512							#pod it encounters a header of this type, but you can suppress this by setting
513							#pod C<$Email::MIME::ContentType::STRICT_PARAMS> to a false value. Please consider
514							#pod localizing this assignment!
515							#pod
516							#pod Same applies for C.
517							#pod
518							#pod =cut
519
520							__END__