File Coverage

lib/Sisimai/MIME.pm
Criterion Covered Total %
statement 203 208 97.6
branch 105 124 84.6
condition 33 48 68.7
subroutine 15 15 100.0
pod 7 7 100.0
total 363 402 90.3


line stmt bran cond sub pod time code
1             package Sisimai::MIME;
2 81     81   59170 use feature ':5.10';
  81         145  
  81         5297  
3 81     81   394 use strict;
  81         127  
  81         1681  
4 81     81   353 use warnings;
  81         129  
  81         1865  
5 81     81   817 use Encode;
  81         8690  
  81         6862  
6 81     81   32935 use MIME::Base64 ();
  81         45907  
  81         1705  
7 81     81   29465 use MIME::QuotedPrint ();
  81         16962  
  81         1543  
8 81     81   729 use Sisimai::String;
  81         135  
  81         12767  
9 81         218644 use constant ReE => {
10             '7bit-encoded' => qr/^content-transfer-encoding:[ ]*7bit/m,
11             'quoted-print' => qr/^content-transfer-encoding:[ ]*quoted-printable/m,
12             'some-iso2022' => qr/^content-type:[ ]*.+;[ ]*charset=["']?(iso-2022-[-a-z0-9]+)['"]?\b/m,
13             'with-charset' => qr/^content[-]type:[ ]*.+[;][ ]*charset=['"]?([-0-9a-z]+)['"]?\b/,
14             'only-charset' => qr/^[\s\t]+charset=['"]?([-0-9a-z]+)['"]?\b/,
15             'html-message' => qr|^content-type:[ ]*text/html;|m,
16 81     81   488 };
  81         167  
17              
18             sub is_mimeencoded {
19             # Check that the argument is MIME-Encoded string or not
20             # @param [String] argv1 String to be checked
21             # @return [Integer] 0: Not MIME encoded string
22             # 1: MIME encoded string
23 8698     8698 1 10860 my $class = shift;
24 8698   50     13439 my $argv1 = shift || return undef;
25 8698 50       14042 return undef unless ref $argv1 eq 'SCALAR';
26              
27 8698         9698 my $text1 = $$argv1; $text1 =~ y/"//d;
  8698         10676  
28 8698         8864 my $mime1 = 0;
29 8698         8718 my @piece;
30              
31 8698 100       15833 if( rindex($text1, ' ') > -1 ) {
32             # Multiple MIME-Encoded strings in a line
33 3470         11042 @piece = split(' ', $text1);
34             } else {
35 5228         7369 push @piece, $text1;
36             }
37              
38 8698         11809 for my $e ( @piece ) {
39             # Check all the string in the array
40 20652 100       32602 next unless $e =~ /[ \t]*=[?][-_0-9A-Za-z]+[?][BbQq][?].+[?]=?[ \t]*/;
41 1276         1879 $mime1 = 1;
42             }
43 8698         22110 return $mime1;
44             }
45              
46             sub mimedecode {
47             # Decode MIME-Encoded string
48             # @param [Array] argvs Reference to an array including MIME-Encoded text
49             # @return [String] MIME-Decoded text
50 2667     2667 1 7322 my $class = shift;
51 2667         3856 my $argvs = shift;
52 2667 50       5159 return undef unless ref $argvs eq 'ARRAY';
53              
54 2667         3320 my $characterset = '';
55 2667         3022 my $encodingname = '';
56 2667         2687 my @decodedtext0;
57              
58 2667         3871 for my $e ( @$argvs ) {
59             # Check and decode each element
60 2884         4933 $e =~ s/\A[ \t]+//g;
61 2884         4456 $e =~ s/[ \t]+\z//g;
62 2884         4211 $e =~ y/"//d;
63              
64 2884 100       4930 if( __PACKAGE__->is_mimeencoded(\$e) ) {
65             # =?utf-8?B?55m954yr44Gr44KD44KT44GT?=
66 490 100       2295 next unless $e =~ m{\A(.*)=[?]([-_0-9A-Za-z]+)[?]([BbQq])[?](.+)[?]=?(.*)\z};
67 468   66     2306 $characterset ||= lc $2;
68 468   66     1667 $encodingname ||= uc $3;
69              
70 468         863 push @decodedtext0, $1;
71 468 100       2711 push @decodedtext0, $encodingname eq 'B'
72             ? MIME::Base64::decode($4)
73             : MIME::QuotedPrint::decode($4);
74 468         842 $decodedtext0[-1] =~ y/\r\n//d;
75 468         1141 push @decodedtext0, $5;
76              
77             } else {
78 2394 100       5351 push @decodedtext0, scalar @decodedtext0 ? ' '.$e : $e;
79             }
80             }
81 2667 50       4545 return '' unless scalar @decodedtext0;
82              
83 2667         5082 my $decodedtext1 = join('', @decodedtext0);
84 2667 100 66     6682 if( $characterset && $encodingname ) {
85             # utf-8 => utf8
86 403 100       893 $characterset = 'utf8' if $characterset eq 'utf-8';
87              
88 403 100       878 if( $characterset ne 'utf8' ) {
89             # Characterset is not UTF-8
90 138         177 eval { Encode::from_to($decodedtext1, $characterset, 'utf8') };
  138         564  
91 138 50       29138 $decodedtext1 = 'FAILED TO CONVERT THE SUBJECT' if $@;
92             }
93             }
94 2667         7433 return $decodedtext1;
95             }
96              
97             sub qprintd {
98             # Decode MIME Quoted-Printable Encoded string
99             # @param [String] argv1 MIME Encoded text
100             # @param [Hash] heads Email header
101             # @return [String] MIME Decoded text
102 266     266 1 2295 my $class = shift;
103 266   50     585 my $argv1 = shift // return undef;
104 266   100     995 my $heads = shift // {};
105 266         447 my $plain = '';
106 266 50       691 return \'' unless ref $argv1 eq 'SCALAR';
107              
108 266 100 66     815 if( ! exists $heads->{'content-type'} || ! $heads->{'content-type'} ) {
109             # There is no Content-Type: field
110 265         10438 $plain = MIME::QuotedPrint::decode($$argv1);
111 265         2371 return \$plain;
112             }
113              
114             # Quoted-printable encoded part is the part of the text
115 1         3 my $boundary00 = __PACKAGE__->boundary($heads->{'content-type'}, 0);
116 1 50 33     16 if( ! $boundary00 || lc($$argv1) !~ ReE->{'quoted-print'} ) {
117             # There is no boundary string or no
118             # Content-Transfer-Encoding: quoted-printable field.
119 0         0 $plain = MIME::QuotedPrint::decode($$argv1);
120 0         0 return \$plain;
121             }
122              
123 1         3 my $boundary01 = Sisimai::MIME->boundary($heads->{'content-type'}, 1);
124 1         2 my $bodystring = '';
125 1         1 my $notdecoded = '';
126              
127 1         2 my $encodename = undef;
128 1         1 my $ctencoding = undef;
129 1         2 my $mimeinside = 0;
130              
131 1         8 for my $e ( split("\n", $$argv1) ) {
132             # This is a multi-part message in MIME format. Your mail reader does not
133             # understand MIME message format.
134             # --=_gy7C4Gpes0RP4V5Bs9cK4o2Us2ZT57b-3OLnRN+4klS8dTmQ
135             # Content-Type: text/plain; charset=iso-8859-15
136             # Content-Transfer-Encoding: quoted-printable
137 31 100       34 if( $mimeinside ) {
138             # Quoted-Printable encoded text block
139 21 100       20 if( $e eq $boundary00 ) {
140             # The next boundary string has appeared
141             # --=_gy7C4Gpes0RP4V5Bs9cK4o2Us2ZT57b-3OLnRN+4klS8dTmQ
142 1         7 my $hasdecoded = MIME::QuotedPrint::decode($notdecoded);
143 1         14 $hasdecoded = Sisimai::String->to_utf8(\$hasdecoded, $encodename);
144 1         4 $bodystring .= $$hasdecoded;
145 1         3 $bodystring .= $e . "\n";
146              
147 1         2 $notdecoded = '';
148 1         1 $mimeinside = 0;
149 1         1 $ctencoding = undef;
150 1         2 $encodename = undef;
151             } else {
152             # Inside of Queoted printable encoded text
153 20         29 $notdecoded .= $e . "\n";
154             }
155             } else {
156             # NOT Quoted-Printable encoded text block
157 10 100 66     88 if( (my $lowercased = lc $e) =~ /\A[-]{2}[^\s]+[^-]\z/ ) {
    100          
    100          
    50          
158             # Start of the boundary block
159             # --=_gy7C4Gpes0RP4V5Bs9cK4o2Us2ZT57b-3OLnRN+4klS8dTmQ
160 1 50       3 unless( $e eq $boundary00 ) {
161             # New boundary string has appeared
162 0         0 $boundary00 = $e;
163 0         0 $boundary01 = $e . '--';
164             }
165             } elsif( $lowercased =~ ReE->{'with-charset'} || $lowercased =~ ReE->{'only-charset'} ) {
166             # Content-Type: text/plain; charset=ISO-2022-JP
167 1         3 $encodename = $1;
168 1 50       2 $mimeinside = 1 if $ctencoding;
169              
170             } elsif( $lowercased =~ ReE->{'quoted-print'} ) {
171             # Content-Transfer-Encoding: quoted-printable
172 1         2 $ctencoding = $e;
173 1 50       2 $mimeinside = 1 if $encodename;
174              
175             } elsif( $e eq $boundary01 ) {
176             # The end of boundary block
177             # --=_gy7C4Gpes0RP4V5Bs9cK4o2Us2ZT57b-3OLnRN+4klS8dTmQ--
178 0         0 $mimeinside = 0;
179             }
180 10         22 $bodystring .= $e . "\n";
181             }
182             }
183              
184 1 50       4 $bodystring .= $notdecoded if length $notdecoded;
185 1         4 return \$bodystring;
186             }
187              
188             sub base64d {
189             # Decode MIME BASE64 Encoded string
190             # @param [String] argv1 MIME Encoded text
191             # @return [String] MIME-Decoded text
192 35     35 1 323 my $class = shift;
193 35   50     105 my $argv1 = shift // return undef;
194 35 50       116 return \'' unless ref $argv1 eq 'SCALAR';
195              
196             # Decode BASE64
197 35 50       375 my $plain = $$argv1 =~ m|([+/=0-9A-Za-z\r\n]+)| ? MIME::Base64::decode($1) : '';
198 35         114 return \$plain;
199             }
200              
201             sub boundary {
202             # Get boundary string
203             # @param [String] argv1 The value of Content-Type header
204             # @param [Integer] start -1: boundary string itself
205             # 0: Start of boundary
206             # 1: End of boundary
207             # @return [String] Boundary string
208 2317     2317 1 3469 my $class = shift;
209 2317   50     4183 my $argv1 = shift || return undef;
210 2317   100     3819 my $start = shift // -1;
211 2317         2524 my $value = '';
212              
213 2317 100       11450 if( lc $argv1 =~ /\bboundary=([^ ]+)/ ) {
214             # Content-Type: multipart/mixed; boundary=Apple-Mail-5--931376066
215             # Content-Type: multipart/report; report-type=delivery-status;
216             # boundary="n6H9lKZh014511.1247824040/mx.example.jp"
217 2272         5466 $value = $1;
218 2272         3575 $value =~ y/"';\\//d;
219 2272 100       6739 $value = '--'.$value if $start > -1;
220 2272 100       4573 $value = $value.'--' if $start > 0;
221             }
222 2317         5414 return $value;
223             }
224              
225             sub breaksup {
226             # Breaks up each multipart/* block
227             # @param [String] argv0 Text block of multipart/*
228             # @param [String] argv1 MIME type of the outside part
229             # @return [String] Decoded part as a plain text(text part only)
230 5041     5041 1 8188 my $class = shift;
231 5041   100     7524 my $argv0 = shift || return undef;
232 5040   50     7339 my $argv1 = shift || '';
233              
234 5040         4974 state $alsoappend = qr{\A(?:text/rfc822-headers|message/)};
235 5040         4524 state $thisformat = qr/\A(?:Content-Transfer-Encoding:\s*.+\n)?Content-Type:\s*([^ ;\s]+)/;
236 5040         4717 state $leavesonly = qr{\A(?>
237             text/(?:plain|html|rfc822-headers)
238             |message/(?:x?delivery-status|rfc822|partial|feedback-report)
239             |multipart/(?:report|alternative|mixed|related|partial)
240             )
241             }x;
242              
243 5040 50       27950 my $mimeformat = $$argv0 =~ $thisformat ? lc($1) : '';
244 5040 100       9502 my $alternates = index($argv1, 'multipart/alternative') == 0 ? 1 : 0;
245 5040         6078 my $hasflatten = ''; # Message body including only text/plain and message/*
246              
247             # Sisimai require only MIME types defined in $leavesonly variable
248 5040 100       20857 return \'' unless $mimeformat =~ $leavesonly;
249 4999 50 66     9843 return \'' if $alternates && $mimeformat eq 'text/html';
250              
251 4999         21007 my ($upperchunk, $lowerchunk) = split(/^$/m, $$argv0, 2);
252 4999         8228 $upperchunk =~ y/\n/ /;
253 4999         7056 $upperchunk =~ y/ //s;
254              
255             # Content-Description: Undelivered Message
256             # Content-Type: message/rfc822
257             #
258 4999   50     7690 $lowerchunk ||= '';
259              
260 4999 100       8348 if( index($mimeformat, 'multipart/') == 0 ) {
261             # Content-Type: multipart/*
262 269         704 my $mpboundary = __PACKAGE__->boundary($upperchunk, 0);
263 269         5166 my @innerparts = split(/\Q$mpboundary\E\n/, $lowerchunk);
264 269 50       876 shift @innerparts unless length $innerparts[0];
265 269 100       669 shift @innerparts if $innerparts[0] eq "\n";
266              
267 269         473 for my $e ( @innerparts ) {
268             # Find internal multipart/* blocks and decode
269 561 100       2860 if( $e =~ $thisformat ) {
270             # Found Content-Type field at the first or second line of this
271             # split part
272 522         1143 my $nextformat = lc $1;
273 522 100       2071 next unless $nextformat =~ $leavesonly;
274 455 100       1050 next if $nextformat eq 'text/html';
275 285         325 $hasflatten .= ${ __PACKAGE__->breaksup(\$e, $mimeformat) };
  285         863  
276              
277             } else {
278             # The content of this part is almost '--': a part of boundary
279             # string which is used for splitting multipart/* blocks.
280 39         74 $hasflatten .= "\n";
281             }
282             }
283             } else {
284             # Is not "Content-Type: multipart/*"
285 4730 100       9585 if( $upperchunk =~ /Content-Transfer-Encoding: ([^\s;]+)/ ) {
286             # Content-Transfer-Encoding: quoted-printable|base64|7bit|...
287 1297         2233 my $getdecoded = '';
288              
289 1297 100       5047 if( (my $ctencoding = lc $1) eq 'quoted-printable' ) {
    100          
    100          
290             # Content-Transfer-Encoding: quoted-printable
291 218         281 $getdecoded = ${ __PACKAGE__->qprintd(\$lowerchunk) };
  218         880  
292              
293             } elsif( $ctencoding eq 'base64' ) {
294             # Content-Transfer-Encoding: base64
295 29         69 $getdecoded = ${ __PACKAGE__->base64d(\$lowerchunk) };
  29         103  
296              
297             } elsif( $ctencoding eq '7bit' ) {
298             # Content-Transfer-Encoding: 7bit
299 788 100       3504 if( lc($upperchunk) =~ ReE->{'some-iso2022'} ) {
300             # Content-Type: text/plain; charset=ISO-2022-JP
301 59         112 $getdecoded = ${ Sisimai::String->to_utf8(\$lowerchunk, $1) };
  59         396  
302              
303             } else {
304             # No "charset" parameter in Content-Type: field
305 729         1210 $getdecoded = $lowerchunk;
306             }
307             } else {
308             # Content-Transfer-Encoding: 8bit, binary, and so on
309 262         425 $getdecoded = $lowerchunk;
310             }
311 1297 100       3470 $getdecoded =~ s|\r\n|\n|g if index($getdecoded, "\r\n") > -1; # Convert CRLF to LF
312              
313 1297 100       6096 if( $mimeformat =~ $alsoappend ) {
    100          
314             # Append field when the value of Content-Type: begins with
315             # message/ or equals text/rfc822-headers.
316 608         2421 $upperchunk =~ s/Content-Transfer-Encoding:\s*[^\s]+//;
317 608 100       1979 $upperchunk =~ s/\A[ ]//g if substr($upperchunk, 0, 1) eq ' ';
318 608 50       2491 $upperchunk =~ s/[ ]\z//g if substr($upperchunk, -1, 1) eq ' ';
319 608         1233 $hasflatten .= $upperchunk;
320              
321             } elsif( $mimeformat eq 'text/html' ) {
322             # Delete HTML tags inside of text/html part whenever possible
323 21         42 $getdecoded = ${ Sisimai::String->to_plain(\$getdecoded) };
  21         190  
324             }
325 1297 50       4657 $hasflatten .= $getdecoded."\n\n" if length $getdecoded;
326             } else {
327             # Content-Type: text/plain OR text/rfc822-headers OR message/*
328 3433 100 100     8423 if( index($mimeformat, 'message/') == 0 || $mimeformat eq 'text/rfc822-headers' ) {
329             # Append headers of multipart/* when the value of "Content-Type"
330             # is inlucded in the following MIME types:
331             # - message/delivery-status
332             # - message/rfc822
333             # - text/rfc822-headers
334 2706         4618 $hasflatten .= $upperchunk;
335             }
336 3433         6740 $lowerchunk =~ s/^--\z//m;
337 3433 100       9299 $lowerchunk .= "\n" unless $lowerchunk =~ /\n\z/;
338 3433         7472 $hasflatten .= $lowerchunk;
339             }
340             }
341 4999         18405 return \$hasflatten;
342             }
343              
344             sub makeflat {
345             # MIME decode entire message body
346             # @param [String] argv0 Content-Type header
347             # @param [String] argv1 Entire message body
348             # @return [String] Decoded message body
349 1930     1930 1 6149 my $class = shift;
350 1930   100     3769 my $argv0 = shift // return undef;
351 1929   50     3567 my $argv1 = shift // return undef;
352              
353 1929         4293 my $ehboundary = __PACKAGE__->boundary($argv0, 0);
354 1929 100       17496 my $mimeformat = $argv0 =~ qr|\A([0-9a-z]+/[^ ;]+)| ? $1 : '';
355 1929         4365 my $bodystring = '';
356              
357 1929 100       4808 return \'' unless index($mimeformat, 'multipart/') > -1;
358 1898 100       3032 return \'' unless $ehboundary;
359              
360             # Some bounce messages include lower-cased "content-type:" field such as
361             # content-type: message/delivery-status
362             # content-transfer-encoding: quoted-printable
363 1885         28856 $$argv1 =~ s/[Cc]ontent-[Tt]ype:/Content-Type:/g;
364 1885         7820 $$argv1 =~ s/[Cc]ontent-[Tt]ransfer-[Ee]ncodeing:/Content-Transfer-Encoding:/g;
365              
366             # 1. Some bounce messages include upper-cased "Content-Transfer-Encoding",
367             # and "Content-Type" value such as
368             # - Content-Type: multipart/RELATED;
369             # - Content-Transfer-Encoding: 7BIT
370             # 2. Unused fields inside of mutipart/* block should be removed
371 1885         9818 $$argv1 =~ s/(Content-[A-Za-z-]+?):[ ]*([^\s]+)/$1.': '.lc($2)/eg;
  13964         57698  
372 1885         13960 $$argv1 =~ s/^Content-(?:Description|Disposition):.+?\n//gm;
373              
374 1885         45146 my @multiparts = split(/\Q$ehboundary\E\n?/, $$argv1);
375 1885 100       6249 shift @multiparts unless length $multiparts[0];
376 1885         3709 for my $e ( @multiparts ) {
377             # Find internal multipart blocks and decode
378             XCCT: {
379             # Remove fields except Content-Type, Content-Transfer-Encoding in
380             # each part such as the following:
381             # Date: Thu, 29 Apr 2018 22:22:22 +0900
382             # MIME-Version: 1.0
383             # Message-ID: ...
384             # Content-Transfer-Encoding: quoted-printable
385             # Content-Type: text/plain; charset=us-ascii
386             #
387             # Fields before "Content-Type:" in each part should have been removed
388             # and "Content-Type:" should be exist at the first line of each part.
389             # The field works as a delimiter to decode contents of each part.
390             #
391 8286 100       8709 last(XCCT) if $e =~ /\AContent-T[ry]/; # The first field is "Content-Type:"
  8286         16760  
392 3561 50 100     7299 my $p = $1 if $e =~ /\A(.+?)Content-Type:/s || last(XCCT);
393 51 100       176 last(XCCT) if $p =~ /\n\n/m; # There is no field before "Content-Type:"
394 29         205 $e =~ s/\A.+?(Content-T[ry].+)\z/$1/s; # Remove fields before "Content-Type:"
395             }
396              
397 8286 100       30542 if( $e =~ /\A(?:Content-[A-Za-z-]+:.+?\r?\n)?Content-Type:[ ]*[^\s]+/ ) {
398             # Content-Type: multipart/*
399 4754         5164 $bodystring .= ${ __PACKAGE__->breaksup(\$e, $mimeformat) };
  4754         9474  
400              
401             } else {
402             # Is not multipart/* block
403 3532         6143 $e =~ s|^Content-Transfer-Encoding:.+?\n||sim;
404 3532         4529 $e =~ s|^Content-Type:\s*text/plain.+?\n||sim;
405 3532         6001 $bodystring .= $e;
406             }
407             }
408              
409             # Remove entire message body of the original message beginning from
410             # Content-Type: message/rfc822 field so Sisimai does not read the message
411             # body for detecting a bounce reason, for getting email header fields of
412             # the original message.
413 1885         18246 $bodystring =~ s{^(Content-Type:\s*message/(?:rfc822|delivery-status)).+$}{$1}gm;
414 1885         16259 $bodystring =~ s|^\n{2,}|\n|gm;
415              
416 1885         6602 return \$bodystring;
417             }
418              
419             1;
420             __END__