File Coverage

lib/Sisimai/MIME.pm
Criterion Covered Total %
statement 201 208 96.6
branch 104 124 83.8
condition 33 48 68.7
subroutine 15 15 100.0
pod 7 7 100.0
total 360 402 89.5


line stmt bran cond sub pod time code
1             package Sisimai::MIME;
2 83     83   72636 use feature ':5.10';
  83         217  
  83         5987  
3 83     83   573 use strict;
  83         190  
  83         1904  
4 83     83   433 use warnings;
  83         243  
  83         2349  
5 83     83   1070 use Encode;
  83         11047  
  83         7750  
6 83     83   39902 use MIME::Base64 ();
  83         54954  
  83         2068  
7 83     83   35413 use MIME::QuotedPrint ();
  83         20825  
  83         1889  
8 83     83   869 use Sisimai::String;
  83         171  
  83         15995  
9 83         268486 use constant ReE => {
10             '7bit-encoded' => qr/^content-transfer-encoding:[ ]*7bit/m,
11             'quoted-print' => qr/^content-transfer-encoding:[ ]*quoted-printable/m,
12             'some-iso2022' => qr/^content-type:[ ]*.+;[ ]*charset=["']?(iso-2022-[-a-z0-9]+)['"]?\b/m,
13             'with-charset' => qr/^content[-]type:[ ]*.+[;][ ]*charset=['"]?([-0-9a-z]+)['"]?\b/,
14             'only-charset' => qr/^[\s\t]+charset=['"]?([-0-9a-z]+)['"]?\b/,
15             'html-message' => qr|^content-type:[ ]*text/html;|m,
16 83     83   712 };
  83         213  
17              
18             sub is_mimeencoded {
19             # Check that the argument is MIME-Encoded string or not
20             # @param [String] argv1 String to be checked
21             # @return [Integer] 0: Not MIME encoded string
22             # 1: MIME encoded string
23 8778     8778 1 13060 my $class = shift;
24 8778   50     15036 my $argv1 = shift || return undef;
25 8778 50       16855 return undef unless ref $argv1 eq 'SCALAR';
26              
27 8778         12698 my $text1 = $$argv1; $text1 =~ y/"//d;
  8778         11161  
28 8778         10442 my $mime1 = 0;
29 8778         9334 my @piece;
30              
31 8778 100       18040 if( rindex($text1, ' ') > -1 ) {
32             # Multiple MIME-Encoded strings in a line
33 3514         10713 @piece = split(' ', $text1);
34             } else {
35 5264         8581 push @piece, $text1;
36             }
37              
38 8778         12038 for my $e ( @piece ) {
39             # Check all the string in the array
40 20922 100       39082 next unless $e =~ /[ \t]*=[?][-_0-9A-Za-z]+[?][BbQq][?].+[?]=?[ \t]*/;
41 1276         2095 $mime1 = 1;
42             }
43 8778         25499 return $mime1;
44             }
45              
46             sub mimedecode {
47             # Decode MIME-Encoded string
48             # @param [Array] argvs Reference to an array including MIME-Encoded text
49             # @return [String] MIME-Decoded text
50 2692     2692 1 8302 my $class = shift;
51 2692         3492 my $argvs = shift;
52 2692 50       5528 return undef unless ref $argvs eq 'ARRAY';
53              
54 2692         3357 my $characterset = '';
55 2692         4259 my $encodingname = '';
56 2692         3210 my @decodedtext0;
57              
58 2692         4292 for my $e ( @$argvs ) {
59             # Check and decode each element
60 2909         6181 $e =~ s/\A[ \t]+//g;
61 2909         5322 $e =~ s/[ \t]+\z//g;
62 2909         4497 $e =~ y/"//d;
63              
64 2909 100       5355 if( __PACKAGE__->is_mimeencoded(\$e) ) {
65             # =?utf-8?B?55m954yr44Gr44KD44KT44GT?=
66 490 100       3181 next unless $e =~ m{\A(.*)=[?]([-_0-9A-Za-z]+)[?]([BbQq])[?](.+)[?]=?(.*)\z};
67 468   66     2585 $characterset ||= lc $2;
68 468   66     1830 $encodingname ||= uc $3;
69              
70 468         973 push @decodedtext0, $1;
71 468 100       3815 push @decodedtext0, $encodingname eq 'B'
72             ? MIME::Base64::decode($4)
73             : MIME::QuotedPrint::decode($4);
74 468         924 $decodedtext0[-1] =~ y/\r\n//d;
75 468         1231 push @decodedtext0, $5;
76              
77             } else {
78 2419 100       6489 push @decodedtext0, scalar @decodedtext0 ? ' '.$e : $e;
79             }
80             }
81 2692 50       5617 return '' unless scalar @decodedtext0;
82              
83 2692         5985 my $decodedtext1 = join('', @decodedtext0);
84 2692 100 66     6980 if( $characterset && $encodingname ) {
85             # utf-8 => utf8
86 403 100       995 $characterset = 'utf8' if $characterset eq 'utf-8';
87              
88 403 100       1131 if( $characterset ne 'utf8' ) {
89             # Characterset is not UTF-8
90 138         254 eval { Encode::from_to($decodedtext1, $characterset, 'utf8') };
  138         668  
91 138 50       29377 $decodedtext1 = 'FAILED TO CONVERT THE SUBJECT' if $@;
92             }
93             }
94 2692         7842 return $decodedtext1;
95             }
96              
97             sub qprintd {
98             # Decode MIME Quoted-Printable Encoded string
99             # @param [String] argv1 MIME Encoded text
100             # @param [Hash] heads Email header
101             # @return [String] MIME Decoded text
102 245     245 1 2774 my $class = shift;
103 245   50     748 my $argv1 = shift // return undef;
104 245   100     1008 my $heads = shift // {};
105 245         488 my $plain = '';
106 245 50       817 return \'' unless ref $argv1 eq 'SCALAR';
107              
108 245 100 66     909 if( ! exists $heads->{'content-type'} || ! $heads->{'content-type'} ) {
109             # There is no Content-Type: field
110 244         4811 $plain = MIME::QuotedPrint::decode($$argv1);
111 244         1459 return \$plain;
112             }
113              
114             # Quoted-printable encoded part is the part of the text
115 1         5 my $boundary00 = __PACKAGE__->boundary($heads->{'content-type'}, 0);
116 1 50 33     19 if( ! $boundary00 || lc($$argv1) !~ ReE->{'quoted-print'} ) {
117             # There is no boundary string or no
118             # Content-Transfer-Encoding: quoted-printable field.
119 0         0 $plain = MIME::QuotedPrint::decode($$argv1);
120 0         0 return \$plain;
121             }
122              
123 1         3 my $boundary01 = Sisimai::MIME->boundary($heads->{'content-type'}, 1);
124 1         4 my $bodystring = '';
125 1         2 my $notdecoded = '';
126              
127 1         1 my $encodename = undef;
128 1         1 my $ctencoding = undef;
129 1         2 my $mimeinside = 0;
130              
131 1         14 for my $e ( split("\n", $$argv1) ) {
132             # This is a multi-part message in MIME format. Your mail reader does not
133             # understand MIME message format.
134             # --=_gy7C4Gpes0RP4V5Bs9cK4o2Us2ZT57b-3OLnRN+4klS8dTmQ
135             # Content-Type: text/plain; charset=iso-8859-15
136             # Content-Transfer-Encoding: quoted-printable
137 31 100       44 if( $mimeinside ) {
138             # Quoted-Printable encoded text block
139 21 100       28 if( $e eq $boundary00 ) {
140             # The next boundary string has appeared
141             # --=_gy7C4Gpes0RP4V5Bs9cK4o2Us2ZT57b-3OLnRN+4klS8dTmQ
142 1         16 my $hasdecoded = MIME::QuotedPrint::decode($notdecoded);
143 1         11 $hasdecoded = Sisimai::String->to_utf8(\$hasdecoded, $encodename);
144 1         4 $bodystring .= $$hasdecoded;
145 1         4 $bodystring .= $e . "\n";
146              
147 1         1 $notdecoded = '';
148 1         2 $mimeinside = 0;
149 1         1 $ctencoding = undef;
150 1         2 $encodename = undef;
151             } else {
152             # Inside of Queoted printable encoded text
153 20         38 $notdecoded .= $e . "\n";
154             }
155             } else {
156             # NOT Quoted-Printable encoded text block
157 10 100 66     116 if( (my $lowercased = lc $e) =~ /\A[-]{2}[^\s]+[^-]\z/ ) {
    100          
    100          
    50          
158             # Start of the boundary block
159             # --=_gy7C4Gpes0RP4V5Bs9cK4o2Us2ZT57b-3OLnRN+4klS8dTmQ
160 1 50       5 unless( $e eq $boundary00 ) {
161             # New boundary string has appeared
162 0         0 $boundary00 = $e;
163 0         0 $boundary01 = $e . '--';
164             }
165             } elsif( $lowercased =~ ReE->{'with-charset'} || $lowercased =~ ReE->{'only-charset'} ) {
166             # Content-Type: text/plain; charset=ISO-2022-JP
167 1         4 $encodename = $1;
168 1 50       3 $mimeinside = 1 if $ctencoding;
169              
170             } elsif( $lowercased =~ ReE->{'quoted-print'} ) {
171             # Content-Transfer-Encoding: quoted-printable
172 1         2 $ctencoding = $e;
173 1 50       3 $mimeinside = 1 if $encodename;
174              
175             } elsif( $e eq $boundary01 ) {
176             # The end of boundary block
177             # --=_gy7C4Gpes0RP4V5Bs9cK4o2Us2ZT57b-3OLnRN+4klS8dTmQ--
178 0         0 $mimeinside = 0;
179             }
180 10         29 $bodystring .= $e . "\n";
181             }
182             }
183              
184 1 50       10 $bodystring .= $notdecoded if length $notdecoded;
185 1         4 return \$bodystring;
186             }
187              
188             sub base64d {
189             # Decode MIME BASE64 Encoded string
190             # @param [String] argv1 MIME Encoded text
191             # @return [String] MIME-Decoded text
192 35     35 1 377 my $class = shift;
193 35   50     115 my $argv1 = shift // return undef;
194 35 50       139 return \'' unless ref $argv1 eq 'SCALAR';
195              
196             # Decode BASE64
197 35 50       446 my $plain = $$argv1 =~ m|([+/=0-9A-Za-z\r\n]+)| ? MIME::Base64::decode($1) : '';
198 35         132 return \$plain;
199             }
200              
201             sub boundary {
202             # Get boundary string
203             # @param [String] argv1 The value of Content-Type header
204             # @param [Integer] start -1: boundary string itself
205             # 0: Start of boundary
206             # 1: End of boundary
207             # @return [String] Boundary string
208 2347     2347 1 3923 my $class = shift;
209 2347   50     4167 my $argv1 = shift || return undef;
210 2347   100     4233 my $start = shift // -1;
211 2347         2955 my $value = '';
212              
213 2347 100       13092 if( lc $argv1 =~ /\bboundary=([^ ]+)/ ) {
214             # Content-Type: multipart/mixed; boundary=Apple-Mail-5--931376066
215             # Content-Type: multipart/report; report-type=delivery-status;
216             # boundary="n6H9lKZh014511.1247824040/mx.example.jp"
217 2302         6366 $value = $1;
218 2302         4251 $value =~ y/"';\\//d;
219 2302 100       6763 $value = '--'.$value if $start > -1;
220 2302 100       4965 $value = $value.'--' if $start > 0;
221             }
222 2347         5196 return $value;
223             }
224              
225             sub breaksup {
226             # Breaks up each multipart/* block
227             # @param [String] argv0 Text block of multipart/*
228             # @param [String] argv1 MIME type of the outside part
229             # @return [String] Decoded part as a plain text(text part only)
230 5068     5068 1 9386 my $class = shift;
231 5068   100     8924 my $argv0 = shift || return undef;
232 5067   50     8164 my $argv1 = shift || '';
233              
234 5067         5485 state $alsoappend = qr{\A(?:text/rfc822-headers|message/)};
235 5067         5062 state $thisformat = qr/\A(?:Content-Transfer-Encoding:\s*.+\n)?Content-Type:\s*([^ ;\s]+)/;
236 5067         5136 state $leavesonly = qr{\A(?>
237             text/(?:plain|html|rfc822-headers)
238             |message/(?:x?delivery-status|rfc822|partial|feedback-report)
239             |multipart/(?:report|alternative|mixed|related|partial)
240             )
241             }x;
242              
243 5067 50       32344 my $mimeformat = $$argv0 =~ $thisformat ? lc($1) : '';
244 5067 100       11157 my $alternates = index($argv1, 'multipart/alternative') == 0 ? 1 : 0;
245 5067         6098 my $hasflatten = ''; # Message body including only text/plain and message/*
246              
247             # Sisimai require only MIME types defined in $leavesonly variable
248 5067 100       22659 return \'' unless $mimeformat =~ $leavesonly;
249 5062 50 66     10454 return \'' if $alternates && $mimeformat eq 'text/html';
250              
251 5062         25899 my ($upperchunk, $lowerchunk) = split(/^$/m, $$argv0, 2);
252 5062         9187 $upperchunk =~ y/\n/ /;
253 5062         7410 $upperchunk =~ y/ //s;
254              
255             # Content-Description: Undelivered Message
256             # Content-Type: message/rfc822
257             #
258 5062   50     8463 $lowerchunk ||= '';
259              
260 5062 100       9011 if( index($mimeformat, 'multipart/') == 0 ) {
261             # Content-Type: multipart/*
262 269         997 my $mpboundary = __PACKAGE__->boundary($upperchunk, 0);
263 269         5566 my @innerparts = split(/\Q$mpboundary\E\n/, $lowerchunk);
264 269 50       1092 shift @innerparts unless length $innerparts[0];
265 269 100       769 shift @innerparts if $innerparts[0] eq "\n";
266              
267 269         573 for my $e ( @innerparts ) {
268             # Find internal multipart/* blocks and decode
269 546 100       2653 if( $e =~ $thisformat ) {
270             # Found Content-Type field at the first or second line of this
271             # split part
272 309         915 my $nextformat = lc $1;
273 309 100       1508 next unless $nextformat =~ $leavesonly;
274 308 100       740 next if $nextformat eq 'text/html';
275 285         371 $hasflatten .= ${ __PACKAGE__->breaksup(\$e, $mimeformat) };
  285         987  
276              
277             } else {
278             # The content of this part is almost '--': a part of boundary
279             # string which is used for splitting multipart/* blocks.
280 237         591 $hasflatten .= "\n";
281             }
282             }
283             } else {
284             # Is not "Content-Type: multipart/*"
285 4793 100       11009 if( $upperchunk =~ /Content-Transfer-Encoding: ([^\s;]+)/ ) {
286             # Content-Transfer-Encoding: quoted-printable|base64|7bit|...
287 1288         2349 my $getdecoded = '';
288              
289 1288 100       5674 if( (my $ctencoding = lc $1) eq 'quoted-printable' ) {
    100          
    100          
290             # Content-Transfer-Encoding: quoted-printable
291 197         359 $getdecoded = ${ __PACKAGE__->qprintd(\$lowerchunk) };
  197         751  
292              
293             } elsif( $ctencoding eq 'base64' ) {
294             # Content-Transfer-Encoding: base64
295 29         147 $getdecoded = ${ __PACKAGE__->base64d(\$lowerchunk) };
  29         169  
296              
297             } elsif( $ctencoding eq '7bit' ) {
298             # Content-Transfer-Encoding: 7bit
299 788 100       4154 if( lc($upperchunk) =~ ReE->{'some-iso2022'} ) {
300             # Content-Type: text/plain; charset=ISO-2022-JP
301 59         144 $getdecoded = ${ Sisimai::String->to_utf8(\$lowerchunk, $1) };
  59         476  
302              
303             } else {
304             # No "charset" parameter in Content-Type: field
305 729         1254 $getdecoded = $lowerchunk;
306             }
307             } else {
308             # Content-Transfer-Encoding: 8bit, binary, and so on
309 274         616 $getdecoded = $lowerchunk;
310             }
311 1288 100       3312 $getdecoded =~ s|\r\n|\n|g if index($getdecoded, "\r\n") > -1; # Convert CRLF to LF
312              
313 1288 100       6145 if( $mimeformat =~ $alsoappend ) {
    50          
314             # Append field when the value of Content-Type: begins with
315             # message/ or equals text/rfc822-headers.
316 614         2641 $upperchunk =~ s/Content-Transfer-Encoding:\s*[^\s]+//;
317 614 100       2371 $upperchunk =~ s/\A[ ]//g if substr($upperchunk, 0, 1) eq ' ';
318 614 50       2656 $upperchunk =~ s/[ ]\z//g if substr($upperchunk, -1, 1) eq ' ';
319 614         1243 $hasflatten .= $upperchunk;
320              
321             } elsif( $mimeformat eq 'text/html' ) {
322             # Delete HTML tags inside of text/html part whenever possible
323 0         0 $getdecoded = ${ Sisimai::String->to_plain(\$getdecoded) };
  0         0  
324             }
325 1288 50       5640 $hasflatten .= $getdecoded."\n\n" if length $getdecoded;
326             } else {
327             # Content-Type: text/plain OR text/rfc822-headers OR message/*
328 3505 100 100     9411 if( index($mimeformat, 'message/') == 0 || $mimeformat eq 'text/rfc822-headers' ) {
329             # Append headers of multipart/* when the value of "Content-Type"
330             # is inlucded in the following MIME types:
331             # - message/delivery-status
332             # - message/rfc822
333             # - text/rfc822-headers
334 2760         5729 $hasflatten .= $upperchunk;
335             }
336 3505         7911 $lowerchunk =~ s/^--\z//m;
337 3505 100       10448 $lowerchunk .= "\n" unless $lowerchunk =~ /\n\z/;
338 3505         10133 $hasflatten .= $lowerchunk;
339             }
340             }
341 5062         22490 return \$hasflatten;
342             }
343              
344             sub makeflat {
345             # MIME decode entire message body
346             # @param [String] argv0 Content-Type header
347             # @param [String] argv1 Entire message body
348             # @return [String] Decoded message body
349 1960     1960 1 6788 my $class = shift;
350 1960   100     4221 my $argv0 = shift // return undef;
351 1959   50     3631 my $argv1 = shift // return undef;
352              
353 1959         3918 my $ehboundary = __PACKAGE__->boundary($argv0, 0);
354 1959 100       18801 my $mimeformat = $argv0 =~ qr|\A([0-9a-z]+/[^ ;]+)| ? $1 : '';
355 1959         4707 my $bodystring = '';
356              
357 1959 100       5092 return \'' unless index($mimeformat, 'multipart/') > -1;
358 1928 100       3979 return \'' unless $ehboundary;
359              
360             # Some bounce messages include lower-cased "content-type:" field such as
361             # content-type: message/delivery-status
362             # content-transfer-encoding: quoted-printable
363 1915         32102 $$argv1 =~ s/[Cc]ontent-[Tt]ype:/Content-Type:/g;
364 1915         7653 $$argv1 =~ s/[Cc]ontent-[Tt]ransfer-[Ee]ncodeing:/Content-Transfer-Encoding:/g;
365              
366             # 1. Some bounce messages include upper-cased "Content-Transfer-Encoding",
367             # and "Content-Type" value such as
368             # - Content-Type: multipart/RELATED;
369             # - Content-Transfer-Encoding: 7BIT
370             # 2. Unused fields inside of mutipart/* block should be removed
371 1915         11570 $$argv1 =~ s/(Content-[A-Za-z-]+?):[ ]*([^\s]+)/$1.': '.lc($2)/eg;
  13063         65148  
372 1915         14758 $$argv1 =~ s/^Content-(?:Description|Disposition):.+?\n//gm;
373              
374 1915         49428 my @multiparts = split(/\Q$ehboundary\E\n?/, $$argv1);
375 1915 100       6652 shift @multiparts unless length $multiparts[0];
376 1915         3698 for my $e ( @multiparts ) {
377             # Find internal multipart blocks and decode
378             XCCT: {
379             # Remove fields except Content-Type, Content-Transfer-Encoding in
380             # each part such as the following:
381             # Date: Thu, 29 Apr 2018 22:22:22 +0900
382             # MIME-Version: 1.0
383             # Message-ID: ...
384             # Content-Transfer-Encoding: quoted-printable
385             # Content-Type: text/plain; charset=us-ascii
386             #
387             # Fields before "Content-Type:" in each part should have been removed
388             # and "Content-Type:" should be exist at the first line of each part.
389             # The field works as a delimiter to decode contents of each part.
390             #
391 8400 100       10207 last(XCCT) if $e =~ /\AContent-T[ry]/; # The first field is "Content-Type:"
  8400         20298  
392 3648 50 100     8901 my $p = $1 if $e =~ /\A(.+?)Content-Type:/s || last(XCCT);
393 51 100       278 last(XCCT) if $p =~ /\n\n/m; # There is no field before "Content-Type:"
394 29         246 $e =~ s/\A.+?(Content-T[ry].+)\z/$1/s; # Remove fields before "Content-Type:"
395             }
396              
397 8400 100       34537 if( $e =~ /\A(?:Content-[A-Za-z-]+:.+?\r?\n)?Content-Type:[ ]*[^\s]+/ ) {
398             # Content-Type: multipart/*
399 4781         6150 $bodystring .= ${ __PACKAGE__->breaksup(\$e, $mimeformat) };
  4781         10164  
400              
401             } else {
402             # Is not multipart/* block
403 3619         6522 $e =~ s|^Content-Transfer-Encoding:.+?\n||sim;
404 3619         4952 $e =~ s|^Content-Type:\s*text/plain.+?\n||sim;
405 3619         8155 $bodystring .= $e;
406             }
407             }
408              
409             # Remove entire message body of the original message beginning from
410             # Content-Type: message/rfc822 field so Sisimai does not read the message
411             # body for detecting a bounce reason, for getting email header fields of
412             # the original message.
413 1915         23278 $bodystring =~ s{^(Content-Type:\s*message/(?:rfc822|delivery-status)).+$}{$1}gm;
414 1915         19790 $bodystring =~ s|^\n{2,}|\n|gm;
415              
416 1915         8405 return \$bodystring;
417             }
418              
419             1;
420             __END__