File Coverage

blib/lib/Mail/SpamAssassin/Message/Node.pm
Criterion Covered Total %
statement 263 420 62.6
branch 97 212 45.7
condition 40 85 47.0
subroutine 25 30 83.3
pod 16 18 88.8
total 441 765 57.6


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts
21              
22             =head1 DESCRIPTION
23              
24             This module will encapsulate an email message and allow access to
25             the various MIME message parts.
26              
27             =head1 PUBLIC METHODS
28              
29             =over 4
30              
31             =cut
32              
33              
34             use strict;
35 41     41   253 use warnings;
  41         72  
  41         1195  
36 41     41   219 use re 'taint';
  41         77  
  41         1325  
37 41     41   228  
  41         94  
  41         1607  
38             require 5.008001; # needs utf8::is_utf8()
39              
40             use Mail::SpamAssassin;
41 41     41   237 use Mail::SpamAssassin::Constants qw(:sa);
  41         74  
  41         1047  
42 41     41   212 use Mail::SpamAssassin::HTML;
  41         77  
  41         6042  
43 41     41   11179 use Mail::SpamAssassin::Logger;
  41         140  
  41         1490  
44 41     41   248  
  41         73  
  41         6026  
45             our($enc_utf8, $enc_w1252, $have_encode_detector);
46             BEGIN {
47             eval { require Encode }
48 41         20823 and do { $enc_utf8 = Encode::find_encoding('UTF-8');
49 41 50   41   124 $enc_w1252 = Encode::find_encoding('Windows-1252') };
  41         340379  
50 41         5746 eval { require Encode::Detect::Detector }
51 41         190808 and do { $have_encode_detector = 1 };
52 41 50       12972 };
  0         0  
53              
54             =item new()
55              
56             Generates an empty Node object and returns it. Typically only called
57             by functions in Message.
58              
59             =cut
60              
61             my $class = shift;
62             $class = ref($class) || $class;
63 196     196 1 438  
64 196   33     802 my $self = {
65             headers => {},
66 196         1136 raw_headers => {},
67             header_order => []
68             };
69              
70             # deal with any parameters
71             my($opts) = @_;
72             $self->{normalize} = $opts->{'normalize'} || 0;
73 196         438  
74 196   50     928 bless($self,$class);
75             $self;
76 196         544 }
77 196         534  
78             =item find_parts()
79              
80             Used to search the tree for specific MIME parts. An array of matching
81             Node objects (pointers into the tree) is returned. The parameters that
82             can be passed in are (in order, all scalars):
83              
84             Regexp - Used to match against each part's Content-Type header,
85             specifically the type and not the rest of the header. ie: "Content-type:
86             text/html; encoding=quoted-printable" has a type of "text/html". If no
87             regexp is specified, find_parts() will return an empty array.
88              
89             Only_leaves - By default, find_parts() will return any part that matches
90             the regexp, including multipart. If you only want to see leaves of the
91             tree (ie: parts that aren't multipart), set this to true (1).
92              
93             Recursive - By default, when find_parts() finds a multipart which has
94             parts underneath it, it will recurse through all sub-children. If set to 0,
95             only look at the part and any direct children of the part.
96              
97             =cut
98              
99             # Used to find any MIME parts whose simple content-type matches a given regexp
100             # Searches it's own and any children parts. Returns an array of MIME
101             # objects which match. Our callers may expect the default behavior which is a
102             # depth-first array of parts.
103             #
104             my ($self, $re, $onlyleaves, $recursive) = @_;
105              
106             # Didn't pass an RE? Just abort.
107 256     256 1 649 return () unless defined $re && $re ne '';
108              
109             $onlyleaves = 0 unless defined $onlyleaves;
110 256 50 33     1363  
111             my $depth;
112 256 50       558 if (defined $recursive && $recursive == 0) {
113             $depth = 1;
114 256         420 }
115 256 50 33     637
116 0         0 my @ret;
117             my @search = ( $self );
118              
119 256         390 while (my $part = shift @search) {
120 256         541 # If this object matches, mark it for return.
121             my $amialeaf = $part->is_leaf();
122 256         676  
123             if ( $part->{'type'} =~ /$re/ && (!$onlyleaves || $amialeaf) ) {
124 290         897 push(@ret, $part);
125             }
126 290 100 66     2909
      100        
127 271         517 if ( !$amialeaf && (!defined $depth || $depth > 0)) {
128             $depth-- if defined $depth;
129             unshift(@search, @{$part->{'body_parts'}});
130 290 50 33     1059 }
      66        
131 19 50       41 }
132 19         25  
  19         64  
133             return @ret;
134             }
135              
136 256         995 =item header()
137              
138             Stores and retrieves headers from a specific MIME part. The first
139             parameter is the header name. If there is no other parameter, the header
140             is retrieved. If there is a second parameter, the header is stored.
141              
142             Header names are case-insensitive and are stored in both raw and
143             decoded form. Using header(), only the decoded form is retrievable.
144              
145             For retrieval, if header() is called in an array context, an array will
146             be returned with each header entry in a different element. In a scalar
147             context, the last specific header is returned.
148              
149             ie: If 'Subject' is specified as the header, and there are 2 Subject
150             headers in a message, the last/bottom one in the message is returned in
151             scalar context or both are returned in array context.
152              
153             =cut
154              
155             # Store or retrieve headers from a given MIME object
156             #
157             my $self = shift;
158             my $rawkey = shift;
159              
160             return unless defined $rawkey;
161 5663     5663 1 6921  
162 5663         7399 # we're going to do things case insensitively
163             my $key = lc($rawkey);
164 5663 50       11217  
165             # Trim whitespace off of the header keys
166             $key =~ s/^\s+//;
167 5663         10021 $key =~ s/\s+$//;
168              
169             if (@_) {
170 5663         11025 my $raw_value = shift;
171 5663         8979 return unless defined $raw_value;
172              
173 5663 100       9572 push @{ $self->{'header_order'} }, $rawkey;
174 909         1578 if ( !exists $self->{'headers'}->{$key} ) {
175 909 50       1797 $self->{'headers'}->{$key} = [];
176             $self->{'raw_headers'}->{$key} = [];
177 909         1123 }
  909         2059  
178 909 100       2317  
179 835         2315 my $dec_value = $raw_value;
180 835         1764 $dec_value =~ s/\n[ \t]+/ /gs;
181             $dec_value =~ s/\s+$//s;
182             $dec_value =~ s/^\s+//s;
183 909         1494 push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key);
184 909         2282  
185 909         3755 push @{ $self->{'raw_headers'}->{$key} }, $raw_value;
186 909         2371  
187 909         1258 return $self->{'headers'}->{$key}->[-1];
  909         2431  
188             }
189 909         1390  
  909         2051  
190             if (wantarray) {
191 909         4000 return unless exists $self->{'headers'}->{$key};
192             return @{ $self->{'headers'}->{$key} };
193             }
194 4754 100       7358 else {
195 4441 100       14736 return '' unless exists $self->{'headers'}->{$key};
196 1002         1210 return $self->{'headers'}->{$key}->[-1];
  1002         5415  
197             }
198             }
199 313 100       2295  
200 48         318 =item raw_header()
201              
202             Retrieves the raw version of headers from a specific MIME part. The only
203             parameter is the header name. Header names are case-insensitive.
204              
205             For retrieval, if raw_header() is called in an array context, an array
206             will be returned with each header entry in a different element. In a
207             scalar context, the last specific header is returned.
208              
209             ie: If 'Subject' is specified as the header, and there are 2 Subject
210             headers in a message, the last/bottom one in the message is returned in
211             scalar context or both are returned in array context.
212              
213             =cut
214              
215             # Retrieve raw headers from a given MIME object
216             #
217             my $self = shift;
218             my $key = lc(shift);
219              
220             # Trim whitespace off of the header keys
221             $key =~ s/^\s+//;
222 633     633 1 855 $key =~ s/\s+$//;
223 633         992  
224             if (wantarray) {
225             return unless exists $self->{'raw_headers'}->{$key};
226 633         1248 return @{ $self->{'raw_headers'}->{$key} };
227 633         1015 }
228             else {
229 633 50       1133 return '' unless exists $self->{'raw_headers'}->{$key};
230 633 100       1948 return $self->{'raw_headers'}->{$key}->[-1];
231 139         237 }
  139         807  
232             }
233              
234 0 0       0 =item add_body_part()
235 0         0  
236             Adds a Node child object to the current node object.
237              
238             =cut
239              
240             # Add a MIME child part to ourselves
241             my($self, $part) = @_;
242              
243             dbg("message: added part, type: ".$part->{'type'});
244             push @{ $self->{'body_parts'} }, $part;
245             }
246              
247 50     50 1 93 =item is_leaf()
248              
249 50         164 Returns true if the tree node in question is a leaf of the tree (ie:
250 50         58 has no children of its own). Note: This function may return odd results
  50         116  
251             unless the message has been mime parsed via _do_parse()!
252              
253             =cut
254              
255             my($self) = @_;
256             return !exists $self->{'body_parts'};
257             }
258              
259             =item raw()
260              
261             Return a reference to the raw array. Treat this as READ ONLY.
262 290     290 1 509  
263 290         787 =cut
264              
265             my $self = shift;
266              
267             # Ok, if we're called we are expected to return an array.
268             # so if it's a file reference, read in the message into an array...
269             #
270             # NOTE: that "ref undef" works, so don't bother checking for a defined var
271             # first.
272             if (ref $self->{'raw'} eq 'GLOB') {
273 0     0 1 0 my $fd = $self->{'raw'};
274             seek($fd, 0, 0) or die "message: cannot rewind file: $!";
275              
276             # dbg("message: (raw) reading mime part from a temporary file");
277             my($nread,$raw_str); $raw_str = '';
278             while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { }
279             defined $nread or die "error reading: $!";
280 0 0       0 my @array = split(/^/m, $raw_str, -1);
281 0         0  
282 0 0       0 dbg("message: empty message read") if $raw_str eq '';
283             return \@array;
284             }
285 0         0  
  0         0  
286 0         0 return $self->{'raw'};
287 0 0       0 }
288 0         0  
289             =item decode()
290 0 0       0  
291 0         0 If necessary, decode the part text as base64 or quoted-printable.
292             The decoded text will be returned as a scalar string. An optional length
293             parameter can be passed in which limits how much decoded data is returned.
294 0         0 If the scalar isn't needed, call with "0" as a parameter.
295              
296             =cut
297              
298             my($self, $bytes) = @_;
299              
300             if ( !exists $self->{'decoded'} ) {
301             # Someone is looking for a decoded part where there is no raw data
302             # (multipart or subparsed message, etc.) Just return undef.
303             return if !exists $self->{'raw'};
304              
305             my $raw;
306              
307 254     254 1 3692 # if the part is held in a temp file, read it into the scalar
308             if (ref $self->{'raw'} eq 'GLOB') {
309 254 100       678 my $fd = $self->{'raw'};
310             seek($fd, 0, 0) or die "message: cannot rewind file: $!";
311              
312 146 50       414 # dbg("message: (decode) reading mime part from a temporary file");
313             my($nread,$raw_str); $raw = '';
314 146         234 while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { }
315             defined $nread or die "error reading: $!";
316              
317 146 100       473 dbg("message: empty message read from a temp file") if $raw eq '';
318 4         5 }
319 4 50       36 else {
320             # create a new scalar from the raw array in memory
321             $raw = join('', @{$self->{'raw'}});
322 4         9 }
  4         5  
323 4         234  
324 4 50       16 my $encoding = lc $self->header('content-transfer-encoding') || '';
325              
326 4 50       16 if ( $encoding eq 'quoted-printable' ) {
327             dbg("message: decoding quoted-printable");
328             $self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw);
329             $self->{'decoded'} =~ s/\015\012/\012/gs;
330 142         260 }
  142         1450  
331             elsif ( $encoding eq 'base64' ) {
332             dbg("message: decoding base64");
333 146   100     559  
334             # if it's not defined or is 0, do the whole thing, otherwise only decode
335 146 100       711 # a portion
    100          
336 6         25 if ($bytes) {
337 6         39 return Mail::SpamAssassin::Util::base64_decode($raw, $bytes);
338 6         36 }
339             else {
340             # Generate the decoded output
341 5         17 $self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw);
342             }
343              
344             if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) {
345 5 50       7 $self->{'decoded'} =~ s/\015\012/\012/gs;
346 0         0 }
347             }
348             else {
349             # Encoding is one of 7bit, 8bit, binary or x-something
350 5         19 if ( $encoding ) {
351             dbg("message: decoding other encoding type ($encoding), ignoring");
352             }
353 5 100       26 else {
354 1         21 dbg("message: no encoding detected");
355             }
356             $self->{'decoded'} = $raw;
357             }
358             }
359 135 100       316  
360 33         197 if ( !defined $bytes || $bytes ) {
361             if ( !defined $bytes ) {
362             # force a copy
363 102         351 return '' . $self->{'decoded'};
364             }
365 135         727 else {
366             return substr($self->{'decoded'}, 0, $bytes);
367             }
368             }
369 254 100 66     1244 }
370 250 50       599  
371             # Detect endianness of UTF-16 encoded data
372 250         2079 my $data = $_[0]; # could not avoid copying large strings
373             my $utf16le_clues = 0;
374             my $utf16be_clues = 0;
375 0         0 my $sum_h_e = 0;
376             my $sum_h_o = 0;
377             my $sum_l_e = 0;
378             my $sum_l_o = 0;
379             my $decoder = undef;
380              
381             my @msg_h = unpack 'H' x length( $data ), $data;
382 0     0 0 0 my @msg_l = unpack 'h' x length( $data ), $data;
383 0         0  
384 0         0 for( my $i = 0; $i < length( $data ); $i+=2 ) {
385 0         0 my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] );
386 0         0 $sum_h_e += hex $msg_h[$i];
387 0         0 $sum_h_o += hex $msg_h[$i+1];
388 0         0 $sum_l_e += hex $msg_l[$i];
389 0         0 $sum_l_o += hex $msg_l[$i+1];
390             if( $check_char =~ /20 00/ ) {
391 0         0 # UTF-16LE space char detected
392 0         0 $utf16le_clues++;
393             }
394 0         0 if( $check_char =~ /00 20/ ) {
395 0         0 # UTF-16BE space char detected
396 0         0 $utf16be_clues++;
397 0         0 }
398 0         0 }
399 0         0  
400 0 0       0 # If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE
401             $utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 );
402 0         0  
403             # If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE
404 0 0       0 $utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) );
405              
406 0         0 if( $utf16le_clues > $utf16be_clues ) {
407             dbg( "message: detect_utf16: UTF-16LE" );
408             $decoder = Encode::find_encoding("UTF-16LE");
409             } elsif( $utf16le_clues > $utf16be_clues ) {
410             dbg( "message: detect_utf16: UTF-16BE" );
411 0 0       0 $decoder = Encode::find_encoding("UTF-16BE");
412             } else {
413             dbg( "message: detect_utf16: Could not detect UTF-16 endianness" );
414 0 0       0 }
415              
416 0 0       0 return $decoder;
    0          
417 0         0 }
418 0         0  
419             # Look at a text scalar and determine whether it should be rendered
420 0         0 # as text/html.
421 0         0 #
422             # This is not a public function.
423 0         0 #
424             if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is)
425             {
426 0         0 my $pad = $1;
427             my $count = 0;
428             $count += ($pad =~ tr/\n//d) * 2;
429             $count += ($pad =~ tr/\n//cd);
430             return ($count < 24);
431             }
432             return 0;
433             }
434              
435 0 0   0   0 # Decode character set of a given text to perl characters (Unicode),
436             # then encode into UTF-8 octets if requested.
437 0         0 #
438 0         0 # my $data = $_[0]; # avoid copying large strings
439 0         0 my $charset_declared = $_[1];
440 0         0 my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets
441 0         0  
442             warn "message: _normalize() was given characters, expected bytes: $_[0]\n"
443 0         0 if utf8::is_utf8($_[0]);
444              
445             # workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
446             my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data
447              
448             if (!defined $charset_declared || $charset_declared eq '') {
449             $charset_declared = 'us-ascii';
450             }
451 10     10   19  
452 10         13 # number of characters with code above 127
453             my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;
454 10 50       35  
455             if (!$cnt_8bits &&
456             $charset_declared =~
457             /^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) |
458 10         39 ISO646-US )\z/xsi)
459             { # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
460 10 50 33     51 dbg("message: kept, charset is US-ASCII as declared");
461 0         0 return $_[0]; # is all-ASCII, no need for decoding
462             }
463              
464             if (!$cnt_8bits &&
465 10         23 $charset_declared =~
466             /^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} |
467 10 50 33     68 UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} |
468             Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
469             { # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
470             dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared);
471             return $_[0]; # is all-ASCII, no need for decoding
472 0         0 }
473 0         0  
474             # Try first to strictly decode based on a declared character set.
475              
476 10 100 66     74 my $rv;
477             if ($charset_declared =~ /^UTF-?8\z/i) {
478             # attempt decoding as strict UTF-8 (flags: FB_CROAK | LEAVE_SRC)
479             if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
480             dbg("message: decoded as declared charset UTF-8");
481             return $_[0] if !$return_decoded;
482 8         24 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
483 8         58 return $rv; # decoded
484             } else {
485             my $err = '';
486             if ($@) {
487             $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
488 2         9 $err = " ($err)";
489 2 50 33     20 }
    50          
    50          
    50          
490             dbg("message: failed decoding as declared charset UTF-8 ($err)");
491 0 0       0 }
  0         0  
  0         0  
492 0         0  
493 0 0       0 } elsif ($charset_declared =~ /^UTF[ -]?16/i) {
494 0         0 # Handle cases where spammers use UTF-16 encoding without including a BOM
495 0         0 # or declaring endianness as reported at:
496             # https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252
497 0         0  
498 0 0       0 my $decoder = detect_utf16( $_[0] );
499 0         0 if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) {
  0         0  
  0         0  
500 0         0 dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name);
501             return $_[0] if !$return_decoded;
502 0         0 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
503             return $rv; # decoded
504             } else {
505             my $err = '';
506             if ($@) {
507             $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
508             $err = " ($err)";
509             }
510 0         0 dbg("message: failed decoding as declared charset %s%s", $charset_declared, $err);
511 0 0       0 };
  0         0  
  0         0  
512 0         0  
513 0 0       0 } elsif ($cnt_8bits &&
514 0         0 eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
515 0         0 dbg("message: decoded as charset UTF-8, declared %s", $charset_declared);
516             return $_[0] if !$return_decoded;
517 0         0 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
518 0 0       0 return $rv; # decoded
519 0         0  
  0         0  
  0         0  
520 0         0 } elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) {
521             # declared as US-ASCII but contains 8-bit characters, makes no sense
522 0         0 # to attempt decoding first as strict US-ASCII as we know it would fail
523              
524             } else {
525             # try decoding as a declared character set
526 0         0  
  0         0  
527 0         0 # -> http://en.wikipedia.org/wiki/Windows-1252
528 0 0       0 # Windows-1252 character encoding is a superset of ISO 8859-1, but differs
529 0         0 # from the IANA's ISO-8859-1 by using displayable characters rather than
530 0         0 # control characters in the 80 to 9F (hex) range. [...]
531             # It is very common to mislabel Windows-1252 text with the charset label
532             # ISO-8859-1. A common result was that all the quotes and apostrophes
533             # (produced by "smart quotes" in word-processing software) were replaced
534             # with question marks or boxes on non-Windows operating systems, making
535             # text difficult to read. Most modern web browsers and e-mail clients
536             # treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
537             # such mislabeling. This is now standard behavior in the draft HTML 5
538             # specification, which requires that documents advertised as ISO-8859-1
539             # actually be parsed with the Windows-1252 encoding.
540             #
541             my($chset, $decoder);
542             if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) {
543             $chset = 'Windows-1252'; $decoder = $enc_w1252;
544             } else {
545             $chset = $charset_declared; $decoder = Encode::find_encoding($chset);
546             if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
547             $decoder = Encode::find_encoding('GBK'); # a subset of GB18030
548             dbg("message: no decoder for a declared charset %s, using GBK",
549             $chset) if $decoder;
550             }
551             }
552             if (!$decoder) {
553 2         7 dbg("message: failed decoding, no decoder for a declared charset %s",
554 2 50       13 $chset);
555 0         0 } else {
  0         0  
556             my $err = '';
557 2         4 eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
  2         14  
558 2 50 33     6805 if ($@) {
559 0         0 $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
560 0 0       0 $err = " ($err)";
561             }
562             if (lc $chset eq lc $charset_declared) {
563             dbg("message: %s as declared charset %s%s",
564 2 50       5 defined $rv ? 'decoded' : 'failed decoding', $charset_declared, $err);
565 0         0 } else {
566             dbg("message: %s as charset %s, declared %s%s",
567             defined $rv ? 'decoded' : 'failed decoding',
568 2         4 $chset, $charset_declared, $err);
569 2         4 }
  2         7  
570 2 50       196 }
571 0         0 }
  0         0  
  0         0  
572 0         0  
573             # If the above failed, check if it is US-ASCII, possibly extended by few
574 2 50       9 # NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing
575 2 50       9 # some popular punctuation or special characters from Windows-1252 in
576             # the \x80-\x9F range (which is unassigned in ISO-8859-*).
577             # Note that Windows-1252 is a proper superset of ISO-8859-1.
578 0 0       0 #
579             if (!defined $rv && !$cnt_8bits) {
580             dbg("message: kept, guessed charset is US-ASCII, declared %s",
581             $charset_declared);
582             return $_[0]; # is all-ASCII, no need for decoding
583              
584             } elsif (!defined $rv && $enc_w1252 &&
585             # ASCII NBSP (c) SHY ' " ... '".- TM
586             #$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c)
587             # Bug 7656: Include latin1 diacritic letters to Windows-1252 autodetection,
588             # Encode::Detect::Detector might identify them as Windows-1255 (Hebrew!)
589             $_[0] !~ tr/\x00-\x7f\xa0\xa9\xad\x82\x84\x85\x91-\x97\x99\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe//c)
590             { # ASCII + NBSP + SHY + some punctuation characters
591 2 50 33     11 # NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
    50 33        
      0        
592 0         0 # consider also: AE (r), 80 Euro
593             my $err = '';
594 0         0 eval { $rv = $enc_w1252->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
595             if ($@) {
596             $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
597             $err = " ($err)";
598             }
599             # the above can't fail, but keep code general just in case
600             dbg("message: %s as guessed charset %s, declared %s%s",
601             defined $rv ? 'decoded' : 'failed decoding',
602             'Windows-1252', $charset_declared, $err);
603             }
604              
605 0         0 # If we were unsuccessful so far, try some guesswork
606 0         0 # based on Encode::Detect::Detector .
  0         0  
607 0 0       0  
608 0         0 if (defined $rv) {
  0         0  
  0         0  
609 0         0 # done, no need for guesswork
610             } elsif (!$have_encode_detector) {
611             dbg("message: Encode::Detect::Detector not available, declared %s failed",
612 0 0       0 $charset_declared);
613             } else {
614             my $charset_detected = Encode::Detect::Detector::detect($_[0]);
615             if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
616             my $decoder = Encode::find_encoding($charset_detected);
617             if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
618             $decoder = Encode::find_encoding('GBK'); # a subset of GB18030
619             dbg("message: no decoder for a detected charset %s, using GBK",
620 2 50       5 $charset_detected) if $decoder;
    0          
621             }
622             if (!$decoder) {
623 0         0 dbg("message: failed decoding, no decoder for a detected charset %s",
624             $charset_detected);
625             } else {
626 0         0 my $err = '';
627 0 0 0     0 eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
628 0         0 if ($@) {
629 0 0 0     0 $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
630 0         0 $err = " ($err)";
631 0 0       0 }
632             dbg("message: %s as detected charset %s, declared %s%s",
633             defined $rv ? 'decoded' : 'failed decoding',
634 0 0       0 $charset_detected, $charset_declared, $err);
635 0         0 }
636             }
637             }
638 0         0  
639 0         0 if (!defined $rv) { # all decoding attempts failed so far, probably garbage
  0         0  
640 0 0       0 # go for Windows-1252 which can't fail
641 0         0 my $err = '';
  0         0  
  0         0  
642 0         0 eval { $rv = $enc_w1252->decode($_[0]) };
643             if ($@) {
644 0 0       0 $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
645             $err = " ($err)";
646             }
647             dbg("message: %s as last-resort charset %s, declared %s%s",
648             defined $rv ? 'decoded' : 'failed decoding',
649             'Windows-1252', $charset_declared, $err);
650             }
651 2 50       5  
652             if (!defined $rv) { # just in case - all decoding attempts failed so far
653 0         0 return $_[0]; # garbage-in / garbage-out, return unchanged octets
654 0         0 }
  0         0  
655 0 0       0 # decoding octets to characters was successful
656 0         0 if (!$return_decoded) {
  0         0  
  0         0  
657 0         0 # utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg
658             utf8::encode($rv); # encode Unicode characters to UTF-8 octets
659 0 0       0 }
660             $rv .= $data_taint; # carry taintedness over, avoid Encode bug
661             return $rv;
662             }
663              
664 2 50       3 =item rendered()
665 0         0  
666             render_text() takes the given text/* type MIME part, and attempts to
667             render it into a text scalar. It will always render text/html, and will
668 2 50       5 use a heuristic to determine if other text/* parts should be considered
669             text/html. Two scalars are returned: the rendered type (either text/html
670 2         4 or whatever the original type was), and the rendered text.
671              
672 2         3 =cut
673 2         14  
674             my ($self) = @_;
675              
676             if (!exists $self->{rendered}) {
677             # We only know how to render text/plain and text/html ...
678             # Note: for bug 4843, make sure to skip text/calendar parts
679             # we also want to skip things like text/x-vcard
680             # text/x-aol is ignored here, but looks like text/html ...
681             return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
682              
683             my $text = $self->decode; # QP and Base64 decoding, bytes
684             my $text_len = length($text); # num of bytes in original charset encoding
685              
686             # render text/html always
687 143     143 1 442 if ($text ne '' && $self->{'type'} =~ m{^text/html$}i)
688             {
689 143 100       515 $self->{rendered_type} = 'text/html';
690              
691             # will input text to HTML::Parser be provided as Unicode characters?
692             my $character_semantics = 0; # $text is in bytes
693             if ($self->{normalize} && $enc_utf8) { # charset decoding requested
694 127 50       1160 # Provide input to HTML::Parser as Unicode characters
695             # which avoids a HTML::Parser bug in utf8_mode
696 127         615 # https://rt.cpan.org/Public/Bug/Display.html?id=99755
697 127         522 # Note: the above bug was fixed in HTML-Parser 3.72, January 2016.
698             # Avoid unnecessary step of encoding-then-decoding by telling
699             # subroutine _normalize() to return Unicode text. See Bug 7133
700 127 100 100     1385 #
701             $character_semantics = 1; # $text will be in characters
702 8         37 $text = _normalize($text, $self->{charset}, 1); # bytes to chars
703             } elsif (!defined $self->{charset} ||
704             $self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) {
705 8         23 # With some luck input can be interpreted as UTF-8, do not warn.
706 8 50 33     178 # It is still possible to hit the HTML::Parses utf8_mode bug however.
    100 100        
707             } else {
708             dbg("message: 'normalize_charset' is off, encoding will likely ".
709             "be misinterpreted; declared charset: %s", $self->{charset});
710             }
711             # the 0 requires decoded HTML results to be in bytes (not characters)
712             my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object
713              
714 0         0 $html->parse($text); # parse+render text
715 0         0  
716             # resulting HTML-decoded text is in bytes, likely encoded as UTF-8
717             $self->{rendered} = $html->get_rendered_text();
718             $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
719             $self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
720             $self->{html_results} = $html->get_results();
721              
722 1         13 # end-of-document result values that require looking at the text
723             my $r = $self->{html_results}; # temporary reference for brevity
724              
725 8         185 # count the number of spaces in the rendered text (likely UTF-8 octets)
726             my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//;
727 8         70 # we may want to add the count of other Unicode whitespace characters
728              
729             $r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8)
730 8         40 $r->{non_space_len} = $r->{html_length} - $space;
731 8         49 $r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len;
732 8         34 }
733 8         45  
734             else { # plain text
735             if ($self->{normalize} && $enc_utf8) {
736 8         19 # request transcoded result as UTF-8 octets!
737             $text = _normalize($text, $self->{charset}, 0);
738             }
739 8         42 $self->{rendered_type} = $self->{type};
740             $self->{rendered} = $self->{'visible_rendered'} = $text;
741             $self->{'invisible_rendered'} = '';
742 8         31 }
743 8         26 }
744 8 50       185  
745             return ($self->{rendered_type}, $self->{rendered});
746             }
747              
748 119 0 33     451 =item set_rendered($text, $type)
749              
750 0         0 Set the rendered text and type for the given part. If type is not
751             specified, and text is a defined value, a default of 'text/plain' is used.
752 119         426 This can be used, for instance, to render non-text parts using plugins.
753 119         532  
754 119         422 =cut
755              
756             my ($self, $text, $type) = @_;
757              
758 143         599 $type = 'text/plain' if (!defined $type && defined $text);
759              
760             $self->{'rendered_type'} = $type;
761             $self->{'rendered'} = $self->{'visible_rendered'} = $text;
762             $self->{'invisible_rendered'} = defined $text ? '' : undef;
763             }
764              
765             =item visible_rendered()
766              
767             Render and return the visible text in this part.
768              
769             =cut
770 0     0 1 0  
771             my ($self) = @_;
772 0 0 0     0 $self->rendered(); # ignore return, we want just this:
773             return ($self->{rendered_type}, $self->{visible_rendered});
774 0         0 }
775 0         0  
776 0 0       0 =item invisible_rendered()
777              
778             Render and return the invisible text in this part.
779              
780             =cut
781              
782             my ($self) = @_;
783             $self->rendered(); # ignore return, we want just this:
784             return ($self->{rendered_type}, $self->{invisible_rendered});
785             }
786 8     8 1 21  
787 8         55 =item content_summary()
788 8         38  
789             Returns an array of scalars describing the mime parts of the message.
790             Note: This function requires that the message be parsed first!
791              
792             =cut
793              
794             # return an array with scalars describing mime parts
795             my($self) = @_;
796              
797             my @ret = ( [ $self->{'type'} ] );
798 8     8 1 20 my @search;
799 8         26  
800 8         24 if (exists $self->{'body_parts'}) {
801             my $count = @{$self->{'body_parts'}};
802             for(my $i=0; $i<$count; $i++) {
803             push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]);
804             }
805             }
806              
807             while(my $part = shift @search) {
808             my($index, $part) = @{$part};
809             push(@{$ret[$index]}, $part->{'type'});
810             if (exists $part->{'body_parts'}) {
811             unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}});
812 13     13 1 211 }
813             }
814 13         48  
815 13         14 return map { join(",", @{$_}) } @ret;
816             }
817 13 100       27  
818 11         13 =item delete_header()
  11         15  
819 11         24  
820 24         64 Delete the specified header (decoded and raw) from the Node information.
821              
822             =cut
823              
824 13         27 my($self, $hdr) = @_;
825 45         48  
  45         68  
826 45         41 foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) {
  45         83  
827 45 100       125 delete $self->{'headers'}->{$_};
828 11         13 delete $self->{'raw_headers'}->{$_};
  21         48  
  11         18  
829             }
830            
831             my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}});
832 13         18 $self->{'header_order'} = \@neworder;
  37         49  
  37         178  
833             }
834              
835             # decode a header appropriately. don't bother adding it to the pod documents.
836             my ( $encoding, $cte, $data ) = @_;
837              
838             if ( $cte eq 'B' ) {
839             # base 64 encoded
840             $data = Mail::SpamAssassin::Util::base64_decode($data);
841             }
842 505     505 1 929 elsif ( $cte eq 'Q' ) {
843             # quoted printable
844 505         641  
  505         6907  
845 0         0 # the RFC states that in the encoded text, "_" is equal to "=20"
846 0         0 $data =~ s/_/=20/g;
847              
848             $data = Mail::SpamAssassin::Util::qp_decode($data);
849 505         1028 }
  505         5010  
850 505         1678 else {
851             # not possible since the input has already been limited to 'B' and 'Q'
852             die "message: unknown encoding type '$cte' in RFC2047 header";
853             }
854             return _normalize($data, $encoding, 0); # transcode to UTF-8 octets
855 10     10   43 }
856              
857 10 100       39 # Decode base64 and quoted-printable in headers according to RFC2047.
    50          
858             #
859 2         13 my($header_field_body, $header_field_name) = @_;
860              
861             return '' unless defined $header_field_body && $header_field_body ne '';
862              
863             # deal with folding and cream the newlines and such
864             $header_field_body =~ s/\n[ \t]+/\n /g;
865 8         16 $header_field_body =~ s/\015?\012//gs;
866              
867 8         18 if ($header_field_name =~
868             /^ (?: Received | (?:Resent-)? (?: Message-ID | Date ) |
869             MIME-Version | References | In-Reply-To | List-.* ) \z /xsi ) {
870             # Bug 6945: some header fields must not be processed for MIME encoding
871 0         0 # Bug 7466: leave out the Content-*
872              
873 10         28 } else {
874             local($1,$2,$3);
875              
876             # Multiple encoded sections must ignore the interim whitespace.
877             # To avoid possible FPs with (\s+(?==\?))?, look for the whole RE
878             # separated by whitespace.
879 909     909   2543 1 while $header_field_body =~
880             s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+
881 909 100 66     4498 ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) }
882             {$1$2}xsg;
883              
884 859         1550 # transcode properly encoded RFC 2047 substrings into UTF-8 octets,
885 859         1097 # leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532)
886             # or plain US-ASCII
887 859 100       3009 $header_field_body =~
888             s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) }
889             { __decode_header($1, uc($2), $3) }xsge;
890             }
891              
892             # dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body);
893             return $header_field_body;
894 523         1777 }
895              
896             =item get_header()
897              
898             Retrieve a specific header. Will have a newline at the end and will be
899 523         1245 unfolded. The first parameter is the header name (case-insensitive),
900             and the second parameter (optional) is whether or not to return the
901             raw header.
902              
903             If get_header() is called in an array context, an array will be returned
904             with each header entry in a different element. In a scalar context,
905             the last specific header is returned.
906 523         1201  
907 10         39 ie: If 'Subject' is specified as the header, and there are 2 Subject
908             headers in a message, the last/bottom one in the message is returned in
909             scalar context or both are returned in array context.
910              
911 859         2691 Btw, returning the last header field (not the first) happens to be consistent
912             with DKIM signatures, which search for and cover multiple header fields
913             bottom-up according to the 'h' tag. Let's keep it this way.
914              
915             =cut
916              
917             my ($self, $hdr, $raw) = @_;
918             $raw ||= 0;
919              
920             # And now pick up all the entries into a list
921             # This is assumed to include a newline at the end ...
922             # This is also assumed to have removed continuation bits ...
923              
924             # Deal with the possibility that header() or raw_header() returns undef
925             my @hdrs;
926             if ( $raw ) {
927             if (@hdrs = $self->raw_header($hdr)) {
928             s/\015?\012\s+/ /gs for @hdrs;
929             }
930             }
931             else {
932             if (@hdrs = $self->header($hdr)) {
933             $_ .= "\n" for @hdrs;
934             }
935             }
936 4078     4078 1 6951  
937 4078   50     12097 if (wantarray) {
938             return @hdrs;
939             }
940             else {
941             return @hdrs ? $hdrs[-1] : undef;
942             }
943             }
944 4078         4404  
945 4078 50       6029 =item get_all_headers()
946 0 0       0  
947 0         0 Retrieve all headers. Each header will have a newline at the end and
948             will be unfolded. The first parameter (optional) is whether or not to
949             return the raw headers, and the second parameter (optional) is whether
950             or not to include the mbox separator.
951 4078 100       7423  
952 834         3018 If get_all_header() is called in an array context, an array will be
953             returned with each header entry in a different element. In a scalar
954             context, the headers are returned in a single scalar.
955              
956 4078 100       6112 =back
957 3528         8352  
958             =cut
959              
960 550 100       2265 # build it and it will not bomb
961             my ($self, $raw, $include_mbox) = @_;
962             $raw ||= 0;
963             $include_mbox ||= 0;
964              
965             my @lines;
966              
967             # precalculate destination positions based on order of appearance
968             my $i = 0;
969             my %locations;
970             for my $k (@{$self->{header_order}}) {
971             push(@{$locations{lc($k)}}, $i++);
972             }
973              
974             # process headers in order of first appearance
975             my $header;
976             my $size = 0;
977             HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] }
978             keys %locations)
979             {
980             # get all same-name headers and poke into correct position
981 45     45 1 135 my $positions = $locations{$name};
982 45   50     290 for my $contents ($self->get_header($name, $raw)) {
983 45   100     192 my $position = shift @{$positions};
984             $size += length($name) + length($contents) + 2;
985 45         62 if ($size > MAX_HEADER_LENGTH) {
986             $self->{'truncated_header'} = 1;
987             last HEADER;
988 45         70 }
989 45         65 $lines[$position] = $self->{header_order}->[$position].": ".$contents;
990 45         97 }
  45         227  
991 520         589 }
  520         1400  
992              
993             # skip undefined lines if we truncated
994             @lines = grep { defined $_ } @lines if $self->{'truncated_header'};
995 45         83  
996 45         80 splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} );
997 45         408  
  880         1525  
998             return wantarray ? @lines : join ('', @lines);
999             }
1000              
1001 396         600 # legacy public API; now a no-op.
1002 396         653  
1003 520         636 # ---------------------------------------------------------------------------
  520         700  
1004 520         1024  
1005 520 50       1022 1;