File Coverage

blib/lib/Mail/SpamAssassin/Message/Node.pm
Criterion Covered Total %
statement 263 420 62.6
branch 97 212 45.7
condition 40 85 47.0
subroutine 25 30 83.3
pod 16 18 88.8
total 441 765 57.6


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts
21              
22             =head1 DESCRIPTION
23              
24             This module will encapsulate an email message and allow access to
25             the various MIME message parts.
26              
27             =head1 PUBLIC METHODS
28              
29             =over 4
30              
31             =cut
32              
33             package Mail::SpamAssassin::Message::Node;
34              
35 40     40   286 use strict;
  40         90  
  40         1435  
36 40     40   241 use warnings;
  40         95  
  40         1337  
37 40     40   261 use re 'taint';
  40         125  
  40         1669  
38              
39             require 5.008001; # needs utf8::is_utf8()
40              
41 40     40   250 use Mail::SpamAssassin;
  40         95  
  40         1235  
42 40     40   285 use Mail::SpamAssassin::Constants qw(:sa);
  40         80  
  40         6421  
43 40     40   12762 use Mail::SpamAssassin::HTML;
  40         130  
  40         1544  
44 40     40   274 use Mail::SpamAssassin::Logger;
  40         81  
  40         5604  
45              
46             our($enc_utf8, $enc_w1252, $have_encode_detector);
47             BEGIN {
48 40         23905 eval { require Encode }
49 40 50   40   157 and do { $enc_utf8 = Encode::find_encoding('UTF-8');
  40         383246  
50 40         6379 $enc_w1252 = Encode::find_encoding('Windows-1252') };
51 40         213008 eval { require Encode::Detect::Detector }
52 40 50       15296 and do { $have_encode_detector = 1 };
  0         0  
53             };
54              
55             =item new()
56              
57             Generates an empty Node object and returns it. Typically only called
58             by functions in Message.
59              
60             =cut
61              
62             sub new {
63 193     193 1 536 my $class = shift;
64 193   33     908 $class = ref($class) || $class;
65              
66 193         1249 my $self = {
67             headers => {},
68             raw_headers => {},
69             header_order => []
70             };
71              
72             # deal with any parameters
73 193         544 my($opts) = @_;
74 193   50     1053 $self->{normalize} = $opts->{'normalize'} || 0;
75              
76 193         595 bless($self,$class);
77 193         630 $self;
78             }
79              
80             =item find_parts()
81              
82             Used to search the tree for specific MIME parts. An array of matching
83             Node objects (pointers into the tree) is returned. The parameters that
84             can be passed in are (in order, all scalars):
85              
86             Regexp - Used to match against each part's Content-Type header,
87             specifically the type and not the rest of the header. ie: "Content-type:
88             text/html; encoding=quoted-printable" has a type of "text/html". If no
89             regexp is specified, find_parts() will return an empty array.
90              
91             Only_leaves - By default, find_parts() will return any part that matches
92             the regexp, including multipart. If you only want to see leaves of the
93             tree (ie: parts that aren't multipart), set this to true (1).
94              
95             Recursive - By default, when find_parts() finds a multipart which has
96             parts underneath it, it will recurse through all sub-children. If set to 0,
97             only look at the part and any direct children of the part.
98              
99             =cut
100              
101             # Used to find any MIME parts whose simple content-type matches a given regexp
102             # Searches it's own and any children parts. Returns an array of MIME
103             # objects which match. Our callers may expect the default behavior which is a
104             # depth-first array of parts.
105             #
106             sub find_parts {
107 255     255 1 729 my ($self, $re, $onlyleaves, $recursive) = @_;
108              
109             # Didn't pass an RE? Just abort.
110 255 50 33     1544 return () unless defined $re && $re ne '';
111              
112 255 50       761 $onlyleaves = 0 unless defined $onlyleaves;
113              
114 255         432 my $depth;
115 255 50 33     733 if (defined $recursive && $recursive == 0) {
116 0         0 $depth = 1;
117             }
118            
119 255         426 my @ret;
120 255         577 my @search = ( $self );
121              
122 255         773 while (my $part = shift @search) {
123             # If this object matches, mark it for return.
124 287         1010 my $amialeaf = $part->is_leaf();
125              
126 287 100 66     3148 if ( $part->{'type'} =~ /$re/ && (!$onlyleaves || $amialeaf) ) {
      100        
127 269         685 push(@ret, $part);
128             }
129            
130 287 50 33     1209 if ( !$amialeaf && (!defined $depth || $depth > 0)) {
      66        
131 18 50       34 $depth-- if defined $depth;
132 18         29 unshift(@search, @{$part->{'body_parts'}});
  18         64  
133             }
134             }
135              
136 255         978 return @ret;
137             }
138              
139             =item header()
140              
141             Stores and retrieves headers from a specific MIME part. The first
142             parameter is the header name. If there is no other parameter, the header
143             is retrieved. If there is a second parameter, the header is stored.
144              
145             Header names are case-insensitive and are stored in both raw and
146             decoded form. Using header(), only the decoded form is retrievable.
147              
148             For retrieval, if header() is called in an array context, an array will
149             be returned with each header entry in a different element. In a scalar
150             context, the last specific header is returned.
151              
152             ie: If 'Subject' is specified as the header, and there are 2 Subject
153             headers in a message, the last/bottom one in the message is returned in
154             scalar context or both are returned in array context.
155              
156             =cut
157              
158             # Store or retrieve headers from a given MIME object
159             #
160             sub header {
161 5970     5970 1 8455 my $self = shift;
162 5970         9585 my $rawkey = shift;
163              
164 5970 50       13904 return unless defined $rawkey;
165              
166             # we're going to do things case insensitively
167 5970         12288 my $key = lc($rawkey);
168              
169             # Trim whitespace off of the header keys
170 5970         13844 $key =~ s/^\s+//;
171 5970         11090 $key =~ s/\s+$//;
172              
173 5970 100       11531 if (@_) {
174 898         1657 my $raw_value = shift;
175 898 50       2127 return unless defined $raw_value;
176              
177 898         1336 push @{ $self->{'header_order'} }, $rawkey;
  898         2422  
178 898 100       2601 if ( !exists $self->{'headers'}->{$key} ) {
179 824         2632 $self->{'headers'}->{$key} = [];
180 824         1975 $self->{'raw_headers'}->{$key} = [];
181             }
182              
183 898         1836 my $dec_value = $raw_value;
184 898         2646 $dec_value =~ s/\n[ \t]+/ /gs;
185 898         4431 $dec_value =~ s/\s+$//s;
186 898         2832 $dec_value =~ s/^\s+//s;
187 898         1392 push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key);
  898         2927  
188              
189 898         1593 push @{ $self->{'raw_headers'}->{$key} }, $raw_value;
  898         2461  
190              
191 898         4727 return $self->{'headers'}->{$key}->[-1];
192             }
193              
194 5072 100       8944 if (wantarray) {
195 4763 100       19029 return unless exists $self->{'headers'}->{$key};
196 1007         1522 return @{ $self->{'headers'}->{$key} };
  1007         6202  
197             }
198             else {
199 309 100       2500 return '' unless exists $self->{'headers'}->{$key};
200 46         363 return $self->{'headers'}->{$key}->[-1];
201             }
202             }
203              
204             =item raw_header()
205              
206             Retrieves the raw version of headers from a specific MIME part. The only
207             parameter is the header name. Header names are case-insensitive.
208              
209             For retrieval, if raw_header() is called in an array context, an array
210             will be returned with each header entry in a different element. In a
211             scalar context, the last specific header is returned.
212              
213             ie: If 'Subject' is specified as the header, and there are 2 Subject
214             headers in a message, the last/bottom one in the message is returned in
215             scalar context or both are returned in array context.
216              
217             =cut
218              
219             # Retrieve raw headers from a given MIME object
220             #
221             sub raw_header {
222 493     493 1 859 my $self = shift;
223 493         1029 my $key = lc(shift);
224              
225             # Trim whitespace off of the header keys
226 493         1327 $key =~ s/^\s+//;
227 493         1076 $key =~ s/\s+$//;
228              
229 493 50       985 if (wantarray) {
230 493 100       2042 return unless exists $self->{'raw_headers'}->{$key};
231 128         253 return @{ $self->{'raw_headers'}->{$key} };
  128         893  
232             }
233             else {
234 0 0       0 return '' unless exists $self->{'raw_headers'}->{$key};
235 0         0 return $self->{'raw_headers'}->{$key}->[-1];
236             }
237             }
238              
239             =item add_body_part()
240              
241             Adds a Node child object to the current node object.
242              
243             =cut
244              
245             # Add a MIME child part to ourselves
246             sub add_body_part {
247 48     48 1 100 my($self, $part) = @_;
248              
249 48         178 dbg("message: added part, type: ".$part->{'type'});
250 48         81 push @{ $self->{'body_parts'} }, $part;
  48         132  
251             }
252              
253             =item is_leaf()
254              
255             Returns true if the tree node in question is a leaf of the tree (ie:
256             has no children of its own). Note: This function may return odd results
257             unless the message has been mime parsed via _do_parse()!
258              
259             =cut
260              
261             sub is_leaf {
262 287     287 1 744 my($self) = @_;
263 287         903 return !exists $self->{'body_parts'};
264             }
265              
266             =item raw()
267              
268             Return a reference to the the raw array. Treat this as READ ONLY.
269              
270             =cut
271              
272             sub raw {
273 0     0 1 0 my $self = shift;
274              
275             # Ok, if we're called we are expected to return an array.
276             # so if it's a file reference, read in the message into an array...
277             #
278             # NOTE: that "ref undef" works, so don't bother checking for a defined var
279             # first.
280 0 0       0 if (ref $self->{'raw'} eq 'GLOB') {
281 0         0 my $fd = $self->{'raw'};
282 0 0       0 seek($fd, 0, 0) or die "message: cannot rewind file: $!";
283              
284             # dbg("message: (raw) reading mime part from a temporary file");
285 0         0 my($nread,$raw_str); $raw_str = '';
  0         0  
286 0         0 while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { }
287 0 0       0 defined $nread or die "error reading: $!";
288 0         0 my @array = split(/^/m, $raw_str, -1);
289              
290 0 0       0 dbg("message: empty message read") if $raw_str eq '';
291 0         0 return \@array;
292             }
293              
294 0         0 return $self->{'raw'};
295             }
296              
297             =item decode()
298              
299             If necessary, decode the part text as base64 or quoted-printable.
300             The decoded text will be returned as a scalar string. An optional length
301             parameter can be passed in which limits how much decoded data is returned.
302             If the scalar isn't needed, call with "0" as a parameter.
303              
304             =cut
305              
306             sub decode {
307 252     252 1 3385 my($self, $bytes) = @_;
308              
309 252 100       741 if ( !exists $self->{'decoded'} ) {
310             # Someone is looking for a decoded part where there is no raw data
311             # (multipart or subparsed message, etc.) Just return undef.
312 144 50       564 return if !exists $self->{'raw'};
313              
314 144         264 my $raw;
315              
316             # if the part is held in a temp file, read it into the scalar
317 144 100       624 if (ref $self->{'raw'} eq 'GLOB') {
318 4         9 my $fd = $self->{'raw'};
319 4 50       43 seek($fd, 0, 0) or die "message: cannot rewind file: $!";
320              
321             # dbg("message: (decode) reading mime part from a temporary file");
322 4         10 my($nread,$raw_str); $raw = '';
  4         9  
323 4         210 while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { }
324 4 50       18 defined $nread or die "error reading: $!";
325              
326 4 50       18 dbg("message: empty message read from a temp file") if $raw eq '';
327             }
328             else {
329             # create a new scalar from the raw array in memory
330 140         306 $raw = join('', @{$self->{'raw'}});
  140         1484  
331             }
332              
333 144   100     658 my $encoding = lc $self->header('content-transfer-encoding') || '';
334              
335 144 100       753 if ( $encoding eq 'quoted-printable' ) {
    100          
336 6         30 dbg("message: decoding quoted-printable");
337 6         40 $self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw);
338 6         39 $self->{'decoded'} =~ s/\015\012/\012/gs;
339             }
340             elsif ( $encoding eq 'base64' ) {
341 5         21 dbg("message: decoding base64");
342              
343             # if it's not defined or is 0, do the whole thing, otherwise only decode
344             # a portion
345 5 50       10 if ($bytes) {
346 0         0 return Mail::SpamAssassin::Util::base64_decode($raw, $bytes);
347             }
348             else {
349             # Generate the decoded output
350 5         20 $self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw);
351             }
352              
353 5 100       32 if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) {
354 1         35 $self->{'decoded'} =~ s/\015\012/\012/gs;
355             }
356             }
357             else {
358             # Encoding is one of 7bit, 8bit, binary or x-something
359 133 100       370 if ( $encoding ) {
360 31         176 dbg("message: decoding other encoding type ($encoding), ignoring");
361             }
362             else {
363 102         371 dbg("message: no encoding detected");
364             }
365 133         838 $self->{'decoded'} = $raw;
366             }
367             }
368              
369 252 100 66     1442 if ( !defined $bytes || $bytes ) {
370 248 50       747 if ( !defined $bytes ) {
371             # force a copy
372 248         2470 return '' . $self->{'decoded'};
373             }
374             else {
375 0         0 return substr($self->{'decoded'}, 0, $bytes);
376             }
377             }
378             }
379              
380             # Detect endianness of UTF-16 encoded data
381             sub detect_utf16 {
382 0     0 0 0 my $data = $_[0]; # could not avoid copying large strings
383 0         0 my $utf16le_clues = 0;
384 0         0 my $utf16be_clues = 0;
385 0         0 my $sum_h_e = 0;
386 0         0 my $sum_h_o = 0;
387 0         0 my $sum_l_e = 0;
388 0         0 my $sum_l_o = 0;
389 0         0 my $decoder = undef;
390              
391 0         0 my @msg_h = unpack 'H' x length( $data ), $data;
392 0         0 my @msg_l = unpack 'h' x length( $data ), $data;
393              
394 0         0 for( my $i = 0; $i < length( $data ); $i+=2 ) {
395 0         0 my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] );
396 0         0 $sum_h_e += hex $msg_h[$i];
397 0         0 $sum_h_o += hex $msg_h[$i+1];
398 0         0 $sum_l_e += hex $msg_l[$i];
399 0         0 $sum_l_o += hex $msg_l[$i+1];
400 0 0       0 if( $check_char =~ /20 00/ ) {
401             # UTF-16LE space char detected
402 0         0 $utf16le_clues++;
403             }
404 0 0       0 if( $check_char =~ /00 20/ ) {
405             # UTF-16BE space char detected
406 0         0 $utf16be_clues++;
407             }
408             }
409              
410             # If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE
411 0 0       0 $utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 );
412              
413             # If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE
414 0 0       0 $utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) );
415              
416 0 0       0 if( $utf16le_clues > $utf16be_clues ) {
    0          
417 0         0 dbg( "message: detect_utf16: UTF-16LE" );
418 0         0 $decoder = Encode::find_encoding("UTF-16LE");
419             } elsif( $utf16le_clues > $utf16be_clues ) {
420 0         0 dbg( "message: detect_utf16: UTF-16BE" );
421 0         0 $decoder = Encode::find_encoding("UTF-16BE");
422             } else {
423 0         0 dbg( "message: detect_utf16: Could not detect UTF-16 endianness" );
424             }
425              
426 0         0 return $decoder;
427             }
428              
429             # Look at a text scalar and determine whether it should be rendered
430             # as text/html.
431             #
432             # This is not a public function.
433             #
434             sub _html_render {
435 0 0   0   0 if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is)
436             {
437 0         0 my $pad = $1;
438 0         0 my $count = 0;
439 0         0 $count += ($pad =~ tr/\n//d) * 2;
440 0         0 $count += ($pad =~ tr/\n//cd);
441 0         0 return ($count < 24);
442             }
443 0         0 return 0;
444             }
445              
446             # Decode character set of a given text to perl characters (Unicode),
447             # then encode into UTF-8 octets if requested.
448             #
449             sub _normalize {
450             # my $data = $_[0]; # avoid copying large strings
451 10     10   23 my $charset_declared = $_[1];
452 10         20 my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets
453              
454 10 50       38 warn "message: _normalize() was given characters, expected bytes: $_[0]\n"
455             if utf8::is_utf8($_[0]);
456              
457             # workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
458 10         29 my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data
459              
460 10 50 33     128 if (!defined $charset_declared || $charset_declared eq '') {
461 0         0 $charset_declared = 'us-ascii';
462             }
463              
464             # number of characters with code above 127
465 10         31 my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;
466              
467 10 50 33     95 if (!$cnt_8bits &&
468             $charset_declared =~
469             /^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) |
470             ISO646-US )\z/xsi)
471             { # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
472 0         0 dbg("message: kept, charset is US-ASCII as declared");
473 0         0 return $_[0]; # is all-ASCII, no need for decoding
474             }
475              
476 10 100 66     93 if (!$cnt_8bits &&
477             $charset_declared =~
478             /^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} |
479             UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} |
480             Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
481             { # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
482 8         26 dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared);
483 8         71 return $_[0]; # is all-ASCII, no need for decoding
484             }
485              
486             # Try first to strictly decode based on a declared character set.
487              
488 2         5 my $rv;
489 2 50 33     20 if ($charset_declared =~ /^UTF-?8\z/i) {
    50          
    50          
    50          
490             # attempt decoding as strict UTF-8 (flags: FB_CROAK | LEAVE_SRC)
491 0 0       0 if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
  0         0  
  0         0  
492 0         0 dbg("message: decoded as declared charset UTF-8");
493 0 0       0 return $_[0] if !$return_decoded;
494 0         0 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
495 0         0 return $rv; # decoded
496             } else {
497 0         0 my $err = '';
498 0 0       0 if ($@) {
499 0         0 $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
  0         0  
  0         0  
500 0         0 $err = " ($err)";
501             }
502 0         0 dbg("message: failed decoding as declared charset UTF-8 ($err)");
503             }
504              
505             } elsif ($charset_declared =~ /^UTF[ -]?16/i) {
506             # Handle cases where spammers use UTF-16 encoding without including a BOM
507             # or declaring endianness as reported at:
508             # https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252
509              
510 0         0 my $decoder = detect_utf16( $_[0] );
511 0 0       0 if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) {
  0         0  
  0         0  
512 0         0 dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name);
513 0 0       0 return $_[0] if !$return_decoded;
514 0         0 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
515 0         0 return $rv; # decoded
516             } else {
517 0         0 my $err = '';
518 0 0       0 if ($@) {
519 0         0 $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
  0         0  
  0         0  
520 0         0 $err = " ($err)";
521             }
522 0         0 dbg("message: failed decoding as declared charset %s%s", $charset_declared, $err);
523             };
524              
525             } elsif ($cnt_8bits &&
526 0         0 eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
  0         0  
527 0         0 dbg("message: decoded as charset UTF-8, declared %s", $charset_declared);
528 0 0       0 return $_[0] if !$return_decoded;
529 0         0 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
530 0         0 return $rv; # decoded
531              
532             } elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) {
533             # declared as US-ASCII but contains 8-bit characters, makes no sense
534             # to attempt decoding first as strict US-ASCII as we know it would fail
535              
536             } else {
537             # try decoding as a declared character set
538              
539             # -> http://en.wikipedia.org/wiki/Windows-1252
540             # Windows-1252 character encoding is a superset of ISO 8859-1, but differs
541             # from the IANA's ISO-8859-1 by using displayable characters rather than
542             # control characters in the 80 to 9F (hex) range. [...]
543             # It is very common to mislabel Windows-1252 text with the charset label
544             # ISO-8859-1. A common result was that all the quotes and apostrophes
545             # (produced by "smart quotes" in word-processing software) were replaced
546             # with question marks or boxes on non-Windows operating systems, making
547             # text difficult to read. Most modern web browsers and e-mail clients
548             # treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
549             # such mislabeling. This is now standard behavior in the draft HTML 5
550             # specification, which requires that documents advertised as ISO-8859-1
551             # actually be parsed with the Windows-1252 encoding.
552             #
553 2         4 my($chset, $decoder);
554 2 50       15 if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) {
555 0         0 $chset = 'Windows-1252'; $decoder = $enc_w1252;
  0         0  
556             } else {
557 2         7 $chset = $charset_declared; $decoder = Encode::find_encoding($chset);
  2         14  
558 2 50 33     8317 if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
559 0         0 $decoder = Encode::find_encoding('GBK'); # a subset of GB18030
560 0 0       0 dbg("message: no decoder for a declared charset %s, using GBK",
561             $chset) if $decoder;
562             }
563             }
564 2 50       7 if (!$decoder) {
565 0         0 dbg("message: failed decoding, no decoder for a declared charset %s",
566             $chset);
567             } else {
568 2         5 my $err = '';
569 2         3 eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
  2         9  
570 2 50       232 if ($@) {
571 0         0 $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
  0         0  
  0         0  
572 0         0 $err = " ($err)";
573             }
574 2 50       11 if (lc $chset eq lc $charset_declared) {
575 2 50       12 dbg("message: %s as declared charset %s%s",
576             defined $rv ? 'decoded' : 'failed decoding', $charset_declared, $err);
577             } else {
578 0 0       0 dbg("message: %s as charset %s, declared %s%s",
579             defined $rv ? 'decoded' : 'failed decoding',
580             $chset, $charset_declared, $err);
581             }
582             }
583             }
584              
585             # If the above failed, check if it is US-ASCII, possibly extended by few
586             # NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing
587             # some popular punctuation or special characters from Windows-1252 in
588             # the \x80-\x9F range (which is unassigned in ISO-8859-*).
589             # Note that Windows-1252 is a proper superset of ISO-8859-1.
590             #
591 2 50 33     13 if (!defined $rv && !$cnt_8bits) {
    50 33        
      0        
592 0         0 dbg("message: kept, guessed charset is US-ASCII, declared %s",
593             $charset_declared);
594 0         0 return $_[0]; # is all-ASCII, no need for decoding
595              
596             } elsif (!defined $rv && $enc_w1252 &&
597             # ASCII NBSP (c) SHY ' " ... '".- TM
598             #$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c)
599             # Bug 7656: Include latin1 diacritic letters to Windows-1252 autodetection,
600             # Encode::Detect::Detector might identify them as Windows-1255 (Hebrew!)
601             $_[0] !~ tr/\x00-\x7f\xa0\xa9\xad\x82\x84\x85\x91-\x97\x99\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe//c)
602             { # ASCII + NBSP + SHY + some punctuation characters
603             # NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
604             # consider also: AE (r), 80 Euro
605 0         0 my $err = '';
606 0         0 eval { $rv = $enc_w1252->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
  0         0  
607 0 0       0 if ($@) {
608 0         0 $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
  0         0  
  0         0  
609 0         0 $err = " ($err)";
610             }
611             # the above can't fail, but keep code general just in case
612 0 0       0 dbg("message: %s as guessed charset %s, declared %s%s",
613             defined $rv ? 'decoded' : 'failed decoding',
614             'Windows-1252', $charset_declared, $err);
615             }
616              
617             # If we were unsuccessful so far, try some guesswork
618             # based on Encode::Detect::Detector .
619              
620 2 50       6 if (defined $rv) {
    0          
621             # done, no need for guesswork
622             } elsif (!$have_encode_detector) {
623 0         0 dbg("message: Encode::Detect::Detector not available, declared %s failed",
624             $charset_declared);
625             } else {
626 0         0 my $charset_detected = Encode::Detect::Detector::detect($_[0]);
627 0 0 0     0 if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
628 0         0 my $decoder = Encode::find_encoding($charset_detected);
629 0 0 0     0 if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
630 0         0 $decoder = Encode::find_encoding('GBK'); # a subset of GB18030
631 0 0       0 dbg("message: no decoder for a detected charset %s, using GBK",
632             $charset_detected) if $decoder;
633             }
634 0 0       0 if (!$decoder) {
635 0         0 dbg("message: failed decoding, no decoder for a detected charset %s",
636             $charset_detected);
637             } else {
638 0         0 my $err = '';
639 0         0 eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
  0         0  
640 0 0       0 if ($@) {
641 0         0 $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
  0         0  
  0         0  
642 0         0 $err = " ($err)";
643             }
644 0 0       0 dbg("message: %s as detected charset %s, declared %s%s",
645             defined $rv ? 'decoded' : 'failed decoding',
646             $charset_detected, $charset_declared, $err);
647             }
648             }
649             }
650              
651 2 50       7 if (!defined $rv) { # all decoding attempts failed so far, probably garbage
652             # go for Windows-1252 which can't fail
653 0         0 my $err = '';
654 0         0 eval { $rv = $enc_w1252->decode($_[0]) };
  0         0  
655 0 0       0 if ($@) {
656 0         0 $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
  0         0  
  0         0  
657 0         0 $err = " ($err)";
658             }
659 0 0       0 dbg("message: %s as last-resort charset %s, declared %s%s",
660             defined $rv ? 'decoded' : 'failed decoding',
661             'Windows-1252', $charset_declared, $err);
662             }
663              
664 2 50       5 if (!defined $rv) { # just in case - all decoding attempts failed so far
665 0         0 return $_[0]; # garbage-in / garbage-out, return unchanged octets
666             }
667             # decoding octets to characters was successful
668 2 50       6 if (!$return_decoded) {
669             # utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg
670 2         6 utf8::encode($rv); # encode Unicode characters to UTF-8 octets
671             }
672 2         5 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
673 2         15 return $rv;
674             }
675              
676             =item rendered()
677              
678             render_text() takes the given text/* type MIME part, and attempts to
679             render it into a text scalar. It will always render text/html, and will
680             use a heuristic to determine if other text/* parts should be considered
681             text/html. Two scalars are returned: the rendered type (either text/html
682             or whatever the original type was), and the rendered text.
683              
684             =cut
685              
686             sub rendered {
687 141     141 1 387 my ($self) = @_;
688              
689 141 100       550 if (!exists $self->{rendered}) {
690             # We only know how to render text/plain and text/html ...
691             # Note: for bug 4843, make sure to skip text/calendar parts
692             # we also want to skip things like text/x-vcard
693             # text/x-aol is ignored here, but looks like text/html ...
694 125 50       1254 return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
695              
696 125         718 my $text = $self->decode; # QP and Base64 decoding, bytes
697 125         537 my $text_len = length($text); # num of bytes in original charset encoding
698              
699             # render text/html always
700 125 100 100     1479 if ($text ne '' && $self->{'type'} =~ m{^text/html$}i)
701             {
702 7         48 $self->{rendered_type} = 'text/html';
703              
704             # will input text to HTML::Parser be provided as Unicode characters?
705 7         19 my $character_semantics = 0; # $text is in bytes
706 7 50 33     113 if ($self->{normalize} && $enc_utf8) { # charset decoding requested
    100 100        
707             # Provide input to HTML::Parser as Unicode characters
708             # which avoids a HTML::Parser bug in utf8_mode
709             # https://rt.cpan.org/Public/Bug/Display.html?id=99755
710             # Note: the above bug was fixed in HTML-Parser 3.72, January 2016.
711             # Avoid unnecessary step of encoding-then-decoding by telling
712             # subroutine _normalize() to return Unicode text. See Bug 7133
713             #
714 0         0 $character_semantics = 1; # $text will be in characters
715 0         0 $text = _normalize($text, $self->{charset}, 1); # bytes to chars
716             } elsif (!defined $self->{charset} ||
717             $self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) {
718             # With some luck input can be interpreted as UTF-8, do not warn.
719             # It is still possible to hit the HTML::Parses utf8_mode bug however.
720             } else {
721             dbg("message: 'normalize_charset' is off, encoding will likely ".
722 1         11 "be misinterpreted; declared charset: %s", $self->{charset});
723             }
724             # the 0 requires decoded HTML results to be in bytes (not characters)
725 7         135 my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object
726              
727 7         63 $html->parse($text); # parse+render text
728              
729             # resulting HTML-decoded text is in bytes, likely encoded as UTF-8
730 7         38 $self->{rendered} = $html->get_rendered_text();
731 7         40 $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
732 7         31 $self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
733 7         42 $self->{html_results} = $html->get_results();
734              
735             # end-of-document result values that require looking at the text
736 7         25 my $r = $self->{html_results}; # temporary reference for brevity
737              
738             # count the number of spaces in the rendered text (likely UTF-8 octets)
739 7         45 my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//;
740             # we may want to add the count of other Unicode whitespace characters
741              
742 7         27 $r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8)
743 7         31 $r->{non_space_len} = $r->{html_length} - $space;
744 7 50       200 $r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len;
745             }
746              
747             else { # plain text
748 118 0 33     529 if ($self->{normalize} && $enc_utf8) {
749             # request transcoded result as UTF-8 octets!
750 0         0 $text = _normalize($text, $self->{charset}, 0);
751             }
752 118         510 $self->{rendered_type} = $self->{type};
753 118         615 $self->{rendered} = $self->{'visible_rendered'} = $text;
754 118         543 $self->{'invisible_rendered'} = '';
755             }
756             }
757              
758 141         711 return ($self->{rendered_type}, $self->{rendered});
759             }
760              
761             =item set_rendered($text, $type)
762              
763             Set the rendered text and type for the given part. If type is not
764             specified, and text is a defined value, a default of 'text/plain' is used.
765             This can be used, for instance, to render non-text parts using plugins.
766              
767             =cut
768              
769             sub set_rendered {
770 0     0 1 0 my ($self, $text, $type) = @_;
771              
772 0 0 0     0 $type = 'text/plain' if (!defined $type && defined $text);
773              
774 0         0 $self->{'rendered_type'} = $type;
775 0         0 $self->{'rendered'} = $self->{'visible_rendered'} = $text;
776 0 0       0 $self->{'invisible_rendered'} = defined $text ? '' : undef;
777             }
778              
779             =item visible_rendered()
780              
781             Render and return the visible text in this part.
782              
783             =cut
784              
785             sub visible_rendered {
786 8     8 1 27 my ($self) = @_;
787 8         68 $self->rendered(); # ignore return, we want just this:
788 8         42 return ($self->{rendered_type}, $self->{visible_rendered});
789             }
790              
791             =item invisible_rendered()
792              
793             Render and return the invisible text in this part.
794              
795             =cut
796              
797             sub invisible_rendered {
798 8     8 1 25 my ($self) = @_;
799 8         31 $self->rendered(); # ignore return, we want just this:
800 8         29 return ($self->{rendered_type}, $self->{invisible_rendered});
801             }
802              
803             =item content_summary()
804              
805             Returns an array of scalars describing the mime parts of the message.
806             Note: This function requires that the message be parsed first!
807              
808             =cut
809              
810             # return an array with scalars describing mime parts
811             sub content_summary {
812 13     13 1 231 my($self) = @_;
813              
814 13         53 my @ret = ( [ $self->{'type'} ] );
815 13         21 my @search;
816              
817 13 100       32 if (exists $self->{'body_parts'}) {
818 11         15 my $count = @{$self->{'body_parts'}};
  11         21  
819 11         34 for(my $i=0; $i<$count; $i++) {
820 24         73 push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]);
821             }
822             }
823              
824 13         32 while(my $part = shift @search) {
825 45         53 my($index, $part) = @{$part};
  45         81  
826 45         63 push(@{$ret[$index]}, $part->{'type'});
  45         102  
827 45 100       147 if (exists $part->{'body_parts'}) {
828 11         16 unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}});
  21         57  
  11         19  
829             }
830             }
831              
832 13         27 return map { join(",", @{$_}) } @ret;
  37         45  
  37         183  
833             }
834              
835             =item delete_header()
836              
837             Delete the specified header (decoded and raw) from the Node information.
838              
839             =cut
840              
841             sub delete_header {
842 505     505 1 1125 my($self, $hdr) = @_;
843              
844 505         719 foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) {
  505         7514  
845 0         0 delete $self->{'headers'}->{$_};
846 0         0 delete $self->{'raw_headers'}->{$_};
847             }
848            
849 505         1217 my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}});
  505         5841  
850 505         1971 $self->{'header_order'} = \@neworder;
851             }
852              
853             # decode a header appropriately. don't bother adding it to the pod documents.
854             sub __decode_header {
855 10     10   50 my ( $encoding, $cte, $data ) = @_;
856              
857 10 100       51 if ( $cte eq 'B' ) {
    50          
858             # base 64 encoded
859 2         11 $data = Mail::SpamAssassin::Util::base64_decode($data);
860             }
861             elsif ( $cte eq 'Q' ) {
862             # quoted printable
863              
864             # the RFC states that in the encoded text, "_" is equal to "=20"
865 8         20 $data =~ s/_/=20/g;
866              
867 8         22 $data = Mail::SpamAssassin::Util::qp_decode($data);
868             }
869             else {
870             # not possible since the input has already been limited to 'B' and 'Q'
871 0         0 die "message: unknown encoding type '$cte' in RFC2047 header";
872             }
873 10         40 return _normalize($data, $encoding, 0); # transcode to UTF-8 octets
874             }
875              
876             # Decode base64 and quoted-printable in headers according to RFC2047.
877             #
878             sub _decode_header {
879 898     898   2843 my($header_field_body, $header_field_name) = @_;
880              
881 898 100 66     5232 return '' unless defined $header_field_body && $header_field_body ne '';
882              
883             # deal with folding and cream the newlines and such
884 848         1944 $header_field_body =~ s/\n[ \t]+/\n /g;
885 848         1381 $header_field_body =~ s/\015?\012//gs;
886              
887 848 100       3487 if ($header_field_name =~
888             /^ (?: Received | (?:Resent-)? (?: Message-ID | Date ) |
889             MIME-Version | References | In-Reply-To | List-.* ) \z /xsi ) {
890             # Bug 6945: some header fields must not be processed for MIME encoding
891             # Bug 7466: leave out the Content-*
892              
893             } else {
894 515         1909 local($1,$2,$3);
895              
896             # Multiple encoded sections must ignore the interim whitespace.
897             # To avoid possible FPs with (\s+(?==\?))?, look for the whole RE
898             # separated by whitespace.
899 515         1438 1 while $header_field_body =~
900             s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+
901             ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) }
902             {$1$2}xsg;
903              
904             # transcode properly encoded RFC 2047 substrings into UTF-8 octets,
905             # leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532)
906 515         1432 # or plain US-ASCII
907 10         54 $header_field_body =~
908             s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) }
909             { __decode_header($1, uc($2), $3) }xsge;
910             }
911 848         3059  
912             # dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body);
913             return $header_field_body;
914             }
915              
916             =item get_header()
917              
918             Retrieve a specific header. Will have a newline at the end and will be
919             unfolded. The first parameter is the header name (case-insensitive),
920             and the second parameter (optional) is whether or not to return the
921             raw header.
922              
923             If get_header() is called in an array context, an array will be returned
924             with each header entry in a different element. In a scalar context,
925             the last specific header is returned.
926              
927             ie: If 'Subject' is specified as the header, and there are 2 Subject
928             headers in a message, the last/bottom one in the message is returned in
929             scalar context or both are returned in array context.
930              
931             Btw, returning the last header field (not the first) happens to be consistent
932             with DKIM signatures, which search for and cover multiple header fields
933             bottom-up according to the 'h' tag. Let's keep it this way.
934              
935             =cut
936 4405     4405 1 9261  
937 4405   50     16140 sub get_header {
938             my ($self, $hdr, $raw) = @_;
939             $raw ||= 0;
940              
941             # And now pick up all the entries into a list
942             # This is assumed to include a newline at the end ...
943             # This is also assumed to have removed continuation bits ...
944 4405         5674  
945 4405 50       7683 # Deal with the possibility that header() or raw_header() returns undef
946 0 0       0 my @hdrs;
947 0         0 if ( $raw ) {
948             if (@hdrs = $self->raw_header($hdr)) {
949             s/\015?\012\s+/ /gs for @hdrs;
950             }
951 4405 100       9102 }
952 844         3549 else {
953             if (@hdrs = $self->header($hdr)) {
954             $_ .= "\n" for @hdrs;
955             }
956 4405 100       8703 }
957 3856         10981  
958             if (wantarray) {
959             return @hdrs;
960 549 100       2819 }
961             else {
962             return @hdrs ? $hdrs[-1] : undef;
963             }
964             }
965              
966             =item get_all_headers()
967              
968             Retrieve all headers. Each header will have a newline at the end and
969             will be unfolded. The first parameter (optional) is whether or not to
970             return the raw headers, and the second parameter (optional) is whether
971             or not to include the mbox separator.
972              
973             If get_all_header() is called in an array context, an array will be
974             returned with each header entry in a different element. In a scalar
975             context, the headers are returned in a single scalar.
976              
977             =back
978              
979             =cut
980              
981 45     45 1 154 # build it and it will not bomb
982 45   50     280 sub get_all_headers {
983 45   100     191 my ($self, $raw, $include_mbox) = @_;
984             $raw ||= 0;
985 45         106 $include_mbox ||= 0;
986              
987             my @lines;
988 45         84  
989 45         78 # precalculate destination positions based on order of appearance
990 45         75 my $i = 0;
  45         181  
991 520         674 my %locations;
  520         1722  
992             for my $k (@{$self->{header_order}}) {
993             push(@{$locations{lc($k)}}, $i++);
994             }
995 45         102  
996 45         91 # process headers in order of first appearance
997 45         390 my $header;
  902         1476  
998             my $size = 0;
999             HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] }
1000             keys %locations)
1001 396         703 {
1002 396         782 # get all same-name headers and poke into correct position
1003 520         741 my $positions = $locations{$name};
  520         807  
1004 520         1248 for my $contents ($self->get_header($name, $raw)) {
1005 520 50       1241 my $position = shift @{$positions};
1006 0         0 $size += length($name) + length($contents) + 2;
1007 0         0 if ($size > MAX_HEADER_LENGTH) {
1008             $self->{'truncated_header'} = 1;
1009 520         2328 last HEADER;
1010             }
1011             $lines[$position] = $self->{header_order}->[$position].": ".$contents;
1012             }
1013             }
1014 45 50       193  
  0         0  
1015             # skip undefined lines if we truncated
1016 45 50 66     191 @lines = grep { defined $_ } @lines if $self->{'truncated_header'};
1017              
1018 45 100       731 splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} );
1019              
1020             return wantarray ? @lines : join ('', @lines);
1021             }
1022       0 0    
1023             # legacy public API; now a no-op.
1024             sub finish { }
1025              
1026             # ---------------------------------------------------------------------------
1027              
1028             1;
1029             __END__