File Coverage

blib/lib/Mail/SpamAssassin/Message/Node.pm
Criterion Covered Total %
statement 266 387 68.7
branch 97 200 48.5
condition 42 88 47.7
subroutine 27 31 87.1
pod 16 18 88.8
total 448 724 61.8


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts
21              
22             =head1 DESCRIPTION
23              
24             This module will encapsulate an email message and allow access to
25             the various MIME message parts.
26              
27             =head1 PUBLIC METHODS
28              
29             =over 4
30              
31             =cut
32              
33             package Mail::SpamAssassin::Message::Node;
34              
35 40     40   256 use strict;
  40         94  
  40         1385  
36 40     40   227 use warnings;
  40         109  
  40         1371  
37 40     40   224 use re 'taint';
  40         74  
  40         1645  
38              
39             require 5.008001; # needs utf8::is_utf8()
40              
41 40     40   227 use Mail::SpamAssassin;
  40         89  
  40         1148  
42 40     40   229 use Mail::SpamAssassin::Constants qw(:sa);
  40         103  
  40         5924  
43 40     40   12376 use Mail::SpamAssassin::HTML;
  40         162  
  40         1577  
44 40     40   276 use Mail::SpamAssassin::Logger;
  40         73  
  40         6102  
45              
46             our($enc_utf8, $enc_w1252, $have_encode_detector);
47             BEGIN {
48 40         1553 eval { require Encode }
49 40 50   40   156 and do { $enc_utf8 = Encode::find_encoding('UTF-8');
  40         31445  
50 40         9538 $enc_w1252 = Encode::find_encoding('Windows-1252') };
51 40         61914 eval { require Encode::Detect::Detector }
52 40 50       17540 and do { $have_encode_detector = 1 };
  0         0  
53             };
54              
55             =item new()
56              
57             Generates an empty Node object and returns it. Typically only called
58             by functions in Message.
59              
60             =cut
61              
62             sub new {
63 181     181 1 438 my $class = shift;
64 181   33     804 $class = ref($class) || $class;
65              
66 181         1457 my $self = {
67             headers => {},
68             raw_headers => {},
69             header_order => []
70             };
71              
72             # deal with any parameters
73 181         448 my($opts) = @_;
74 181   50     828 $self->{normalize} = $opts->{'normalize'} || 0;
75              
76 181         511 bless($self,$class);
77 181         569 $self;
78             }
79              
80             =item find_parts()
81              
82             Used to search the tree for specific MIME parts. An array of matching
83             Node objects (pointers into the tree) is returned. The parameters that
84             can be passed in are (in order, all scalars):
85              
86             Regexp - Used to match against each part's Content-Type header,
87             specifically the type and not the rest of the header. ie: "Content-type:
88             text/html; encoding=quoted-printable" has a type of "text/html". If no
89             regexp is specified, find_parts() will return an empty array.
90              
91             Only_leaves - By default, find_parts() will return any part that matches
92             the regexp, including multipart. If you only want to see leaves of the
93             tree (ie: parts that aren't multipart), set this to true (1).
94              
95             Recursive - By default, when find_parts() finds a multipart which has
96             parts underneath it, it will recurse through all sub-children. If set to 0,
97             only look at the part and any direct children of the part.
98              
99             =cut
100              
101             # Used to find any MIME parts whose simple content-type matches a given regexp
102             # Searches it's own and any children parts. Returns an array of MIME
103             # objects which match. Our callers may expect the default behavior which is a
104             # depth-first array of parts.
105             #
106             sub find_parts {
107 231     231 1 741 my ($self, $re, $onlyleaves, $recursive) = @_;
108              
109             # Didn't pass an RE? Just abort.
110 231 50 33     1402 return () unless defined $re && $re ne '';
111              
112 231 50       679 $onlyleaves = 0 unless defined $onlyleaves;
113              
114 231         412 my $depth;
115 231 50 33     717 if (defined $recursive && $recursive == 0) {
116 0         0 $depth = 1;
117             }
118            
119 231         380 my @ret;
120 231         495 my @search = ( $self );
121              
122 231         737 while (my $part = shift @search) {
123             # If this object matches, mark it for return.
124 263         925 my $amialeaf = $part->is_leaf();
125              
126 263 100 66     2847 if ( $part->{'type'} =~ /$re/ && (!$onlyleaves || $amialeaf) ) {
      100        
127 245         605 push(@ret, $part);
128             }
129            
130 263 50 33     1005 if ( !$amialeaf && (!defined $depth || $depth > 0)) {
      66        
131 18 50       105 $depth-- if defined $depth;
132 18         27 unshift(@search, @{$part->{'body_parts'}});
  18         48  
133             }
134             }
135              
136 231         848 return @ret;
137             }
138              
139             =item header()
140              
141             Stores and retrieves headers from a specific MIME part. The first
142             parameter is the header name. If there is no other parameter, the header
143             is retrieved. If there is a second parameter, the header is stored.
144              
145             Header names are case-insensitive and are stored in both raw and
146             decoded form. Using header(), only the decoded form is retrievable.
147              
148             For retrieval, if header() is called in an array context, an array will
149             be returned with each header entry in a different element. In a scalar
150             context, the last specific header is returned.
151              
152             ie: If 'Subject' is specified as the header, and there are 2 Subject
153             headers in a message, the last/bottom one in the message is returned in
154             scalar context or both are returned in array context.
155              
156             =cut
157              
158             # Store or retrieve headers from a given MIME object
159             #
160             sub header {
161 5494     5494 1 7094 my $self = shift;
162 5494         6955 my $rawkey = shift;
163              
164 5494 50       9074 return unless defined $rawkey;
165              
166             # we're going to do things case insensitively
167 5494         8236 my $key = lc($rawkey);
168              
169             # Trim whitespace off of the header keys
170 5494         11073 $key =~ s/^\s+//;
171 5494         9003 $key =~ s/\s+$//;
172              
173 5494 100       9428 if (@_) {
174 800         1069 my $raw_value = shift;
175 800 50       1278 return unless defined $raw_value;
176              
177 800         1003 push @{ $self->{'header_order'} }, $rawkey;
  800         1591  
178 800 100       1821 if ( !exists $self->{'headers'}->{$key} ) {
179 726         2150 $self->{'headers'}->{$key} = [];
180 726         1429 $self->{'raw_headers'}->{$key} = [];
181             }
182              
183 800         1125 my $dec_value = $raw_value;
184 800         1774 $dec_value =~ s/\n[ \t]+/ /gs;
185 800         3214 $dec_value =~ s/\s+$//s;
186 800         2059 $dec_value =~ s/^\s+//s;
187 800         1072 push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key);
  800         2340  
188              
189 800         1088 push @{ $self->{'raw_headers'}->{$key} }, $raw_value;
  800         1580  
190              
191 800         2175 return $self->{'headers'}->{$key}->[-1];
192             }
193              
194 4694 100       7500 if (wantarray) {
195 4409 100       13714 return unless exists $self->{'headers'}->{$key};
196 797         1082 return @{ $self->{'headers'}->{$key} };
  797         3359  
197             }
198             else {
199 285 100       2431 return '' unless exists $self->{'headers'}->{$key};
200 34         217 return $self->{'headers'}->{$key}->[-1];
201             }
202             }
203              
204             =item raw_header()
205              
206             Retrieves the raw version of headers from a specific MIME part. The only
207             parameter is the header name. Header names are case-insensitive.
208              
209             For retrieval, if raw_header() is called in an array context, an array
210             will be returned with each header entry in a different element. In a
211             scalar context, the last specific header is returned.
212              
213             ie: If 'Subject' is specified as the header, and there are 2 Subject
214             headers in a message, the last/bottom one in the message is returned in
215             scalar context or both are returned in array context.
216              
217             =cut
218              
219             # Retrieve raw headers from a given MIME object
220             #
221             sub raw_header {
222 475     475 1 787 my $self = shift;
223 475         831 my $key = lc(shift);
224              
225             # Trim whitespace off of the header keys
226 475         996 $key =~ s/^\s+//;
227 475         958 $key =~ s/\s+$//;
228              
229 475 50       1008 if (wantarray) {
230 475 100       1716 return unless exists $self->{'raw_headers'}->{$key};
231 122         245 return @{ $self->{'raw_headers'}->{$key} };
  122         595  
232             }
233             else {
234 0 0       0 return '' unless exists $self->{'raw_headers'}->{$key};
235 0         0 return $self->{'raw_headers'}->{$key}->[-1];
236             }
237             }
238              
239             =item add_body_part()
240              
241             Adds a Node child object to the current node object.
242              
243             =cut
244              
245             # Add a MIME child part to ourselves
246             sub add_body_part {
247 48     48 1 69 my($self, $part) = @_;
248              
249 48         125 dbg("message: added part, type: ".$part->{'type'});
250 48         50 push @{ $self->{'body_parts'} }, $part;
  48         96  
251             }
252              
253             =item is_leaf()
254              
255             Returns true if the tree node in question is a leaf of the tree (ie:
256             has no children of its own). Note: This function may return odd results
257             unless the message has been mime parsed via _do_parse()!
258              
259             =cut
260              
261             sub is_leaf {
262 263     263 1 568 my($self) = @_;
263 263         777 return !exists $self->{'body_parts'};
264             }
265              
266             =item raw()
267              
268             Return a reference to the the raw array. Treat this as READ ONLY.
269              
270             =cut
271              
272             sub raw {
273 0     0 1 0 my $self = shift;
274              
275             # Ok, if we're called we are expected to return an array.
276             # so if it's a file reference, read in the message into an array...
277             #
278             # NOTE: that "ref undef" works, so don't bother checking for a defined var
279             # first.
280 0 0       0 if (ref $self->{'raw'} eq 'GLOB') {
281 0         0 my $fd = $self->{'raw'};
282 0 0       0 seek($fd, 0, 0) or die "message: cannot rewind file: $!";
283              
284             # dbg("message: (raw) reading mime part from a temporary file");
285 0         0 my($nread,$raw_str); $raw_str = '';
  0         0  
286 0         0 while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { }
287 0 0       0 defined $nread or die "error reading: $!";
288 0         0 my @array = split(/^/m, $raw_str, -1);
289              
290 0 0       0 dbg("message: empty message read") if $raw_str eq '';
291 0         0 return \@array;
292             }
293              
294 0         0 return $self->{'raw'};
295             }
296              
297             =item decode()
298              
299             If necessary, decode the part text as base64 or quoted-printable.
300             The decoded text will be returned as a scalar string. An optional length
301             parameter can be passed in which limits how much decoded data is returned.
302             If the scalar isn't needed, call with "0" as a parameter.
303              
304             =cut
305              
306             sub decode {
307 228     228 1 2932 my($self, $bytes) = @_;
308              
309 228 100       760 if ( !exists $self->{'decoded'} ) {
310             # Someone is looking for a decoded part where there is no raw data
311             # (multipart or subparsed message, etc.) Just return undef.
312 132 50       478 return if !exists $self->{'raw'};
313              
314 132         262 my $raw;
315              
316             # if the part is held in a temp file, read it into the scalar
317 132 100       527 if (ref $self->{'raw'} eq 'GLOB') {
318 4         9 my $fd = $self->{'raw'};
319 4 50       44 seek($fd, 0, 0) or die "message: cannot rewind file: $!";
320              
321             # dbg("message: (decode) reading mime part from a temporary file");
322 4         10 my($nread,$raw_str); $raw = '';
  4         8  
323 4         197 while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { }
324 4 50       14 defined $nread or die "error reading: $!";
325              
326 4 50       11 dbg("message: empty message read from a temp file") if $raw eq '';
327             }
328             else {
329             # create a new scalar from the raw array in memory
330 128         282 $raw = join('', @{$self->{'raw'}});
  128         931  
331             }
332              
333 132   100     448 my $encoding = lc $self->header('content-transfer-encoding') || '';
334              
335 132 100       668 if ( $encoding eq 'quoted-printable' ) {
    100          
336 6         29 dbg("message: decoding quoted-printable");
337 6         42 $self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw);
338 6         36 $self->{'decoded'} =~ s/\015\012/\012/gs;
339             }
340             elsif ( $encoding eq 'base64' ) {
341 5         19 dbg("message: decoding base64");
342              
343             # if it's not defined or is 0, do the whole thing, otherwise only decode
344             # a portion
345 5 50       13 if ($bytes) {
346 0         0 return Mail::SpamAssassin::Util::base64_decode($raw, $bytes);
347             }
348             else {
349             # Generate the decoded output
350 5         20 $self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw);
351             }
352              
353 5 100       32 if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) {
354 1         23 $self->{'decoded'} =~ s/\015\012/\012/gs;
355             }
356             }
357             else {
358             # Encoding is one of 7bit, 8bit, binary or x-something
359 121 100       328 if ( $encoding ) {
360 19         101 dbg("message: decoding other encoding type ($encoding), ignoring");
361             }
362             else {
363 102         365 dbg("message: no encoding detected");
364             }
365 121         619 $self->{'decoded'} = $raw;
366             }
367             }
368              
369 228 100 66     1491 if ( !defined $bytes || $bytes ) {
370 224 50       577 if ( !defined $bytes ) {
371             # force a copy
372 224         2207 return '' . $self->{'decoded'};
373             }
374             else {
375 0         0 return substr($self->{'decoded'}, 0, $bytes);
376             }
377             }
378             }
379              
380             # Detect endianness of UTF-16 encoded data
381             sub detect_utf16 {
382 40     40   22692 use Data::Dumper;
  40         201504  
  40         132897  
383 0     0 0 0 my $data = $_[0]; # could not avoid copying large strings
384 0         0 my $utf16le_clues = 0;
385 0         0 my $utf16be_clues = 0;
386 0         0 my $sum_h_e = 0;
387 0         0 my $sum_h_o = 0;
388 0         0 my $sum_l_e = 0;
389 0         0 my $sum_l_o = 0;
390 0         0 my $decoder = undef;
391              
392 0         0 my @msg_h = unpack 'H' x length( $data ), $data;
393 0         0 my @msg_l = unpack 'h' x length( $data ), $data;
394              
395 0         0 for( my $i = 0; $i < length( $data ); $i+=2 ) {
396 0         0 my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] );
397 0         0 $sum_h_e += hex $msg_h[$i];
398 0         0 $sum_h_o += hex $msg_h[$i+1];
399 0         0 $sum_l_e += hex $msg_l[$i];
400 0         0 $sum_l_o += hex $msg_l[$i+1];
401 0 0       0 if( $check_char =~ /20 00/ ) {
402             # UTF-16LE space char detected
403 0         0 $utf16le_clues++;
404             }
405 0 0       0 if( $check_char =~ /00 20/ ) {
406             # UTF-16BE space char detected
407 0         0 $utf16be_clues++;
408             }
409             }
410              
411             # If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE
412 0 0       0 $utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 );
413              
414             # If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE
415 0 0       0 $utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) );
416              
417 0 0       0 if( $utf16le_clues > $utf16be_clues ) {
    0          
418 0         0 dbg( "message: detect_utf16: UTF-16LE" );
419 0         0 $decoder = Encode::find_encoding("UTF-16LE");
420             } elsif( $utf16le_clues > $utf16be_clues ) {
421 0         0 dbg( "message: detect_utf16: UTF-16BE" );
422 0         0 $decoder = Encode::find_encoding("UTF-16BE");
423             } else {
424 0         0 dbg( "message: detect_utf16: Could not detect UTF-16 endianness" );
425             }
426              
427 0         0 return $decoder;
428             }
429              
430             # Look at a text scalar and determine whether it should be rendered
431             # as text/html.
432             #
433             # This is not a public function.
434             #
435             sub _html_render {
436 105 50   105   783 if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is)
437             {
438 0         0 my $pad = $1;
439 0         0 my $count = 0;
440 0         0 $count += ($pad =~ tr/\n//d) * 2;
441 0         0 $count += ($pad =~ tr/\n//cd);
442 0         0 return ($count < 24);
443             }
444 105         558 return 0;
445             }
446              
447             # Decode character set of a given text to perl characters (Unicode),
448             # then encode into UTF-8 octets if requested.
449             #
450             sub _normalize {
451             # my $data = $_[0]; # avoid copying large strings
452 10     10   20 my $charset_declared = $_[1];
453 10         12 my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets
454              
455 10 50       35 warn "message: _normalize() was given characters, expected bytes: $_[0]\n"
456             if utf8::is_utf8($_[0]);
457              
458             # workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
459 10         29 my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data
460              
461 10 50 33     54 if (!defined $charset_declared || $charset_declared eq '') {
462 0         0 $charset_declared = 'us-ascii';
463             }
464              
465             # number of characters with code above 127
466 10         25 my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;
467              
468 10 50 33     74 if (!$cnt_8bits &&
469             $charset_declared =~
470             /^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) |
471             ISO646-US )\z/xsi)
472             { # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
473 0         0 dbg("message: kept, charset is US-ASCII as declared");
474 0         0 return $_[0]; # is all-ASCII, no need for decoding
475             }
476              
477 10 100 66     103 if (!$cnt_8bits &&
478             $charset_declared =~
479             /^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} |
480             UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} |
481             Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
482             { # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
483 8         24 dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared);
484 8         36 return $_[0]; # is all-ASCII, no need for decoding
485             }
486              
487             # Try first to strictly decode based on a declared character set.
488              
489 2         5 my $rv;
490 2 50 33     29 if ($charset_declared =~ /^UTF-?8\z/i) {
    50          
    50          
    50          
491             # attempt decoding as strict UTF-8 (flags: FB_CROAK | LEAVE_SRC)
492 0 0       0 if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
  0         0  
  0         0  
493 0         0 dbg("message: decoded as declared charset UTF-8");
494 0 0       0 return $_[0] if !$return_decoded;
495 0         0 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
496 0         0 return $rv; # decoded
497             } else {
498 0         0 dbg("message: failed decoding as declared charset UTF-8");
499             };
500              
501             } elsif ($charset_declared =~ /^UTF[ -]?16/i) {
502             # Handle cases where spammers use UTF-16 encoding without including a BOM
503             # or declaring endianness as reported at:
504             # https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252
505              
506 0         0 my $decoder = detect_utf16( $_[0] );
507 0 0       0 if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) {
  0         0  
  0         0  
508 0         0 dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name);
509 0 0       0 return $_[0] if !$return_decoded;
510 0         0 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
511 0         0 return $rv; # decoded
512             } else {
513 0         0 dbg("message: failed decoding as declared charset $charset_declared");
514             };
515              
516             } elsif ($cnt_8bits &&
517 0         0 eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
  0         0  
518 0         0 dbg("message: decoded as charset UTF-8, declared %s", $charset_declared);
519 0 0       0 return $_[0] if !$return_decoded;
520 0         0 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
521 0         0 return $rv; # decoded
522              
523             } elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) {
524             # declared as US-ASCII but contains 8-bit characters, makes no sense
525             # to attempt decoding first as strict US-ASCII as we know it would fail
526              
527             } else {
528             # try decoding as a declared character set
529              
530             # -> http://en.wikipedia.org/wiki/Windows-1252
531             # Windows-1252 character encoding is a superset of ISO 8859-1, but differs
532             # from the IANA's ISO-8859-1 by using displayable characters rather than
533             # control characters in the 80 to 9F (hex) range. [...]
534             # It is very common to mislabel Windows-1252 text with the charset label
535             # ISO-8859-1. A common result was that all the quotes and apostrophes
536             # (produced by "smart quotes" in word-processing software) were replaced
537             # with question marks or boxes on non-Windows operating systems, making
538             # text difficult to read. Most modern web browsers and e-mail clients
539             # treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
540             # such mislabeling. This is now standard behavior in the draft HTML 5
541             # specification, which requires that documents advertised as ISO-8859-1
542             # actually be parsed with the Windows-1252 encoding.
543             #
544 2         7 my($chset, $decoder);
545 2 50       12 if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) {
546 0         0 $chset = 'Windows-1252'; $decoder = $enc_w1252;
  0         0  
547             } else {
548 2         5 $chset = $charset_declared; $decoder = Encode::find_encoding($chset);
  2         31  
549 2 50 33     17108 if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
550 0         0 $decoder = Encode::find_encoding('GBK'); # a subset of GB18030
551 0 0       0 dbg("message: no decoder for a declared charset %s, using GBK",
552             $chset) if $decoder;
553             }
554             }
555 2 50       9 if (!$decoder) {
556 0         0 dbg("message: failed decoding, no decoder for a declared charset %s",
557             $chset);
558             } else {
559 2         3 eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
  2         14  
560 2 50       254 if (lc $chset eq lc $charset_declared) {
561 2 50       14 dbg("message: %s as declared charset %s",
562             defined $rv ? 'decoded' : 'failed decoding', $charset_declared);
563             } else {
564 0 0       0 dbg("message: %s as charset %s, declared %s",
565             defined $rv ? 'decoded' : 'failed decoding',
566             $chset, $charset_declared);
567             }
568             }
569             }
570              
571             # If the above failed, check if it is US-ASCII, possibly extended by few
572             # NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing
573             # some popular punctuation or special characters from Windows-1252 in
574             # the \x80-\x9F range (which is unassigned in ISO-8859-*).
575             # Note that Windows-1252 is a proper superset of ISO-8859-1.
576             #
577 2 50 33     18 if (!defined $rv && !$cnt_8bits) {
    50 33        
      0        
578 0         0 dbg("message: kept, guessed charset is US-ASCII, declared %s",
579             $charset_declared);
580 0         0 return $_[0]; # is all-ASCII, no need for decoding
581              
582             } elsif (!defined $rv && $enc_w1252 &&
583             # ASCII NBSP (c) SHY ' " ... '".- TM
584             $_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c)
585             { # ASCII + NBSP + SHY + some punctuation characters
586             # NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
587             # consider also: AE (r), 80 Euro
588 0         0 eval { $rv = $enc_w1252->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
  0         0  
589             # the above can't fail, but keep code general just in case
590 0 0       0 dbg("message: %s as guessed charset %s, declared %s",
591             defined $rv ? 'decoded' : 'failed decoding',
592             'Windows-1252', $charset_declared);
593             }
594              
595             # If we were unsuccessful so far, try some guesswork
596             # based on Encode::Detect::Detector .
597              
598 2 50       8 if (defined $rv) {
    0          
599             # done, no need for guesswork
600             } elsif (!$have_encode_detector) {
601 0         0 dbg("message: Encode::Detect::Detector not available, declared %s failed",
602             $charset_declared);
603             } else {
604 0         0 my $charset_detected = Encode::Detect::Detector::detect($_[0]);
605 0 0 0     0 if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
606 0         0 my $decoder = Encode::find_encoding($charset_detected);
607 0 0 0     0 if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
608 0         0 $decoder = Encode::find_encoding('GBK'); # a subset of GB18030
609 0 0       0 dbg("message: no decoder for a detected charset %s, using GBK",
610             $charset_detected) if $decoder;
611             }
612 0 0       0 if (!$decoder) {
613 0         0 dbg("message: failed decoding, no decoder for a detected charset %s",
614             $charset_detected);
615             } else {
616 0         0 eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
  0         0  
617 0 0       0 dbg("message: %s as detected charset %s, declared %s",
618             defined $rv ? 'decoded' : 'failed decoding',
619             $charset_detected, $charset_declared);
620             }
621             }
622             }
623              
624 2 50       8 if (!defined $rv) { # all decoding attempts failed so far, probably garbage
625             # go for Windows-1252 which can't fail
626 0         0 eval { $rv = $enc_w1252->decode($_[0]) };
  0         0  
627 0 0       0 dbg("message: %s as last-resort charset %s, declared %s",
628             defined $rv ? 'decoded' : 'failed decoding',
629             'Windows-1252', $charset_declared);
630             }
631              
632 2 50       7 if (!defined $rv) { # just in case - all decoding attempts failed so far
633 0         0 return $_[0]; # garbage-in / garbage-out, return unchanged octets
634             }
635             # decoding octets to characters was successful
636 2 50       7 if (!$return_decoded) {
637             # utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg
638 2         7 utf8::encode($rv); # encode Unicode characters to UTF-8 octets
639             }
640 2         6 $rv .= $data_taint; # carry taintedness over, avoid Encode bug
641 2         17 return $rv;
642             }
643              
644             =item rendered()
645              
646             render_text() takes the given text/* type MIME part, and attempts to
647             render it into a text scalar. It will always render text/html, and will
648             use a heuristic to determine if other text/* parts should be considered
649             text/html. Two scalars are returned: the rendered type (either text/html
650             or whatever the original type was), and the rendered text.
651              
652             =cut
653              
654             sub rendered {
655 129     129 1 398 my ($self) = @_;
656              
657 129 100       545 if (!exists $self->{rendered}) {
658             # We only know how to render text/plain and text/html ...
659             # Note: for bug 4843, make sure to skip text/calendar parts
660             # we also want to skip things like text/x-vcard
661             # text/x-aol is ignored here, but looks like text/html ...
662 113 50       1065 return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
663              
664 113         649 my $text = $self->decode; # QP and Base64 decoding, bytes
665 113         398 my $text_len = length($text); # num of bytes in original charset encoding
666              
667             # render text/html always, or any other text|text/plain part as text/html
668             # based on a heuristic which simulates a certain common mail client
669 113 100 66     2466 if ($text ne '' && ($self->{'type'} =~ m{^text/html$}i ||
      100        
670             ($self->{'type'} =~ m{^text/plain$}i &&
671             _html_render(substr($text, 0, 23)))))
672             {
673 7         62 $self->{rendered_type} = 'text/html';
674              
675             # will input text to HTML::Parser be provided as Unicode characters?
676 7         38 my $character_semantics = 0; # $text is in bytes
677 7 50 33     90 if ($self->{normalize} && $enc_utf8) { # charset decoding requested
    100 100        
678             # Provide input to HTML::Parser as Unicode characters
679             # which avoids a HTML::Parser bug in utf8_mode
680             # https://rt.cpan.org/Public/Bug/Display.html?id=99755
681             # Note: the above bug was fixed in HTML-Parser 3.72, January 2016.
682             # Avoid unnecessary step of encoding-then-decoding by telling
683             # subroutine _normalize() to return Unicode text. See Bug 7133
684             #
685 0         0 $character_semantics = 1; # $text will be in characters
686 0         0 $text = _normalize($text, $self->{charset}, 1); # bytes to chars
687             } elsif (!defined $self->{charset} ||
688             $self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) {
689             # With some luck input can be interpreted as UTF-8, do not warn.
690             # It is still possible to hit the HTML::Parses utf8_mode bug however.
691             } else {
692             dbg("message: 'normalize_charset' is off, encoding will likely ".
693 1         6 "be misinterpreted; declared charset: %s", $self->{charset});
694             }
695             # the 0 requires decoded HTML results to be in bytes (not characters)
696 7         166 my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object
697              
698 7         74 $html->parse($text); # parse+render text
699              
700             # resulting HTML-decoded text is in bytes, likely encoded as UTF-8
701 7         52 $self->{rendered} = $html->get_rendered_text();
702 7         31 $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
703 7         38 $self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
704 7         37 $self->{html_results} = $html->get_results();
705              
706             # end-of-document result values that require looking at the text
707 7         19 my $r = $self->{html_results}; # temporary reference for brevity
708              
709             # count the number of spaces in the rendered text (likely UTF-8 octets)
710 7         40 my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//;
711             # we may want to add the count of other Unicode whitespace characters
712              
713 7         27 $r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8)
714 7         24 $r->{non_space_len} = $r->{html_length} - $space;
715 7 50       174 $r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len;
716             }
717              
718             else { # plain text
719 106 0 33     412 if ($self->{normalize} && $enc_utf8) {
720             # request transcoded result as UTF-8 octets!
721 0         0 $text = _normalize($text, $self->{charset}, 0);
722             }
723 106         462 $self->{rendered_type} = $self->{type};
724 106         493 $self->{rendered} = $self->{'visible_rendered'} = $text;
725 106         434 $self->{'invisible_rendered'} = '';
726             }
727             }
728              
729 129         665 return ($self->{rendered_type}, $self->{rendered});
730             }
731              
732             =item set_rendered($text, $type)
733              
734             Set the rendered text and type for the given part. If type is not
735             specified, and text is a defined value, a default of 'text/plain' is used.
736             This can be used, for instance, to render non-text parts using plugins.
737              
738             =cut
739              
740             sub set_rendered {
741 0     0 1 0 my ($self, $text, $type) = @_;
742              
743 0 0 0     0 $type = 'text/plain' if (!defined $type && defined $text);
744              
745 0         0 $self->{'rendered_type'} = $type;
746 0         0 $self->{'rendered'} = $self->{'visible_rendered'} = $text;
747 0 0       0 $self->{'invisible_rendered'} = defined $text ? '' : undef;
748             }
749              
750             =item visible_rendered()
751              
752             Render and return the visible text in this part.
753              
754             =cut
755              
756             sub visible_rendered {
757 8     8 1 30 my ($self) = @_;
758 8         78 $self->rendered(); # ignore return, we want just this:
759 8         39 return ($self->{rendered_type}, $self->{visible_rendered});
760             }
761              
762             =item invisible_rendered()
763              
764             Render and return the invisible text in this part.
765              
766             =cut
767              
768             sub invisible_rendered {
769 8     8 1 23 my ($self) = @_;
770 8         44 $self->rendered(); # ignore return, we want just this:
771 8         23 return ($self->{rendered_type}, $self->{invisible_rendered});
772             }
773              
774             =item content_summary()
775              
776             Returns an array of scalars describing the mime parts of the message.
777             Note: This function requires that the message be parsed first!
778              
779             =cut
780              
781             # return an array with scalars describing mime parts
782             sub content_summary {
783 13     13 1 203 my($self) = @_;
784              
785 13         49 my @ret = ( [ $self->{'type'} ] );
786 13         17 my @search;
787              
788 13 100       32 if (exists $self->{'body_parts'}) {
789 11         15 my $count = @{$self->{'body_parts'}};
  11         22  
790 11         28 for(my $i=0; $i<$count; $i++) {
791 24         62 push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]);
792             }
793             }
794              
795 13         33 while(my $part = shift @search) {
796 45         40 my($index, $part) = @{$part};
  45         57  
797 45         41 push(@{$ret[$index]}, $part->{'type'});
  45         70  
798 45 100       94 if (exists $part->{'body_parts'}) {
799 11         13 unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}});
  21         44  
  11         20  
800             }
801             }
802              
803 13         22 return map { join(",", @{$_}) } @ret;
  37         39  
  37         104  
804             }
805              
806             =item delete_header()
807              
808             Delete the specified header (decoded and raw) from the Node information.
809              
810             =cut
811              
812             sub delete_header {
813 505     505 1 1057 my($self, $hdr) = @_;
814              
815 505         692 foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) {
  505         7528  
816 0         0 delete $self->{'headers'}->{$_};
817 0         0 delete $self->{'raw_headers'}->{$_};
818             }
819            
820 505         1167 my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}});
  505         5474  
821 505         1783 $self->{'header_order'} = \@neworder;
822             }
823              
824             # decode a header appropriately. don't bother adding it to the pod documents.
825             sub __decode_header {
826 10     10   30 my ( $encoding, $cte, $data ) = @_;
827              
828 10 100       27 if ( $cte eq 'B' ) {
    50          
829             # base 64 encoded
830 2         14 $data = Mail::SpamAssassin::Util::base64_decode($data);
831             }
832             elsif ( $cte eq 'Q' ) {
833             # quoted printable
834              
835             # the RFC states that in the encoded text, "_" is equal to "=20"
836 8         16 $data =~ s/_/=20/g;
837              
838 8         17 $data = Mail::SpamAssassin::Util::qp_decode($data);
839             }
840             else {
841             # not possible since the input has already been limited to 'B' and 'Q'
842 0         0 die "message: unknown encoding type '$cte' in RFC2047 header";
843             }
844 10         53 return _normalize($data, $encoding, 0); # transcode to UTF-8 octets
845             }
846              
847             # Decode base64 and quoted-printable in headers according to RFC2047.
848             #
849             sub _decode_header {
850 800     800   1667 my($header_field_body, $header_field_name) = @_;
851              
852 800 100 66     3151 return '' unless defined $header_field_body && $header_field_body ne '';
853              
854             # deal with folding and cream the newlines and such
855 750         1322 $header_field_body =~ s/\n[ \t]+/\n /g;
856 750         1044 $header_field_body =~ s/\015?\012//gs;
857              
858 750 100       3543 if ($header_field_name =~
859             /^ (?: Received | (?:Resent-)? (?: Message-ID | Date ) |
860             MIME-Version | References | In-Reply-To | List-.* ) \z /xsi ) {
861             # Bug 6945: some header fields must not be processed for MIME encoding
862             # Bug 7466: leave out the Content-*
863              
864             } else {
865 453         1420 local($1,$2,$3);
866              
867             # Multiple encoded sections must ignore the interim whitespace.
868             # To avoid possible FPs with (\s+(?==\?))?, look for the whole RE
869             # separated by whitespace.
870 453         1063 1 while $header_field_body =~
871             s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+
872             ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) }
873             {$1$2}xsg;
874              
875             # transcode properly encoded RFC 2047 substrings into UTF-8 octets,
876             # leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532)
877 453         1055 # or plain US-ASCII
878 10         38 $header_field_body =~
879             s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) }
880             { __decode_header($1, uc($2), $3) }xsge;
881             }
882 750         1596  
883             # dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body);
884             return $header_field_body;
885             }
886              
887             =item get_header()
888              
889             Retrieve a specific header. Will have a newline at the end and will be
890             unfolded. The first parameter is the header name (case-insensitive),
891             and the second parameter (optional) is whether or not to return the
892             raw header.
893              
894             If get_header() is called in an array context, an array will be returned
895             with each header entry in a different element. In a scalar context,
896             the last specific header is returned.
897              
898             ie: If 'Subject' is specified as the header, and there are 2 Subject
899             headers in a message, the last/bottom one in the message is returned in
900             scalar context or both are returned in array context.
901              
902             Btw, returning the last header field (not the first) happens to be consistent
903             with DKIM signatures, which search for and cover multiple header fields
904             bottom-up according to the 'h' tag. Let's keep it this way.
905              
906             =cut
907 4075     4075 1 7161  
908 4075   50     13154 sub get_header {
909             my ($self, $hdr, $raw) = @_;
910             $raw ||= 0;
911              
912             # And now pick up all the entries into a list
913             # This is assumed to include a newline at the end ...
914             # This is also assumed to have removed continuation bits ...
915 4075         4715  
916 4075 50       6280 # Deal with the possibility that header() or raw_header() returns undef
917 0 0       0 my @hdrs;
918 0         0 if ( $raw ) {
919             if (@hdrs = $self->raw_header($hdr)) {
920             s/\015?\012\s+/ /gs for @hdrs;
921             }
922 4075 100       7554 }
923 658         2285 else {
924             if (@hdrs = $self->header($hdr)) {
925             $_ .= "\n" for @hdrs;
926             }
927 4075 100       6151 }
928 3586         8295  
929             if (wantarray) {
930             return @hdrs;
931 489 100       1838 }
932             else {
933             return @hdrs ? $hdrs[-1] : undef;
934             }
935             }
936              
937             =item get_all_headers()
938              
939             Retrieve all headers. Each header will have a newline at the end and
940             will be unfolded. The first parameter (optional) is whether or not to
941             return the raw headers, and the second parameter (optional) is whether
942             or not to include the mbox separator.
943              
944             If get_all_header() is called in an array context, an array will be
945             returned with each header entry in a different element. In a scalar
946             context, the headers are returned in a single scalar.
947              
948             =back
949              
950             =cut
951              
952 26     26 1 66 # build it and it will not bomb
953 26   50     155 sub get_all_headers {
954 26   100     85 my ($self, $raw, $include_mbox) = @_;
955             $raw ||= 0;
956 26         41 $include_mbox ||= 0;
957              
958             my @lines;
959 26         41  
960 26         37 # precalculate destination positions based on order of appearance
961 26         39 my $i = 0;
  26         108  
962 332         365 my %locations;
  332         804  
963             for my $k (@{$self->{header_order}}) {
964             push(@{$locations{lc($k)}}, $i++);
965             }
966 26         42  
967 26         45 # process headers in order of first appearance
968 26         215 my $header;
  572         784  
969             my $size = 0;
970             HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] }
971             keys %locations)
972 250         339 {
973 250         428 # get all same-name headers and poke into correct position
974 332         375 my $positions = $locations{$name};
  332         409  
975 332         451 for my $contents ($self->get_header($name, $raw)) {
976 332 50       552 my $position = shift @{$positions};
977 0         0 $size += length($name) + length($contents) + 2;
978 0         0 if ($size > MAX_HEADER_LENGTH) {
979             $self->{'truncated_header'} = 1;
980 332         878 last HEADER;
981             }
982             $lines[$position] = $self->{header_order}->[$position].":".$contents;
983             }
984             }
985 26 50       103  
  0         0  
986             # skip undefined lines if we truncated
987 26 50 66     141 @lines = grep { defined $_ } @lines if $self->{'truncated_header'};
988              
989 26 100       327 splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} );
990              
991             return wantarray ? @lines : join ('', @lines);
992             }
993       0 0    
994             # legacy public API; now a no-op.
995             sub finish { }
996              
997             # ---------------------------------------------------------------------------
998              
999             1;
1000             __END__