File Coverage

blib/lib/Mail/SpamAssassin/Message.pm
Criterion Covered Total %
statement 427 499 85.5
branch 181 256 70.7
condition 74 115 64.3
subroutine 33 36 91.6
pod 16 24 66.6
total 731 930 78.6


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Message - decode, render, and hold an RFC-2822 message
21              
22             =head1 DESCRIPTION
23              
24             This module encapsulates an email message and allows access to the various MIME
25             message parts and message metadata.
26              
27             The message structure, after initiating a parse() cycle, looks like this:
28              
29             Message object, also top-level node in Message::Node tree
30             |
31             +---> Message::Node for other parts in MIME structure
32             | |---> [ more Message::Node parts ... ]
33             | [ others ... ]
34             |
35             +---> Message::Metadata object to hold metadata
36              
37             =head1 PUBLIC METHODS
38              
39             =over 4
40              
41             =cut
42              
43             package Mail::SpamAssassin::Message;
44              
45 40     40   295 use strict;
  40         94  
  40         5215  
46 40     40   220 use warnings;
  40         90  
  40         1320  
47 40     40   217 use re 'taint';
  40         74  
  40         10993  
48              
49             BEGIN {
50 40         23062 eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
  40         126759  
  40         1309  
51 40 50   40   143 or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
  0         0  
  0         0  
52             }
53              
54 40     40   301 use Mail::SpamAssassin;
  40         90  
  40         955  
55 40     40   13945 use Mail::SpamAssassin::Message::Node;
  40         131  
  40         1572  
56 40     40   14449 use Mail::SpamAssassin::Message::Metadata;
  40         563  
  40         4594  
57 40     40   377 use Mail::SpamAssassin::Constants qw(:sa);
  40         133  
  40         9088  
58 40     40   390 use Mail::SpamAssassin::Logger;
  40         176  
  40         248989  
59              
60             our @ISA = qw(Mail::SpamAssassin::Message::Node);
61              
62             # ---------------------------------------------------------------------------
63              
64             =item new()
65              
66             Creates a Mail::SpamAssassin::Message object. Takes a hash reference
67             as a parameter. The used hash key/value pairs are as follows:
68              
69             C<message> is either undef (which will use STDIN), a scalar - a string
70             containing an entire message, a reference to such string, an array reference
71             of the message with one line per array element, or either a file glob
72             or an IO::File object which holds the entire contents of the message.
73              
74             Note: The message is expected to generally be in RFC 2822 format, optionally
75             including an mbox message separator line (the "From " line) as the first line.
76              
77             C<parse_now> specifies whether or not to create the MIME tree
78             at object-creation time or later as necessary.
79              
80             The I<parse_now> option, by default, is set to false (0).
81             This allows SpamAssassin to not have to generate the tree of
82             Mail::SpamAssassin::Message::Node objects and their related data if the
83             tree is not going to be used. This is handy, for instance, when running
84             C<spamassassin -d>, which only needs the pristine header and body which
85             is always handled when the object is created.
86              
87             C<subparse> specifies how many MIME recursion levels should be parsed.
88             Defaults to 20.
89              
90             =cut
91              
92             # month mappings (ripped from Util.pm)
93             my %MONTH = (jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6,
94             jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12);
95              
96             # day of week mapping (starting from zero)
97             my @DAY_OF_WEEK = qw/Sun Mon Tue Wed Thu Fri Sat/ ;
98              
99             sub new {
100 149     149 1 473 my $class = shift;
101 149   33     1043 $class = ref($class) || $class;
102              
103 149         422 my($opts) = @_;
104 149 50       719 my $message = defined $opts->{'message'} ? $opts->{'message'} : \*STDIN;
105 149   100     694 my $parsenow = $opts->{'parsenow'} || 0;
106 149   50     1004 my $normalize = $opts->{'normalize'} || 0;
107              
108             # Specifies whether or not to parse message/rfc822 parts into its own tree.
109             # If the # > 0, it'll subparse, otherwise it won't. By default, do twenty
110             # levels deep.
111 149 100       539 my $subparse = defined $opts->{'subparse'} ? $opts->{'subparse'} : 20;
112              
113 149         1668 my $self = $class->SUPER::new({normalize=>$normalize});
114              
115 149         659 $self->{tmpfiles} = [];
116 149         590 $self->{pristine_msg} = '';
117 149         567 $self->{pristine_headers} = '';
118 149         685 $self->{pristine_body} = '';
119 149         420 $self->{mime_boundary_state} = {};
120 149         553 $self->{line_ending} = "\012";
121 149         456 $self->{master_deadline} = $opts->{'master_deadline'};
122 149         342 $self->{suppl_attrib} = $opts->{'suppl_attrib'};
123 149   100     624 $self->{body_part_scan_size} = $opts->{'body_part_scan_size'} || 0;
124 149   100     563 $self->{rawbody_part_scan_size} = $opts->{'rawbody_part_scan_size'} || 0;
125              
126 149 100       509 if ($self->{suppl_attrib}) { # caller-provided additional information
127             # pristine_body_length is currently used by an eval test check_body_length
128             # Possible To-Do: Base the length on the @message array later down?
129 49 50       173 if (defined $self->{suppl_attrib}{body_size}) {
130             # Optional info provided by a caller; should reflect the original
131             # message body size if provided, and as such it may differ from the
132             # $self->{pristine_body} size, e.g. when the caller passed a truncated
133             # message to SpamAssassin, or when counting line-endings differently.
134 0         0 $self->{pristine_body_length} = $self->{suppl_attrib}{body_size};
135             }
136 49 50       195 if (ref $self->{suppl_attrib}{mimepart_digests}) {
137             # Optional info provided by a caller: an array of digest codes (e.g. SHA1)
138             # of each MIME part. Should reflect the original message if provided.
139             # As such it may differ from digests calculated by get_mimepart_digests(),
140             # e.g. when the caller passed a truncated message to SpamAssassin.
141 0         0 $self->{mimepart_digests} = $self->{suppl_attrib}{mimepart_digests};
142             }
143             }
144              
145 149         366 bless($self,$class);
146              
147             # create the metadata holder class
148 149         1545 $self->{metadata} = Mail::SpamAssassin::Message::Metadata->new($self);
149              
150             # Ok, go ahead and do the message "parsing"
151              
152             # protect it from abuse ...
153 149         371 local $_;
154              
155             # Figure out how the message was passed to us, and deal with it.
156 149         282 my @message;
157 149 100 66     1297 if (ref $message eq 'ARRAY') {
    100          
    50          
    50          
    50          
158 63         150 @message = @{$message};
  63         365  
159             }
160             elsif (ref($message) eq 'GLOB' || ref($message) =~ /^IO::/) {
161 18 50       66 if (defined fileno $message) {
162              
163             # sysread+split avoids a Perl I/O bug (Bug 5985)
164             # and is faster than (<$message>) by 10..25 %
165             # (a drawback is a short-term double storage of a text in $raw_str)
166             #
167 18         36 my($nread,$raw_str); $raw_str = '';
  18         31  
168 18         784 while ( $nread=sysread($message, $raw_str, 16384, length $raw_str) ) { }
169 18 50       74 defined $nread or die "error reading: $!";
170 18         1203 @message = split(/^/m, $raw_str, -1);
171              
172 18 50       248 if ($raw_str eq '') {
    50          
173 0         0 dbg("message: empty message read");
174             } elsif (length($raw_str) > 128*1024) {
175             # ditch rarely used large chunks of allocated memory, Bug 6514
176             # http://www.perlmonks.org/?node_id=803515
177             # about 97% of mail messages are below 128 kB,
178             # about 98% of mail messages are below 256 kB (2010 statistics)
179             # dbg("message: deallocating %.2f MB", length($raw_str)/1024/1024);
180 0         0 undef $raw_str;
181             }
182             }
183             }
184             elsif (ref $message eq 'SCALAR') {
185 0         0 @message = split(/^/m, $$message, -1);
186             }
187             elsif (ref $message) {
188 0         0 dbg("message: Input is a reference of unknown type!");
189             }
190             elsif (defined $message) {
191 68         1828 @message = split(/^/m, $message, -1);
192             }
193              
194             # Deal with null message
195 149 50       549 if (!@message) {
196             # bug 4884:
197             # if we get here, it means that the input was null, so fake the message
198             # content as a single newline...
199 0         0 @message = ("\n");
200             }
201              
202             # Bug 7648:
203             # Make sure the message is tainted. When linting, @testmsg is not, so this
204             # handles that. Perhaps 3rd party tools could call this with untainted
205             # messages? Tainting the message is important because it prevents certain
206             # exploits later.
207 149 100 66     815 if (Mail::SpamAssassin::Util::am_running_in_taint_mode() &&
208 4859         10259 grep { !Scalar::Util::tainted($_) } @message) {
209 119         220 local($_);
210             # To preserve newlines, no joining and splitting here, process each line
211             # directly as is.
212 119         331 foreach (@message) {
213 1070         2163 $_ = Mail::SpamAssassin::Util::taint_var($_);
214             }
215 119 50       303 if (grep { !Scalar::Util::tainted($_) } @message) {
  1070         2534  
216 0         0 die "Mail::SpamAssassin::Message failed to enforce message taintness";
217             }
218             }
219              
220             # Pull off mbox and mbx separators
221 149 100       1472 if ($message[0] =~ /^From\s+(?!:)/) {
    50          
222             # careful not to confuse with obsolete syntax which allowed WSP before ':'
223             # mbox formatted mailbox
224 17         115 $self->{'mbox_sep'} = shift @message;
225             } elsif ($message[0] =~ MBX_SEPARATOR) {
226 0         0 $_ = shift @message;
227              
228             # Munge the mbx message separator into mbox format as a sort of
229             # de facto portability standard in SA's internals. We need to
230             # to this so that Mail::SpamAssassin::Util::parse_rfc822_date
231             # can parse the date string...
232 0 0       0 if (/([\s\d]\d)-([a-zA-Z]{3})-(\d{4})\s(\d{2}):(\d{2}):(\d{2})/) {
233             # $1 = day of month
234             # $2 = month (text)
235             # $3 = year
236             # $4 = hour
237             # $5 = min
238             # $6 = sec
239 0         0 my @arr = localtime(timelocal($6,$5,$4,$1,$MONTH{lc($2)}-1,$3));
240 0         0 my $address;
241 0         0 foreach (@message) {
242 0 0       0 if (/^From:[^<]*<([^>]+)>/) {
    0          
243 0         0 $address = $1;
244 0         0 last;
245             } elsif (/^From:\s*([^<> ]+)/) {
246 0         0 $address = $1;
247 0         0 last;
248             }
249             }
250 0         0 $self->{'mbox_sep'} = "From $address $DAY_OF_WEEK[$arr[6]] $2 $1 $4:$5:$6 $3\n";
251             }
252             }
253              
254             # bug 4363
255             # Check to see if we should do CRLF instead of just LF
256             # For now, just check the first and last line and do whatever it does
257 149 50 33     1662 if (@message && ($message[0] =~ /\015\012/ || $message[-1] =~ /\015\012/)) {
      33        
258 0         0 $self->{line_ending} = "\015\012";
259 0         0 dbg("message: line ending changed to CRLF");
260             }
261              
262             # Is a CRLF -> LF line endings conversion necessary?
263 149         596 my $squash_crlf = $self->{line_ending} eq "\015\012";
264              
265             # Go through all the header fields of the message
266 149         309 my $hdr_errors = 0;
267 149         330 my $header;
268 149         277 for (;;) {
269             # make sure not to lose the last header field when there is no body
270 1166         2055 my $eof = !@message;
271 1166 100       2971 my $current = $eof ? "\n" : shift @message;
272              
273 1166 100       3817 if ( $current =~ /^[ \t]/ ) {
274             # This wasn't useful in terms of a rule, but we may want to treat it
275             # specially at some point. Perhaps ignore it?
276             #unless ($current =~ /\S/) {
277             # $self->{'obsolete_folding_whitespace'} = 1;
278             #}
279              
280 200 50       575 $header = '' if !defined $header; # header starts with a continuation!?
281 200         439 $header .= $current; # append continuations, no matter what
282 200         597 $self->{'pristine_headers'} .= $current;
283             }
284             else { # not a continuation
285             # Ok, there's a header here, let's go ahead and add it in.
286 966 100       2518 if (defined $header) { # deal with a previous header field
287 817         3854 my ($key, $value) = split (/:/s, $header, 2);
288              
289             # If it's not a valid header (aka: not in the form "foo:bar"), skip it.
290 817 100       2318 if (defined $value) {
291             # CRLF -> LF line-endings conversion if necessary
292 811 50       1762 $value =~ s/\015\012/\012/sg if $squash_crlf;
293 811         1702 $key =~ s/[ \t]+\z//; # strip WSP before colon, obsolete rfc822 syn
294             # limit the length of the pairs we store
295 811 50       2078 if (length($key) > MAX_HEADER_KEY_LENGTH) {
296 0         0 $key = substr($key, 0, MAX_HEADER_KEY_LENGTH);
297 0         0 $self->{'truncated_header'} = 1;
298             }
299 811 50       1960 if (length($value) > MAX_HEADER_VALUE_LENGTH) {
300 0         0 $value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);
301 0         0 $self->{'truncated_header'} = 1;
302             }
303 811         2749 $self->header($key, $value);
304             }
305             }
306              
307 966 100       4308 if ($current =~ /^\r?$/) { # a regular end of a header section
    100          
308 144 100       488 if ($eof) {
309 1         4 $self->{'missing_head_body_separator'} = 1;
310             } else {
311 143         484 $self->{'pristine_headers'} .= $current;
312             }
313 144         449 last;
314             }
315             elsif ($current =~ /^--/) { # mime boundary encountered, bail out
316 4         22 $self->{'missing_head_body_separator'} = 1;
317 4         15 unshift(@message, $current);
318 4         14 last;
319             }
320             # should we assume entering a body on encountering invalid header field?
321             else {
322             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
323 818 100       3606 if ($current !~ /^[\041-\071\073-\176]+[ \t]*:/) {
324             # A field name MUST be composed of printable US-ASCII characters
325             # (i.e., characters that have values between 33 (041) and 126 (176),
326             # inclusive), except colon (072). Obsolete header field syntax
327             # allowed WSP before a colon.
328 7 100       19 if (++$hdr_errors <= 3) {
329             # just consume but ignore a few invalid header fields
330             } else { # enough is enough...
331 1         5 $self->{'missing_head_body_separator'} = 1;
332 1         4 unshift(@message, $current);
333 1         3 last;
334             }
335             }
336             }
337              
338             # start collecting a new header field
339 817         1796 $header = $current;
340 817         3145 $self->{'pristine_headers'} .= $current;
341             }
342             }
343 149         400 undef $header;
344              
345             # Store the pristine body for later -- store as a copy since @message
346             # will get modified below
347 149         1826 $self->{'pristine_body'} = join('', @message);
348              
349 149 50       654 if (!defined $self->{pristine_body_length}) {
350 149         902 $self->{'pristine_body_length'} = length $self->{'pristine_body'};
351             }
352              
353             # Store complete message, get_pristine() is used a lot, avoid making copies
354 149         1267 $self->{'pristine_msg'} = $self->{'pristine_headers'} . $self->{'pristine_body'};
355              
356             # iterate over lines in reverse order
357             # merge multiple blank lines into a single one
358 149         366 my $start;
359 149         760 for (my $cnt=$#message; $cnt>=0; $cnt--) {
360             # CRLF -> LF line-endings conversion if necessary
361 3682 50       5724 $message[$cnt] =~ s/\015\012\z/\012/ if $squash_crlf;
362              
363             # line is blank
364 3682 100       7970 if ($message[$cnt] =~ /^\s*$/) {
365             # /^\s*$/ is about 5% faster then !/\S/, but still expensive here
366 414 100       934 if (!defined $start) {
367 346         473 $start=$cnt;
368             }
369 414 100       1160 next unless $cnt == 0;
370             }
371              
372             # line is not blank, or we've reached the beginning
373              
374             # if we've got a series of blank lines, get rid of them
375 3303 100       8220 if (defined $start) {
376 346         521 my $max_blank_lines = 20;
377 346         568 my $num = $start-$cnt;
378 346 50       657 if ($num > $max_blank_lines) {
379 0         0 splice @message, $cnt+2, $num-$max_blank_lines;
380             }
381 346         751 undef $start;
382             }
383             }
384              
385             # Figure out the boundary
386 149         320 my ($boundary);
387 149         697 ($self->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($self->header('content-type'));
388 149         992 dbg("message: main message type: ".$self->{'type'});
389              
390             # dbg("message: \$message[0]: \"" . $message[0] . "\"");
391              
392             # bug 6845: if main message type is multipart and the message body does not begin with
393             # either a blank line or the boundary (if defined), insert a blank line
394             # to ensure proper parsing - do not consider MIME headers at the beginning of the body
395             # to be part of the message headers.
396 149 100 66     939 if ($self->{'type'} =~ /^multipart\//i && $#message > 0 && $message[0] =~ /\S/)
      100        
397             {
398 14 100 66     332 if (!defined $boundary || $message[0] !~ /^--\Q$boundary\E/)
399             {
400 4         23 dbg("message: Inserting blank line at top of body to ensure correct multipart MIME parsing");
401 4         11 unshift(@message, "\012");
402             }
403             }
404              
405             # dbg("message: \$message[0]: \"" . $message[0] . "\"");
406             # dbg("message: \$message[1]: \"" . $message[1] . "\"");
407              
408             # parse queue, simple array of parts to parse:
409             # 0: part object, already in the tree
410             # 1: boundary used to focus body parsing
411             # 2: message content
412             # 3: how many MIME subparts to parse down
413             #
414 149         761 $self->{'parse_queue'} = [ [ $self, $boundary, \@message, $subparse ] ];
415              
416             # If the message does need to get parsed, save off a copy of the body
417             # in a format we can easily parse later so we don't have to rip from
418             # pristine_body ... If we do want to parse now, go ahead and do so ...
419             #
420 149 100       549 if ($parsenow) {
421 101         566 $self->parse_body();
422             }
423              
424 149         1179 $self;
425             }
426              
427             # ---------------------------------------------------------------------------
428              
429             =item find_parts()
430              
431             Used to search the tree for specific MIME parts. See
432             I<Mail::SpamAssassin::Message::Node> for more details.
433              
434             =cut
435              
436             # Used to find any MIME parts whose simple content-type matches a given regexp
437             # Searches it's own and any children parts. Returns an array of MIME
438             # objects which match.
439             #
440             sub find_parts {
441 255     255 1 3323 my $self = shift;
442              
443             # ok, we need to do the parsing now...
444 255 100       903 $self->parse_body() if (exists $self->{'parse_queue'});
445              
446             # and pass through to the Message::Node version of the method
447 255         1317 return $self->SUPER::find_parts(@_);
448             }
449              
450             # ---------------------------------------------------------------------------
451              
452             =item get_pristine_header()
453              
454             Returns pristine headers of the message. If no specific header name
455             is given as a parameter (case-insensitive), then all headers will be
456             returned as a scalar, including the blank line at the end of the headers.
457              
458             If called in an array context, an array will be returned with each
459             specific header in a different element. In a scalar context, the last
460             specific header is returned.
461              
462             ie: If 'Subject' is specified as the header, and there are 2 Subject
463             headers in a message, the last/bottom one in the message is returned in
464             scalar context or both are returned in array context.
465              
466             Btw, returning the last header field (not the first) happens to be consistent
467             with DKIM signatures, which search for and cover multiple header fields
468             bottom-up according to the 'h' tag. Let's keep it this way.
469              
470             Note: the returned header will include the ending newline and any embedded
471             whitespace folding.
472              
473             =cut
474              
475             sub get_pristine_header {
476 76     76 1 205 my ($self, $hdr) = @_;
477            
478 76 100 66     555 return $self->{pristine_headers} if !defined $hdr || $hdr eq '';
479             my(@ret) =
480 44         1064 $self->{pristine_headers} =~ /^\Q$hdr\E[ \t]*:[ \t]*(.*?\n(?![ \t]))/smgi;
481             # taintedness is retained by "use re 'taint'" (fix in bug 5283 now redundant)
482 44 100       181 if (!@ret) {
    50          
483 34         188 return $self->get_header($hdr);
484             } elsif (wantarray) {
485 0         0 return @ret;
486             } else {
487 10         60 return $ret[-1];
488             }
489             }
490              
491             =item get_mbox_separator()
492              
493             Returns the mbox separator found in the message, or undef if there
494             wasn't one.
495              
496             =cut
497              
498             sub get_mbox_separator {
499 34     34 1 208 return $_[0]->{mbox_sep};
500             }
501              
502             =item get_body()
503              
504             Returns an array of the pristine message body, one line per array element.
505              
506             =cut
507              
508             sub get_body {
509 0     0 1 0 my ($self) = @_;
510 0         0 my @ret = split(/^/m, $self->{pristine_body});
511 0         0 return \@ret;
512             }
513              
514             # ---------------------------------------------------------------------------
515              
516             =item get_pristine()
517              
518             Returns a scalar of the entire pristine message.
519              
520             =cut
521              
522             sub get_pristine {
523 102     102 1 300 my ($self) = @_;
524 102         404 return $self->{pristine_msg};
525             }
526              
527             =item get_pristine_body()
528              
529             Returns a scalar of the pristine message body.
530              
531             =cut
532              
533             sub get_pristine_body {
534 46     46 1 140 my ($self) = @_;
535 46         468 return $self->{pristine_body};
536             }
537              
538             # ---------------------------------------------------------------------------
539              
540             =item extract_message_metadata($permsgstatus)
541              
542             =cut
543              
544             sub extract_message_metadata {
545 116     116 1 360 my ($self, $permsgstatus) = @_;
546              
547             # do this only once per message, it can be expensive
548 116 100       388 return if $self->{already_extracted_metadata};
549 102         362 $self->{already_extracted_metadata} = 1;
550              
551 102         683 $self->{metadata}->extract ($self, $permsgstatus);
552             }
553              
554             # ---------------------------------------------------------------------------
555              
556             =item $str = get_metadata($hdr)
557              
558             =cut
559              
560             sub get_metadata {
561 2942     2942 1 5458 my ($self, $hdr) = @_;
562 2942 50       6168 if (!$self->{metadata}) {
563 0         0 warn "metadata: oops! get_metadata() called after finish_metadata()"; return;
  0         0  
564             }
565             # dbg("message: get_metadata - %s: %s", $hdr, defined $_ ? $_ : '<undef>')
566             # for $self->{metadata}->{strings}->{lc $hdr};
567              
568 2942         8326 $self->{metadata}->{strings}->{lc $hdr};
569             }
570              
571             =item put_metadata($hdr, $text)
572              
573             =cut
574              
575             sub put_metadata {
576 408     408 1 1063 my ($self, $hdr, $text) = @_;
577 408 50       937 if (!$self->{metadata}) {
578 0         0 warn "metadata: oops! put_metadata() called after finish_metadata()"; return;
  0         0  
579             }
580             # dbg("message: put_metadata - %s: %s", $hdr, $text);
581 408         1608 $self->{metadata}->{strings}->{lc $hdr} = $text;
582             }
583              
584             =item delete_metadata($hdr)
585              
586             =cut
587              
588             sub delete_metadata {
589 0     0 1 0 my ($self, $hdr) = @_;
590 0 0       0 if (!$self->{metadata}) {
591 0         0 warn "metadata: oops! delete_metadata() called after finish_metadata()"; return;
  0         0  
592             }
593 0         0 delete $self->{metadata}->{strings}->{lc $hdr};
594             }
595              
596             =item $str = get_all_metadata()
597              
598             =cut
599              
600             sub get_all_metadata {
601 16     16 1 49 my ($self) = @_;
602              
603 16 50       60 if (!$self->{metadata}) {
604 0         0 warn "metadata: oops! get_all_metadata() called after finish_metadata()"; return;
  0         0  
605             }
606 16         31 my @ret;
607 16         38 my $keys_ref = $self->{metadata}->{strings};
608 16         121 foreach my $key (sort keys %$keys_ref) {
609 64         151 my $val = $keys_ref->{$key};
610 64 50       151 $val = '' if !defined $val;
611 64         281 push (@ret, "$key: $val\n");
612             }
613 16 50       118 return (wantarray ? @ret : join('', @ret));
614             }
615              
616             # ---------------------------------------------------------------------------
617              
618             =item finish_metadata()
619              
620             Destroys the metadata for this message. Once a message has been
621             scanned fully, the metadata is no longer required. Destroying
622             this will free up some memory.
623              
624             =cut
625              
626             sub finish_metadata {
627 124     124 1 305 my ($self) = @_;
628 124 50       446 if (defined ($self->{metadata})) {
629 124         906 $self->{metadata}->finish();
630 124         472 delete $self->{metadata};
631             }
632             }
633              
634             =item finish()
635              
636             Clean up an object so that it can be destroyed.
637              
638             =cut
639              
640             sub finish {
641 92     92 1 6809 my ($self) = @_;
642              
643             # Clean ourself up
644 92         520 $self->finish_metadata();
645              
646             # These will only be in the root Message node
647 92         246 delete $self->{'mime_boundary_state'};
648 92         240 delete $self->{'mbox_sep'};
649 92         196 delete $self->{'normalize'};
650 92         205 delete $self->{'body_part_scan_size'};
651 92         246 delete $self->{'rawbody_part_scan_size'};
652 92         252 delete $self->{'pristine_msg'};
653 92         244 delete $self->{'pristine_body'};
654 92         246 delete $self->{'pristine_headers'};
655 92         205 delete $self->{'line_ending'};
656 92         169 delete $self->{'missing_head_body_separator'};
657              
658             # Remove the queue variable, in case the body has not been parsed
659 92         171 delete $self->{'parse_queue'};
660              
661 92         212 my @toclean = ( $self );
662              
663             # Go ahead and clean up all of the Message::Node parts
664 92         357 while (my $part = shift @toclean) {
665             # bug 5557: windows requires tmp file be closed before it can be rm'd
666 138 100       547 if (ref $part->{'raw'} eq 'GLOB') {
667 20 50       237 close($part->{'raw'}) or die "error closing input file: $!";
668             }
669              
670             # bug 5858: avoid memory leak with deep MIME structure
671 138 100       393 if (defined ($part->{metadata})) {
672 4         16 $part->{metadata}->finish();
673 4         7 delete $part->{metadata};
674             }
675              
676 138         621 delete $part->{'headers'};
677 138         545 delete $part->{'raw_headers'};
678 138         415 delete $part->{'header_order'};
679 138         419 delete $part->{'raw'};
680 138         322 delete $part->{'decoded'};
681 138         251 delete $part->{'rendered'};
682 138         231 delete $part->{'visible_rendered'};
683 138         229 delete $part->{'invisible_rendered'};
684 138         255 delete $part->{'type'};
685 138         227 delete $part->{'rendered_type'};
686              
687             # if there are children nodes, add them to the queue of nodes to clean up
688 138 100       653 if (exists $part->{'body_parts'}) {
689 23         35 push(@toclean, @{$part->{'body_parts'}});
  23         48  
690 23         81 delete $part->{'body_parts'};
691             }
692             }
693              
694             # delete temporary files
695 92 50       286 if ($self->{'tmpfiles'}) {
696 92         172 for my $fn (@{$self->{'tmpfiles'}}) {
  92         295  
697 20 50       818 unlink($fn) or warn "cannot unlink $fn: $!";
698             }
699 92         369 delete $self->{'tmpfiles'};
700             }
701             }
702              
703             # also use a DESTROY method, just to ensure (as much as possible) that
704             # temporary files are deleted even if the finish() method is omitted
705             sub DESTROY {
706 128     128   604 my $self = shift;
707             # best practices: prevent potential calls to eval and to system routines
708             # in code of a DESTROY method from clobbering global variables $@ and $!
709 128         1120 local($@,$!); # keep outer error handling unaffected by DESTROY
710 128 100       2358 if ($self->{'tmpfiles'}) {
711 36         92 for my $fn (@{$self->{'tmpfiles'}}) {
  36         1854  
712 0 0       0 unlink($fn) or dbg("message: cannot unlink $fn: $!");
713             }
714             }
715             }
716              
717             # ---------------------------------------------------------------------------
718              
719             =item receive_date()
720              
721             Return a time_t value with the received date of the current message,
722             or current time if received time couldn't be determined.
723              
724             =cut
725              
726             sub receive_date {
727 10     10 1 31 my($self) = @_;
728              
729 10         55 return Mail::SpamAssassin::Util::receive_date(scalar $self->get_all_headers(0,1));
730             }
731              
732             # ---------------------------------------------------------------------------
733              
734             =back
735              
736             =head1 PARSING METHODS, NON-PUBLIC
737              
738             These methods take a RFC2822-esque formatted message and create a tree
739             with all of the MIME body parts included. Those parts will be decoded
740             as necessary, and text/html parts will be rendered into a standard text
741             format, suitable for use in SpamAssassin.
742              
743             =over 4
744              
745             =item parse_body()
746              
747             parse_body() passes the body part that was passed in onto the
748             correct part parser, either _parse_multipart() for multipart/* parts,
749             or _parse_normal() for everything else. Multipart sections become the
750             root of sub-trees, while everything else becomes a leaf in the tree.
751              
752             For multipart messages, the first call to parse_body() doesn't create a
753             new sub-tree and just uses the parent node to contain children. All other
754             calls to parse_body() will cause a new sub-tree root to be created and
755             children will exist underneath that root. (this is just so the tree
756             doesn't have a root node which points at the actual root node ...)
757              
758             =cut
759              
760             sub parse_body {
761 137     137 1 356 my($self) = @_;
762              
763             # This shouldn't happen, but just in case, abort.
764 137 50       479 return unless (exists $self->{'parse_queue'});
765              
766 137         524 dbg("message: ---- MIME PARSER START ----");
767              
768 137         239 while (my $toparse = shift @{$self->{'parse_queue'}}) {
  322         1244  
769             # multipart sections are required to have a boundary set ... If this
770             # one doesn't, assume it's malformed and send it to be parsed as a
771             # non-multipart section
772             #
773 185         510 my ($msg, $boundary, $body, $subparse) = @$toparse;
774              
775 185 100 66     1254 if ($msg->{'type'} =~ m{^multipart/}i && defined $boundary && $subparse > 0) {
      66        
776 20         73 $self->_parse_multipart($toparse);
777             }
778             else {
779             # If it's not multipart, go ahead and just deal with it.
780 165         1001 $self->_parse_normal($toparse);
781              
782             # bug 5041: process message/*, but exclude message/partial content types
783 165 100 66     1222 if ($msg->{'type'} =~ m{^message/(?!partial\z)}i && $subparse > 0)
784             {
785             # Just decode the part, but we don't need the resulting string here.
786 4         20 $msg->decode(0);
787              
788             # bug 7125: decode and parse only message/rfc822 or message/global,
789             # but do not treat other message/* content types (like the ones listed
790             # here) as a message consisting of a header and a body, as they are not:
791             # message/delivery-status, message/global-delivery-status,
792             # message/feedback-report, message/global-headers,
793             # message/global-disposition-notification,
794             # message/disposition-notification, (and message/partial)
795              
796             # bug 5051, bug 3748: check $msg->{decoded}: sometimes message/* parts
797             # have no content, and we get stuck waiting for STDIN, which is bad. :(
798              
799 4 50 33     49 if ($msg->{'type'} =~ m{^message/(?:rfc822|global)\z}i &&
      33        
800             defined $msg->{'decoded'} && $msg->{'decoded'} ne '')
801             {
802             # Ok, so this part is still semi-recursive, since M::SA::Message
803             # calls M::SA::Message, but we don't subparse the new message,
804             # and pull a sneaky "steal our child's queue" maneuver to deal
805             # with it on our own time. Reference the decoded array directly
806             # since it's faster.
807             #
808             my $msg_obj = Mail::SpamAssassin::Message->new({
809             message => $msg->{'decoded'},
810             parsenow => 0,
811             normalize => $self->{normalize},
812 4         43 subparse => $subparse - 1,
813             });
814              
815             # Add the new message to the current node
816 4         20 $msg->add_body_part($msg_obj);
817              
818             # now this is the sneaky bit ... steal the sub-message's parse_queue
819             # and add it to ours. then we'll handle the sub-message in our
820             # normal loop and get all the glory. muhaha. :)
821 4         5 push(@{$self->{'parse_queue'}}, @{$msg_obj->{'parse_queue'}});
  4         10  
  4         9  
822 4         9 delete $msg_obj->{'parse_queue'};
823              
824             # Ok, we've subparsed, so go ahead and remove the raw and decoded
825             # data because we won't need them anymore (the tree under this part
826             # will have that data)
827 4 50       11 if (ref $msg->{'raw'} eq 'GLOB') {
828             # Make sure we close it if it's a temp file -- Bug 5166
829 0 0       0 close($msg->{'raw'})
830             or die "error closing input file: $!";
831             }
832              
833 4         10 delete $msg->{'raw'};
834            
835 4         90 delete $msg->{'decoded'};
836             }
837             }
838             }
839             }
840              
841 137         642 dbg("message: ---- MIME PARSER END ----");
842              
843             # we're done parsing, so remove the queue variable
844 137         485 delete $self->{'parse_queue'};
845             }
846              
847             =item _parse_multipart()
848              
849             Generate a root node, and for each child part call parse_body()
850             to generate the tree.
851              
852             =cut
853              
854             sub _parse_multipart {
855 20     20   48 my($self, $toparse) = @_;
856              
857 20         31 my ($msg, $boundary, $body, $subparse) = @{$toparse};
  20         52  
858              
859             # we're not supposed to be a leaf, so prep ourselves
860 20         51 $msg->{'body_parts'} = [];
861              
862             # the next set of objects will be one level deeper
863 20         26 $subparse--;
864              
865 20 50       108 dbg("message: parsing multipart, got boundary: ".(defined $boundary ? $boundary : ''));
866              
867             # NOTE: The MIME boundary REs here are very specific to be mostly RFC 1521
868             # compliant, but also allow possible malformations to still work. Please
869             # see Bugzilla bug 3749 for more information before making any changes!
870              
871             # ignore preamble per RFC 1521, unless there's no boundary ...
872 20 50       54 if ( defined $boundary ) {
873 20         30 my $line;
874 20         29 my $tmp_line = @{$body};
  20         31  
875 20         65 for ($line=0; $line < $tmp_line; $line++) {
876             # dbg("message: multipart line $line: \"" . $body->[$line] . "\"");
877             # specifically look for an opening boundary
878 120 100 100     813 if (substr($body->[$line],0,2) eq '--' # triage
879             && $body->[$line] =~ /^--\Q$boundary\E\s*$/) {
880             # Make note that we found the opening boundary
881 19         86 $self->{mime_boundary_state}->{$boundary} = 1;
882              
883             # if the line after the opening boundary isn't a header, flag it.
884             # we need to make sure that there's actually another line though.
885             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
886 19 100 66     135 if ($line+1 < $tmp_line && $body->[$line+1] !~ /^[\041-\071\073-\176]+:/) {
887 5         12 $self->{'missing_mime_headers'} = 1;
888             }
889              
890 19         55 last;
891             }
892             }
893              
894             # Found a boundary, ignore the preamble
895 20 100       48 if ( $line < $tmp_line ) {
896 19         43 splice @{$body}, 0, $line+1;
  19         57  
897             }
898              
899             # Else, there's no boundary, so leave the whole part...
900             }
901              
902             # prepare a new tree node
903 20         149 my $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
904 20         49 my $in_body = 0;
905 20         44 my $header;
906             my $part_array;
907 20         0 my $found_end_boundary;
908 20         26 my $partcnt = 0;
909              
910 20         33 my $line_count = @{$body};
  20         34  
911 20         26 foreach ( @{$body} ) {
  20         45  
912             # if we're on the last body line, or we find any boundary marker,
913             # deal with the mime part;
914             # a triage before an unlikely-to-match regexp avoids a CPU hotspot
915 2260   100     10253 $found_end_boundary = defined $boundary && substr($_,0,2) eq '--'
916             && /^--\Q$boundary\E(?:--)?\s*$/;
917 2260 100 100     6458 if ( --$line_count == 0 || $found_end_boundary ) {
918 44         99 my $line = $_; # remember the last line
919              
920             # If at last line and no end boundary found, the line belongs to body
921             # TODO:
922             # Is $self->{mime_boundary_state}->{$boundary}-- needed here?
923             # Could "missing end boundary" be a useful rule? Mark it somewhere?
924             # If SA processed truncated message from amavis etc, this could also
925             # be hit legimately..
926 44 50       139 if (!$found_end_boundary) {
    50          
927             # TODO: This is duplicate code from few pages down below..
928 0         0 while (length ($_) > MAX_BODY_LINE_LENGTH) {
929 0         0 push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
  0         0  
930 0         0 substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
931             }
932 0         0 push ( @{$part_array}, $_ );
  0         0  
933             }
934             # per rfc 1521, the CRLF before the boundary is part of the boundary:
935             # NOTE: The CRLF preceding the encapsulation line is conceptually
936             # attached to the boundary so that it is possible to have a part
937             # that does not end with a CRLF (line break). Body parts that must
938             # be considered to end with line breaks, therefore, must have two
939             # CRLFs preceding the encapsulation line, the first of which is part
940             # of the preceding body part, and the second of which is part of the
941             # encapsulation boundary.
942             elsif ($part_array) {
943 44         116 chomp( $part_array->[-1] ); # trim the CRLF that's part of the boundary
944 44 100       144 splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
  24         66  
945             }
946             else {
947             # Invalid parts can have no body, so fake in a blank body
948             # in that case.
949 0         0 $part_array = [];
950             }
951              
952 44         85 my($p_boundary);
953 44         148 ($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
954 44   66     268 $p_boundary ||= $boundary;
955 44 50       297 dbg("message: found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));
956              
957             # we've created a new node object, so add it to the queue along with the
958             # text that belongs to that part, then add the new part to the current
959             # node to create the tree.
960 44         70 push(@{$self->{'parse_queue'}}, [ $part_msg, $p_boundary, $part_array, $subparse ]);
  44         164  
961 44         196 $msg->add_body_part($part_msg);
962              
963             # rfc 1521 says /^--boundary--$/, some MUAs may just require /^--boundary--/
964             # but this causes problems with horizontal lines when the boundary is
965             # made up of dashes as well, etc.
966 44 50       112 if (defined $boundary) {
967             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
968 44 100 33     671 if ($line =~ /^--\Q${boundary}\E--\s*$/) {
    50          
969             # Make a note that we've seen the end boundary
970 20         57 $self->{mime_boundary_state}->{$boundary}--;
971 20         79 last;
972             }
973             elsif ($line_count && $body->[-$line_count] !~ /^[\041-\071\073-\176]+:/) {
974             # if we aren't on an end boundary and there are still lines left, it
975             # means we hit a new start boundary. therefore, the next line ought
976             # to be a mime header. if it's not, mark it.
977 0         0 $self->{'missing_mime_headers'} = 1;
978             }
979             }
980              
981             # Maximum parts to process
982 24 50       149 if (++$partcnt == 1000) {
983 0         0 dbg("message: mimepart limit exceeded, stopping parsing");
984 0         0 $self->{'mimepart_limit_exceeded'} = 1;
985 0         0 return;
986             }
987              
988             # make sure we start with a new clean node
989 24         33 $in_body = 0;
990 24         107 $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
991 24         59 undef $part_array;
992 24         43 undef $header;
993              
994 24         102 next;
995             }
996              
997 2216 100       3495 if (!$in_body) {
998             # s/\s+$//; # bug 5127: don't clean this up (yet)
999             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
1000 132 100 66     509 if (/^[\041-\071\073-\176]+[ \t]*:/) {
    100          
1001 87 100       250 if ($header) {
1002 49         267 my ( $key, $value ) = split ( /:\s*/, $header, 2 );
1003 49         150 $part_msg->header( $key, $value );
1004             }
1005 87         170 $header = $_;
1006 87         278 next;
1007             }
1008             elsif (/^[ \t]/ && $header) {
1009             # $_ =~ s/^\s*//; # bug 5127, again
1010 1         5 $header .= $_;
1011 1         3 next;
1012             }
1013             else {
1014 44 100       117 if ($header) {
1015 38         246 my ( $key, $value ) = split ( /:\s*/, $header, 2 );
1016 38         124 $part_msg->header( $key, $value );
1017             }
1018 44         80 $in_body = 1;
1019              
1020             # if there's a blank line separator, that's good. if there isn't,
1021             # it's a body line, so drop through.
1022 44 100       158 if (/^\r?$/) {
1023 43         117 next;
1024             }
1025             else {
1026 1         4 $self->{'missing_mime_head_body_separator'} = 1;
1027             }
1028             }
1029             }
1030              
1031             # we run into a perl bug if the lines are astronomically long (probably
1032             # due to lots of regexp backtracking); so split any individual line
1033             # over MAX_BODY_LINE_LENGTH bytes in length. This can wreck HTML
1034             # totally -- but IMHO the only reason a luser would use
1035             # MAX_BODY_LINE_LENGTH-byte lines is to crash filters, anyway.
1036 2085         4787 while (length ($_) > MAX_BODY_LINE_LENGTH) {
1037 0         0 push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
  0         0  
1038 0         0 substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
1039             }
1040 2085         2649 push ( @{$part_array}, $_ );
  2085         5409  
1041             }
1042              
1043             # Look for a message epilogue
1044             # originally ignored whitespace: 0.185 0.2037 0.0654 0.757 0.00 0.00 TVD_TAB
1045             # ham FPs were all "." on a line by itself.
1046             # spams seem to only have NULL chars afterwards ?
1047 20 100       146 if ($line_count) {
1048 8         19 for(; $line_count > 0; $line_count--) {
1049 15 100       81 if ($body->[-$line_count] =~ /[^\s.]/) {
1050 2         8 $self->{mime_epilogue_exists} = 1;
1051 2         14 last;
1052             }
1053             }
1054             }
1055              
1056             }
1057              
1058             =item _parse_normal()
1059              
1060             Generate a leaf node and add it to the parent.
1061              
1062             =cut
1063              
1064             sub _parse_normal {
1065 165     165   504 my($self, $toparse) = @_;
1066              
1067 165         299 my ($msg, $boundary, $body) = @{$toparse};
  165         459  
1068              
1069 165         561 dbg("message: parsing normal part");
1070              
1071             # 0: content-type, 1: boundary, 2: charset, 3: filename
1072 165         568 my @ct = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
1073              
1074             # multipart sections are required to have a boundary set ... If this
1075             # one doesn't, assume it's malformed and revert to text/plain
1076 165 50 33     1262 $msg->{'type'} = ($ct[0] !~ m@^multipart/@i || defined $boundary ) ? $ct[0] : 'text/plain';
1077 165         647 $msg->{'charset'} = $ct[2];
1078              
1079             # attempt to figure out a name for this attachment if there is one ...
1080 165   100     661 my $disp = $msg->header('content-disposition') || '';
1081 165 100       848 if ($disp =~ /name=\s*"?([^";]+)"?/i) {
    100          
1082 1         4 $msg->{'name'} = $1;
1083             }
1084             elsif ($ct[3]) {
1085 2         7 $msg->{'name'} = $ct[3];
1086             }
1087 165 100       557 if ($msg->{'name'}) {
1088 3         18 $msg->{'name'} = Encode::decode("MIME-Header", $msg->{'name'});
1089             }
1090              
1091 165         7208 $msg->{'boundary'} = $boundary;
1092              
1093             # If the part type is not one that we're likely to want to use, go
1094             # ahead and write the part data out to a temp file -- why keep sucking
1095             # up RAM with something we're not going to use?
1096             #
1097 165 100       1100 if ($msg->{'type'} !~ m@^(?:text/(?:plain|html)$|message\b)@) {
1098 20         39 my($filepath, $fh);
1099             eval {
1100 20         77 ($filepath, $fh) = Mail::SpamAssassin::Util::secure_tmpfile(); 1;
  20         63  
1101 20 50       29 } or do {
1102 0 0       0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
1103 0         0 info("message: failed to create a temp file: %s", $eval_stat);
1104             };
1105 20 50       46 if ($fh) {
1106             # The temp file was created, add it to the list of pending deletions
1107             # we cannot just delete immediately in the POSIX idiom, as this is
1108             # unportable (to win32 at least)
1109 20         25 push @{$self->{tmpfiles}}, $filepath;
  20         58  
1110 20         47 dbg("message: storing a message part to file %s", $filepath);
1111 20 50       33 $fh->print(@{$body}) or die "error writing to $filepath: $!";
  20         84  
1112 20 50       1090 $fh->flush or die "error writing (flush) to $filepath: $!";
1113 20         117 $msg->{'raw'} = $fh;
1114             }
1115             }
1116              
1117             # if the part didn't get a temp file, go ahead and store the data in memory
1118 165 100       659 if (!defined $msg->{'raw'}) {
1119 145         530 dbg("message: storing a body to memory");
1120 145         869 $msg->{'raw'} = $body;
1121             }
1122             }
1123              
1124             # ---------------------------------------------------------------------------
1125              
1126             sub get_mimepart_digests {
1127 0     0 0 0 my ($self) = @_;
1128              
1129 0 0       0 if (!exists $self->{mimepart_digests}) {
1130             # traverse all parts which are leaves, recursively
1131             $self->{mimepart_digests} =
1132 0   0     0 [ map(sha1_hex($_->decode) . ':' . lc($_->{type}||''),
1133             $self->find_parts(qr/^/,1,1)) ];
1134             }
1135 0         0 return $self->{mimepart_digests};
1136             }
1137              
1138             # ---------------------------------------------------------------------------
1139              
1140             # common code for get_rendered_body_text_array,
1141             # get_visible_rendered_body_text_array, get_invisible_rendered_body_text_array
1142             #
1143             sub get_body_text_array_common {
1144 387     387 0 1093 my ($self, $method_name) = @_;
1145              
1146 387         967 my $key = 'text_' . $method_name;
1147 387 100       1084 if (exists $self->{$key}) { return $self->{$key} }
  247         954  
1148              
1149 140         522 $self->{$key} = [];
1150              
1151 140         335 my $scansize = $self->{body_part_scan_size};
1152              
1153             # Find all parts which are leaves
1154 140         1086 my @parts = $self->find_parts(qr/./,1);
1155 140 50       595 return $self->{$key} unless @parts;
1156              
1157             # the html metadata may have already been set, so let's not bother if it's
1158             # already been done.
1159 140         494 my $html_needs_setting = !exists $self->{metadata}->{html};
1160              
1161 140 100 100     799 my $text = $method_name eq 'invisible_rendered' ? ''
1162             : ($self->get_header('subject') || "\n");
1163              
1164             # Go through each part
1165 140         699 for (my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1166 141         387 my $p = $parts[$pt];
1167              
1168             # put a blank line between parts ...
1169 141 100       507 $text .= "\n" if $text ne '';
1170              
1171 141         1023 my($type, $rnd) = $p->$method_name(); # decode this part
1172             # Only text/* types are rendered ...
1173 141 50       552 if (defined $rnd) {
1174             # Truncate to body_part_scan_size
1175 141 100 100     1097 if ($scansize && length($rnd) > $scansize) {
1176             # Try truncating at boundary, nearest 1k block is fine
1177             # Newline first
1178 8         43 my $idx = index($rnd, "\n", $scansize);
1179 8 100 100     37 if ($idx >= 0 && $idx - $scansize <= 1024) {
1180 2         10 substr($rnd, $idx+1) = ''; # truncate
1181             }
1182             else {
1183             # Space is fine too for rendered
1184 6         18 $idx = index($rnd, " ", $scansize);
1185 6 100 66     24 if ($idx >= 0 && $idx - $scansize <= 1024) {
1186 1         6 substr($rnd, $idx+1) = ''; # truncate
1187             }
1188             else {
1189 5         19 substr($rnd, $scansize) = ''; # truncate
1190             }
1191             }
1192             }
1193              
1194             # Add to rendered text
1195 141         608 $text .= $rnd;
1196              
1197             # TVD - if there are multiple parts, what should we do?
1198             # right now, just use the last one. we may need to give some priority
1199             # at some point, ie: use text/html rendered if it exists, or
1200             # text/plain rendered as html otherwise.
1201 141 100 100     1306 if ($html_needs_setting && $type eq 'text/html') {
1202 7         51 $self->{metadata}->{html} = $p->{html_results};
1203             }
1204             }
1205             }
1206              
1207             # whitespace handling (warning: small changes have large effects!)
1208 140         2020 $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
1209             # $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace (incl. VT, NBSP) => space
1210 140         879 $text =~ tr/ \t\n\r\x0b/ /s; # whitespace (incl. VT) => space
1211 140         579 $text =~ tr/\f/\n/; # form feeds => newline
1212              
1213 140         658 my @textary = split_into_array_of_short_lines($text);
1214 140         490 $self->{$key} = \@textary;
1215              
1216 140         959 return $self->{$key};
1217             }
1218              
1219             # ---------------------------------------------------------------------------
1220              
1221             sub get_rendered_body_text_array {
1222 339     339 0 702 my ($self) = @_;
1223 339         1008 return $self->get_body_text_array_common('rendered');
1224             }
1225              
1226             sub get_visible_rendered_body_text_array {
1227 24     24 0 68 my ($self) = @_;
1228 24         134 return $self->get_body_text_array_common('visible_rendered');
1229             }
1230              
1231             sub get_invisible_rendered_body_text_array {
1232 24     24 0 64 my ($self) = @_;
1233 24         57 return $self->get_body_text_array_common('invisible_rendered');
1234             }
1235              
1236             # ---------------------------------------------------------------------------
1237              
1238             sub get_decoded_body_text_array {
1239 110     110 0 278 my ($self) = @_;
1240              
1241 110 100       349 if (defined $self->{text_decoded}) { return $self->{text_decoded}; }
  2         18  
1242 108         427 $self->{text_decoded} = [ ];
1243              
1244 108         261 my $scansize = $self->{rawbody_part_scan_size};
1245              
1246             # Find all parts which are leaves
1247 108         649 my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
1248 108 50       484 return $self->{text_decoded} unless @parts;
1249              
1250             # Go through each part
1251 108         520 for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1252             # bug 4843: skip text/calendar parts since they're usually an attachment
1253             # and not displayed
1254 108 50       534 next if ($parts[$pt]->{'type'} eq 'text/calendar');
1255              
1256 108         390 my $rawbody = $parts[$pt]->decode();
1257 108         479 my $rawlen = length($rawbody);
1258              
1259             # Truncate to rawbody_part_scan_size
1260 108 100 100     824 if ($scansize && $rawlen > $scansize) {
1261             # Try truncating at boundary, nearest 1k block is fine
1262             # Newline first
1263 8         32 my $idx = index($rawbody, "\n", $scansize);
1264 8 100 100     52 if ($idx >= 0 && $idx - $scansize <= 1024) {
1265 2         8 substr($rawbody, $idx+1) = ''; # truncate
1266             }
1267             else {
1268             # Try > next, might be html
1269 6         32 $idx = index($rawbody, '>', $scansize);
1270 6 100 66     27 if ($idx >= 0 && $idx - $scansize <= 1024) {
1271 1         26 substr($rawbody, $idx+1) = ''; # truncate
1272             }
1273             else {
1274             # Space as last effort
1275 5         13 $idx = index($rawbody, ' ', $scansize);
1276 5 50 33     18 if ($idx >= 0 && $idx - $scansize <= 1024) {
1277 0         0 substr($rawbody, $idx+1) = ''; # truncate
1278             }
1279             else {
1280 5         18 substr($rawbody, $scansize) = ''; # truncate
1281             }
1282             }
1283             }
1284             }
1285              
1286 108         250 push(@{$self->{text_decoded}},
  108         547  
1287             split_into_array_of_short_paragraphs($rawbody));
1288             }
1289              
1290 108         466 return $self->{text_decoded};
1291             }
1292              
1293             # ---------------------------------------------------------------------------
1294              
1295             sub split_into_array_of_short_lines {
1296 140     140 0 303 my @result;
1297 140         1129 foreach my $line (split (/^/m, $_[0])) {
1298 512         1523 while (length ($line) > MAX_BODY_LINE_LENGTH) {
1299             # try splitting "nicely" so that we don't chop a url in half or
1300             # something. if there's no space, then just split at max length.
1301 1         8 my $length = rindex($line, ' ', MAX_BODY_LINE_LENGTH) + 1;
1302 1   50     12 $length ||= MAX_BODY_LINE_LENGTH;
1303 1         8 push (@result, substr($line, 0, $length, ''));
1304             }
1305 512         1405 push (@result, $line);
1306             }
1307 140         862 @result;
1308             }
1309              
1310             # ---------------------------------------------------------------------------
1311              
1312             # Split a text into array of paragraphs of sizes between
1313             # 1-2 * MAX_BODY_LINE_LENGTH, returning the resulting array.
1314             # Mainly used for rawbody splitting purposes
1315              
1316             sub split_into_array_of_short_paragraphs {
1317 108     108 0 365 my $text_l = length($_[0]);
1318 108 100       958 return ($_[0]) if $text_l <= MAX_BODY_LINE_LENGTH;
1319 5         14 my(@result,$j,$ofs);
1320             # Try to split in order \n, > (html?), space
1321 5         26 for ($ofs = 0; $text_l - $ofs > 2 * MAX_BODY_LINE_LENGTH; $ofs = $j+1) {
1322 1         14 $j = index($_[0], "\n", $ofs + MAX_BODY_LINE_LENGTH);
1323 1 50 33     9 if ($j < 0 || $j-$ofs+1 > 2 * MAX_BODY_LINE_LENGTH) {
1324 0         0 $j = index($_[0], ">", $ofs + MAX_BODY_LINE_LENGTH);
1325 0 0 0     0 if ($j < 0 || $j-$ofs+1 > 2 * MAX_BODY_LINE_LENGTH) {
1326 0         0 $j = index($_[0], " ", $ofs + MAX_BODY_LINE_LENGTH);
1327 0 0 0     0 if ($j < 0 || $j-$ofs+1 > 2 * MAX_BODY_LINE_LENGTH) {
1328 0         0 $j = $ofs + MAX_BODY_LINE_LENGTH;
1329             }
1330             }
1331             }
1332 1         9 push(@result, substr($_[0], $ofs, $j-$ofs+1));
1333             }
1334 5 50       64 push(@result, substr($_[0], $ofs)) if $ofs < $text_l;
1335 5         36 @result;
1336             }
1337              
1338             # ---------------------------------------------------------------------------
1339              
1340             1;
1341              
1342             =back
1343              
1344             =cut