File Coverage

blib/lib/Mail/SpamAssassin/Message.pm
Criterion Covered Total %
statement 427 501 85.2
branch 181 256 70.7
condition 74 115 64.3
subroutine 33 36 91.6
pod 16 24 66.6
total 731 932 78.4


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Message - decode, render, and hold an RFC-2822 message
21              
22             =head1 DESCRIPTION
23              
24             This module encapsulates an email message and allows access to the various MIME
25             message parts and message metadata.
26              
27             The message structure, after initiating a parse() cycle, looks like this:
28              
29             Message object, also top-level node in Message::Node tree
30             |
31             +---> Message::Node for other parts in MIME structure
32             | |---> [ more Message::Node parts ... ]
33             | [ others ... ]
34             |
35             +---> Message::Metadata object to hold metadata
36              
37             =head1 PUBLIC METHODS
38              
39             =over 4
40              
41             =cut
42              
43              
44             use strict;
45 41     41   280 use warnings;
  41         72  
  41         1188  
46 41     41   199 use re 'taint';
  41         67  
  41         1307  
47 41     41   203  
  41         65  
  41         2834  
48             BEGIN {
49             eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
50 41         19225 or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
  41         113335  
  41         1297  
51 41 50   41   115 }
  0         0  
  0         0  
52              
53             use Mail::SpamAssassin;
54 41     41   267 use Mail::SpamAssassin::Message::Node;
  41         73  
  41         862  
55 41     41   12792 use Mail::SpamAssassin::Message::Metadata;
  41         113  
  41         1591  
56 41     41   13191 use Mail::SpamAssassin::Constants qw(:sa);
  41         415  
  41         3659  
57 41     41   336 use Mail::SpamAssassin::Logger;
  41         140  
  41         8030  
58 41     41   298  
  41         121  
  41         220543  
59             our @ISA = qw(Mail::SpamAssassin::Message::Node);
60              
61             # ---------------------------------------------------------------------------
62              
63             =item new()
64              
65             Creates a Mail::SpamAssassin::Message object. Takes a hash reference
66             as a parameter. The used hash key/value pairs are as follows:
67              
68             C<message> is either undef (which will use STDIN), a scalar - a string
69             containing an entire message, a reference to such string, an array reference
70             of the message with one line per array element, or either a file glob
71             or an IO::File object which holds the entire contents of the message.
72              
73             Note: The message is expected to generally be in RFC 2822 format, optionally
74             including an mbox message separator line (the "From " line) as the first line.
75              
76             C<parse_now> specifies whether or not to create the MIME tree
77             at object-creation time or later as necessary.
78              
79             The I<parse_now> option, by default, is set to false (0).
80             This allows SpamAssassin to not have to generate the tree of
81             Mail::SpamAssassin::Message::Node objects and their related data if the
82             tree is not going to be used. This is handy, for instance, when running
83             C<spamassassin -d>, which only needs the pristine header and body which
84             is always handled when the object is created.
85              
86             C<subparse> specifies how many MIME recursion levels should be parsed.
87             Defaults to 20.
88              
89             =cut
90              
91             # month mappings (ripped from Util.pm)
92             my %MONTH = (jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6,
93             jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12);
94              
95             # day of week mapping (starting from zero)
96             my @DAY_OF_WEEK = qw/Sun Mon Tue Wed Thu Fri Sat/ ;
97              
98             my $class = shift;
99             $class = ref($class) || $class;
100 150     150 1 379  
101 150   33     993 my($opts) = @_;
102             my $message = defined $opts->{'message'} ? $opts->{'message'} : \*STDIN;
103 150         402 my $parsenow = $opts->{'parsenow'} || 0;
104 150 50       614 my $normalize = $opts->{'normalize'} || 0;
105 150   100     603  
106 150   50     814 # Specifies whether or not to parse message/rfc822 parts into its own tree.
107             # If the # > 0, it'll subparse, otherwise it won't. By default, do twenty
108             # levels deep.
109             my $subparse = defined $opts->{'subparse'} ? $opts->{'subparse'} : 20;
110              
111 150 100       538 my $self = $class->SUPER::new({normalize=>$normalize});
112              
113 150         1626 $self->{tmpfiles} = [];
114             $self->{pristine_msg} = '';
115 150         664 $self->{pristine_headers} = '';
116 150         499 $self->{pristine_body} = '';
117 150         512 $self->{mime_boundary_state} = {};
118 150         609 $self->{line_ending} = "\012";
119 150         390 $self->{master_deadline} = $opts->{'master_deadline'};
120 150         538 $self->{suppl_attrib} = $opts->{'suppl_attrib'};
121 150         421 $self->{body_part_scan_size} = $opts->{'body_part_scan_size'} || 0;
122 150         318 $self->{rawbody_part_scan_size} = $opts->{'rawbody_part_scan_size'} || 0;
123 150   100     516  
124 150   100     567 if ($self->{suppl_attrib}) { # caller-provided additional information
125             # pristine_body_length is currently used by an eval test check_body_length
126 150 100       488 # Possible To-Do: Base the length on the @message array later down?
127             if (defined $self->{suppl_attrib}{body_size}) {
128             # Optional info provided by a caller; should reflect the original
129 49 50       206 # message body size if provided, and as such it may differ from the
130             # $self->{pristine_body} size, e.g. when the caller passed a truncated
131             # message to SpamAssassin, or when counting line-endings differently.
132             $self->{pristine_body_length} = $self->{suppl_attrib}{body_size};
133             dbg("message: set pristine_body_length from suppl_attrib: %s", $self->{pristine_body_length});
134 0         0 }
135 0         0 if (ref $self->{suppl_attrib}{mimepart_digests}) {
136             # Optional info provided by a caller: an array of digest codes (e.g. SHA1)
137 49 50       217 # of each MIME part. Should reflect the original message if provided.
138             # As such it may differ from digests calculated by get_mimepart_digests(),
139             # e.g. when the caller passed a truncated message to SpamAssassin.
140             $self->{mimepart_digests} = $self->{suppl_attrib}{mimepart_digests};
141             dbg("message: set mimepart_digests from suppl_attrib");
142 0         0 }
143 0         0 }
144              
145             bless($self,$class);
146              
147 150         344 # create the metadata holder class
148             $self->{metadata} = Mail::SpamAssassin::Message::Metadata->new($self);
149              
150 150         1608 # Ok, go ahead and do the message "parsing"
151              
152             # protect it from abuse ...
153             local $_;
154              
155 150         284 # Figure out how the message was passed to us, and deal with it.
156             my @message;
157             if (ref $message eq 'ARRAY') {
158 150         247 @message = @{$message};
159 150 100 66     1349 }
    100          
    50          
    50          
    50          
160 63         127 elsif (ref($message) eq 'GLOB' || ref($message) =~ /^IO::/) {
  63         340  
161             if (defined fileno $message) {
162              
163 18 50       61 # sysread+split avoids a Perl I/O bug (Bug 5985)
164             # and is faster than (<$message>) by 10..25 %
165             # (a drawback is a short-term double storage of a text in $raw_str)
166             #
167             my($nread,$raw_str); $raw_str = '';
168             while ( $nread=sysread($message, $raw_str, 16384, length $raw_str) ) { }
169 18         25 defined $nread or die "error reading: $!";
  18         24  
170 18         721 @message = split(/^/m, $raw_str, -1);
171 18 50       71  
172 18         904 if ($raw_str eq '') {
173             dbg("message: empty message read");
174 18 50       194 } elsif (length($raw_str) > 128*1024) {
    50          
175 0         0 # ditch rarely used large chunks of allocated memory, Bug 6514
176             # http://www.perlmonks.org/?node_id=803515
177             # about 97% of mail messages are below 128 kB,
178             # about 98% of mail messages are below 256 kB (2010 statistics)
179             # dbg("message: deallocating %.2f MB", length($raw_str)/1024/1024);
180             undef $raw_str;
181             }
182 0         0 }
183             }
184             elsif (ref $message eq 'SCALAR') {
185             @message = split(/^/m, $$message, -1);
186             }
187 0         0 elsif (ref $message) {
188             dbg("message: Input is a reference of unknown type!");
189             }
190 0         0 elsif (defined $message) {
191             @message = split(/^/m, $message, -1);
192             }
193 69         1605  
194             # Deal with null message
195             if (!@message) {
196             # bug 4884:
197 150 50       533 # if we get here, it means that the input was null, so fake the message
198             # content as a single newline...
199             @message = ("\n");
200             }
201 0         0  
202             # Bug 7648:
203             # Make sure the message is tainted. When linting, @testmsg is not, so this
204             # handles that. Perhaps 3rd party tools could call this with untainted
205             # messages? Tainting the message is important because it prevents certain
206             # exploits later.
207             if (Mail::SpamAssassin::Util::am_running_in_taint_mode() &&
208             grep { !Scalar::Util::tainted($_) } @message) {
209 150 100 66     713 local($_);
210 4973         7886 # To preserve newlines, no joining and splitting here, process each line
211 120         209 # directly as is.
212             foreach (@message) {
213             $_ = Mail::SpamAssassin::Util::taint_var($_);
214 120         315 }
215 1184         2236 if (grep { !Scalar::Util::tainted($_) } @message) {
216             die "Mail::SpamAssassin::Message failed to enforce message taintness";
217 120 50       299 }
  1184         2526  
218 0         0 }
219              
220             # Pull off mbox and mbx separators
221             if ($message[0] =~ /^From\s+(?!:)/) {
222             # careful not to confuse with obsolete syntax which allowed WSP before ':'
223 150 100       1415 # mbox formatted mailbox
    50          
224             $self->{'mbox_sep'} = shift @message;
225             } elsif ($message[0] =~ MBX_SEPARATOR) {
226 17         107 $_ = shift @message;
227              
228 0         0 # Munge the mbx message separator into mbox format as a sort of
229             # de facto portability standard in SA's internals. We need to
230             # to this so that Mail::SpamAssassin::Util::parse_rfc822_date
231             # can parse the date string...
232             if (/([\s\d]\d)-([a-zA-Z]{3})-(\d{4})\s(\d{2}):(\d{2}):(\d{2})/) {
233             # $1 = day of month
234 0 0       0 # $2 = month (text)
235             # $3 = year
236             # $4 = hour
237             # $5 = min
238             # $6 = sec
239             my @arr = localtime(timelocal($6,$5,$4,$1,$MONTH{lc($2)}-1,$3));
240             my $address;
241 0         0 foreach (@message) {
242 0         0 if (/^From:[^<]*<([^>]+)>/) {
243 0         0 $address = $1;
244 0 0       0 last;
    0          
245 0         0 } elsif (/^From:\s*([^<> ]+)/) {
246 0         0 $address = $1;
247             last;
248 0         0 }
249 0         0 }
250             $self->{'mbox_sep'} = "From $address $DAY_OF_WEEK[$arr[6]] $2 $1 $4:$5:$6 $3\n";
251             }
252 0         0 }
253              
254             # bug 4363
255             # Check to see if we should do CRLF instead of just LF
256             # For now, just check the first and last line and do whatever it does
257             if (@message && ($message[0] =~ /\015\012/ || $message[-1] =~ /\015\012/)) {
258             $self->{line_ending} = "\015\012";
259 150 50 33     1315 dbg("message: line ending changed to CRLF");
      33        
260 0         0 }
261 0         0  
262             # Is a CRLF -> LF line endings conversion necessary?
263             my $squash_crlf = $self->{line_ending} eq "\015\012";
264              
265 150         498 # Go through all the header fields of the message
266             my $hdr_errors = 0;
267             my $header;
268 150         245 for (;;) {
269 150         234 # make sure not to lose the last header field when there is no body
270 150         246 my $eof = !@message;
271             my $current = $eof ? "\n" : shift @message;
272 1175         1779  
273 1175 100       2439 if ( $current =~ /^[ \t]/ ) {
274             # This wasn't useful in terms of a rule, but we may want to treat it
275 1175 100       3050 # specially at some point. Perhaps ignore it?
276             #unless ($current =~ /\S/) {
277             # $self->{'obsolete_folding_whitespace'} = 1;
278             #}
279              
280             $header = '' if !defined $header; # header starts with a continuation!?
281             $header .= $current; # append continuations, no matter what
282 201 50       439 $self->{'pristine_headers'} .= $current;
283 201         433 }
284 201         509 else { # not a continuation
285             # Ok, there's a header here, let's go ahead and add it in.
286             if (defined $header) { # deal with a previous header field
287             my ($key, $value) = split (/:/s, $header, 2);
288 974 100       2087  
289 824         3473 # If it's not a valid header (aka: not in the form "foo:bar"), skip it.
290             if (defined $value) {
291             # CRLF -> LF line-endings conversion if necessary
292 824 100       1937 $value =~ s/\015\012/\012/sg if $squash_crlf;
293             $key =~ s/[ \t]+\z//; # strip WSP before colon, obsolete rfc822 syn
294 818 50       1502 # limit the length of the pairs we store
295 818         1458 if (length($key) > MAX_HEADER_KEY_LENGTH) {
296             $key = substr($key, 0, MAX_HEADER_KEY_LENGTH);
297 818 50       1823 $self->{'truncated_header'} = 1;
298 0         0 }
299 0         0 if (length($value) > MAX_HEADER_VALUE_LENGTH) {
300             $value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);
301 818 50       1661 $self->{'truncated_header'} = 1;
302 0         0 }
303 0         0 $self->header($key, $value);
304             }
305 818         2453 }
306              
307             if ($current =~ /^\r?$/) { # a regular end of a header section
308             if ($eof) {
309 974 100       3679 $self->{'missing_head_body_separator'} = 1;
    100          
310 145 100       443 } else {
311 1         4 $self->{'pristine_headers'} .= $current;
312             }
313 144         379 last;
314             }
315 145         395 elsif ($current =~ /^--/) { # mime boundary encountered, bail out
316             $self->{'missing_head_body_separator'} = 1;
317             unshift(@message, $current);
318 4         16 last;
319 4         12 }
320 4         11 # should we assume entering a body on encountering invalid header field?
321             else {
322             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
323             if ($current !~ /^[\041-\071\073-\176]+[ \t]*:/) {
324             # A field name MUST be composed of printable US-ASCII characters
325 825 100       3075 # (i.e., characters that have values between 33 (041) and 126 (176),
326             # inclusive), except colon (072). Obsolete header field syntax
327             # allowed WSP before a colon.
328             if (++$hdr_errors <= 3) {
329             # just consume but ignore a few invalid header fields
330 7 100       18 } else { # enough is enough...
331             $self->{'missing_head_body_separator'} = 1;
332             unshift(@message, $current);
333 1         4 last;
334 1         2 }
335 1         4 }
336             }
337              
338             # start collecting a new header field
339             $header = $current;
340             $self->{'pristine_headers'} .= $current;
341 824         1505 }
342 824         2614 }
343             undef $header;
344              
345 150         315 # Store the pristine body for later -- store as a copy since @message
346             # will get modified below
347             $self->{'pristine_body'} = join('', @message);
348              
349 150         1635 if (!defined $self->{pristine_body_length}) {
350             $self->{'pristine_body_length'} = length $self->{'pristine_body'};
351 150 50       541 }
352 150         833  
353             # Store complete message, get_pristine() is used a lot, avoid making copies
354             $self->{'pristine_msg'} = $self->{'pristine_headers'} . $self->{'pristine_body'};
355              
356 150         1164 # iterate over lines in reverse order
357             # merge multiple blank lines into a single one
358             my $start;
359             for (my $cnt=$#message; $cnt>=0; $cnt--) {
360 150         400 # CRLF -> LF line-endings conversion if necessary
361 150         652 $message[$cnt] =~ s/\015\012\z/\012/ if $squash_crlf;
362              
363 3787 50       4895 # line is blank
364             if ($message[$cnt] =~ /^\s*$/) {
365             # /^\s*$/ is about 5% faster then !/\S/, but still expensive here
366 3787 100       6882 if (!defined $start) {
367             $start=$cnt;
368 418 100       760 }
369 350         403 next unless $cnt == 0;
370             }
371 418 100       910  
372             # line is not blank, or we've reached the beginning
373              
374             # if we've got a series of blank lines, get rid of them
375             if (defined $start) {
376             my $max_blank_lines = 20;
377 3404 100       7003 my $num = $start-$cnt;
378 350         427 if ($num > $max_blank_lines) {
379 350         411 splice @message, $cnt+2, $num-$max_blank_lines;
380 350 50       564 }
381 0         0 undef $start;
382             }
383 350         646 }
384              
385             # Figure out the boundary
386             my ($boundary);
387             ($self->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($self->header('content-type'));
388 150         262 dbg("message: main message type: ".$self->{'type'});
389 150         541  
390 150         912 # dbg("message: \$message[0]: \"" . $message[0] . "\"");
391              
392             # bug 6845: if main message type is multipart and the message body does not begin with
393             # either a blank line or the boundary (if defined), insert a blank line
394             # to ensure proper parsing - do not consider MIME headers at the beginning of the body
395             # to be part of the message headers.
396             if ($self->{'type'} =~ /^multipart\//i && $#message > 0 && $message[0] =~ /\S/)
397             {
398 150 100 66     805 if (!defined $boundary || $message[0] !~ /^--\Q$boundary\E/)
      100        
399             {
400 15 100 66     359 dbg("message: Inserting blank line at top of body to ensure correct multipart MIME parsing");
401             unshift(@message, "\012");
402 4         15 }
403 4         13 }
404              
405             # dbg("message: \$message[0]: \"" . $message[0] . "\"");
406             # dbg("message: \$message[1]: \"" . $message[1] . "\"");
407              
408             # parse queue, simple array of parts to parse:
409             # 0: part object, already in the tree
410             # 1: boundary used to focus body parsing
411             # 2: message content
412             # 3: how many MIME subparts to parse down
413             #
414             $self->{'parse_queue'} = [ [ $self, $boundary, \@message, $subparse ] ];
415              
416 150         599 # If the message does need to get parsed, save off a copy of the body
417             # in a format we can easily parse later so we don't have to rip from
418             # pristine_body ... If we do want to parse now, go ahead and do so ...
419             #
420             if ($parsenow) {
421             $self->parse_body();
422 150 100       455 }
423 101         500  
424             $self;
425             }
426 150         1029  
427             # ---------------------------------------------------------------------------
428              
429             =item find_parts()
430              
431             Used to search the tree for specific MIME parts. See
432             I<Mail::SpamAssassin::Message::Node> for more details.
433              
434             =cut
435              
436             # Used to find any MIME parts whose simple content-type matches a given regexp
437             # Searches it's own and any children parts. Returns an array of MIME
438             # objects which match.
439             #
440             my $self = shift;
441              
442             # ok, we need to do the parsing now...
443 256     256 1 3782 $self->parse_body() if (exists $self->{'parse_queue'});
444              
445             # and pass through to the Message::Node version of the method
446 256 100       862 return $self->SUPER::find_parts(@_);
447             }
448              
449 256         1122 # ---------------------------------------------------------------------------
450              
451             =item get_pristine_header()
452              
453             Returns pristine headers of the message. If no specific header name
454             is given as a parameter (case-insensitive), then all headers will be
455             returned as a scalar, including the blank line at the end of the headers.
456              
457             If called in an array context, an array will be returned with each
458             specific header in a different element. In a scalar context, the last
459             specific header is returned.
460              
461             ie: If 'Subject' is specified as the header, and there are 2 Subject
462             headers in a message, the last/bottom one in the message is returned in
463             scalar context or both are returned in array context.
464              
465             Btw, returning the last header field (not the first) happens to be consistent
466             with DKIM signatures, which search for and cover multiple header fields
467             bottom-up according to the 'h' tag. Let's keep it this way.
468              
469             Note: the returned header will include the ending newline and any embedded
470             whitespace folding.
471              
472             =cut
473              
474             my ($self, $hdr) = @_;
475            
476             return $self->{pristine_headers} if !defined $hdr || $hdr eq '';
477             my(@ret) =
478 76     76 1 144 $self->{pristine_headers} =~ /^\Q$hdr\E[ \t]*:[ \t]*(.*?\n(?![ \t]))/smgi;
479             # taintedness is retained by "use re 'taint'" (fix in bug 5283 now redundant)
480 76 100 66     455 if (!@ret) {
481             return $self->get_header($hdr);
482 44         556 } elsif (wantarray) {
483             return @ret;
484 44 100       148 } else {
    50          
485 34         142 return $ret[-1];
486             }
487 0         0 }
488              
489 10         42 =item get_mbox_separator()
490              
491             Returns the mbox separator found in the message, or undef if there
492             wasn't one.
493              
494             =cut
495              
496             return $_[0]->{mbox_sep};
497             }
498              
499             =item get_body()
500              
501 34     34 1 161 Returns an array of the pristine message body, one line per array element.
502              
503             =cut
504              
505             my ($self) = @_;
506             my @ret = split(/^/m, $self->{pristine_body});
507             return \@ret;
508             }
509              
510             # ---------------------------------------------------------------------------
511 0     0 1 0  
512 0         0 =item get_pristine()
513 0         0  
514             Returns a scalar of the entire pristine message.
515              
516             =cut
517              
518             my ($self) = @_;
519             return $self->{pristine_msg};
520             }
521              
522             =item get_pristine_body()
523              
524             Returns a scalar of the pristine message body.
525 102     102 1 210  
526 102         420 =cut
527              
528             my ($self) = @_;
529             return $self->{pristine_body};
530             }
531              
532             # ---------------------------------------------------------------------------
533              
534             =item extract_message_metadata($permsgstatus)
535              
536 46     46 1 112 =cut
537 46         403  
538             my ($self, $permsgstatus) = @_;
539              
540             # do this only once per message, it can be expensive
541             return if $self->{already_extracted_metadata};
542             $self->{already_extracted_metadata} = 1;
543              
544             $self->{metadata}->extract ($self, $permsgstatus);
545             }
546              
547 116     116 1 307 # ---------------------------------------------------------------------------
548              
549             =item $str = get_metadata($hdr)
550 116 100       353  
551 102         301 =cut
552              
553 102         622 my ($self, $hdr) = @_;
554             if (!$self->{metadata}) {
555             warn "metadata: oops! get_metadata() called after finish_metadata()"; return;
556             }
557             # dbg("message: get_metadata - %s: %s", $hdr, defined $_ ? $_ : '<undef>')
558             # for $self->{metadata}->{strings}->{lc $hdr};
559              
560             $self->{metadata}->{strings}->{lc $hdr};
561             }
562              
563 2754     2754 1 4285 =item put_metadata($hdr, $text)
564 2754 50       4789  
565 0         0 =cut
  0         0  
566              
567             my ($self, $hdr, $text) = @_;
568             if (!$self->{metadata}) {
569             warn "metadata: oops! put_metadata() called after finish_metadata()"; return;
570 2754         6690 }
571             # dbg("message: put_metadata - %s: %s", $hdr, $text);
572             $self->{metadata}->{strings}->{lc $hdr} = $text;
573             }
574              
575             =item delete_metadata($hdr)
576              
577             =cut
578 408     408 1 911  
579 408 50       805 my ($self, $hdr) = @_;
580 0         0 if (!$self->{metadata}) {
  0         0  
581             warn "metadata: oops! delete_metadata() called after finish_metadata()"; return;
582             }
583 408         1537 delete $self->{metadata}->{strings}->{lc $hdr};
584             }
585              
586             =item $str = get_all_metadata()
587              
588             =cut
589              
590             my ($self) = @_;
591 0     0 1 0  
592 0 0       0 if (!$self->{metadata}) {
593 0         0 warn "metadata: oops! get_all_metadata() called after finish_metadata()"; return;
  0         0  
594             }
595 0         0 my @ret;
596             my $keys_ref = $self->{metadata}->{strings};
597             foreach my $key (sort keys %$keys_ref) {
598             my $val = $keys_ref->{$key};
599             $val = '' if !defined $val;
600             push (@ret, "$key: $val\n");
601             }
602             return (wantarray ? @ret : join('', @ret));
603 16     16 1 46 }
604              
605 16 50       68 # ---------------------------------------------------------------------------
606 0         0  
  0         0  
607             =item finish_metadata()
608 16         28  
609 16         43 Destroys the metadata for this message. Once a message has been
610 16         128 scanned fully, the metadata is no longer required. Destroying
611 64         149 this will free up some memory.
612 64 50       134  
613 64         231 =cut
614              
615 16 50       103 my ($self) = @_;
616             if (defined ($self->{metadata})) {
617             $self->{metadata}->finish();
618             delete $self->{metadata};
619             }
620             }
621              
622             =item finish()
623              
624             Clean up an object so that it can be destroyed.
625              
626             =cut
627              
628             my ($self) = @_;
629 124     124 1 239  
630 124 50       406 # Clean ourself up
631 124         585 $self->finish_metadata();
632 124         387  
633             # These will only be in the root Message node
634             delete $self->{'mime_boundary_state'};
635             delete $self->{'mbox_sep'};
636             delete $self->{'normalize'};
637             delete $self->{'body_part_scan_size'};
638             delete $self->{'rawbody_part_scan_size'};
639             delete $self->{'pristine_msg'};
640             delete $self->{'pristine_body'};
641             delete $self->{'pristine_headers'};
642             delete $self->{'line_ending'};
643 92     92 1 7411 delete $self->{'missing_head_body_separator'};
644              
645             # Remove the queue variable, in case the body has not been parsed
646 92         315 delete $self->{'parse_queue'};
647              
648             my @toclean = ( $self );
649 92         199  
650 92         167 # Go ahead and clean up all of the Message::Node parts
651 92         183 while (my $part = shift @toclean) {
652 92         149 # bug 5557: windows requires tmp file be closed before it can be rm'd
653 92         144 if (ref $part->{'raw'} eq 'GLOB') {
654 92         213 close($part->{'raw'}) or die "error closing input file: $!";
655 92         192 }
656 92         181  
657 92         172 # bug 5858: avoid memory leak with deep MIME structure
658 92         139 if (defined ($part->{metadata})) {
659             $part->{metadata}->finish();
660             delete $part->{metadata};
661 92         135 }
662              
663 92         177 delete $part->{'headers'};
664             delete $part->{'raw_headers'};
665             delete $part->{'header_order'};
666 92         295 delete $part->{'raw'};
667             delete $part->{'decoded'};
668 138 100       391 delete $part->{'rendered'};
669 20 50       244 delete $part->{'visible_rendered'};
670             delete $part->{'invisible_rendered'};
671             delete $part->{'type'};
672             delete $part->{'rendered_type'};
673 138 100       315  
674 4         12 # if there are children nodes, add them to the queue of nodes to clean up
675 4         8 if (exists $part->{'body_parts'}) {
676             push(@toclean, @{$part->{'body_parts'}});
677             delete $part->{'body_parts'};
678 138         524 }
679 138         484 }
680 138         333  
681 138         333 # delete temporary files
682 138         289 if ($self->{'tmpfiles'}) {
683 138         225 for my $fn (@{$self->{'tmpfiles'}}) {
684 138         208 unlink($fn) or warn "cannot unlink $fn: $!";
685 138         238 }
686 138         206 delete $self->{'tmpfiles'};
687 138         184 }
688             }
689              
690 138 100       528 # also use a DESTROY method, just to ensure (as much as possible) that
691 23         26 # temporary files are deleted even if the finish() method is omitted
  23         41  
692 23         62 my $self = shift;
693             # best practices: prevent potential calls to eval and to system routines
694             # in code of a DESTROY method from clobbering global variables $@ and $!
695             local($@,$!); # keep outer error handling unaffected by DESTROY
696             if ($self->{'tmpfiles'}) {
697 92 50       259 for my $fn (@{$self->{'tmpfiles'}}) {
698 92         131 unlink($fn) or dbg("message: cannot unlink $fn: $!");
  92         342  
699 20 50       746 }
700             }
701 92         355 }
702              
703             # ---------------------------------------------------------------------------
704              
705             =item receive_date()
706              
707             Return a time_t value with the received date of the current message,
708 128     128   633 or current time if received time couldn't be determined.
709              
710             =cut
711 128         956  
712 128 100       2024 my($self) = @_;
713 36         68  
  36         1757  
714 0 0       0 return Mail::SpamAssassin::Util::receive_date(scalar $self->get_all_headers(0,1));
715             }
716              
717             # ---------------------------------------------------------------------------
718              
719             =back
720              
721             =head1 PARSING METHODS, NON-PUBLIC
722              
723             These methods take a RFC2822-esque formatted message and create a tree
724             with all of the MIME body parts included. Those parts will be decoded
725             as necessary, and text/html parts will be rendered into a standard text
726             format, suitable for use in SpamAssassin.
727              
728             =over 4
729 10     10 1 32  
730             =item parse_body()
731 10         68  
732             parse_body() passes the body part that was passed in onto the
733             correct part parser, either _parse_multipart() for multipart/* parts,
734             or _parse_normal() for everything else. Multipart sections become the
735             root of sub-trees, while everything else becomes a leaf in the tree.
736              
737             For multipart messages, the first call to parse_body() doesn't create a
738             new sub-tree and just uses the parent node to contain children. All other
739             calls to parse_body() will cause a new sub-tree root to be created and
740             children will exist underneath that root. (this is just so the tree
741             doesn't have a root node which points at the actual root node ...)
742              
743             =cut
744              
745             my($self) = @_;
746              
747             # This shouldn't happen, but just in case, abort.
748             return unless (exists $self->{'parse_queue'});
749              
750             dbg("message: ---- MIME PARSER START ----");
751              
752             while (my $toparse = shift @{$self->{'parse_queue'}}) {
753             # multipart sections are required to have a boundary set ... If this
754             # one doesn't, assume it's malformed and send it to be parsed as a
755             # non-multipart section
756             #
757             my ($msg, $boundary, $body, $subparse) = @$toparse;
758              
759             if ($msg->{'type'} =~ m{^multipart/}i && defined $boundary && $subparse > 0) {
760             $self->_parse_multipart($toparse);
761             }
762             else {
763 138     138 1 334 # If it's not multipart, go ahead and just deal with it.
764             $self->_parse_normal($toparse);
765              
766 138 50       376 # bug 5041: process message/*, but exclude message/partial content types
767             if ($msg->{'type'} =~ m{^message/(?!partial\z)}i && $subparse > 0)
768 138         414 {
769             # Just decode the part, but we don't need the resulting string here.
770 138         260 $msg->decode(0);
  326         1177  
771              
772             # bug 7125: decode and parse only message/rfc822 or message/global,
773             # but do not treat other message/* content types (like the ones listed
774             # here) as a message consisting of a header and a body, as they are not:
775 188         481 # message/delivery-status, message/global-delivery-status,
776             # message/feedback-report, message/global-headers,
777 188 100 66     1176 # message/global-disposition-notification,
      66        
778 21         105 # message/disposition-notification, (and message/partial)
779              
780             # bug 5051, bug 3748: check $msg->{decoded}: sometimes message/* parts
781             # have no content, and we get stuck waiting for STDIN, which is bad. :(
782 167         811  
783             if ($msg->{'type'} =~ m{^message/(?:rfc822|global)\z}i &&
784             defined $msg->{'decoded'} && $msg->{'decoded'} ne '')
785 167 100 66     1091 {
786             # Ok, so this part is still semi-recursive, since M::SA::Message
787             # calls M::SA::Message, but we don't subparse the new message,
788 4         17 # and pull a sneaky "steal our child's queue" maneuver to deal
789             # with it on our own time. Reference the decoded array directly
790             # since it's faster.
791             #
792             my $msg_obj = Mail::SpamAssassin::Message->new({
793             message => $msg->{'decoded'},
794             parsenow => 0,
795             normalize => $self->{normalize},
796             subparse => $subparse - 1,
797             });
798              
799             # Add the new message to the current node
800             $msg->add_body_part($msg_obj);
801 4 50 33     40  
      33        
802             # now this is the sneaky bit ... steal the sub-message's parse_queue
803             # and add it to ours. then we'll handle the sub-message in our
804             # normal loop and get all the glory. muhaha. :)
805             push(@{$self->{'parse_queue'}}, @{$msg_obj->{'parse_queue'}});
806             delete $msg_obj->{'parse_queue'};
807              
808             # Ok, we've subparsed, so go ahead and remove the raw and decoded
809             # data because we won't need them anymore (the tree under this part
810             # will have that data)
811             if (ref $msg->{'raw'} eq 'GLOB') {
812             # Make sure we close it if it's a temp file -- Bug 5166
813             close($msg->{'raw'})
814 4         36 or die "error closing input file: $!";
815             }
816              
817             delete $msg->{'raw'};
818 4         18
819             delete $msg->{'decoded'};
820             }
821             }
822             }
823 4         5 }
  4         6  
  4         6  
824 4         8  
825             dbg("message: ---- MIME PARSER END ----");
826              
827             # we're done parsing, so remove the queue variable
828             delete $self->{'parse_queue'};
829 4 50       11 }
830              
831 0 0       0 =item _parse_multipart()
832              
833             Generate a root node, and for each child part call parse_body()
834             to generate the tree.
835 4         5  
836             =cut
837 4         73  
838             my($self, $toparse) = @_;
839              
840             my ($msg, $boundary, $body, $subparse) = @{$toparse};
841              
842             # we're not supposed to be a leaf, so prep ourselves
843 138         413 $msg->{'body_parts'} = [];
844              
845             # the next set of objects will be one level deeper
846 138         420 $subparse--;
847              
848             dbg("message: parsing multipart, got boundary: ".(defined $boundary ? $boundary : ''));
849              
850             # NOTE: The MIME boundary REs here are very specific to be mostly RFC 1521
851             # compliant, but also allow possible malformations to still work. Please
852             # see Bugzilla bug 3749 for more information before making any changes!
853              
854             # ignore preamble per RFC 1521, unless there's no boundary ...
855             if ( defined $boundary ) {
856             my $line;
857 21     21   38 my $tmp_line = @{$body};
858             for ($line=0; $line < $tmp_line; $line++) {
859 21         26 # dbg("message: multipart line $line: \"" . $body->[$line] . "\"");
  21         45  
860             # specifically look for an opening boundary
861             if (substr($body->[$line],0,2) eq '--' # triage
862 21         49 && $body->[$line] =~ /^--\Q$boundary\E\s*$/) {
863             # Make note that we found the opening boundary
864             $self->{mime_boundary_state}->{$boundary} = 1;
865 21         29  
866             # if the line after the opening boundary isn't a header, flag it.
867 21 50       99 # we need to make sure that there's actually another line though.
868             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
869             if ($line+1 < $tmp_line && $body->[$line+1] !~ /^[\041-\071\073-\176]+:/) {
870             $self->{'missing_mime_headers'} = 1;
871             }
872              
873             last;
874 21 50       49 }
875 21         30 }
876 21         26  
  21         39  
877 21         50 # Found a boundary, ignore the preamble
878             if ( $line < $tmp_line ) {
879             splice @{$body}, 0, $line+1;
880 121 100 100     710 }
881              
882             # Else, there's no boundary, so leave the whole part...
883 20         72 }
884              
885             # prepare a new tree node
886             my $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
887             my $in_body = 0;
888 20 100 66     132 my $header;
889 5         9 my $part_array;
890             my $found_end_boundary;
891             my $partcnt = 0;
892 20         61  
893             my $line_count = @{$body};
894             foreach ( @{$body} ) {
895             # if we're on the last body line, or we find any boundary marker,
896             # deal with the mime part;
897 21 100       40 # a triage before an unlikely-to-match regexp avoids a CPU hotspot
898 20         26 $found_end_boundary = defined $boundary && substr($_,0,2) eq '--'
  20         51  
899             && /^--\Q$boundary\E(?:--)?\s*$/;
900             if ( --$line_count == 0 || $found_end_boundary ) {
901             my $line = $_; # remember the last line
902              
903             # If at last line and no end boundary found, the line belongs to body
904             # TODO:
905 21         134 # Is $self->{mime_boundary_state}->{$boundary}-- needed here?
906 21         45 # Could "missing end boundary" be a useful rule? Mark it somewhere?
907 21         70 # If SA processed truncated message from amavis etc, this could also
908             # be hit legimately..
909 21         0 if (!$found_end_boundary) {
910 21         26 # TODO: This is duplicate code from few pages down below..
911             while (length ($_) > MAX_BODY_LINE_LENGTH) {
912 21         29 push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
  21         31  
913 21         28 substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
  21         37  
914             }
915             push ( @{$part_array}, $_ );
916             }
917 2364   100     8443 # per rfc 1521, the CRLF before the boundary is part of the boundary:
918             # NOTE: The CRLF preceding the encapsulation line is conceptually
919 2364 100 100     5302 # attached to the boundary so that it is possible to have a part
920 46         96 # that does not end with a CRLF (line break). Body parts that must
921             # be considered to end with line breaks, therefore, must have two
922             # CRLFs preceding the encapsulation line, the first of which is part
923             # of the preceding body part, and the second of which is part of the
924             # encapsulation boundary.
925             elsif ($part_array) {
926             chomp( $part_array->[-1] ); # trim the CRLF that's part of the boundary
927             splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
928 46 50       121 }
    50          
929             else {
930 0         0 # Invalid parts can have no body, so fake in a blank body
931 0         0 # in that case.
  0         0  
932 0         0 $part_array = [];
933             }
934 0         0  
  0         0  
935             my($p_boundary);
936             ($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
937             $p_boundary ||= $boundary;
938             dbg("message: found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));
939              
940             # we've created a new node object, so add it to the queue along with the
941             # text that belongs to that part, then add the new part to the current
942             # node to create the tree.
943             push(@{$self->{'parse_queue'}}, [ $part_msg, $p_boundary, $part_array, $subparse ]);
944             $msg->add_body_part($part_msg);
945 46         107  
946 46 100       125 # rfc 1521 says /^--boundary--$/, some MUAs may just require /^--boundary--/
  26         70  
947             # but this causes problems with horizontal lines when the boundary is
948             # made up of dashes as well, etc.
949             if (defined $boundary) {
950             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
951 0         0 if ($line =~ /^--\Q${boundary}\E--\s*$/) {
952             # Make a note that we've seen the end boundary
953             $self->{mime_boundary_state}->{$boundary}--;
954 46         68 last;
955 46         165 }
956 46   66     228 elsif ($line_count && $body->[-$line_count] !~ /^[\041-\071\073-\176]+:/) {
957 46 50       260 # if we aren't on an end boundary and there are still lines left, it
958             # means we hit a new start boundary. therefore, the next line ought
959             # to be a mime header. if it's not, mark it.
960             $self->{'missing_mime_headers'} = 1;
961             }
962 46         71 }
  46         145  
963 46         187  
964             # Maximum parts to process, simply skip the rest of the parts
965             if (++$partcnt == 1000) {
966             dbg("message: mimepart limit exceeded, stopping parsing");
967             $self->{'mimepart_limit_exceeded'} = 1;
968 46 50       140 return;
969             }
970 46 100 33     716  
    50          
971             # make sure we start with a new clean node
972 21         64 $in_body = 0;
973 21         89 $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
974             undef $part_array;
975             undef $header;
976              
977             next;
978             }
979 0         0  
980             if (!$in_body) {
981             # s/\s+$//; # bug 5127: don't clean this up (yet)
982             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
983             if (/^[\041-\071\073-\176]+[ \t]*:/) {
984 25 50       76 if ($header) {
985 0         0 my ( $key, $value ) = split ( /:\s*/, $header, 2 );
986 0         0 $part_msg->header( $key, $value );
987 0         0 }
988             $header = $_;
989             next;
990             }
991 25         30 elsif (/^[ \t]/ && $header) {
992 25         93 # $_ =~ s/^\s*//; # bug 5127, again
993 25         44 $header .= $_;
994 25         38 next;
995             }
996 25         75 else {
997             if ($header) {
998             my ( $key, $value ) = split ( /:\s*/, $header, 2 );
999 2318 100       3086 $part_msg->header( $key, $value );
1000             }
1001             $in_body = 1;
1002 138 100 66     438  
    100          
1003 91 100       186 # if there's a blank line separator, that's good. if there isn't,
1004 51         232 # it's a body line, so drop through.
1005 51         145 if (/^\r?$/) {
1006             next;
1007 91         142 }
1008 91         199 else {
1009             $self->{'missing_mime_head_body_separator'} = 1;
1010             }
1011             }
1012 1         4 }
1013 1         2  
1014             # we run into a perl bug if the lines are astronomically long (probably
1015             # due to lots of regexp backtracking); so split any individual line
1016 46 100       132 # over MAX_BODY_LINE_LENGTH bytes in length. This can wreck HTML
1017 40         243 # totally -- but IMHO the only reason a luser would use
1018 40         122 # MAX_BODY_LINE_LENGTH-byte lines is to crash filters, anyway.
1019             while (length ($_) > MAX_BODY_LINE_LENGTH) {
1020 46         62 push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
1021             substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
1022             }
1023             push ( @{$part_array}, $_ );
1024 46 100       140 }
1025 45         103  
1026             # Look for a message epilogue
1027             # originally ignored whitespace: 0.185 0.2037 0.0654 0.757 0.00 0.00 TVD_TAB
1028 1         17 # ham FPs were all "." on a line by itself.
1029             # spams seem to only have NULL chars afterwards ?
1030             if ($line_count) {
1031             for(; $line_count > 0; $line_count--) {
1032             if ($body->[-$line_count] =~ /[^\s.]/) {
1033             $self->{mime_epilogue_exists} = 1;
1034             last;
1035             }
1036             }
1037             }
1038 2181         3731  
1039 0         0 }
  0         0  
1040 0         0  
1041             =item _parse_normal()
1042 2181         2195  
  2181         4976  
1043             Generate a leaf node and add it to the parent.
1044              
1045             =cut
1046              
1047             my($self, $toparse) = @_;
1048              
1049 21 100       159 my ($msg, $boundary, $body) = @{$toparse};
1050 8         19  
1051 15 100       92 dbg("message: parsing normal part");
1052 2         5  
1053 2         12 # 0: content-type, 1: boundary, 2: charset, 3: filename
1054             my @ct = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
1055              
1056             # multipart sections are required to have a boundary set ... If this
1057             # one doesn't, assume it's malformed and revert to text/plain
1058             $msg->{'type'} = ($ct[0] !~ m@^multipart/@i || defined $boundary ) ? $ct[0] : 'text/plain';
1059             $msg->{'charset'} = $ct[2];
1060              
1061             # attempt to figure out a name for this attachment if there is one ...
1062             my $disp = $msg->header('content-disposition') || '';
1063             if ($disp =~ /name=\s*"?([^";]+)"?/i) {
1064             $msg->{'name'} = $1;
1065             }
1066             elsif ($ct[3]) {
1067 167     167   424 $msg->{'name'} = $ct[3];
1068             }
1069 167         343 if ($msg->{'name'}) {
  167         412  
1070             $msg->{'name'} = Encode::decode("MIME-Header", $msg->{'name'});
1071 167         499 }
1072              
1073             $msg->{'boundary'} = $boundary;
1074 167         499  
1075             # If the part type is not one that we're likely to want to use, go
1076             # ahead and write the part data out to a temp file -- why keep sucking
1077             # up RAM with something we're not going to use?
1078 167 50 33     1060 #
1079 167         519 if ($msg->{'type'} !~ m@^(?:text/(?:plain|html)$|message\b)@) {
1080             my($filepath, $fh);
1081             eval {
1082 167   100     526 ($filepath, $fh) = Mail::SpamAssassin::Util::secure_tmpfile(); 1;
1083 167 100       659 } or do {
    100          
1084 1         4 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1085             info("message: failed to create a temp file: %s", $eval_stat);
1086             };
1087 2         5 if ($fh) {
1088             # The temp file was created, add it to the list of pending deletions
1089 167 100       471 # we cannot just delete immediately in the POSIX idiom, as this is
1090 3         12 # unportable (to win32 at least)
1091             push @{$self->{tmpfiles}}, $filepath;
1092             dbg("message: storing a message part to file %s", $filepath);
1093 167         5769 $fh->print(@{$body}) or die "error writing to $filepath: $!";
1094             $fh->flush or die "error writing (flush) to $filepath: $!";
1095             $msg->{'raw'} = $fh;
1096             }
1097             }
1098              
1099 167 100       1092 # if the part didn't get a temp file, go ahead and store the data in memory
1100 20         29 if (!defined $msg->{'raw'}) {
1101             dbg("message: storing a body to memory");
1102 20         43 $msg->{'raw'} = $body;
  20         45  
1103 20 50       23 }
1104 0 0       0 }
  0         0  
1105 0         0  
1106             # ---------------------------------------------------------------------------
1107 20 50       35  
1108             my ($self) = @_;
1109              
1110             if (!exists $self->{mimepart_digests}) {
1111 20         21 # traverse all parts which are leaves, recursively
  20         48  
1112 20         39 $self->{mimepart_digests} =
1113 20 50       24 [ map(sha1_hex($_->decode) . ':' . lc($_->{type}||''),
  20         72  
1114 20 50       999 $self->find_parts(qr/^/,1,1)) ];
1115 20         105 }
1116             return $self->{mimepart_digests};
1117             }
1118              
1119             # ---------------------------------------------------------------------------
1120 167 100       587  
1121 147         534 # common code for get_rendered_body_text_array,
1122 147         706 # get_visible_rendered_body_text_array, get_invisible_rendered_body_text_array
1123             #
1124             my ($self, $method_name) = @_;
1125              
1126             my $key = 'text_' . $method_name;
1127             if (exists $self->{$key}) { return $self->{$key} }
1128              
1129 0     0 0 0 $self->{$key} = [];
1130              
1131 0 0       0 my $scansize = $self->{body_part_scan_size};
1132              
1133             # Find all parts which are leaves
1134 0   0     0 my @parts = $self->find_parts(qr/./,1);
1135             return $self->{$key} unless @parts;
1136              
1137 0         0 # the html metadata may have already been set, so let's not bother if it's
1138             # already been done.
1139             my $html_needs_setting = !exists $self->{metadata}->{html};
1140              
1141             my $text = $method_name eq 'invisible_rendered' ? ''
1142             : ($self->get_header('subject') || "\n");
1143              
1144             # Go through each part
1145             for (my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1146 388     388 0 805 my $p = $parts[$pt];
1147              
1148 388         863 # put a blank line between parts ...
1149 388 100       912 $text .= "\n" if $text ne '';
  247         746  
1150              
1151 141         480 my($type, $rnd) = $p->$method_name(); # decode this part
1152             # Only text/* types are rendered ...
1153 141         378 if (defined $rnd) {
1154             # Truncate to body_part_scan_size
1155             if ($scansize && length($rnd) > $scansize) {
1156 141         1080 # Try truncating at boundary, nearest 1k block is fine
1157 141 50       521 # Newline first
1158             my $idx = index($rnd, "\n", $scansize);
1159             if ($idx >= 0 && $idx - $scansize <= 1024) {
1160             substr($rnd, $idx+1) = ''; # truncate
1161 141         505 }
1162             else {
1163 141 100 100     687 # Space is fine too for rendered
1164             $idx = index($rnd, " ", $scansize);
1165             if ($idx >= 0 && $idx - $scansize <= 1024) {
1166             substr($rnd, $idx+1) = ''; # truncate
1167 141         626 }
1168 143         294 else {
1169             substr($rnd, $scansize) = ''; # truncate
1170             }
1171 143 100       561 }
1172             }
1173 143         885  
1174             # Add to rendered text
1175 143 50       489 $text .= $rnd;
1176              
1177 143 100 100     1004 # TVD - if there are multiple parts, what should we do?
1178             # right now, just use the last one. we may need to give some priority
1179             # at some point, ie: use text/html rendered if it exists, or
1180 8         31 # text/plain rendered as html otherwise.
1181 8 100 100     33 if ($html_needs_setting && $type eq 'text/html') {
1182 2         19 $self->{metadata}->{html} = $p->{html_results};
1183             }
1184             }
1185             }
1186 6         14  
1187 6 100 66     19 # whitespace handling (warning: small changes have large effects!)
1188 1         5 $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
1189             # $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace (incl. VT, NBSP) => space
1190             $text =~ tr/ \t\n\r\x0b/ /s; # whitespace (incl. VT) => space
1191 5         14 $text =~ tr/\f/\n/; # form feeds => newline
1192              
1193             my @textary = split_into_array_of_short_lines($text);
1194             $self->{$key} = \@textary;
1195              
1196             return $self->{$key};
1197 143         605 }
1198              
1199             # ---------------------------------------------------------------------------
1200              
1201             my ($self) = @_;
1202             return $self->get_body_text_array_common('rendered');
1203 143 100 100     1157 }
1204 8         52  
1205             my ($self) = @_;
1206             return $self->get_body_text_array_common('visible_rendered');
1207             }
1208              
1209             my ($self) = @_;
1210 141         1693 return $self->get_body_text_array_common('invisible_rendered');
1211             }
1212 141         733  
1213 141         439 # ---------------------------------------------------------------------------
1214              
1215 141         667 my ($self) = @_;
1216 141         566  
1217             if (defined $self->{text_decoded}) { return $self->{text_decoded}; }
1218 141         753 $self->{text_decoded} = [ ];
1219              
1220             my $scansize = $self->{rawbody_part_scan_size};
1221              
1222             # Find all parts which are leaves
1223             my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
1224 340     340 0 657 return $self->{text_decoded} unless @parts;
1225 340         917  
1226             # Go through each part
1227             for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1228             # bug 4843: skip text/calendar parts since they're usually an attachment
1229 24     24 0 57 # and not displayed
1230 24         104 next if ($parts[$pt]->{'type'} eq 'text/calendar');
1231              
1232             my $rawbody = $parts[$pt]->decode();
1233             my $rawlen = length($rawbody);
1234 24     24 0 51  
1235 24         56 # Truncate to rawbody_part_scan_size
1236             if ($scansize && $rawlen > $scansize) {
1237             # Try truncating at boundary, nearest 1k block is fine
1238             # Newline first
1239             my $idx = index($rawbody, "\n", $scansize);
1240             if ($idx >= 0 && $idx - $scansize <= 1024) {
1241 110     110 0 233 substr($rawbody, $idx+1) = ''; # truncate
1242             }
1243 110 100       282 else {
  2         21  
1244 108         345 # Try > next, might be html
1245             $idx = index($rawbody, '>', $scansize);
1246 108         218 if ($idx >= 0 && $idx - $scansize <= 1024) {
1247             substr($rawbody, $idx+1) = ''; # truncate
1248             }
1249 108         543 else {
1250 108 50       406 # Space as last effort
1251             $idx = index($rawbody, ' ', $scansize);
1252             if ($idx >= 0 && $idx - $scansize <= 1024) {
1253 108         383 substr($rawbody, $idx+1) = ''; # truncate
1254             }
1255             else {
1256 108 50       336 substr($rawbody, $scansize) = ''; # truncate
1257             }
1258 108         313 }
1259 108         475 }
1260             }
1261              
1262 108 100 100     742 push(@{$self->{text_decoded}},
1263             split_into_array_of_short_paragraphs($rawbody));
1264             }
1265 8         37  
1266 8 100 100     37 return $self->{text_decoded};
1267 2         6 }
1268              
1269             # ---------------------------------------------------------------------------
1270              
1271 6         15 my @result;
1272 6 100 66     19 foreach my $line (split (/^/m, $_[0])) {
1273 1         5 while (length ($line) > MAX_BODY_LINE_LENGTH) {
1274             # try splitting "nicely" so that we don't chop a url in half or
1275             # something. if there's no space, then just split at max length.
1276             my $length = rindex($line, ' ', MAX_BODY_LINE_LENGTH) + 1;
1277 5         12 $length ||= MAX_BODY_LINE_LENGTH;
1278 5 50 33     13 push (@result, substr($line, 0, $length, ''));
1279 0         0 }
1280             push (@result, $line);
1281             }
1282 5         16 @result;
1283             }
1284              
1285             # ---------------------------------------------------------------------------
1286              
1287             # Split a text into array of paragraphs of sizes between
1288 108         205 # 1-2 * MAX_BODY_LINE_LENGTH, returning the resulting array.
  108         410  
1289             # Mainly used for rawbody splitting purposes
1290              
1291             my $text_l = length($_[0]);
1292 108         427 return ($_[0]) if $text_l <= MAX_BODY_LINE_LENGTH;
1293             my(@result,$j,$ofs);
1294             # Try to split in order \n, > (html?), space
1295             for ($ofs = 0; $text_l - $ofs > 2 * MAX_BODY_LINE_LENGTH; $ofs = $j+1) {
1296             $j = index($_[0], "\n", $ofs + MAX_BODY_LINE_LENGTH);
1297             if ($j < 0 || $j-$ofs+1 > 2 * MAX_BODY_LINE_LENGTH) {
1298 141     141 0 292 $j = index($_[0], ">", $ofs + MAX_BODY_LINE_LENGTH);
1299 141         1077 if ($j < 0 || $j-$ofs+1 > 2 * MAX_BODY_LINE_LENGTH) {
1300 515         1429 $j = index($_[0], " ", $ofs + MAX_BODY_LINE_LENGTH);
1301             if ($j < 0 || $j-$ofs+1 > 2 * MAX_BODY_LINE_LENGTH) {
1302             $j = $ofs + MAX_BODY_LINE_LENGTH;
1303 1         12 }
1304 1   50     5 }
1305 1         8 }
1306             push(@result, substr($_[0], $ofs, $j-$ofs+1));
1307 515         1272 }
1308             push(@result, substr($_[0], $ofs)) if $ofs < $text_l;
1309 141         789 @result;
1310             }
1311              
1312             # ---------------------------------------------------------------------------
1313              
1314             1;
1315              
1316             =back
1317              
1318             =cut