File Coverage

blib/lib/Mail/SpamAssassin/Message.pm
Criterion Covered Total %
statement 389 454 85.6
branch 163 232 70.2
condition 50 78 64.1
subroutine 33 36 91.6
pod 16 24 66.6
total 651 824 79.0


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Message - decode, render, and hold an RFC-2822 message
21              
22             =head1 DESCRIPTION
23              
24             This module encapsulates an email message and allows access to the various MIME
25             message parts and message metadata.
26              
27             The message structure, after initiating a parse() cycle, looks like this:
28              
29             Message object, also top-level node in Message::Node tree
30             |
31             +---> Message::Node for other parts in MIME structure
32             | |---> [ more Message::Node parts ... ]
33             | [ others ... ]
34             |
35             +---> Message::Metadata object to hold metadata
36              
37             =head1 PUBLIC METHODS
38              
39             =over 4
40              
41             =cut
42              
43             package Mail::SpamAssassin::Message;
44              
45 40     40   300 use strict;
  40         92  
  40         1298  
46 40     40   248 use warnings;
  40         83  
  40         1427  
47 40     40   247 use re 'taint';
  40         79  
  40         3522  
48              
49             BEGIN {
50 40         21803 eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
  40         118978  
  40         1314  
51 40 50   40   152 or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
  0         0  
  0         0  
52             }
53              
54 40     40   297 use Mail::SpamAssassin;
  40         85  
  40         957  
55 40     40   13476 use Mail::SpamAssassin::Message::Node;
  40         138  
  40         1699  
56 40     40   14337 use Mail::SpamAssassin::Message::Metadata;
  40         779  
  40         4456  
57 40     40   351 use Mail::SpamAssassin::Constants qw(:sa);
  40         130  
  40         8033  
58 40     40   363 use Mail::SpamAssassin::Logger;
  40         150  
  40         225146  
59              
60             our @ISA = qw(Mail::SpamAssassin::Message::Node);
61              
62             # ---------------------------------------------------------------------------
63              
64             =item new()
65              
66             Creates a Mail::SpamAssassin::Message object. Takes a hash reference
67             as a parameter. The used hash key/value pairs are as follows:
68              
69             C<message> is either undef (which will use STDIN), a scalar - a string
70             containing an entire message, a reference to such string, an array reference
71             of the message with one line per array element, or either a file glob
72             or an IO::File object which holds the entire contents of the message.
73              
74             Note: The message is expected to generally be in RFC 2822 format, optionally
75             including an mbox message separator line (the "From " line) as the first line.
76              
77             C<parse_now> specifies whether or not to create the MIME tree
78             at object-creation time or later as necessary.
79              
80             The I<parse_now> option, by default, is set to false (0).
81             This allows SpamAssassin to not have to generate the tree of
82             Mail::SpamAssassin::Message::Node objects and their related data if the
83             tree is not going to be used. This is handy, for instance, when running
84             C<spamassassin -d>, which only needs the pristine header and body which
85             is always handled when the object is created.
86              
87             C<subparse> specifies how many MIME recursion levels should be parsed.
88             Defaults to 20.
89              
90             =cut
91              
92             # month mappings (ripped from Util.pm)
93             my %MONTH = (jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6,
94             jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12);
95              
96             # day of week mapping (starting from zero)
97             my @DAY_OF_WEEK = qw/Sun Mon Tue Wed Thu Fri Sat/ ;
98              
99             sub new {
100 137     137 1 420 my $class = shift;
101 137   33     971 $class = ref($class) || $class;
102              
103 137         413 my($opts) = @_;
104 137 50       596 my $message = defined $opts->{'message'} ? $opts->{'message'} : \*STDIN;
105 137   100     742 my $parsenow = $opts->{'parsenow'} || 0;
106 137   50     1039 my $normalize = $opts->{'normalize'} || 0;
107              
108             # Specifies whether or not to parse message/rfc822 parts into its own tree.
109             # If the # > 0, it'll subparse, otherwise it won't. By default, do twenty
110             # levels deep.
111 137 100       540 my $subparse = defined $opts->{'subparse'} ? $opts->{'subparse'} : 20;
112              
113 137         1770 my $self = $class->SUPER::new({normalize=>$normalize});
114              
115 137         658 $self->{tmpfiles} = [];
116 137         626 $self->{pristine_headers} = '';
117 137         551 $self->{pristine_body} = '';
118 137         654 $self->{mime_boundary_state} = {};
119 137         658 $self->{line_ending} = "\012";
120 137         404 $self->{master_deadline} = $opts->{'master_deadline'};
121 137         339 $self->{suppl_attrib} = $opts->{'suppl_attrib'};
122              
123 137 100       520 if ($self->{suppl_attrib}) { # caller-provided additional information
124             # pristine_body_length is currently used by an eval test check_body_length
125             # Possible To-Do: Base the length on the @message array later down?
126 49 50       252 if (defined $self->{suppl_attrib}{body_size}) {
127             # Optional info provided by a caller; should reflect the original
128             # message body size if provided, and as such it may differ from the
129             # $self->{pristine_body} size, e.g. when the caller passed a truncated
130             # message to SpamAssassin, or when counting line-endings differently.
131 0         0 $self->{pristine_body_length} = $self->{suppl_attrib}{body_size};
132             }
133 49 50       235 if (ref $self->{suppl_attrib}{mimepart_digests}) {
134             # Optional info provided by a caller: an array of digest codes (e.g. SHA1)
135             # of each MIME part. Should reflect the original message if provided.
136             # As such it may differ from digests calculated by get_mimepart_digests(),
137             # e.g. when the caller passed a truncated message to SpamAssassin.
138 0         0 $self->{mimepart_digests} = $self->{suppl_attrib}{mimepart_digests};
139             }
140             }
141              
142 137         327 bless($self,$class);
143              
144             # create the metadata holder class
145 137         1793 $self->{metadata} = Mail::SpamAssassin::Message::Metadata->new($self);
146              
147             # Ok, go ahead and do the message "parsing"
148              
149             # protect it from abuse ...
150 137         281 local $_;
151              
152             # Figure out how the message was passed to us, and deal with it.
153 137         298 my @message;
154 137 100 66     1147 if (ref $message eq 'ARRAY') {
    100          
    50          
    50          
    50          
155 63         139 @message = @{$message};
  63         349  
156             }
157             elsif (ref($message) eq 'GLOB' || ref($message) =~ /^IO::/) {
158 18 50       70 if (defined fileno $message) {
159              
160             # sysread+split avoids a Perl I/O bug (Bug 5985)
161             # and is faster than (<$message>) by 10..25 %
162             # (a drawback is a short-term double storage of a text in $raw_str)
163             #
164 18         29 my($nread,$raw_str); $raw_str = '';
  18         32  
165 18         621 while ( $nread=sysread($message, $raw_str, 16384, length $raw_str) ) { }
166 18 50       70 defined $nread or die "error reading: $!";
167 18         595 @message = split(/^/m, $raw_str, -1);
168              
169 18 50       89 if ($raw_str eq '') {
    50          
170 0         0 dbg("message: empty message read");
171             } elsif (length($raw_str) > 128*1024) {
172             # ditch rarely used large chunks of allocated memory, Bug 6514
173             # http://www.perlmonks.org/?node_id=803515
174             # about 97% of mail messages are below 128 kB,
175             # about 98% of mail messages are below 256 kB (2010 statistics)
176             # dbg("message: deallocating %.2f MB", length($raw_str)/1024/1024);
177 0         0 undef $raw_str;
178             }
179             }
180             }
181             elsif (ref $message eq 'SCALAR') {
182 0         0 @message = split(/^/m, $$message, -1);
183             }
184             elsif (ref $message) {
185 0         0 dbg("message: Input is a reference of unknown type!");
186             }
187             elsif (defined $message) {
188 56         1306 @message = split(/^/m, $message, -1);
189             }
190              
191             # Pull off mbox and mbx separators
192             # also deal with null messages
193 137 50       1618 if (!@message) {
    100          
    50          
194             # bug 4884:
195             # if we get here, it means that the input was null, so fake the message
196             # content as a single newline...
197 0         0 @message = ("\n");
198             } elsif ($message[0] =~ /^From\s+(?!:)/) {
199             # careful not to confuse with obsolete syntax which allowed WSP before ':'
200             # mbox formated mailbox
201 17         67 $self->{'mbox_sep'} = shift @message;
202             } elsif ($message[0] =~ MBX_SEPARATOR) {
203 0         0 $_ = shift @message;
204              
205             # Munge the mbx message separator into mbox format as a sort of
206             # de facto portability standard in SA's internals. We need to
207             # to this so that Mail::SpamAssassin::Util::parse_rfc822_date
208             # can parse the date string...
209 0 0       0 if (/([\s\d]\d)-([a-zA-Z]{3})-(\d{4})\s(\d{2}):(\d{2}):(\d{2})/) {
210             # $1 = day of month
211             # $2 = month (text)
212             # $3 = year
213             # $4 = hour
214             # $5 = min
215             # $6 = sec
216 0         0 my @arr = localtime(timelocal($6,$5,$4,$1,$MONTH{lc($2)}-1,$3));
217 0         0 my $address;
218 0         0 foreach (@message) {
219 0 0       0 if (/^From:[^<]*<([^>]+)>/) {
    0          
220 0         0 $address = $1;
221 0         0 last;
222             } elsif (/^From:\s*([^<> ]+)/) {
223 0         0 $address = $1;
224 0         0 last;
225             }
226             }
227 0         0 $self->{'mbox_sep'} = "From $address $DAY_OF_WEEK[$arr[6]] $2 $1 $4:$5:$6 $3\n";
228             }
229             }
230              
231             # bug 4363
232             # Check to see if we should do CRLF instead of just LF
233             # For now, just check the first and last line and do whatever it does
234 137 50 33     1678 if (@message && ($message[0] =~ /\015\012/ || $message[-1] =~ /\015\012/)) {
      33        
235 0         0 $self->{line_ending} = "\015\012";
236 0         0 dbg("message: line ending changed to CRLF");
237             }
238              
239             # Is a CRLF -> LF line endings conversion necessary?
240 137         523 my $squash_crlf = $self->{line_ending} eq "\015\012";
241              
242             # Go through all the header fields of the message
243 137         250 my $hdr_errors = 0;
244 137         237 my $header;
245 137         329 for (;;) {
246             # make sure not to lose the last header field when there is no body
247 1056         1540 my $eof = !@message;
248 1056 100       1934 my $current = $eof ? "\n" : shift @message;
249              
250 1056 100       2648 if ( $current =~ /^[ \t]/ ) {
251             # This wasn't useful in terms of a rule, but we may want to treat it
252             # specially at some point. Perhaps ignore it?
253             #unless ($current =~ /\S/) {
254             # $self->{'obsolete_folding_whitespace'} = 1;
255             #}
256              
257 200 50       400 $header = '' if !defined $header; # header starts with a continuation!?
258 200         334 $header .= $current; # append continuations, no matter what
259 200         353 $self->{'pristine_headers'} .= $current;
260             }
261             else { # not a continuation
262             # Ok, there's a header here, let's go ahead and add it in.
263 856 100       1673 if (defined $header) { # deal with a previous header field
264 719         2197 my ($key, $value) = split (/:/s, $header, 2);
265              
266             # If it's not a valid header (aka: not in the form "foo:bar"), skip it.
267 719 100       1350 if (defined $value) {
268             # CRLF -> LF line-endings conversion if necessary
269 713 50       1168 $value =~ s/\015\012/\012/sg if $squash_crlf;
270 713         1332 $key =~ s/[ \t]+\z//; # strip WSP before colon, obsolete rfc822 syn
271             # limit the length of the pairs we store
272 713 50       1459 if (length($key) > MAX_HEADER_KEY_LENGTH) {
273 0         0 $key = substr($key, 0, MAX_HEADER_KEY_LENGTH);
274 0         0 $self->{'truncated_header'} = 1;
275             }
276 713 50       1260 if (length($value) > MAX_HEADER_VALUE_LENGTH) {
277 0         0 $value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);
278 0         0 $self->{'truncated_header'} = 1;
279             }
280 713         2215 $self->header($key, $value);
281             }
282             }
283              
284 856 100       3004 if ($current =~ /^\r?$/) { # a regular end of a header section
    100          
285 132 100       435 if ($eof) {
286 1         10 $self->{'missing_head_body_separator'} = 1;
287             } else {
288 131         343 $self->{'pristine_headers'} .= $current;
289             }
290 132         285 last;
291             }
292             elsif ($current =~ /^--/) { # mime boundary encountered, bail out
293 4         11 $self->{'missing_head_body_separator'} = 1;
294 4         10 unshift(@message, $current);
295 4         7 last;
296             }
297             # should we assume entering a body on encountering invalid header field?
298             else {
299             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
300 720 100       2401 if ($current !~ /^[\041-\071\073-\176]+[ \t]*:/) {
301             # A field name MUST be composed of printable US-ASCII characters
302             # (i.e., characters that have values between 33 (041) and 126 (176),
303             # inclusive), except colon (072). Obsolete header field syntax
304             # allowed WSP before a colon.
305 7 100       13 if (++$hdr_errors <= 3) {
306             # just consume but ignore a few invalid header fields
307             } else { # enough is enough...
308 1         3 $self->{'missing_head_body_separator'} = 1;
309 1         2 unshift(@message, $current);
310 1         3 last;
311             }
312             }
313             }
314              
315             # start collecting a new header field
316 719         1087 $header = $current;
317 719         1785 $self->{'pristine_headers'} .= $current;
318             }
319             }
320 137         289 undef $header;
321              
322             # Store the pristine body for later -- store as a copy since @message
323             # will get modified below
324 137         1360 $self->{'pristine_body'} = join('', @message);
325              
326 137 50       590 if (!defined $self->{pristine_body_length}) {
327 137         645 $self->{'pristine_body_length'} = length $self->{'pristine_body'};
328             }
329              
330             # iterate over lines in reverse order
331             # merge multiple blank lines into a single one
332 137         282 my $start;
333 137         634 for (my $cnt=$#message; $cnt>=0; $cnt--) {
334             # CRLF -> LF line-endings conversion if necessary
335 4180 50       4907 $message[$cnt] =~ s/\015\012\z/\012/ if $squash_crlf;
336              
337             # line is blank
338 4180 100       6408 if ($message[$cnt] =~ /^\s*$/) {
339             # /^\s*$/ is about 5% faster then !/\S/, but still expensive here
340 414 100       698 if (!defined $start) {
341 346         409 $start=$cnt;
342             }
343 414 100       847 next unless $cnt == 0;
344             }
345              
346             # line is not blank, or we've reached the beginning
347              
348             # if we've got a series of blank lines, get rid of them
349 3801 100       6082 if (defined $start) {
350 346         394 my $max_blank_lines = 20;
351 346         418 my $num = $start-$cnt;
352 346 50       548 if ($num > $max_blank_lines) {
353 0         0 splice @message, $cnt+2, $num-$max_blank_lines;
354             }
355 346         554 undef $start;
356             }
357             }
358              
359             # Figure out the boundary
360 137         345 my ($boundary);
361 137         568 ($self->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($self->header('content-type'));
362 137         1012 dbg("message: main message type: ".$self->{'type'});
363              
364             # dbg("message: \$message[0]: \"" . $message[0] . "\"");
365              
366             # bug 6845: if main message type is multipart and the message body does not begin with
367             # either a blank line or the boundary (if defined), insert a blank line
368             # to ensure proper parsing - do not consider MIME headers at the beginning of the body
369             # to be part of the message headers.
370 137 100 66     942 if ($self->{'type'} =~ /^multipart\//i && $#message > 0 && $message[0] =~ /\S/)
      100        
371             {
372 14 100 66     290 if (!defined $boundary || $message[0] !~ /^--\Q$boundary\E/)
373             {
374 4         12 dbg("message: Inserting blank line at top of body to ensure correct multipart MIME parsing");
375 4         8 unshift(@message, "\012");
376             }
377             }
378              
379             # dbg("message: \$message[0]: \"" . $message[0] . "\"");
380             # dbg("message: \$message[1]: \"" . $message[1] . "\"");
381              
382             # parse queue, simple array of parts to parse:
383             # 0: part object, already in the tree
384             # 1: boundary used to focus body parsing
385             # 2: message content
386             # 3: how many MIME subparts to parse down
387             #
388 137         779 $self->{'parse_queue'} = [ [ $self, $boundary, \@message, $subparse ] ];
389              
390             # If the message does need to get parsed, save off a copy of the body
391             # in a format we can easily parse later so we don't have to rip from
392             # pristine_body ... If we do want to parse now, go ahead and do so ...
393             #
394 137 100       494 if ($parsenow) {
395 101         558 $self->parse_body();
396             }
397              
398 137         695 $self;
399             }
400              
401             # ---------------------------------------------------------------------------
402              
403             =item find_parts()
404              
405             Used to search the tree for specific MIME parts. See
406             I<Mail::SpamAssassin::Message::Node> for more details.
407              
408             =cut
409              
410             # Used to find any MIME parts whose simple content-type matches a given regexp
411             # Searches it's own and any children parts. Returns an array of MIME
412             # objects which match.
413             #
414             sub find_parts {
415 231     231 1 3125 my $self = shift;
416              
417             # ok, we need to do the parsing now...
418 231 100       872 $self->parse_body() if (exists $self->{'parse_queue'});
419              
420             # and pass through to the Message::Node version of the method
421 231         1432 return $self->SUPER::find_parts(@_);
422             }
423              
424             # ---------------------------------------------------------------------------
425              
426             =item get_pristine_header()
427              
428             Returns pristine headers of the message. If no specific header name
429             is given as a parameter (case-insensitive), then all headers will be
430             returned as a scalar, including the blank line at the end of the headers.
431              
432             If called in an array context, an array will be returned with each
433             specific header in a different element. In a scalar context, the last
434             specific header is returned.
435              
436             ie: If 'Subject' is specified as the header, and there are 2 Subject
437             headers in a message, the last/bottom one in the message is returned in
438             scalar context or both are returned in array context.
439              
440             Btw, returning the last header field (not the first) happens to be consistent
441             with DKIM signatures, which search for and cover multiple header fields
442             bottom-up according to the 'h' tag. Let's keep it this way.
443              
444             Note: the returned header will include the ending newline and any embedded
445             whitespace folding.
446              
447             =cut
448              
449             sub get_pristine_header {
450 55     55 1 163 my ($self, $hdr) = @_;
451            
452 55 100 66     512 return $self->{pristine_headers} if !defined $hdr || $hdr eq '';
453             my(@ret) =
454 12         361 $self->{pristine_headers} =~ /^\Q$hdr\E[ \t]*:[ \t]*(.*?\n(?![ \t]))/smgi;
455             # taintedness is retained by "use re 'taint'" (fix in bug 5283 now redundant)
456 12 100       51 if (!@ret) {
    50          
457 2         20 return $self->get_header($hdr);
458             } elsif (wantarray) {
459 0         0 return @ret;
460             } else {
461 10         47 return $ret[-1];
462             }
463             }
464              
465             =item get_mbox_separator()
466              
467             Returns the mbox separator found in the message, or undef if there
468             wasn't one.
469              
470             =cut
471              
472             sub get_mbox_separator {
473 34     34 1 211 return $_[0]->{mbox_sep};
474             }
475              
476             =item get_body()
477              
478             Returns an array of the pristine message body, one line per array element.
479              
480             =cut
481              
482             sub get_body {
483 0     0 1 0 my ($self) = @_;
484 0         0 my @ret = split(/^/m, $self->{pristine_body});
485 0         0 return \@ret;
486             }
487              
488             # ---------------------------------------------------------------------------
489              
490             =item get_pristine()
491              
492             Returns a scalar of the entire pristine message.
493              
494             =cut
495              
496             sub get_pristine {
497 98     98 1 231 my ($self) = @_;
498 98         595 return $self->{pristine_headers} . $self->{pristine_body};
499             }
500              
501             =item get_pristine_body()
502              
503             Returns a scalar of the pristine message body.
504              
505             =cut
506              
507             sub get_pristine_body {
508 46     46 1 167 my ($self) = @_;
509 46         305 return $self->{pristine_body};
510             }
511              
512             # ---------------------------------------------------------------------------
513              
514             =item extract_message_metadata($permsgstatus)
515              
516             =cut
517              
518             sub extract_message_metadata {
519 116     116 1 409 my ($self, $permsgstatus) = @_;
520              
521             # do this only once per message, it can be expensive
522 116 100       460 return if $self->{already_extracted_metadata};
523 102         400 $self->{already_extracted_metadata} = 1;
524              
525 102         771 $self->{metadata}->extract ($self, $permsgstatus);
526             }
527              
528             # ---------------------------------------------------------------------------
529              
530             =item $str = get_metadata($hdr)
531              
532             =cut
533              
534             sub get_metadata {
535 2859     2859 1 4491 my ($self, $hdr) = @_;
536 2859 50       5083 if (!$self->{metadata}) {
537 0         0 warn "metadata: oops! get_metadata() called after finish_metadata()"; return;
  0         0  
538             }
539             # dbg("message: get_metadata - %s: %s", $hdr, defined $_ ? $_ : '<undef>')
540             # for $self->{metadata}->{strings}->{lc $hdr};
541              
542 2859         6669 $self->{metadata}->{strings}->{lc $hdr};
543             }
544              
545             =item put_metadata($hdr, $text)
546              
547             =cut
548              
549             sub put_metadata {
550 408     408 1 804 my ($self, $hdr, $text) = @_;
551 408 50       924 if (!$self->{metadata}) {
552 0         0 warn "metadata: oops! put_metadata() called after finish_metadata()"; return;
  0         0  
553             }
554             # dbg("message: put_metadata - %s: %s", $hdr, $text);
555 408         1434 $self->{metadata}->{strings}->{lc $hdr} = $text;
556             }
557              
558             =item delete_metadata($hdr)
559              
560             =cut
561              
562             sub delete_metadata {
563 0     0 1 0 my ($self, $hdr) = @_;
564 0 0       0 if (!$self->{metadata}) {
565 0         0 warn "metadata: oops! delete_metadata() called after finish_metadata()"; return;
  0         0  
566             }
567 0         0 delete $self->{metadata}->{strings}->{lc $hdr};
568             }
569              
570             =item $str = get_all_metadata()
571              
572             =cut
573              
574             sub get_all_metadata {
575 16     16 1 47 my ($self) = @_;
576              
577 16 50       67 if (!$self->{metadata}) {
578 0         0 warn "metadata: oops! get_all_metadata() called after finish_metadata()"; return;
  0         0  
579             }
580 16         30 my @ret;
581 16         38 my $keys_ref = $self->{metadata}->{strings};
582 16         123 foreach my $key (sort keys %$keys_ref) {
583 64         132 my $val = $keys_ref->{$key};
584 64 50       121 $val = '' if !defined $val;
585 64         169 push (@ret, "$key: $val\n");
586             }
587 16 50       93 return (wantarray ? @ret : join('', @ret));
588             }
589              
590             # ---------------------------------------------------------------------------
591              
592             =item finish_metadata()
593              
594             Destroys the metadata for this message. Once a message has been
595             scanned fully, the metadata is no longer required. Destroying
596             this will free up some memory.
597              
598             =cut
599              
600             sub finish_metadata {
601 112     112 1 259 my ($self) = @_;
602 112 50       430 if (defined ($self->{metadata})) {
603 112         832 $self->{metadata}->finish();
604 112         439 delete $self->{metadata};
605             }
606             }
607              
608             =item finish()
609              
610             Clean up an object so that it can be destroyed.
611              
612             =cut
613              
614             sub finish {
615 80     80 1 6610 my ($self) = @_;
616              
617             # Clean ourself up
618 80         342 $self->finish_metadata();
619              
620             # These will only be in the root Message node
621 80         224 delete $self->{'mime_boundary_state'};
622 80         438 delete $self->{'mbox_sep'};
623 80         195 delete $self->{'normalize'};
624 80         217 delete $self->{'pristine_body'};
625 80         191 delete $self->{'pristine_headers'};
626 80         192 delete $self->{'line_ending'};
627 80         151 delete $self->{'missing_head_body_separator'};
628              
629             # Remove the queue variable, in case the body has not been parsed
630 80         155 delete $self->{'parse_queue'};
631              
632 80         211 my @toclean = ( $self );
633              
634             # Go ahead and clean up all of the Message::Node parts
635 80         293 while (my $part = shift @toclean) {
636             # bug 5557: windows requires tmp file be closed before it can be rm'd
637 126 100       468 if (ref $part->{'raw'} eq 'GLOB') {
638 20 50       186 close($part->{'raw'}) or die "error closing input file: $!";
639             }
640              
641             # bug 5858: avoid memory leak with deep MIME structure
642 126 100       322 if (defined ($part->{metadata})) {
643 4         14 $part->{metadata}->finish();
644 4         8 delete $part->{metadata};
645             }
646              
647 126         515 delete $part->{'headers'};
648 126         428 delete $part->{'raw_headers'};
649 126         289 delete $part->{'header_order'};
650 126         309 delete $part->{'raw'};
651 126         264 delete $part->{'decoded'};
652 126         211 delete $part->{'rendered'};
653 126         202 delete $part->{'visible_rendered'};
654 126         200 delete $part->{'invisible_rendered'};
655 126         202 delete $part->{'type'};
656 126         215 delete $part->{'rendered_type'};
657              
658             # if there are children nodes, add them to the queue of nodes to clean up
659 126 100       493 if (exists $part->{'body_parts'}) {
660 23         30 push(@toclean, @{$part->{'body_parts'}});
  23         36  
661 23         50 delete $part->{'body_parts'};
662             }
663             }
664              
665             # delete temporary files
666 80 50       292 if ($self->{'tmpfiles'}) {
667 80         133 for my $fn (@{$self->{'tmpfiles'}}) {
  80         311  
668 20 50       731 unlink($fn) or warn "cannot unlink $fn: $!";
669             }
670 80         321 delete $self->{'tmpfiles'};
671             }
672             }
673              
674             # also use a DESTROY method, just to ensure (as much as possible) that
675             # temporary files are deleted even if the finish() method is omitted
676             sub DESTROY {
677 116     116   525 my $self = shift;
678             # best practices: prevent potential calls to eval and to system routines
679             # in code of a DESTROY method from clobbering global variables $@ and $!
680 116         993 local($@,$!); # keep outer error handling unaffected by DESTROY
681 116 100       2174 if ($self->{'tmpfiles'}) {
682 36         92 for my $fn (@{$self->{'tmpfiles'}}) {
  36         1708  
683 0 0       0 unlink($fn) or dbg("message: cannot unlink $fn: $!");
684             }
685             }
686             }
687              
688             # ---------------------------------------------------------------------------
689              
690             =item receive_date()
691              
692             Return a time_t value with the received date of the current message,
693             or current time if received time couldn't be determined.
694              
695             =cut
696              
697             sub receive_date {
698 10     10 1 39 my($self) = @_;
699              
700 10         58 return Mail::SpamAssassin::Util::receive_date(scalar $self->get_all_headers(0,1));
701             }
702              
703             # ---------------------------------------------------------------------------
704              
705             =back
706              
707             =head1 PARSING METHODS, NON-PUBLIC
708              
709             These methods take a RFC2822-esque formatted message and create a tree
710             with all of the MIME body parts included. Those parts will be decoded
711             as necessary, and text/html parts will be rendered into a standard text
712             format, suitable for use in SpamAssassin.
713              
714             =over 4
715              
716             =item parse_body()
717              
718             parse_body() passes the body part that was passed in onto the
719             correct part parser, either _parse_multipart() for multipart/* parts,
720             or _parse_normal() for everything else. Multipart sections become the
721             root of sub-trees, while everything else becomes a leaf in the tree.
722              
723             For multipart messages, the first call to parse_body() doesn't create a
724             new sub-tree and just uses the parent node to contain children. All other
725             calls to parse_body() will cause a new sub-tree root to be created and
726             children will exist underneath that root. (this is just so the tree
727             doesn't have a root node which points at the actual root node ...)
728              
729             =cut
730              
731             sub parse_body {
732 125     125 1 358 my($self) = @_;
733              
734             # This shouldn't happen, but just in case, abort.
735 125 50       505 return unless (exists $self->{'parse_queue'});
736              
737 125         479 dbg("message: ---- MIME PARSER START ----");
738              
739 125         270 while (my $toparse = shift @{$self->{'parse_queue'}}) {
  298         1084  
740             # multipart sections are required to have a boundary set ... If this
741             # one doesn't, assume it's malformed and send it to be parsed as a
742             # non-multipart section
743             #
744 173         444 my ($msg, $boundary, $body, $subparse) = @$toparse;
745              
746 173 100 66     1072 if ($msg->{'type'} =~ m{^multipart/}i && defined $boundary && $subparse > 0) {
      66        
747 20         56 $self->_parse_multipart($toparse);
748             }
749             else {
750             # If it's not multipart, go ahead and just deal with it.
751 153         917 $self->_parse_normal($toparse);
752              
753             # bug 5041: process message/*, but exclude message/partial content types
754 153 100 66     1029 if ($msg->{'type'} =~ m{^message/(?!partial\z)}i && $subparse > 0)
755             {
756             # Just decode the part, but we don't need the resulting string here.
757 4         27 $msg->decode(0);
758              
759             # bug 7125: decode and parse only message/rfc822 or message/global,
760             # but do not treat other message/* content types (like the ones listed
761             # here) as a message consisting of a header and a body, as they are not:
762             # message/delivery-status, message/global-delivery-status,
763             # message/feedback-report, message/global-headers,
764             # message/global-disposition-notification,
765             # message/disposition-notification, (and message/partial)
766              
767             # bug 5051, bug 3748: check $msg->{decoded}: sometimes message/* parts
768             # have no content, and we get stuck waiting for STDIN, which is bad. :(
769              
770 4 50 33     32 if ($msg->{'type'} =~ m{^message/(?:rfc822|global)\z}i &&
      33        
771             defined $msg->{'decoded'} && $msg->{'decoded'} ne '')
772             {
773             # Ok, so this part is still semi-recursive, since M::SA::Message
774             # calls M::SA::Message, but we don't subparse the new message,
775             # and pull a sneaky "steal our child's queue" maneuver to deal
776             # with it on our own time. Reference the decoded array directly
777             # since it's faster.
778             #
779             my $msg_obj = Mail::SpamAssassin::Message->new({
780             message => $msg->{'decoded'},
781             parsenow => 0,
782             normalize => $self->{normalize},
783 4         37 subparse => $subparse - 1,
784             });
785              
786             # Add the new message to the current node
787 4         17 $msg->add_body_part($msg_obj);
788              
789             # now this is the sneaky bit ... steal the sub-message's parse_queue
790             # and add it to ours. then we'll handle the sub-message in our
791             # normal loop and get all the glory. muhaha. :)
792 4         4 push(@{$self->{'parse_queue'}}, @{$msg_obj->{'parse_queue'}});
  4         6  
  4         8  
793 4         8 delete $msg_obj->{'parse_queue'};
794              
795             # Ok, we've subparsed, so go ahead and remove the raw and decoded
796             # data because we won't need them anymore (the tree under this part
797             # will have that data)
798 4 50       11 if (ref $msg->{'raw'} eq 'GLOB') {
799             # Make sure we close it if it's a temp file -- Bug 5166
800 0 0       0 close($msg->{'raw'})
801             or die "error closing input file: $!";
802             }
803              
804 4         6 delete $msg->{'raw'};
805            
806 4         36 delete $msg->{'decoded'};
807             }
808             }
809             }
810             }
811              
812 125         440 dbg("message: ---- MIME PARSER END ----");
813              
814             # we're done parsing, so remove the queue variable
815 125         378 delete $self->{'parse_queue'};
816             }
817              
818             =item _parse_multipart()
819              
820             Generate a root node, and for each child part call parse_body()
821             to generate the tree.
822              
823             =cut
824              
825             sub _parse_multipart {
826 20     20   37 my($self, $toparse) = @_;
827              
828 20         22 my ($msg, $boundary, $body, $subparse) = @{$toparse};
  20         38  
829              
830             # we're not supposed to be a leaf, so prep ourselves
831 20         46 $msg->{'body_parts'} = [];
832              
833             # the next set of objects will be one level deeper
834 20         26 $subparse--;
835              
836 20 50       76 dbg("message: parsing multipart, got boundary: ".(defined $boundary ? $boundary : ''));
837              
838             # NOTE: The MIME boundary REs here are very specific to be mostly RFC 1521
839             # compliant, but also allow possible malformations to still work. Please
840             # see Bugzilla bug 3749 for more information before making any changes!
841              
842             # ignore preamble per RFC 1521, unless there's no boundary ...
843 20 50       38 if ( defined $boundary ) {
844 20         22 my $line;
845 20         26 my $tmp_line = @{$body};
  20         34  
846 20         47 for ($line=0; $line < $tmp_line; $line++) {
847             # dbg("message: multipart line $line: \"" . $body->[$line] . "\"");
848             # specifically look for an opening boundary
849 120 100 100     516 if (substr($body->[$line],0,2) eq '--' # triage
850             && $body->[$line] =~ /^--\Q$boundary\E\s*$/) {
851             # Make note that we found the opening boundary
852 19         63 $self->{mime_boundary_state}->{$boundary} = 1;
853              
854             # if the line after the opening boundary isn't a header, flag it.
855             # we need to make sure that there's actually another line though.
856             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
857 19 100 66     103 if ($line+1 < $tmp_line && $body->[$line+1] !~ /^[\041-\071\073-\176]+:/) {
858 5         12 $self->{'missing_mime_headers'} = 1;
859             }
860              
861 19         31 last;
862             }
863             }
864              
865             # Found a boundary, ignore the preamble
866 20 100       38 if ( $line < $tmp_line ) {
867 19         23 splice @{$body}, 0, $line+1;
  19         41  
868             }
869              
870             # Else, there's no boundary, so leave the whole part...
871             }
872              
873             # prepare a new tree node
874 20         158 my $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
875 20         42 my $in_body = 0;
876 20         38 my $header;
877             my $part_array;
878 20         0 my $found_end_boundary;
879              
880 20         21 my $line_count = @{$body};
  20         28  
881 20         23 foreach ( @{$body} ) {
  20         36  
882             # if we're on the last body line, or we find any boundary marker,
883             # deal with the mime part;
884             # a triage before an unlikely-to-match regexp avoids a CPU hotspot
885 2260   100     5390 $found_end_boundary = defined $boundary && substr($_,0,2) eq '--'
886             && /^--\Q$boundary\E(?:--)?\s*$/;
887 2260 100 100     4432 if ( --$line_count == 0 || $found_end_boundary ) {
888 44         57 my $line = $_; # remember the last line
889              
890             # If at last line and no end boundary found, the line belongs to body
891             # TODO:
892             # Is $self->{mime_boundary_state}->{$boundary}-- needed here?
893             # Could "missing end boundary" be a useful rule? Mark it somewhere?
894             # If SA processed truncated message from amavis etc, this could also
895             # be hit legimately..
896 44 50       93 if (!$found_end_boundary) {
    50          
897             # TODO: This is duplicate code from few pages down below..
898 0         0 while (length ($_) > MAX_BODY_LINE_LENGTH) {
899 0         0 push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
  0         0  
900 0         0 substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
901             }
902 0         0 push ( @{$part_array}, $_ );
  0         0  
903             }
904             # per rfc 1521, the CRLF before the boundary is part of the boundary:
905             # NOTE: The CRLF preceding the encapsulation line is conceptually
906             # attached to the boundary so that it is possible to have a part
907             # that does not end with a CRLF (line break). Body parts that must
908             # be considered to end with line breaks, therefore, must have two
909             # CRLFs preceding the encapsulation line, the first of which is part
910             # of the preceding body part, and the second of which is part of the
911             # encapsulation boundary.
912             elsif ($part_array) {
913 44         78 chomp( $part_array->[-1] ); # trim the CRLF that's part of the boundary
914 44 100       73 splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
  24         46  
915             }
916             else {
917             # Invalid parts can have no body, so fake in a blank body
918             # in that case.
919 0         0 $part_array = [];
920             }
921              
922 44         46 my($p_boundary);
923 44         102 ($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
924 44   66     157 $p_boundary ||= $boundary;
925 44 50       168 dbg("message: found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));
926              
927             # we've created a new node object, so add it to the queue along with the
928             # text that belongs to that part, then add the new part to the current
929             # node to create the tree.
930 44         47 push(@{$self->{'parse_queue'}}, [ $part_msg, $p_boundary, $part_array, $subparse ]);
  44         151  
931 44         158 $msg->add_body_part($part_msg);
932              
933             # rfc 1521 says /^--boundary--$/, some MUAs may just require /^--boundary--/
934             # but this causes problems with horizontal lines when the boundary is
935             # made up of dashes as well, etc.
936 44 50       71 if (defined $boundary) {
937             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
938 44 100 33     456 if ($line =~ /^--\Q${boundary}\E--\s*$/) {
    50          
939             # Make a note that we've seen the end boundary
940 20         40 $self->{mime_boundary_state}->{$boundary}--;
941 20         38 last;
942             }
943             elsif ($line_count && $body->[-$line_count] !~ /^[\041-\071\073-\176]+:/) {
944             # if we aren't on an end boundary and there are still lines left, it
945             # means we hit a new start boundary. therefore, the next line ought
946             # to be a mime header. if it's not, mark it.
947 0         0 $self->{'missing_mime_headers'} = 1;
948             }
949             }
950              
951             # make sure we start with a new clean node
952 24         42 $in_body = 0;
953 24         82 $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
954 24         41 undef $part_array;
955 24         28 undef $header;
956              
957 24         51 next;
958             }
959              
960 2216 100       2501 if (!$in_body) {
961             # s/\s+$//; # bug 5127: don't clean this up (yet)
962             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
963 132 100 66     362 if (/^[\041-\071\073-\176]+[ \t]*:/) {
    100          
964 87 100       135 if ($header) {
965 49         136 my ( $key, $value ) = split ( /:\s*/, $header, 2 );
966 49         112 $part_msg->header( $key, $value );
967             }
968 87         93 $header = $_;
969 87         103 next;
970             }
971             elsif (/^[ \t]/ && $header) {
972             # $_ =~ s/^\s*//; # bug 5127, again
973 1         2 $header .= $_;
974 1         2 next;
975             }
976             else {
977 44 100       87 if ($header) {
978 38         125 my ( $key, $value ) = split ( /:\s*/, $header, 2 );
979 38         81 $part_msg->header( $key, $value );
980             }
981 44         54 $in_body = 1;
982              
983             # if there's a blank line separator, that's good. if there isn't,
984             # it's a body line, so drop through.
985 44 100       113 if (/^\r?$/) {
986 43         55 next;
987             }
988             else {
989 1         9 $self->{'missing_mime_head_body_separator'} = 1;
990             }
991             }
992             }
993              
994             # we run into a perl bug if the lines are astronomically long (probably
995             # due to lots of regexp backtracking); so split any individual line
996             # over MAX_BODY_LINE_LENGTH bytes in length. This can wreck HTML
997             # totally -- but IMHO the only reason a luser would use
998             # MAX_BODY_LINE_LENGTH-byte lines is to crash filters, anyway.
999 2085         2448 while (length ($_) > MAX_BODY_LINE_LENGTH) {
1000 0         0 push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
  0         0  
1001 0         0 substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
1002             }
1003 2085         1731 push ( @{$part_array}, $_ );
  2085         2764  
1004             }
1005              
1006             # Look for a message epilogue
1007             # originally ignored whitespace: 0.185 0.2037 0.0654 0.757 0.00 0.00 TVD_TAB
1008             # ham FPs were all "." on a line by itself.
1009             # spams seem to only have NULL chars afterwards ?
1010 20 100       95 if ($line_count) {
1011 8         24 for(; $line_count > 0; $line_count--) {
1012 15 100       58 if ($body->[-$line_count] =~ /[^\s.]/) {
1013 2         6 $self->{mime_epilogue_exists} = 1;
1014 2         9 last;
1015             }
1016             }
1017             }
1018              
1019             }
1020              
1021             =item _parse_normal()
1022              
1023             Generate a leaf node and add it to the parent.
1024              
1025             =cut
1026              
1027             sub _parse_normal {
1028 153     153   511 my($self, $toparse) = @_;
1029              
1030 153         308 my ($msg, $boundary, $body) = @{$toparse};
  153         444  
1031              
1032 153         501 dbg("message: parsing normal part");
1033              
1034             # 0: content-type, 1: boundary, 2: charset, 3: filename
1035 153         606 my @ct = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
1036              
1037             # multipart sections are required to have a boundary set ... If this
1038             # one doesn't, assume it's malformed and revert to text/plain
1039 153 50 33     1090 $msg->{'type'} = ($ct[0] !~ m@^multipart/@i || defined $boundary ) ? $ct[0] : 'text/plain';
1040 153         720 $msg->{'charset'} = $ct[2];
1041              
1042             # attempt to figure out a name for this attachment if there is one ...
1043 153   100     667 my $disp = $msg->header('content-disposition') || '';
1044 153 100       718 if ($disp =~ /name=\s*"?([^";]+)"?/i) {
    100          
1045 1         4 $msg->{'name'} = $1;
1046             }
1047             elsif ($ct[3]) {
1048 2         6 $msg->{'name'} = $ct[3];
1049             }
1050 153 100       510 if ($msg->{'name'}) {
1051 3         38 $msg->{'name'} = Encode::decode("MIME-Header", $msg->{'name'});
1052             }
1053              
1054 153         6102 $msg->{'boundary'} = $boundary;
1055              
1056             # If the part type is not one that we're likely to want to use, go
1057             # ahead and write the part data out to a temp file -- why keep sucking
1058             # up RAM with something we're not going to use?
1059             #
1060 153 100       1145 if ($msg->{'type'} !~ m@^(?:text/(?:plain|html)$|message\b)@) {
1061 20         28 my($filepath, $fh);
1062             eval {
1063 20         42 ($filepath, $fh) = Mail::SpamAssassin::Util::secure_tmpfile(); 1;
  20         46  
1064 20 50       23 } or do {
1065 0 0       0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
1066 0         0 info("message: failed to create a temp file: %s", $eval_stat);
1067             };
1068 20 50       33 if ($fh) {
1069             # The temp file was created, add it to the list of pending deletions
1070             # we cannot just delete immediately in the POSIX idiom, as this is
1071             # unportable (to win32 at least)
1072 20         18 push @{$self->{tmpfiles}}, $filepath;
  20         49  
1073 20         44 dbg("message: storing a message part to file %s", $filepath);
1074 20 50       20 $fh->print(@{$body}) or die "error writing to $filepath: $!";
  20         79  
1075 20 50       884 $fh->flush or die "error writing (flush) to $filepath: $!";
1076 20         91 $msg->{'raw'} = $fh;
1077             }
1078             }
1079              
1080             # if the part didn't get a temp file, go ahead and store the data in memory
1081 153 100       598 if (!defined $msg->{'raw'}) {
1082 133         498 dbg("message: storing a body to memory");
1083 133         722 $msg->{'raw'} = $body;
1084             }
1085             }
1086              
1087             # ---------------------------------------------------------------------------
1088              
1089             sub get_mimepart_digests {
1090 0     0 0 0 my ($self) = @_;
1091              
1092 0 0       0 if (!exists $self->{mimepart_digests}) {
1093             # traverse all parts which are leaves, recursively
1094             $self->{mimepart_digests} =
1095 0   0     0 [ map(sha1_hex($_->decode) . ':' . lc($_->{type}||''),
1096             $self->find_parts(qr/^/,1,1)) ];
1097             }
1098 0         0 return $self->{mimepart_digests};
1099             }
1100              
1101             # ---------------------------------------------------------------------------
1102              
1103             # common code for get_rendered_body_text_array,
1104             # get_visible_rendered_body_text_array, get_invisible_rendered_body_text_array
1105             #
1106             sub get_body_text_array_common {
1107 375     375 0 781 my ($self, $method_name) = @_;
1108              
1109 375         863 my $key = 'text_' . $method_name;
1110 375 100       1019 if (exists $self->{$key}) { return $self->{$key} }
  247         781  
1111              
1112 128         538 $self->{$key} = [];
1113              
1114             # Find all parts which are leaves
1115 128         1053 my @parts = $self->find_parts(qr/./,1);
1116 128 50       593 return $self->{$key} unless @parts;
1117              
1118             # the html metadata may have already been set, so let's not bother if it's
1119             # already been done.
1120 128         425 my $html_needs_setting = !exists $self->{metadata}->{html};
1121              
1122 128 100 100     794 my $text = $method_name eq 'invisible_rendered' ? ''
1123             : ($self->get_header('subject') || "\n");
1124              
1125             # Go through each part
1126 128         645 for (my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1127 129         293 my $p = $parts[$pt];
1128              
1129             # put a blank line between parts ...
1130 129 100       480 $text .= "\n" if $text ne '';
1131              
1132 129         1099 my($type, $rnd) = $p->$method_name(); # decode this part
1133 129 50       495 if ( defined $rnd ) {
1134             # Only text/* types are rendered ...
1135 129         770 $text .= $rnd;
1136              
1137             # TVD - if there are multiple parts, what should we do?
1138             # right now, just use the last one. we may need to give some priority
1139             # at some point, ie: use text/html rendered if it exists, or
1140             # text/plain rendered as html otherwise.
1141 129 100 100     1067 if ($html_needs_setting && $type eq 'text/html') {
1142 7         53 $self->{metadata}->{html} = $p->{html_results};
1143             }
1144             }
1145             }
1146              
1147             # whitespace handling (warning: small changes have large effects!)
1148 128         1892 $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
1149             # $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace (incl. VT, NBSP) => space
1150 128         751 $text =~ tr/ \t\n\r\x0b/ /s; # whitespace (incl. VT) => space
1151 128         418 $text =~ tr/\f/\n/; # form feeds => newline
1152              
1153 128         617 my @textary = split_into_array_of_short_lines($text);
1154 128         544 $self->{$key} = \@textary;
1155              
1156 128         555 return $self->{$key};
1157             }
1158              
1159             # ---------------------------------------------------------------------------
1160              
1161             sub get_rendered_body_text_array {
1162 327     327 0 680 my ($self) = @_;
1163 327         911 return $self->get_body_text_array_common('rendered');
1164             }
1165              
1166             sub get_visible_rendered_body_text_array {
1167 24     24 0 58 my ($self) = @_;
1168 24         122 return $self->get_body_text_array_common('visible_rendered');
1169             }
1170              
1171             sub get_invisible_rendered_body_text_array {
1172 24     24 0 56 my ($self) = @_;
1173 24         60 return $self->get_body_text_array_common('invisible_rendered');
1174             }
1175              
1176             # ---------------------------------------------------------------------------
1177              
1178             sub get_decoded_body_text_array {
1179 98     98 0 257 my ($self) = @_;
1180              
1181 98 100       461 if (defined $self->{text_decoded}) { return $self->{text_decoded}; }
  2         17  
1182 96         468 $self->{text_decoded} = [ ];
1183              
1184             # Find all parts which are leaves
1185 96         614 my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
1186 96 50       398 return $self->{text_decoded} unless @parts;
1187              
1188             # Go through each part
1189 96         458 for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1190             # bug 4843: skip text/calendar parts since they're usually an attachment
1191             # and not displayed
1192 96 50       369 next if ($parts[$pt]->{'type'} eq 'text/calendar');
1193              
1194 96         193 push(@{$self->{text_decoded}},
  96         405  
1195             split_into_array_of_short_paragraphs($parts[$pt]->decode()));
1196             }
1197              
1198 96         349 return $self->{text_decoded};
1199             }
1200              
1201             # ---------------------------------------------------------------------------
1202              
1203             sub split_into_array_of_short_lines {
1204 128     128 0 264 my @result;
1205 128         877 foreach my $line (split (/^/m, $_[0])) {
1206 488         1063 while (length ($line) > MAX_BODY_LINE_LENGTH) {
1207             # try splitting "nicely" so that we don't chop a url in half or
1208             # something. if there's no space, then just split at max length.
1209 4         10 my $length = rindex($line, ' ', MAX_BODY_LINE_LENGTH) + 1;
1210 4   50     8 $length ||= MAX_BODY_LINE_LENGTH;
1211 4         21 push (@result, substr($line, 0, $length, ''));
1212             }
1213 488         957 push (@result, $line);
1214             }
1215 128         555 @result;
1216             }
1217              
1218             # ---------------------------------------------------------------------------
1219              
1220             # split a text into array of paragraphs of sizes between
1221             # $chunk_size and 2 * $chunk_size, returning the resulting array
1222              
1223             sub split_into_array_of_short_paragraphs {
1224 96     96 0 279 my @result;
1225 96         218 my $chunk_size = 1024;
1226 96         239 my $text_l = length($_[0]);
1227 96         183 my($j,$ofs);
1228 96         488 for ($ofs = 0; $text_l - $ofs > 2 * $chunk_size; $ofs = $j+1) {
1229 8         32 $j = index($_[0], "\n", $ofs+$chunk_size);
1230 8 50       30 if ($j < 0) {
1231 0         0 $j = index($_[0], " ", $ofs+$chunk_size);
1232 0 0       0 if ($j < 0) { $j = $ofs+$chunk_size }
  0         0  
1233             }
1234 8         45 push(@result, substr($_[0], $ofs, $j-$ofs+1));
1235             }
1236 96 100       598 push(@result, substr($_[0], $ofs)) if $ofs < $text_l;
1237 96         507 @result;
1238             }
1239              
1240             # ---------------------------------------------------------------------------
1241              
1242             1;
1243              
1244             =back
1245              
1246             =cut