File Coverage

blib/lib/Mail/SpamAssassin/Message.pm

Criterion	Covered	Total	%
statement	427	499	85.5
branch	181	256	70.7
condition	74	115	64.3
subroutine	33	36	91.6
pod	16	24	66.6
total	731	930	78.6

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Message - decode, render, and hold an RFC-2822 message
21
22							=head1 DESCRIPTION
23
24							This module encapsulates an email message and allows access to the various MIME
25							message parts and message metadata.
26
27							The message structure, after initiating a parse() cycle, looks like this:
28
29							Message object, also top-level node in Message::Node tree
30							\|
31							+---> Message::Node for other parts in MIME structure
32							\| \|---> [ more Message::Node parts ... ]
33							\| [ others ... ]
34							\|
35							+---> Message::Metadata object to hold metadata
36
37							=head1 PUBLIC METHODS
38
39							=over 4
40
41							=cut
42
43							package Mail::SpamAssassin::Message;
44
45	40			40		295	use strict;
	40					94
	40					5215
46	40			40		220	use warnings;
	40					90
	40					1320
47	40			40		217	use re 'taint';
	40					74
	40					10993
48
49							BEGIN {
50	40					23062	eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
	40					126759
	40					1309
51	40	50		40		143	or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
	0					0
	0					0
52							}
53
54	40			40		301	use Mail::SpamAssassin;
	40					90
	40					955
55	40			40		13945	use Mail::SpamAssassin::Message::Node;
	40					131
	40					1572
56	40			40		14449	use Mail::SpamAssassin::Message::Metadata;
	40					563
	40					4594
57	40			40		377	use Mail::SpamAssassin::Constants qw(:sa);
	40					133
	40					9088
58	40			40		390	use Mail::SpamAssassin::Logger;
	40					176
	40					248989
59
60							our @ISA = qw(Mail::SpamAssassin::Message::Node);
61
62							# ---------------------------------------------------------------------------
63
64							=item new()
65
66							Creates a Mail::SpamAssassin::Message object. Takes a hash reference
67							as a parameter. The used hash key/value pairs are as follows:
68
69							C<message> is either undef (which will use STDIN), a scalar - a string
70							containing an entire message, a reference to such string, an array reference
71							of the message with one line per array element, or either a file glob
72							or an IO::File object which holds the entire contents of the message.
73
74							Note: The message is expected to generally be in RFC 2822 format, optionally
75							including an mbox message separator line (the "From " line) as the first line.
76
77							C<parse_now> specifies whether or not to create the MIME tree
78							at object-creation time or later as necessary.
79
80							The I<parse_now> option, by default, is set to false (0).
81							This allows SpamAssassin to not have to generate the tree of
82							Mail::SpamAssassin::Message::Node objects and their related data if the
83							tree is not going to be used. This is handy, for instance, when running
84							C<spamassassin -d>, which only needs the pristine header and body which
85							is always handled when the object is created.
86
87							C<subparse> specifies how many MIME recursion levels should be parsed.
88							Defaults to 20.
89
90							=cut
91
92							# month mappings (ripped from Util.pm)
93							my %MONTH = (jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6,
94							jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12);
95
96							# day of week mapping (starting from zero)
97							my @DAY_OF_WEEK = qw/Sun Mon Tue Wed Thu Fri Sat/ ;
98
99							sub new {
100	149			149	1	473	my $class = shift;
101	149		33			1043	$class = ref($class) \|\| $class;
102
103	149					422	my($opts) = @_;
104	149	50				719	my $message = defined $opts->{'message'} ? $opts->{'message'} : \*STDIN;
105	149		100			694	my $parsenow = $opts->{'parsenow'} \|\| 0;
106	149		50			1004	my $normalize = $opts->{'normalize'} \|\| 0;
107
108							# Specifies whether or not to parse message/rfc822 parts into its own tree.
109							# If the # > 0, it'll subparse, otherwise it won't. By default, do twenty
110							# levels deep.
111	149	100				539	my $subparse = defined $opts->{'subparse'} ? $opts->{'subparse'} : 20;
112
113	149					1668	my $self = $class->SUPER::new({normalize=>$normalize});
114
115	149					659	$self->{tmpfiles} = [];
116	149					590	$self->{pristine_msg} = '';
117	149					567	$self->{pristine_headers} = '';
118	149					685	$self->{pristine_body} = '';
119	149					420	$self->{mime_boundary_state} = {};
120	149					553	$self->{line_ending} = "\012";
121	149					456	$self->{master_deadline} = $opts->{'master_deadline'};
122	149					342	$self->{suppl_attrib} = $opts->{'suppl_attrib'};
123	149		100			624	$self->{body_part_scan_size} = $opts->{'body_part_scan_size'} \|\| 0;
124	149		100			563	$self->{rawbody_part_scan_size} = $opts->{'rawbody_part_scan_size'} \|\| 0;
125
126	149	100				509	if ($self->{suppl_attrib}) { # caller-provided additional information
127							# pristine_body_length is currently used by an eval test check_body_length
128							# Possible To-Do: Base the length on the @message array later down?
129	49	50				173	if (defined $self->{suppl_attrib}{body_size}) {
130							# Optional info provided by a caller; should reflect the original
131							# message body size if provided, and as such it may differ from the
132							# $self->{pristine_body} size, e.g. when the caller passed a truncated
133							# message to SpamAssassin, or when counting line-endings differently.
134	0					0	$self->{pristine_body_length} = $self->{suppl_attrib}{body_size};
135							}
136	49	50				195	if (ref $self->{suppl_attrib}{mimepart_digests}) {
137							# Optional info provided by a caller: an array of digest codes (e.g. SHA1)
138							# of each MIME part. Should reflect the original message if provided.
139							# As such it may differ from digests calculated by get_mimepart_digests(),
140							# e.g. when the caller passed a truncated message to SpamAssassin.
141	0					0	$self->{mimepart_digests} = $self->{suppl_attrib}{mimepart_digests};
142							}
143							}
144
145	149					366	bless($self,$class);
146
147							# create the metadata holder class
148	149					1545	$self->{metadata} = Mail::SpamAssassin::Message::Metadata->new($self);
149
150							# Ok, go ahead and do the message "parsing"
151
152							# protect it from abuse ...
153	149					371	local $_;
154
155							# Figure out how the message was passed to us, and deal with it.
156	149					282	my @message;
157	149	100	66			1297	if (ref $message eq 'ARRAY') {
		100
		50
		50
		50
158	63					150	@message = @{$message};
	63					365
159							}
160							elsif (ref($message) eq 'GLOB' \|\| ref($message) =~ /^IO::/) {
161	18	50				66	if (defined fileno $message) {
162
163							# sysread+split avoids a Perl I/O bug (Bug 5985)
164							# and is faster than (<$message>) by 10..25 %
165							# (a drawback is a short-term double storage of a text in $raw_str)
166							#
167	18					36	my($nread,$raw_str); $raw_str = '';
	18					31
168	18					784	while ( $nread=sysread($message, $raw_str, 16384, length $raw_str) ) { }
169	18	50				74	defined $nread or die "error reading: $!";
170	18					1203	@message = split(/^/m, $raw_str, -1);
171
172	18	50				248	if ($raw_str eq '') {
		50
173	0					0	dbg("message: empty message read");
174							} elsif (length($raw_str) > 128*1024) {
175							# ditch rarely used large chunks of allocated memory, Bug 6514
176							# http://www.perlmonks.org/?node_id=803515
177							# about 97% of mail messages are below 128 kB,
178							# about 98% of mail messages are below 256 kB (2010 statistics)
179							# dbg("message: deallocating %.2f MB", length($raw_str)/1024/1024);
180	0					0	undef $raw_str;
181							}
182							}
183							}
184							elsif (ref $message eq 'SCALAR') {
185	0					0	@message = split(/^/m, $$message, -1);
186							}
187							elsif (ref $message) {
188	0					0	dbg("message: Input is a reference of unknown type!");
189							}
190							elsif (defined $message) {
191	68					1828	@message = split(/^/m, $message, -1);
192							}
193
194							# Deal with null message
195	149	50				549	if (!@message) {
196							# bug 4884:
197							# if we get here, it means that the input was null, so fake the message
198							# content as a single newline...
199	0					0	@message = ("\n");
200							}
201
202							# Bug 7648:
203							# Make sure the message is tainted. When linting, @testmsg is not, so this
204							# handles that. Perhaps 3rd party tools could call this with untainted
205							# messages? Tainting the message is important because it prevents certain
206							# exploits later.
207	149	100	66			815	if (Mail::SpamAssassin::Util::am_running_in_taint_mode() &&
208	4859					10259	grep { !Scalar::Util::tainted($_) } @message) {
209	119					220	local($_);
210							# To preserve newlines, no joining and splitting here, process each line
211							# directly as is.
212	119					331	foreach (@message) {
213	1070					2163	$_ = Mail::SpamAssassin::Util::taint_var($_);
214							}
215	119	50				303	if (grep { !Scalar::Util::tainted($_) } @message) {
	1070					2534
216	0					0	die "Mail::SpamAssassin::Message failed to enforce message taintness";
217							}
218							}
219
220							# Pull off mbox and mbx separators
221	149	100				1472	if ($message[0] =~ /^From\s+(?!:)/) {
		50
222							# careful not to confuse with obsolete syntax which allowed WSP before ':'
223							# mbox formatted mailbox
224	17					115	$self->{'mbox_sep'} = shift @message;
225							} elsif ($message[0] =~ MBX_SEPARATOR) {
226	0					0	$_ = shift @message;
227
228							# Munge the mbx message separator into mbox format as a sort of
229							# de facto portability standard in SA's internals. We need to
230							# to this so that Mail::SpamAssassin::Util::parse_rfc822_date
231							# can parse the date string...
232	0	0				0	if (/([\s\d]\d)-([a-zA-Z]{3})-(\d{4})\s(\d{2}):(\d{2}):(\d{2})/) {
233							# $1 = day of month
234							# $2 = month (text)
235							# $3 = year
236							# $4 = hour
237							# $5 = min
238							# $6 = sec
239	0					0	my @arr = localtime(timelocal($6,$5,$4,$1,$MONTH{lc($2)}-1,$3));
240	0					0	my $address;
241	0					0	foreach (@message) {
242	0	0				0	if (/^From:[^<]*<([^>]+)>/) {
		0
243	0					0	$address = $1;
244	0					0	last;
245							} elsif (/^From:\s*([^<> ]+)/) {
246	0					0	$address = $1;
247	0					0	last;
248							}
249							}
250	0					0	$self->{'mbox_sep'} = "From $address $DAY_OF_WEEK[$arr[6]] $2 $1 $4:$5:$6 $3\n";
251							}
252							}
253
254							# bug 4363
255							# Check to see if we should do CRLF instead of just LF
256							# For now, just check the first and last line and do whatever it does
257	149	50	33			1662	if (@message && ($message[0] =~ /\015\012/ \|\| $message[-1] =~ /\015\012/)) {
			33
258	0					0	$self->{line_ending} = "\015\012";
259	0					0	dbg("message: line ending changed to CRLF");
260							}
261
262							# Is a CRLF -> LF line endings conversion necessary?
263	149					596	my $squash_crlf = $self->{line_ending} eq "\015\012";
264
265							# Go through all the header fields of the message
266	149					309	my $hdr_errors = 0;
267	149					330	my $header;
268	149					277	for (;;) {
269							# make sure not to lose the last header field when there is no body
270	1166					2055	my $eof = !@message;
271	1166	100				2971	my $current = $eof ? "\n" : shift @message;
272
273	1166	100				3817	if ( $current =~ /^[ \t]/ ) {
274							# This wasn't useful in terms of a rule, but we may want to treat it
275							# specially at some point. Perhaps ignore it?
276							#unless ($current =~ /\S/) {
277							# $self->{'obsolete_folding_whitespace'} = 1;
278							#}
279
280	200	50				575	$header = '' if !defined $header; # header starts with a continuation!?
281	200					439	$header .= $current; # append continuations, no matter what
282	200					597	$self->{'pristine_headers'} .= $current;
283							}
284							else { # not a continuation
285							# Ok, there's a header here, let's go ahead and add it in.
286	966	100				2518	if (defined $header) { # deal with a previous header field
287	817					3854	my ($key, $value) = split (/:/s, $header, 2);
288
289							# If it's not a valid header (aka: not in the form "foo:bar"), skip it.
290	817	100				2318	if (defined $value) {
291							# CRLF -> LF line-endings conversion if necessary
292	811	50				1762	$value =~ s/\015\012/\012/sg if $squash_crlf;
293	811					1702	$key =~ s/[ \t]+\z//; # strip WSP before colon, obsolete rfc822 syn
294							# limit the length of the pairs we store
295	811	50				2078	if (length($key) > MAX_HEADER_KEY_LENGTH) {
296	0					0	$key = substr($key, 0, MAX_HEADER_KEY_LENGTH);
297	0					0	$self->{'truncated_header'} = 1;
298							}
299	811	50				1960	if (length($value) > MAX_HEADER_VALUE_LENGTH) {
300	0					0	$value = substr($value, 0, MAX_HEADER_VALUE_LENGTH);
301	0					0	$self->{'truncated_header'} = 1;
302							}
303	811					2749	$self->header($key, $value);
304							}
305							}
306
307	966	100				4308	if ($current =~ /^\r?$/) { # a regular end of a header section
		100
308	144	100				488	if ($eof) {
309	1					4	$self->{'missing_head_body_separator'} = 1;
310							} else {
311	143					484	$self->{'pristine_headers'} .= $current;
312							}
313	144					449	last;
314							}
315							elsif ($current =~ /^--/) { # mime boundary encountered, bail out
316	4					22	$self->{'missing_head_body_separator'} = 1;
317	4					15	unshift(@message, $current);
318	4					14	last;
319							}
320							# should we assume entering a body on encountering invalid header field?
321							else {
322							# no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
323	818	100				3606	if ($current !~ /^[\041-\071\073-\176]+[ \t]*:/) {
324							# A field name MUST be composed of printable US-ASCII characters
325							# (i.e., characters that have values between 33 (041) and 126 (176),
326							# inclusive), except colon (072). Obsolete header field syntax
327							# allowed WSP before a colon.
328	7	100				19	if (++$hdr_errors <= 3) {
329							# just consume but ignore a few invalid header fields
330							} else { # enough is enough...
331	1					5	$self->{'missing_head_body_separator'} = 1;
332	1					4	unshift(@message, $current);
333	1					3	last;
334							}
335							}
336							}
337
338							# start collecting a new header field
339	817					1796	$header = $current;
340	817					3145	$self->{'pristine_headers'} .= $current;
341							}
342							}
343	149					400	undef $header;
344
345							# Store the pristine body for later -- store as a copy since @message
346							# will get modified below
347	149					1826	$self->{'pristine_body'} = join('', @message);
348
349	149	50				654	if (!defined $self->{pristine_body_length}) {
350	149					902	$self->{'pristine_body_length'} = length $self->{'pristine_body'};
351							}
352
353							# Store complete message, get_pristine() is used a lot, avoid making copies
354	149					1267	$self->{'pristine_msg'} = $self->{'pristine_headers'} . $self->{'pristine_body'};
355
356							# iterate over lines in reverse order
357							# merge multiple blank lines into a single one
358	149					366	my $start;
359	149					760	for (my $cnt=$#message; $cnt>=0; $cnt--) {
360							# CRLF -> LF line-endings conversion if necessary
361	3682	50				5724	$message[$cnt] =~ s/\015\012\z/\012/ if $squash_crlf;
362
363							# line is blank
364	3682	100				7970	if ($message[$cnt] =~ /^\s*$/) {
365							# /^\s*$/ is about 5% faster then !/\S/, but still expensive here
366	414	100				934	if (!defined $start) {
367	346					473	$start=$cnt;
368							}
369	414	100				1160	next unless $cnt == 0;
370							}
371
372							# line is not blank, or we've reached the beginning
373
374							# if we've got a series of blank lines, get rid of them
375	3303	100				8220	if (defined $start) {
376	346					521	my $max_blank_lines = 20;
377	346					568	my $num = $start-$cnt;
378	346	50				657	if ($num > $max_blank_lines) {
379	0					0	splice @message, $cnt+2, $num-$max_blank_lines;
380							}
381	346					751	undef $start;
382							}
383							}
384
385							# Figure out the boundary
386	149					320	my ($boundary);
387	149					697	($self->{'type'}, $boundary) = Mail::SpamAssassin::Util::parse_content_type($self->header('content-type'));
388	149					992	dbg("message: main message type: ".$self->{'type'});
389
390							# dbg("message: \$message[0]: \"" . $message[0] . "\"");
391
392							# bug 6845: if main message type is multipart and the message body does not begin with
393							# either a blank line or the boundary (if defined), insert a blank line
394							# to ensure proper parsing - do not consider MIME headers at the beginning of the body
395							# to be part of the message headers.
396	149	100	66			939	if ($self->{'type'} =~ /^multipart\//i && $#message > 0 && $message[0] =~ /\S/)
			100
397							{
398	14	100	66			332	if (!defined $boundary \|\| $message[0] !~ /^--\Q$boundary\E/)
399							{
400	4					23	dbg("message: Inserting blank line at top of body to ensure correct multipart MIME parsing");
401	4					11	unshift(@message, "\012");
402							}
403							}
404
405							# dbg("message: \$message[0]: \"" . $message[0] . "\"");
406							# dbg("message: \$message[1]: \"" . $message[1] . "\"");
407
408							# parse queue, simple array of parts to parse:
409							# 0: part object, already in the tree
410							# 1: boundary used to focus body parsing
411							# 2: message content
412							# 3: how many MIME subparts to parse down
413							#
414	149					761	$self->{'parse_queue'} = [ [ $self, $boundary, \@message, $subparse ] ];
415
416							# If the message does need to get parsed, save off a copy of the body
417							# in a format we can easily parse later so we don't have to rip from
418							# pristine_body ... If we do want to parse now, go ahead and do so ...
419							#
420	149	100				549	if ($parsenow) {
421	101					566	$self->parse_body();
422							}
423
424	149					1179	$self;
425							}
426
427							# ---------------------------------------------------------------------------
428
429							=item find_parts()
430
431							Used to search the tree for specific MIME parts. See
432							I<Mail::SpamAssassin::Message::Node> for more details.
433
434							=cut
435
436							# Used to find any MIME parts whose simple content-type matches a given regexp
437							# Searches it's own and any children parts. Returns an array of MIME
438							# objects which match.
439							#
440							sub find_parts {
441	255			255	1	3323	my $self = shift;
442
443							# ok, we need to do the parsing now...
444	255	100				903	$self->parse_body() if (exists $self->{'parse_queue'});
445
446							# and pass through to the Message::Node version of the method
447	255					1317	return $self->SUPER::find_parts(@_);
448							}
449
450							# ---------------------------------------------------------------------------
451
452							=item get_pristine_header()
453
454							Returns pristine headers of the message. If no specific header name
455							is given as a parameter (case-insensitive), then all headers will be
456							returned as a scalar, including the blank line at the end of the headers.
457
458							If called in an array context, an array will be returned with each
459							specific header in a different element. In a scalar context, the last
460							specific header is returned.
461
462							ie: If 'Subject' is specified as the header, and there are 2 Subject
463							headers in a message, the last/bottom one in the message is returned in
464							scalar context or both are returned in array context.
465
466							Btw, returning the last header field (not the first) happens to be consistent
467							with DKIM signatures, which search for and cover multiple header fields
468							bottom-up according to the 'h' tag. Let's keep it this way.
469
470							Note: the returned header will include the ending newline and any embedded
471							whitespace folding.
472
473							=cut
474
475							sub get_pristine_header {
476	76			76	1	205	my ($self, $hdr) = @_;
477
478	76	100	66			555	return $self->{pristine_headers} if !defined $hdr \|\| $hdr eq '';
479							my(@ret) =
480	44					1064	$self->{pristine_headers} =~ /^\Q$hdr\E[ \t]:[ \t](.*?\n(?![ \t]))/smgi;
481							# taintedness is retained by "use re 'taint'" (fix in bug 5283 now redundant)
482	44	100				181	if (!@ret) {
		50
483	34					188	return $self->get_header($hdr);
484							} elsif (wantarray) {
485	0					0	return @ret;
486							} else {
487	10					60	return $ret[-1];
488							}
489							}
490
491							=item get_mbox_separator()
492
493							Returns the mbox separator found in the message, or undef if there
494							wasn't one.
495
496							=cut
497
498							sub get_mbox_separator {
499	34			34	1	208	return $_[0]->{mbox_sep};
500							}
501
502							=item get_body()
503
504							Returns an array of the pristine message body, one line per array element.
505
506							=cut
507
508							sub get_body {
509	0			0	1	0	my ($self) = @_;
510	0					0	my @ret = split(/^/m, $self->{pristine_body});
511	0					0	return \@ret;
512							}
513
514							# ---------------------------------------------------------------------------
515
516							=item get_pristine()
517
518							Returns a scalar of the entire pristine message.
519
520							=cut
521
522							sub get_pristine {
523	102			102	1	300	my ($self) = @_;
524	102					404	return $self->{pristine_msg};
525							}
526
527							=item get_pristine_body()
528
529							Returns a scalar of the pristine message body.
530
531							=cut
532
533							sub get_pristine_body {
534	46			46	1	140	my ($self) = @_;
535	46					468	return $self->{pristine_body};
536							}
537
538							# ---------------------------------------------------------------------------
539
540							=item extract_message_metadata($permsgstatus)
541
542							=cut
543
544							sub extract_message_metadata {
545	116			116	1	360	my ($self, $permsgstatus) = @_;
546
547							# do this only once per message, it can be expensive
548	116	100				388	return if $self->{already_extracted_metadata};
549	102					362	$self->{already_extracted_metadata} = 1;
550
551	102					683	$self->{metadata}->extract ($self, $permsgstatus);
552							}
553
554							# ---------------------------------------------------------------------------
555
556							=item $str = get_metadata($hdr)
557
558							=cut
559
560							sub get_metadata {
561	2942			2942	1	5458	my ($self, $hdr) = @_;
562	2942	50				6168	if (!$self->{metadata}) {
563	0					0	warn "metadata: oops! get_metadata() called after finish_metadata()"; return;
	0					0
564							}
565							# dbg("message: get_metadata - %s: %s", $hdr, defined $_ ? $_ : '<undef>')
566							# for $self->{metadata}->{strings}->{lc $hdr};
567
568	2942					8326	$self->{metadata}->{strings}->{lc $hdr};
569							}
570
571							=item put_metadata($hdr, $text)
572
573							=cut
574
575							sub put_metadata {
576	408			408	1	1063	my ($self, $hdr, $text) = @_;
577	408	50				937	if (!$self->{metadata}) {
578	0					0	warn "metadata: oops! put_metadata() called after finish_metadata()"; return;
	0					0
579							}
580							# dbg("message: put_metadata - %s: %s", $hdr, $text);
581	408					1608	$self->{metadata}->{strings}->{lc $hdr} = $text;
582							}
583
584							=item delete_metadata($hdr)
585
586							=cut
587
588							sub delete_metadata {
589	0			0	1	0	my ($self, $hdr) = @_;
590	0	0				0	if (!$self->{metadata}) {
591	0					0	warn "metadata: oops! delete_metadata() called after finish_metadata()"; return;
	0					0
592							}
593	0					0	delete $self->{metadata}->{strings}->{lc $hdr};
594							}
595
596							=item $str = get_all_metadata()
597
598							=cut
599
600							sub get_all_metadata {
601	16			16	1	49	my ($self) = @_;
602
603	16	50				60	if (!$self->{metadata}) {
604	0					0	warn "metadata: oops! get_all_metadata() called after finish_metadata()"; return;
	0					0
605							}
606	16					31	my @ret;
607	16					38	my $keys_ref = $self->{metadata}->{strings};
608	16					121	foreach my $key (sort keys %$keys_ref) {
609	64					151	my $val = $keys_ref->{$key};
610	64	50				151	$val = '' if !defined $val;
611	64					281	push (@ret, "$key: $val\n");
612							}
613	16	50				118	return (wantarray ? @ret : join('', @ret));
614							}
615
616							# ---------------------------------------------------------------------------
617
618							=item finish_metadata()
619
620							Destroys the metadata for this message. Once a message has been
621							scanned fully, the metadata is no longer required. Destroying
622							this will free up some memory.
623
624							=cut
625
626							sub finish_metadata {
627	124			124	1	305	my ($self) = @_;
628	124	50				446	if (defined ($self->{metadata})) {
629	124					906	$self->{metadata}->finish();
630	124					472	delete $self->{metadata};
631							}
632							}
633
634							=item finish()
635
636							Clean up an object so that it can be destroyed.
637
638							=cut
639
640							sub finish {
641	92			92	1	6809	my ($self) = @_;
642
643							# Clean ourself up
644	92					520	$self->finish_metadata();
645
646							# These will only be in the root Message node
647	92					246	delete $self->{'mime_boundary_state'};
648	92					240	delete $self->{'mbox_sep'};
649	92					196	delete $self->{'normalize'};
650	92					205	delete $self->{'body_part_scan_size'};
651	92					246	delete $self->{'rawbody_part_scan_size'};
652	92					252	delete $self->{'pristine_msg'};
653	92					244	delete $self->{'pristine_body'};
654	92					246	delete $self->{'pristine_headers'};
655	92					205	delete $self->{'line_ending'};
656	92					169	delete $self->{'missing_head_body_separator'};
657
658							# Remove the queue variable, in case the body has not been parsed
659	92					171	delete $self->{'parse_queue'};
660
661	92					212	my @toclean = ( $self );
662
663							# Go ahead and clean up all of the Message::Node parts
664	92					357	while (my $part = shift @toclean) {
665							# bug 5557: windows requires tmp file be closed before it can be rm'd
666	138	100				547	if (ref $part->{'raw'} eq 'GLOB') {
667	20	50				237	close($part->{'raw'}) or die "error closing input file: $!";
668							}
669
670							# bug 5858: avoid memory leak with deep MIME structure
671	138	100				393	if (defined ($part->{metadata})) {
672	4					16	$part->{metadata}->finish();
673	4					7	delete $part->{metadata};
674							}
675
676	138					621	delete $part->{'headers'};
677	138					545	delete $part->{'raw_headers'};
678	138					415	delete $part->{'header_order'};
679	138					419	delete $part->{'raw'};
680	138					322	delete $part->{'decoded'};
681	138					251	delete $part->{'rendered'};
682	138					231	delete $part->{'visible_rendered'};
683	138					229	delete $part->{'invisible_rendered'};
684	138					255	delete $part->{'type'};
685	138					227	delete $part->{'rendered_type'};
686
687							# if there are children nodes, add them to the queue of nodes to clean up
688	138	100				653	if (exists $part->{'body_parts'}) {
689	23					35	push(@toclean, @{$part->{'body_parts'}});
	23					48
690	23					81	delete $part->{'body_parts'};
691							}
692							}
693
694							# delete temporary files
695	92	50				286	if ($self->{'tmpfiles'}) {
696	92					172	for my $fn (@{$self->{'tmpfiles'}}) {
	92					295
697	20	50				818	unlink($fn) or warn "cannot unlink $fn: $!";
698							}
699	92					369	delete $self->{'tmpfiles'};
700							}
701							}
702
703							# also use a DESTROY method, just to ensure (as much as possible) that
704							# temporary files are deleted even if the finish() method is omitted
705							sub DESTROY {
706	128			128		604	my $self = shift;
707							# best practices: prevent potential calls to eval and to system routines
708							# in code of a DESTROY method from clobbering global variables $@ and $!
709	128					1120	local($@,$!); # keep outer error handling unaffected by DESTROY
710	128	100				2358	if ($self->{'tmpfiles'}) {
711	36					92	for my $fn (@{$self->{'tmpfiles'}}) {
	36					1854
712	0	0				0	unlink($fn) or dbg("message: cannot unlink $fn: $!");
713							}
714							}
715							}
716
717							# ---------------------------------------------------------------------------
718
719							=item receive_date()
720
721							Return a time_t value with the received date of the current message,
722							or current time if received time couldn't be determined.
723
724							=cut
725
726							sub receive_date {
727	10			10	1	31	my($self) = @_;
728
729	10					55	return Mail::SpamAssassin::Util::receive_date(scalar $self->get_all_headers(0,1));
730							}
731
732							# ---------------------------------------------------------------------------
733
734							=back
735
736							=head1 PARSING METHODS, NON-PUBLIC
737
738							These methods take a RFC2822-esque formatted message and create a tree
739							with all of the MIME body parts included. Those parts will be decoded
740							as necessary, and text/html parts will be rendered into a standard text
741							format, suitable for use in SpamAssassin.
742
743							=over 4
744
745							=item parse_body()
746
747							parse_body() passes the body part that was passed in onto the
748							correct part parser, either _parse_multipart() for multipart/* parts,
749							or _parse_normal() for everything else. Multipart sections become the
750							root of sub-trees, while everything else becomes a leaf in the tree.
751
752							For multipart messages, the first call to parse_body() doesn't create a
753							new sub-tree and just uses the parent node to contain children. All other
754							calls to parse_body() will cause a new sub-tree root to be created and
755							children will exist underneath that root. (this is just so the tree
756							doesn't have a root node which points at the actual root node ...)
757
758							=cut
759
760							sub parse_body {
761	137			137	1	356	my($self) = @_;
762
763							# This shouldn't happen, but just in case, abort.
764	137	50				479	return unless (exists $self->{'parse_queue'});
765
766	137					524	dbg("message: ---- MIME PARSER START ----");
767
768	137					239	while (my $toparse = shift @{$self->{'parse_queue'}}) {
	322					1244
769							# multipart sections are required to have a boundary set ... If this
770							# one doesn't, assume it's malformed and send it to be parsed as a
771							# non-multipart section
772							#
773	185					510	my ($msg, $boundary, $body, $subparse) = @$toparse;
774
775	185	100	66			1254	if ($msg->{'type'} =~ m{^multipart/}i && defined $boundary && $subparse > 0) {
			66
776	20					73	$self->_parse_multipart($toparse);
777							}
778							else {
779							# If it's not multipart, go ahead and just deal with it.
780	165					1001	$self->_parse_normal($toparse);
781
782							# bug 5041: process message/*, but exclude message/partial content types
783	165	100	66			1222	if ($msg->{'type'} =~ m{^message/(?!partial\z)}i && $subparse > 0)
784							{
785							# Just decode the part, but we don't need the resulting string here.
786	4					20	$msg->decode(0);
787
788							# bug 7125: decode and parse only message/rfc822 or message/global,
789							# but do not treat other message/* content types (like the ones listed
790							# here) as a message consisting of a header and a body, as they are not:
791							# message/delivery-status, message/global-delivery-status,
792							# message/feedback-report, message/global-headers,
793							# message/global-disposition-notification,
794							# message/disposition-notification, (and message/partial)
795
796							# bug 5051, bug 3748: check $msg->{decoded}: sometimes message/* parts
797							# have no content, and we get stuck waiting for STDIN, which is bad. :(
798
799	4	50	33			49	if ($msg->{'type'} =~ m{^message/(?:rfc822\|global)\z}i &&
			33
800							defined $msg->{'decoded'} && $msg->{'decoded'} ne '')
801							{
802							# Ok, so this part is still semi-recursive, since M::SA::Message
803							# calls M::SA::Message, but we don't subparse the new message,
804							# and pull a sneaky "steal our child's queue" maneuver to deal
805							# with it on our own time. Reference the decoded array directly
806							# since it's faster.
807							#
808							my $msg_obj = Mail::SpamAssassin::Message->new({
809							message => $msg->{'decoded'},
810							parsenow => 0,
811							normalize => $self->{normalize},
812	4					43	subparse => $subparse - 1,
813							});
814
815							# Add the new message to the current node
816	4					20	$msg->add_body_part($msg_obj);
817
818							# now this is the sneaky bit ... steal the sub-message's parse_queue
819							# and add it to ours. then we'll handle the sub-message in our
820							# normal loop and get all the glory. muhaha. :)
821	4					5	push(@{$self->{'parse_queue'}}, @{$msg_obj->{'parse_queue'}});
	4					10
	4					9
822	4					9	delete $msg_obj->{'parse_queue'};
823
824							# Ok, we've subparsed, so go ahead and remove the raw and decoded
825							# data because we won't need them anymore (the tree under this part
826							# will have that data)
827	4	50				11	if (ref $msg->{'raw'} eq 'GLOB') {
828							# Make sure we close it if it's a temp file -- Bug 5166
829	0	0				0	close($msg->{'raw'})
830							or die "error closing input file: $!";
831							}
832
833	4					10	delete $msg->{'raw'};
834
835	4					90	delete $msg->{'decoded'};
836							}
837							}
838							}
839							}
840
841	137					642	dbg("message: ---- MIME PARSER END ----");
842
843							# we're done parsing, so remove the queue variable
844	137					485	delete $self->{'parse_queue'};
845							}
846
847							=item _parse_multipart()
848
849							Generate a root node, and for each child part call parse_body()
850							to generate the tree.
851
852							=cut
853
854							sub _parse_multipart {
855	20			20		48	my($self, $toparse) = @_;
856
857	20					31	my ($msg, $boundary, $body, $subparse) = @{$toparse};
	20					52
858
859							# we're not supposed to be a leaf, so prep ourselves
860	20					51	$msg->{'body_parts'} = [];
861
862							# the next set of objects will be one level deeper
863	20					26	$subparse--;
864
865	20	50				108	dbg("message: parsing multipart, got boundary: ".(defined $boundary ? $boundary : ''));
866
867							# NOTE: The MIME boundary REs here are very specific to be mostly RFC 1521
868							# compliant, but also allow possible malformations to still work. Please
869							# see Bugzilla bug 3749 for more information before making any changes!
870
871							# ignore preamble per RFC 1521, unless there's no boundary ...
872	20	50				54	if ( defined $boundary ) {
873	20					30	my $line;
874	20					29	my $tmp_line = @{$body};
	20					31
875	20					65	for ($line=0; $line < $tmp_line; $line++) {
876							# dbg("message: multipart line $line: \"" . $body->[$line] . "\"");
877							# specifically look for an opening boundary
878	120	100	100			813	if (substr($body->[$line],0,2) eq '--' # triage
879							&& $body->[$line] =~ /^--\Q$boundary\E\s*$/) {
880							# Make note that we found the opening boundary
881	19					86	$self->{mime_boundary_state}->{$boundary} = 1;
882
883							# if the line after the opening boundary isn't a header, flag it.
884							# we need to make sure that there's actually another line though.
885							# no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
886	19	100	66			135	if ($line+1 < $tmp_line && $body->[$line+1] !~ /^[\041-\071\073-\176]+:/) {
887	5					12	$self->{'missing_mime_headers'} = 1;
888							}
889
890	19					55	last;
891							}
892							}
893
894							# Found a boundary, ignore the preamble
895	20	100				48	if ( $line < $tmp_line ) {
896	19					43	splice @{$body}, 0, $line+1;
	19					57
897							}
898
899							# Else, there's no boundary, so leave the whole part...
900							}
901
902							# prepare a new tree node
903	20					149	my $part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
904	20					49	my $in_body = 0;
905	20					44	my $header;
906							my $part_array;
907	20					0	my $found_end_boundary;
908	20					26	my $partcnt = 0;
909
910	20					33	my $line_count = @{$body};
	20					34
911	20					26	foreach ( @{$body} ) {
	20					45
912							# if we're on the last body line, or we find any boundary marker,
913							# deal with the mime part;
914							# a triage before an unlikely-to-match regexp avoids a CPU hotspot
915	2260		100			10253	$found_end_boundary = defined $boundary && substr($_,0,2) eq '--'
916							&& /^--\Q$boundary\E(?:--)?\s*$/;
917	2260	100	100			6458	if ( --$line_count == 0 \|\| $found_end_boundary ) {
918	44					99	my $line = $_; # remember the last line
919
920							# If at last line and no end boundary found, the line belongs to body
921							# TODO:
922							# Is $self->{mime_boundary_state}->{$boundary}-- needed here?
923							# Could "missing end boundary" be a useful rule? Mark it somewhere?
924							# If SA processed truncated message from amavis etc, this could also
925							# be hit legimately..
926	44	50				139	if (!$found_end_boundary) {
		50
927							# TODO: This is duplicate code from few pages down below..
928	0					0	while (length ($_) > MAX_BODY_LINE_LENGTH) {
929	0					0	push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
	0					0
930	0					0	substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
931							}
932	0					0	push ( @{$part_array}, $_ );
	0					0
933							}
934							# per rfc 1521, the CRLF before the boundary is part of the boundary:
935							# NOTE: The CRLF preceding the encapsulation line is conceptually
936							# attached to the boundary so that it is possible to have a part
937							# that does not end with a CRLF (line break). Body parts that must
938							# be considered to end with line breaks, therefore, must have two
939							# CRLFs preceding the encapsulation line, the first of which is part
940							# of the preceding body part, and the second of which is part of the
941							# encapsulation boundary.
942							elsif ($part_array) {
943	44					116	chomp( $part_array->[-1] ); # trim the CRLF that's part of the boundary
944	44	100				144	splice @{$part_array}, -1 if ( $part_array->[-1] eq '' ); # blank line for the boundary only ...
	24					66
945							}
946							else {
947							# Invalid parts can have no body, so fake in a blank body
948							# in that case.
949	0					0	$part_array = [];
950							}
951
952	44					85	my($p_boundary);
953	44					148	($part_msg->{'type'}, $p_boundary) = Mail::SpamAssassin::Util::parse_content_type($part_msg->header('content-type'));
954	44		66			268	$p_boundary \|\|= $boundary;
955	44	50				297	dbg("message: found part of type ".$part_msg->{'type'}.", boundary: ".(defined $p_boundary ? $p_boundary : ''));
956
957							# we've created a new node object, so add it to the queue along with the
958							# text that belongs to that part, then add the new part to the current
959							# node to create the tree.
960	44					70	push(@{$self->{'parse_queue'}}, [ $part_msg, $p_boundary, $part_array, $subparse ]);
	44					164
961	44					196	$msg->add_body_part($part_msg);
962
963							# rfc 1521 says /^--boundary--$/, some MUAs may just require /^--boundary--/
964							# but this causes problems with horizontal lines when the boundary is
965							# made up of dashes as well, etc.
966	44	50				112	if (defined $boundary) {
967							# no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
968	44	100	33			671	if ($line =~ /^--\Q${boundary}\E--\s*$/) {
		50
969							# Make a note that we've seen the end boundary
970	20					57	$self->{mime_boundary_state}->{$boundary}--;
971	20					79	last;
972							}
973							elsif ($line_count && $body->[-$line_count] !~ /^[\041-\071\073-\176]+:/) {
974							# if we aren't on an end boundary and there are still lines left, it
975							# means we hit a new start boundary. therefore, the next line ought
976							# to be a mime header. if it's not, mark it.
977	0					0	$self->{'missing_mime_headers'} = 1;
978							}
979							}
980
981							# Maximum parts to process
982	24	50				149	if (++$partcnt == 1000) {
983	0					0	dbg("message: mimepart limit exceeded, stopping parsing");
984	0					0	$self->{'mimepart_limit_exceeded'} = 1;
985	0					0	return;
986							}
987
988							# make sure we start with a new clean node
989	24					33	$in_body = 0;
990	24					107	$part_msg = Mail::SpamAssassin::Message::Node->new({ normalize=>$self->{normalize} });
991	24					59	undef $part_array;
992	24					43	undef $header;
993
994	24					102	next;
995							}
996
997	2216	100				3495	if (!$in_body) {
998							# s/\s+$//; # bug 5127: don't clean this up (yet)
999							# no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
1000	132	100	66			509	if (/^[\041-\071\073-\176]+[ \t]*:/) {
		100
1001	87	100				250	if ($header) {
1002	49					267	my ( $key, $value ) = split ( /:\s*/, $header, 2 );
1003	49					150	$part_msg->header( $key, $value );
1004							}
1005	87					170	$header = $_;
1006	87					278	next;
1007							}
1008							elsif (/^[ \t]/ && $header) {
1009							# $_ =~ s/^\s*//; # bug 5127, again
1010	1					5	$header .= $_;
1011	1					3	next;
1012							}
1013							else {
1014	44	100				117	if ($header) {
1015	38					246	my ( $key, $value ) = split ( /:\s*/, $header, 2 );
1016	38					124	$part_msg->header( $key, $value );
1017							}
1018	44					80	$in_body = 1;
1019
1020							# if there's a blank line separator, that's good. if there isn't,
1021							# it's a body line, so drop through.
1022	44	100				158	if (/^\r?$/) {
1023	43					117	next;
1024							}
1025							else {
1026	1					4	$self->{'missing_mime_head_body_separator'} = 1;
1027							}
1028							}
1029							}
1030
1031							# we run into a perl bug if the lines are astronomically long (probably
1032							# due to lots of regexp backtracking); so split any individual line
1033							# over MAX_BODY_LINE_LENGTH bytes in length. This can wreck HTML
1034							# totally -- but IMHO the only reason a luser would use
1035							# MAX_BODY_LINE_LENGTH-byte lines is to crash filters, anyway.
1036	2085					4787	while (length ($_) > MAX_BODY_LINE_LENGTH) {
1037	0					0	push (@{$part_array}, substr($_, 0, MAX_BODY_LINE_LENGTH)."\n");
	0					0
1038	0					0	substr($_, 0, MAX_BODY_LINE_LENGTH) = '';
1039							}
1040	2085					2649	push ( @{$part_array}, $_ );
	2085					5409
1041							}
1042
1043							# Look for a message epilogue
1044							# originally ignored whitespace: 0.185 0.2037 0.0654 0.757 0.00 0.00 TVD_TAB
1045							# ham FPs were all "." on a line by itself.
1046							# spams seem to only have NULL chars afterwards ?
1047	20	100				146	if ($line_count) {
1048	8					19	for(; $line_count > 0; $line_count--) {
1049	15	100				81	if ($body->[-$line_count] =~ /[^\s.]/) {
1050	2					8	$self->{mime_epilogue_exists} = 1;
1051	2					14	last;
1052							}
1053							}
1054							}
1055
1056							}
1057
1058							=item _parse_normal()
1059
1060							Generate a leaf node and add it to the parent.
1061
1062							=cut
1063
1064							sub _parse_normal {
1065	165			165		504	my($self, $toparse) = @_;
1066
1067	165					299	my ($msg, $boundary, $body) = @{$toparse};
	165					459
1068
1069	165					561	dbg("message: parsing normal part");
1070
1071							# 0: content-type, 1: boundary, 2: charset, 3: filename
1072	165					568	my @ct = Mail::SpamAssassin::Util::parse_content_type($msg->header('content-type'));
1073
1074							# multipart sections are required to have a boundary set ... If this
1075							# one doesn't, assume it's malformed and revert to text/plain
1076	165	50	33			1262	$msg->{'type'} = ($ct[0] !~ m@^multipart/@i \|\| defined $boundary ) ? $ct[0] : 'text/plain';
1077	165					647	$msg->{'charset'} = $ct[2];
1078
1079							# attempt to figure out a name for this attachment if there is one ...
1080	165		100			661	my $disp = $msg->header('content-disposition') \|\| '';
1081	165	100				848	if ($disp =~ /name=\s*"?([^";]+)"?/i) {
		100
1082	1					4	$msg->{'name'} = $1;
1083							}
1084							elsif ($ct[3]) {
1085	2					7	$msg->{'name'} = $ct[3];
1086							}
1087	165	100				557	if ($msg->{'name'}) {
1088	3					18	$msg->{'name'} = Encode::decode("MIME-Header", $msg->{'name'});
1089							}
1090
1091	165					7208	$msg->{'boundary'} = $boundary;
1092
1093							# If the part type is not one that we're likely to want to use, go
1094							# ahead and write the part data out to a temp file -- why keep sucking
1095							# up RAM with something we're not going to use?
1096							#
1097	165	100				1100	if ($msg->{'type'} !~ m@^(?:text/(?:plain\|html)$\|message\b)@) {
1098	20					39	my($filepath, $fh);
1099							eval {
1100	20					77	($filepath, $fh) = Mail::SpamAssassin::Util::secure_tmpfile(); 1;
	20					63
1101	20	50				29	} or do {
1102	0	0				0	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
	0					0
1103	0					0	info("message: failed to create a temp file: %s", $eval_stat);
1104							};
1105	20	50				46	if ($fh) {
1106							# The temp file was created, add it to the list of pending deletions
1107							# we cannot just delete immediately in the POSIX idiom, as this is
1108							# unportable (to win32 at least)
1109	20					25	push @{$self->{tmpfiles}}, $filepath;
	20					58
1110	20					47	dbg("message: storing a message part to file %s", $filepath);
1111	20	50				33	$fh->print(@{$body}) or die "error writing to $filepath: $!";
	20					84
1112	20	50				1090	$fh->flush or die "error writing (flush) to $filepath: $!";
1113	20					117	$msg->{'raw'} = $fh;
1114							}
1115							}
1116
1117							# if the part didn't get a temp file, go ahead and store the data in memory
1118	165	100				659	if (!defined $msg->{'raw'}) {
1119	145					530	dbg("message: storing a body to memory");
1120	145					869	$msg->{'raw'} = $body;
1121							}
1122							}
1123
1124							# ---------------------------------------------------------------------------
1125
1126							sub get_mimepart_digests {
1127	0			0	0	0	my ($self) = @_;
1128
1129	0	0				0	if (!exists $self->{mimepart_digests}) {
1130							# traverse all parts which are leaves, recursively
1131							$self->{mimepart_digests} =
1132	0		0			0	[ map(sha1_hex($_->decode) . ':' . lc($_->{type}\|\|''),
1133							$self->find_parts(qr/^/,1,1)) ];
1134							}
1135	0					0	return $self->{mimepart_digests};
1136							}
1137
1138							# ---------------------------------------------------------------------------
1139
1140							# common code for get_rendered_body_text_array,
1141							# get_visible_rendered_body_text_array, get_invisible_rendered_body_text_array
1142							#
1143							sub get_body_text_array_common {
1144	387			387	0	1093	my ($self, $method_name) = @_;
1145
1146	387					967	my $key = 'text_' . $method_name;
1147	387	100				1084	if (exists $self->{$key}) { return $self->{$key} }
	247					954
1148
1149	140					522	$self->{$key} = [];
1150
1151	140					335	my $scansize = $self->{body_part_scan_size};
1152
1153							# Find all parts which are leaves
1154	140					1086	my @parts = $self->find_parts(qr/./,1);
1155	140	50				595	return $self->{$key} unless @parts;
1156
1157							# the html metadata may have already been set, so let's not bother if it's
1158							# already been done.
1159	140					494	my $html_needs_setting = !exists $self->{metadata}->{html};
1160
1161	140	100	100			799	my $text = $method_name eq 'invisible_rendered' ? ''
1162							: ($self->get_header('subject') \|\| "\n");
1163
1164							# Go through each part
1165	140					699	for (my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1166	141					387	my $p = $parts[$pt];
1167
1168							# put a blank line between parts ...
1169	141	100				507	$text .= "\n" if $text ne '';
1170
1171	141					1023	my($type, $rnd) = $p->$method_name(); # decode this part
1172							# Only text/* types are rendered ...
1173	141	50				552	if (defined $rnd) {
1174							# Truncate to body_part_scan_size
1175	141	100	100			1097	if ($scansize && length($rnd) > $scansize) {
1176							# Try truncating at boundary, nearest 1k block is fine
1177							# Newline first
1178	8					43	my $idx = index($rnd, "\n", $scansize);
1179	8	100	100			37	if ($idx >= 0 && $idx - $scansize <= 1024) {
1180	2					10	substr($rnd, $idx+1) = ''; # truncate
1181							}
1182							else {
1183							# Space is fine too for rendered
1184	6					18	$idx = index($rnd, " ", $scansize);
1185	6	100	66			24	if ($idx >= 0 && $idx - $scansize <= 1024) {
1186	1					6	substr($rnd, $idx+1) = ''; # truncate
1187							}
1188							else {
1189	5					19	substr($rnd, $scansize) = ''; # truncate
1190							}
1191							}
1192							}
1193
1194							# Add to rendered text
1195	141					608	$text .= $rnd;
1196
1197							# TVD - if there are multiple parts, what should we do?
1198							# right now, just use the last one. we may need to give some priority
1199							# at some point, ie: use text/html rendered if it exists, or
1200							# text/plain rendered as html otherwise.
1201	141	100	100			1306	if ($html_needs_setting && $type eq 'text/html') {
1202	7					51	$self->{metadata}->{html} = $p->{html_results};
1203							}
1204							}
1205							}
1206
1207							# whitespace handling (warning: small changes have large effects!)
1208	140					2020	$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
1209							# $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace (incl. VT, NBSP) => space
1210	140					879	$text =~ tr/ \t\n\r\x0b/ /s; # whitespace (incl. VT) => space
1211	140					579	$text =~ tr/\f/\n/; # form feeds => newline
1212
1213	140					658	my @textary = split_into_array_of_short_lines($text);
1214	140					490	$self->{$key} = \@textary;
1215
1216	140					959	return $self->{$key};
1217							}
1218
1219							# ---------------------------------------------------------------------------
1220
1221							sub get_rendered_body_text_array {
1222	339			339	0	702	my ($self) = @_;
1223	339					1008	return $self->get_body_text_array_common('rendered');
1224							}
1225
1226							sub get_visible_rendered_body_text_array {
1227	24			24	0	68	my ($self) = @_;
1228	24					134	return $self->get_body_text_array_common('visible_rendered');
1229							}
1230
1231							sub get_invisible_rendered_body_text_array {
1232	24			24	0	64	my ($self) = @_;
1233	24					57	return $self->get_body_text_array_common('invisible_rendered');
1234							}
1235
1236							# ---------------------------------------------------------------------------
1237
1238							sub get_decoded_body_text_array {
1239	110			110	0	278	my ($self) = @_;
1240
1241	110	100				349	if (defined $self->{text_decoded}) { return $self->{text_decoded}; }
	2					18
1242	108					427	$self->{text_decoded} = [ ];
1243
1244	108					261	my $scansize = $self->{rawbody_part_scan_size};
1245
1246							# Find all parts which are leaves
1247	108					649	my @parts = $self->find_parts(qr/^(?:text\|message)\b/i,1);
1248	108	50				484	return $self->{text_decoded} unless @parts;
1249
1250							# Go through each part
1251	108					520	for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
1252							# bug 4843: skip text/calendar parts since they're usually an attachment
1253							# and not displayed
1254	108	50				534	next if ($parts[$pt]->{'type'} eq 'text/calendar');
1255
1256	108					390	my $rawbody = $parts[$pt]->decode();
1257	108					479	my $rawlen = length($rawbody);
1258
1259							# Truncate to rawbody_part_scan_size
1260	108	100	100			824	if ($scansize && $rawlen > $scansize) {
1261							# Try truncating at boundary, nearest 1k block is fine
1262							# Newline first
1263	8					32	my $idx = index($rawbody, "\n", $scansize);
1264	8	100	100			52	if ($idx >= 0 && $idx - $scansize <= 1024) {
1265	2					8	substr($rawbody, $idx+1) = ''; # truncate
1266							}
1267							else {
1268							# Try > next, might be html
1269	6					32	$idx = index($rawbody, '>', $scansize);
1270	6	100	66			27	if ($idx >= 0 && $idx - $scansize <= 1024) {
1271	1					26	substr($rawbody, $idx+1) = ''; # truncate
1272							}
1273							else {
1274							# Space as last effort
1275	5					13	$idx = index($rawbody, ' ', $scansize);
1276	5	50	33			18	if ($idx >= 0 && $idx - $scansize <= 1024) {
1277	0					0	substr($rawbody, $idx+1) = ''; # truncate
1278							}
1279							else {
1280	5					18	substr($rawbody, $scansize) = ''; # truncate
1281							}
1282							}
1283							}
1284							}
1285
1286	108					250	push(@{$self->{text_decoded}},
	108					547
1287							split_into_array_of_short_paragraphs($rawbody));
1288							}
1289
1290	108					466	return $self->{text_decoded};
1291							}
1292
1293							# ---------------------------------------------------------------------------
1294
1295							sub split_into_array_of_short_lines {
1296	140			140	0	303	my @result;
1297	140					1129	foreach my $line (split (/^/m, $_[0])) {
1298	512					1523	while (length ($line) > MAX_BODY_LINE_LENGTH) {
1299							# try splitting "nicely" so that we don't chop a url in half or
1300							# something. if there's no space, then just split at max length.
1301	1					8	my $length = rindex($line, ' ', MAX_BODY_LINE_LENGTH) + 1;
1302	1		50			12	$length \|\|= MAX_BODY_LINE_LENGTH;
1303	1					8	push (@result, substr($line, 0, $length, ''));
1304							}
1305	512					1405	push (@result, $line);
1306							}
1307	140					862	@result;
1308							}
1309
1310							# ---------------------------------------------------------------------------
1311
1312							# Split a text into array of paragraphs of sizes between
1313							# 1-2 * MAX_BODY_LINE_LENGTH, returning the resulting array.
1314							# Mainly used for rawbody splitting purposes
1315
1316							sub split_into_array_of_short_paragraphs {
1317	108			108	0	365	my $text_l = length($_[0]);
1318	108	100				958	return ($_[0]) if $text_l <= MAX_BODY_LINE_LENGTH;
1319	5					14	my(@result,$j,$ofs);
1320							# Try to split in order \n, > (html?), space
1321	5					26	for ($ofs = 0; $text_l - $ofs > 2 * MAX_BODY_LINE_LENGTH; $ofs = $j+1) {
1322	1					14	$j = index($_[0], "\n", $ofs + MAX_BODY_LINE_LENGTH);
1323	1	50	33			9	if ($j < 0 \|\| $j-$ofs+1 > 2 * MAX_BODY_LINE_LENGTH) {
1324	0					0	$j = index($_[0], ">", $ofs + MAX_BODY_LINE_LENGTH);
1325	0	0	0			0	if ($j < 0 \|\| $j-$ofs+1 > 2 * MAX_BODY_LINE_LENGTH) {
1326	0					0	$j = index($_[0], " ", $ofs + MAX_BODY_LINE_LENGTH);
1327	0	0	0			0	if ($j < 0 \|\| $j-$ofs+1 > 2 * MAX_BODY_LINE_LENGTH) {
1328	0					0	$j = $ofs + MAX_BODY_LINE_LENGTH;
1329							}
1330							}
1331							}
1332	1					9	push(@result, substr($_[0], $ofs, $j-$ofs+1));
1333							}
1334	5	50				64	push(@result, substr($_[0], $ofs)) if $ofs < $text_l;
1335	5					36	@result;
1336							}
1337
1338							# ---------------------------------------------------------------------------
1339
1340							1;
1341
1342							=back
1343
1344							=cut