File Coverage

blib/lib/Mail/SpamAssassin/Plugin/BodyEval.pm

Criterion	Covered	Total	%
statement	33	163	20.2
branch	0	70	0.0
condition	1	54	1.8
subroutine	7	21	33.3
pod	1	12	8.3
total	42	320	13.1

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18
19							use Mail::SpamAssassin::Plugin;
20	22			22		158	use Mail::SpamAssassin::Logger;
	22					49
	22					629
21	22			22		124	use Mail::SpamAssassin::Constants qw(:sa);
	22					54
	22					1316
22	22			22		170
	22					44
	22					3125
23							use strict;
24	22			22		143	use warnings;
	22					41
	22					604
25	22			22		167	# use bytes;
	22					45
	22					747
26							use re 'taint';
27	22			22		132
	22					43
	22					43213
28							our @ISA = qw(Mail::SpamAssassin::Plugin);
29
30							# constructor: register the eval rule
31							my $class = shift;
32							my $mailsaobject = shift;
33	63			63	1	220
34	63					151	# some boilerplate...
35							$class = ref($class) \|\| $class;
36							my $self = $class->SUPER::new($mailsaobject);
37	63		33			415	bless ($self, $class);
38	63					331
39	63					203	# the important bit!
40							$self->register_eval_rule("multipart_alternative_difference");
41							$self->register_eval_rule("multipart_alternative_difference_count");
42	63					264	$self->register_eval_rule("check_blank_line_ratio");
43	63					193	$self->register_eval_rule("tvd_vertical_words");
44	63					226	$self->register_eval_rule("check_stock_info");
45	63					195	$self->register_eval_rule("check_body_length");
46	63					189
47	63					194	$self->register_eval_rule("plaintext_body_length");
48							$self->register_eval_rule("plaintext_sig_length");
49	63					177	$self->register_eval_rule("plaintext_body_sig_ratio");
50	63					181
51	63					192	return $self;
52							}
53	63					478
54							my ($self, $pms, $fulltext, $min, $max) = @_;
55
56							$self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
57	0			0	0
58							if (($min == 0 \|\| $pms->{madiff} > $min) &&
59	0	0					($max eq "undef" \|\| $pms->{madiff} <= $max)) {
60							return 1;
61	0	0	0				}
			0
			0
62							return 0;
63	0						}
64
65	0						my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
66							$self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
67							return 0 unless $pms->{madiff_html} > $minhtml;
68							return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio);
69	0			0	0		}
70	0	0
71	0	0					my ($self, $pms) = @_;
72	0						$pms->{madiff} = 0;
73							$pms->{madiff_html} = 0;
74							$pms->{madiff_text} = 0;
75
76	0			0			my $msg = $pms->{msg};
77	0
78	0						# Find all multipart/alternative parts in the message
79	0						my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);
80
81	0						# If there are no multipart/alternative sections, skip this test.
82							return if (!@ma);
83
84	0						# Figure out what the MIME content of the message looks like
85							my @content = $msg->content_summary();
86
87	0	0					# Exchange meeting requests come in as m/a text/html text/calendar,
88							# which we want to ignore because of the high FP rate it would cause.
89							#
90	0						if (@content == 3 && $content[2] eq 'text/calendar' &&
91							$content[1] eq 'text/html' &&
92							$content[0] eq 'multipart/alternative') {
93							return;
94							}
95	0	0	0
			0
			0
96							# Go through each of the multipart parts
97							foreach my $part (@ma) {
98	0						my %html;
99							my %text;
100
101							# limit our search to text-based parts
102	0						my @txt = $part->find_parts(qr@^text\b@i);
103	0						foreach my $text (@txt) {
104							# we only care about the rendered version of the part
105							my ($type, $rnd) = $text->rendered();
106							next unless defined $type;
107	0
108	0						# parse the rendered text into tokens. assume they are whitespace
109							# separated, and ignore anything that doesn't have a word-character
110	0						# in it (0-9a-zA-Z_) since those are probably things like bullet
111	0	0					# points, horizontal lines, etc. this assumes that punctuation
112							# in one part will be the same in other parts.
113							#
114							if ($type eq 'text/html') {
115							foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
116							#dbg("eval: HTML: $w");
117							$html{$w}++;
118							}
119	0	0
120	0						# If there are no words, mark if there's at least 1 image ...
121							if (!%html && exists $pms->{html}{inside}{img}) {
122	0						# Use "\n" as the mark since it can't ever occur normally
123							$html{"\n"}=1;
124							}
125							}
126	0	0	0				else {
127							foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
128	0						#dbg("eval: TEXT: $w");
129							$text{$w}++;
130							}
131							}
132	0						}
133
134	0						# How many HTML tokens do we have at the start?
135							my $orig = keys %html;
136							next if ($orig == 0);
137
138							$pms->{madiff_html} = $orig;
139							$pms->{madiff_text} = keys %text;
140	0						dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html});
141	0	0
142							# If the token appears at least as many times in the text part as
143	0						# in the html part, remove it from the list of html tokens.
144	0						while(my ($k,$v) = each %text) {
145	0						delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
146							}
147
148							#map { dbg("eval: LEFT: $_") } keys %html;
149	0
150	0	0	0				# In theory, the tokens should be the same in both text and html
151							# parts, so there would be 0 tokens left in the html token list, for
152							# a 0% difference rate. Calculate it here, and record the difference
153							# if it's been the highest so far in this message.
154							my $diff = scalar(keys %html)/$orig*100;
155							$pms->{madiff} = $diff if ($diff > $pms->{madiff});
156
157							dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff});
158							}
159	0
160	0	0					return;
161							}
162	0
163							my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;
164
165	0						if (!defined $minlines \|\| $minlines < 1) {
166							$minlines = 1;
167							}
168
169	0			0	0		my $blank_line_ratio_ref = $pms->{blank_line_ratio};
170
171	0	0	0				if (! exists $blank_line_ratio_ref->{$minlines}) {
172	0						$fulltext = $pms->get_decoded_body_text_array();
173
174							my $blank = 0;
175	0						my $nlines = 0;
176							foreach my $chunk (@$fulltext) {
177	0	0					foreach (split(/^/m, $chunk, -1)) {
178	0						$nlines++;
179							$blank++ if !/\S/;
180	0						}
181	0						}
182	0
183	0						# report -1 if it's a blank message ...
184	0						$blank_line_ratio_ref->{$minlines} =
185	0	0					$nlines < $minlines ? -1 : 100 * $blank / $nlines;
186							}
187
188							return (($min == 0 && $blank_line_ratio_ref->{$minlines} <= $max) \|\|
189							($blank_line_ratio_ref->{$minlines} > $min &&
190	0	0					$blank_line_ratio_ref->{$minlines} <= $max));
191							}
192
193							my ($self, $pms, $text, $min, $max) = @_;
194
195							# klugy
196	0		0				$max = 101 if ($max >= 100);
197
198							if (!defined $pms->{tvd_vertical_words}) {
199							$pms->{tvd_vertical_words} = -1;
200	0			0	0
201							foreach (@{$text}) {
202							my $l = length $_;
203	0	0					next unless ($l > 5);
204							my $spaces = tr/ / /;
205	0	0					my $nonspaces = $l - $spaces;
206	0						my $pct;
207							if ($spaces > $nonspaces \|\| $nonspaces == 0) {
208	0						$pct = 100;
	0
209	0						}
210	0	0					else {
211	0						$pct = int(100*$spaces/$nonspaces);
212	0						}
213	0						$pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words});
214	0	0	0				}
215	0						}
216
217							dbg("eval: tvd_vertical_words value: $pms->{tvd_vertical_words} / min: $min / max: $max - value must be >= min and < max");
218	0						return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max);
219							}
220	0	0
221							my ($self, $pms, $fulltext, $min) = @_;
222
223							$self->_check_stock_info($pms) unless (exists $pms->{stock_info});
224	0
225	0	0	0				if ($min == 0 \|\| $pms->{stock_info} >= $min) {
226							return 1;
227							}
228							return 0;
229	0			0	0		}
230
231	0	0					my ($self, $pms) = @_;
232							$pms->{stock_info} = 0;
233	0	0	0
234	0						# Find all multipart/alternative parts in the message
235							my @parts = $pms->{msg}->find_parts(qr@^text/plain$@i);
236	0						return if (!@parts);
237
238							# Go through each of the multipart parts
239							my %hits;
240	0			0			my $part = $parts[0];
241	0						my ($type, $rnd) = $part->rendered();
242							return unless $type;
243
244	0						# bug 5644,5717: avoid pathological cases where a regexp takes massive amount
245	0	0					# of time by applying the regexp to limited-size text chunks, one at a time
246
247							foreach my $rnd_chunk (
248	0						Mail::SpamAssassin::Message::split_into_array_of_short_paragraphs($rnd))
249	0						{
250	0						foreach ( $rnd_chunk =~ /^\s([^:\s][^:\n]{2,29})\s:\s*\S/mg ) {
251	0	0					my $str = lc $_;
252							$str =~ tr/a-z//cd;
253							#$str =~ s/([a-z])0([a-z])/$1o$2/g;
254
255							if ($str =~ /(
256	0						^trad(?:e\|ing)date\|
257							company(?:name)?\|
258							s\w?(?:t\w?o\w?c\w?k\|y\w?m(?:\w?b\w?o\w?l)?)\|
259	0						t(?:arget\|icker)\|
260	0						(?:opening\|current)p(?:rice)?\|
261	0						p(?:rojected\|osition)\|
262							expectations\|
263							weeks?high\|
264	0	0					marketperformance\|
265							(?:year\|week\|month\|day\|price)(?:target\|estimates?)\|
266							sector\|
267							r(?:ecommendation\|ating)
268							)$/x) {
269							$hits{$1}++;
270							dbg("eval: stock info hit: $1");
271							}
272							}
273							}
274
275							$pms->{stock_info} = scalar keys %hits;
276							dbg("eval: stock info total: ".$pms->{stock_info});
277
278	0						return;
279	0						}
280
281							my ($self, $pms, undef, $min) = @_;
282
283							my $body_length = $pms->{msg}->{pristine_body_length};
284	0						dbg("eval: body_length - %s - check for min of %s", $body_length, $min);
285	0
286							return (defined $body_length && $body_length <= $min) ? 1 : 0;
287	0						}
288
289
290							# For plain text parts with a signature delimiter, evaluate the ratio and
291	0			0	0		# lengths (in bytes) of the body and signature parts.
292							#
293	0						# Arguments: min and (optional) max value
294	0						#
295							# body __SIG_RATIO_EXCESSIVE eval:plaintext_body_sig_ratio('0','0.5')
296	0	0	0
297							my ($self, $pms, undef, $min, $max) = @_;
298
299							$self->_plaintext_body_sig_ratio($pms);
300
301							my $len = $pms->{plaintext_body_sig_ratio}->{body_length};
302
303							return ( defined $len
304							&& $len >= $min
305							&& (defined $max ? $len <= $max : 1) ) ? 1 : 0;
306							}
307
308	0			0	0		my ($self, $pms, undef, $min, $max) = @_;
309
310	0						$self->_plaintext_body_sig_ratio($pms);
311
312	0						my $len = $pms->{plaintext_body_sig_ratio}->{sig_length};
313
314	0	0	0				return ( defined $len
315							&& $len >= $min
316							&& (defined $max ? $len <= $max : 1) ) ? 1 : 0;
317							}
318
319							my ($self, $pms, undef, $min, $max) = @_;
320	0			0	0
321							$self->_plaintext_body_sig_ratio($pms);
322	0
323							my $len = $pms->{plaintext_body_sig_ratio}->{ratio};
324	0
325							return ( defined $len
326	0	0	0				&& $len >= $min
327							&& (defined $max ? $len <= $max : 10**6) ) ? 1 : 0;
328							}
329
330							my ($self, $pms) = @_;
331
332	0			0	0		return if exists $pms->{plaintext_body_sig_ratio};
333
334	0						$pms->{plaintext_body_sig_ratio} = {};
335
336	0						# Find the first text/plain MIME part.
337
338	0	0	0				# Naive approach. This should commonly match the text/plain part we want,
339							# but could be enhanced to better cope with complex MIME structures.
340							my $part = ($pms->{msg}->find_parts(qr/^text\/plain/))[0];
341
342							return unless defined $part;
343
344	0			0			# Decode if necessary, do not render or alter whitespace.
345							my $text = $part->decode();
346	0	0
347							# Find the last occurence of a signature delimiter and get the body and
348	0						# signature lengths.
349							my ($len_b, $len_s) = map { length } $text =~ /(^\|.\n)-- \n(.?)$/s;
350
351							if (! defined $len_b) { # no sig marker, all body
352							$len_b = length $text;
353							$len_s = 0;
354	0						}
355
356	0	0					$pms->{plaintext_body_sig_ratio}->{body_length} = $len_b;
357							$pms->{plaintext_body_sig_ratio}->{sig_length} = $len_s;
358
359	0						$pms->{plaintext_body_sig_ratio}->{ratio} = $len_s ? $len_b/$len_s : 10**6;
360
361							return 1;
362							}
363	0
	0
364
365	0	0					# ---------------------------------------------------------------------------
366	0
367	0						# capability checks for "if can()":
368							#
369
370	0
371	0						1;