File Coverage

blib/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
Criterion Covered Total %
statement 30 132 22.7
branch 0 56 0.0
condition 1 45 2.2
subroutine 7 16 43.7
pod 1 8 12.5
total 39 257 15.1


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             package Mail::SpamAssassin::Plugin::BodyEval;
19              
20 22     22   155 use Mail::SpamAssassin::Plugin;
  22         50  
  22         710  
21 22     22   116 use Mail::SpamAssassin::Logger;
  22         50  
  22         1399  
22 22     22   163 use Mail::SpamAssassin::Constants qw(:sa);
  22         66  
  22         3191  
23              
24 22     22   199 use strict;
  22         59  
  22         633  
25 22     22   123 use warnings;
  22         37  
  22         687  
26             # use bytes;
27 22     22   136 use re 'taint';
  22         54  
  22         36961  
28              
29             our @ISA = qw(Mail::SpamAssassin::Plugin);
30              
31             # constructor: register the eval rule
32             sub new {
33 63     63 1 206 my $class = shift;
34 63         147 my $mailsaobject = shift;
35              
36             # some boilerplate...
37 63   33     507 $class = ref($class) || $class;
38 63         357 my $self = $class->SUPER::new($mailsaobject);
39 63         167 bless ($self, $class);
40              
41             # the important bit!
42 63         310 $self->register_eval_rule("multipart_alternative_difference");
43 63         236 $self->register_eval_rule("multipart_alternative_difference_count");
44 63         219 $self->register_eval_rule("check_blank_line_ratio");
45 63         221 $self->register_eval_rule("tvd_vertical_words");
46 63         231 $self->register_eval_rule("check_stock_info");
47 63         214 $self->register_eval_rule("check_body_length");
48              
49 63         503 return $self;
50             }
51              
52             sub multipart_alternative_difference {
53 0     0 0   my ($self, $pms, $fulltext, $min, $max) = @_;
54              
55 0 0         $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
56              
57 0 0 0       if (($min == 0 || $pms->{madiff} > $min) &&
      0        
      0        
58             ($max eq "undef" || $pms->{madiff} <= $max)) {
59 0           return 1;
60             }
61 0           return 0;
62             }
63              
64             sub multipart_alternative_difference_count {
65 0     0 0   my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
66 0 0         $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
67 0 0         return 0 unless $pms->{madiff_html} > $minhtml;
68 0           return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio);
69             }
70              
71             sub _multipart_alternative_difference {
72 0     0     my ($self, $pms) = @_;
73 0           $pms->{madiff} = 0;
74 0           $pms->{madiff_html} = 0;
75 0           $pms->{madiff_text} = 0;
76              
77 0           my $msg = $pms->{msg};
78              
79             # Find all multipart/alternative parts in the message
80 0           my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);
81              
82             # If there are no multipart/alternative sections, skip this test.
83 0 0         return if (!@ma);
84              
85             # Figure out what the MIME content of the message looks like
86 0           my @content = $msg->content_summary();
87              
88             # Exchange meeting requests come in as m/a text/html text/calendar,
89             # which we want to ignore because of the high FP rate it would cause.
90             #
91 0 0 0       if (@content == 3 && $content[2] eq 'text/calendar' &&
      0        
      0        
92             $content[1] eq 'text/html' &&
93             $content[0] eq 'multipart/alternative') {
94 0           return;
95             }
96              
97             # Go through each of the multipart parts
98 0           foreach my $part (@ma) {
99 0           my %html;
100             my %text;
101              
102             # limit our search to text-based parts
103 0           my @txt = $part->find_parts(qr@^text\b@i);
104 0           foreach my $text (@txt) {
105             # we only care about the rendered version of the part
106 0           my ($type, $rnd) = $text->rendered();
107 0 0         next unless defined $type;
108              
109             # parse the rendered text into tokens. assume they are whitespace
110             # separated, and ignore anything that doesn't have a word-character
111             # in it (0-9a-zA-Z_) since those are probably things like bullet
112             # points, horizontal lines, etc. this assumes that punctuation
113             # in one part will be the same in other parts.
114             #
115 0 0         if ($type eq 'text/html') {
116 0           foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
117             #dbg("eval: HTML: $w");
118 0           $html{$w}++;
119             }
120              
121             # If there are no words, mark if there's at least 1 image ...
122 0 0 0       if (!%html && exists $pms->{html}{inside}{img}) {
123             # Use "\n" as the mark since it can't ever occur normally
124 0           $html{"\n"}=1;
125             }
126             }
127             else {
128 0           foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
129             #dbg("eval: TEXT: $w");
130 0           $text{$w}++;
131             }
132             }
133             }
134              
135             # How many HTML tokens do we have at the start?
136 0           my $orig = keys %html;
137 0 0         next if ($orig == 0);
138              
139 0           $pms->{madiff_html} = $orig;
140 0           $pms->{madiff_text} = keys %text;
141 0           dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html});
142              
143             # If the token appears at least as many times in the text part as
144             # in the html part, remove it from the list of html tokens.
145 0           while(my ($k,$v) = each %text) {
146 0 0 0       delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
147             }
148              
149             #map { dbg("eval: LEFT: $_") } keys %html;
150              
151             # In theory, the tokens should be the same in both text and html
152             # parts, so there would be 0 tokens left in the html token list, for
153             # a 0% difference rate. Calculate it here, and record the difference
154             # if it's been the highest so far in this message.
155 0           my $diff = scalar(keys %html)/$orig*100;
156 0 0         $pms->{madiff} = $diff if ($diff > $pms->{madiff});
157              
158 0           dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff});
159             }
160              
161 0           return;
162             }
163              
164             sub check_blank_line_ratio {
165 0     0 0   my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;
166              
167 0 0 0       if (!defined $minlines || $minlines < 1) {
168 0           $minlines = 1;
169             }
170              
171 0           my $blank_line_ratio_ref = $pms->{blank_line_ratio};
172              
173 0 0         if (! exists $blank_line_ratio_ref->{$minlines}) {
174 0           $fulltext = $pms->get_decoded_body_text_array();
175              
176 0           my $blank = 0;
177 0           my $nlines = 0;
178 0           foreach my $chunk (@$fulltext) {
179 0           foreach (split(/^/m, $chunk, -1)) {
180 0           $nlines++;
181 0 0         $blank++ if !/\S/;
182             }
183             }
184              
185             # report -1 if it's a blank message ...
186 0 0         $blank_line_ratio_ref->{$minlines} =
187             $nlines < $minlines ? -1 : 100 * $blank / $nlines;
188             }
189              
190             return (($min == 0 && $blank_line_ratio_ref->{$minlines} <= $max) ||
191             ($blank_line_ratio_ref->{$minlines} > $min &&
192 0   0       $blank_line_ratio_ref->{$minlines} <= $max));
193             }
194              
195             sub tvd_vertical_words {
196 0     0 0   my ($self, $pms, $text, $min, $max) = @_;
197              
198             # klugy
199 0 0         $max = 101 if ($max >= 100);
200              
201 0 0         if (!defined $pms->{tvd_vertical_words}) {
202 0           $pms->{tvd_vertical_words} = -1;
203              
204 0           foreach (@{$text}) {
  0            
205 0           my $l = length $_;
206 0 0         next unless ($l > 5);
207 0           my $spaces = tr/ / /;
208 0           my $nonspaces = $l - $spaces;
209 0           my $pct;
210 0 0 0       if ($spaces > $nonspaces || $nonspaces == 0) {
211 0           $pct = 100;
212             }
213             else {
214 0           $pct = int(100*$spaces/$nonspaces);
215             }
216 0 0         $pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words});
217             }
218             }
219              
220 0           dbg("eval: tvd_vertical_words value: $pms->{tvd_vertical_words} / min: $min / max: $max - value must be >= min and < max");
221 0 0 0       return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max);
222             }
223              
224             sub check_stock_info {
225 0     0 0   my ($self, $pms, $fulltext, $min) = @_;
226              
227 0 0         $self->_check_stock_info($pms) unless (exists $pms->{stock_info});
228              
229 0 0 0       if ($min == 0 || $pms->{stock_info} >= $min) {
230 0           return 1;
231             }
232 0           return 0;
233             }
234              
235             sub _check_stock_info {
236 0     0     my ($self, $pms) = @_;
237 0           $pms->{stock_info} = 0;
238              
239             # Find all multipart/alternative parts in the message
240 0           my @parts = $pms->{msg}->find_parts(qr@^text/plain$@i);
241 0 0         return if (!@parts);
242              
243             # Go through each of the multipart parts
244 0           my %hits;
245 0           my $part = $parts[0];
246 0           my ($type, $rnd) = $part->rendered();
247 0 0         return unless $type;
248              
249             # bug 5644,5717: avoid pathological cases where a regexp takes massive amount
250             # of time by applying the regexp to limited-size text chunks, one at a time
251              
252 0           foreach my $rnd_chunk (
253             Mail::SpamAssassin::Message::split_into_array_of_short_paragraphs($rnd))
254             {
255 0           foreach ( $rnd_chunk =~ /^\s*([^:\s][^:\n]{2,29})\s*:\s*\S/mg ) {
256 0           my $str = lc $_;
257 0           $str =~ tr/a-z//cd;
258             #$str =~ s/([a-z])0([a-z])/$1o$2/g;
259              
260 0 0         if ($str =~ /(
261             ^trad(?:e|ing)date|
262             company(?:name)?|
263             s\w?(?:t\w?o\w?c\w?k|y\w?m(?:\w?b\w?o\w?l)?)|
264             t(?:arget|icker)|
265             (?:opening|current)p(?:rice)?|
266             p(?:rojected|osition)|
267             expectations|
268             weeks?high|
269             marketperformance|
270             (?:year|week|month|day|price)(?:target|estimates?)|
271             sector|
272             r(?:ecommendation|ating)
273             )$/x) {
274 0           $hits{$1}++;
275 0           dbg("eval: stock info hit: $1");
276             }
277             }
278             }
279              
280 0           $pms->{stock_info} = scalar keys %hits;
281 0           dbg("eval: stock info total: ".$pms->{stock_info});
282              
283 0           return;
284             }
285              
286             sub check_body_length {
287 0     0 0   my ($self, $pms, undef, $min) = @_;
288              
289 0           my $body_length = $pms->{msg}->{pristine_body_length};
290 0           dbg("eval: body_length - %s - check for min of %s", $body_length, $min);
291              
292 0 0 0       return (defined $body_length && $body_length <= $min) ? 1 : 0;
293             }
294              
295             # ---------------------------------------------------------------------------
296              
297             # capability checks for "if can()":
298             #
299 0     0 0   sub has_check_body_length { 1 }
300              
301             1;