File Coverage

blib/lib/Mail/SpamAssassin/Plugin/BodyEval.pm
Criterion Covered Total %
statement 33 163 20.2
branch 0 70 0.0
condition 1 54 1.8
subroutine 7 21 33.3
pod 1 12 8.3
total 42 320 13.1


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18              
19             use Mail::SpamAssassin::Plugin;
20 22     22   158 use Mail::SpamAssassin::Logger;
  22         49  
  22         629  
21 22     22   124 use Mail::SpamAssassin::Constants qw(:sa);
  22         54  
  22         1316  
22 22     22   170  
  22         44  
  22         3125  
23             use strict;
24 22     22   143 use warnings;
  22         41  
  22         604  
25 22     22   167 # use bytes;
  22         45  
  22         747  
26             use re 'taint';
27 22     22   132  
  22         43  
  22         43213  
28             our @ISA = qw(Mail::SpamAssassin::Plugin);
29              
30             # constructor: register the eval rule
31             my $class = shift;
32             my $mailsaobject = shift;
33 63     63 1 220  
34 63         151 # some boilerplate...
35             $class = ref($class) || $class;
36             my $self = $class->SUPER::new($mailsaobject);
37 63   33     415 bless ($self, $class);
38 63         331  
39 63         203 # the important bit!
40             $self->register_eval_rule("multipart_alternative_difference");
41             $self->register_eval_rule("multipart_alternative_difference_count");
42 63         264 $self->register_eval_rule("check_blank_line_ratio");
43 63         193 $self->register_eval_rule("tvd_vertical_words");
44 63         226 $self->register_eval_rule("check_stock_info");
45 63         195 $self->register_eval_rule("check_body_length");
46 63         189  
47 63         194 $self->register_eval_rule("plaintext_body_length");
48             $self->register_eval_rule("plaintext_sig_length");
49 63         177 $self->register_eval_rule("plaintext_body_sig_ratio");
50 63         181  
51 63         192 return $self;
52             }
53 63         478  
54             my ($self, $pms, $fulltext, $min, $max) = @_;
55              
56             $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
57 0     0 0    
58             if (($min == 0 || $pms->{madiff} > $min) &&
59 0 0         ($max eq "undef" || $pms->{madiff} <= $max)) {
60             return 1;
61 0 0 0       }
      0        
      0        
62             return 0;
63 0           }
64              
65 0           my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
66             $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
67             return 0 unless $pms->{madiff_html} > $minhtml;
68             return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio);
69 0     0 0   }
70 0 0          
71 0 0         my ($self, $pms) = @_;
72 0           $pms->{madiff} = 0;
73             $pms->{madiff_html} = 0;
74             $pms->{madiff_text} = 0;
75              
76 0     0     my $msg = $pms->{msg};
77 0            
78 0           # Find all multipart/alternative parts in the message
79 0           my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);
80              
81 0           # If there are no multipart/alternative sections, skip this test.
82             return if (!@ma);
83              
84 0           # Figure out what the MIME content of the message looks like
85             my @content = $msg->content_summary();
86              
87 0 0         # Exchange meeting requests come in as m/a text/html text/calendar,
88             # which we want to ignore because of the high FP rate it would cause.
89             #
90 0           if (@content == 3 && $content[2] eq 'text/calendar' &&
91             $content[1] eq 'text/html' &&
92             $content[0] eq 'multipart/alternative') {
93             return;
94             }
95 0 0 0        
      0        
      0        
96             # Go through each of the multipart parts
97             foreach my $part (@ma) {
98 0           my %html;
99             my %text;
100              
101             # limit our search to text-based parts
102 0           my @txt = $part->find_parts(qr@^text\b@i);
103 0           foreach my $text (@txt) {
104             # we only care about the rendered version of the part
105             my ($type, $rnd) = $text->rendered();
106             next unless defined $type;
107 0            
108 0           # parse the rendered text into tokens. assume they are whitespace
109             # separated, and ignore anything that doesn't have a word-character
110 0           # in it (0-9a-zA-Z_) since those are probably things like bullet
111 0 0         # points, horizontal lines, etc. this assumes that punctuation
112             # in one part will be the same in other parts.
113             #
114             if ($type eq 'text/html') {
115             foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
116             #dbg("eval: HTML: $w");
117             $html{$w}++;
118             }
119 0 0          
120 0           # If there are no words, mark if there's at least 1 image ...
121             if (!%html && exists $pms->{html}{inside}{img}) {
122 0           # Use "\n" as the mark since it can't ever occur normally
123             $html{"\n"}=1;
124             }
125             }
126 0 0 0       else {
127             foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
128 0           #dbg("eval: TEXT: $w");
129             $text{$w}++;
130             }
131             }
132 0           }
133              
134 0           # How many HTML tokens do we have at the start?
135             my $orig = keys %html;
136             next if ($orig == 0);
137              
138             $pms->{madiff_html} = $orig;
139             $pms->{madiff_text} = keys %text;
140 0           dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html});
141 0 0          
142             # If the token appears at least as many times in the text part as
143 0           # in the html part, remove it from the list of html tokens.
144 0           while(my ($k,$v) = each %text) {
145 0           delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
146             }
147              
148             #map { dbg("eval: LEFT: $_") } keys %html;
149 0            
150 0 0 0       # In theory, the tokens should be the same in both text and html
151             # parts, so there would be 0 tokens left in the html token list, for
152             # a 0% difference rate. Calculate it here, and record the difference
153             # if it's been the highest so far in this message.
154             my $diff = scalar(keys %html)/$orig*100;
155             $pms->{madiff} = $diff if ($diff > $pms->{madiff});
156              
157             dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff});
158             }
159 0            
160 0 0         return;
161             }
162 0            
163             my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;
164              
165 0           if (!defined $minlines || $minlines < 1) {
166             $minlines = 1;
167             }
168              
169 0     0 0   my $blank_line_ratio_ref = $pms->{blank_line_ratio};
170              
171 0 0 0       if (! exists $blank_line_ratio_ref->{$minlines}) {
172 0           $fulltext = $pms->get_decoded_body_text_array();
173              
174             my $blank = 0;
175 0           my $nlines = 0;
176             foreach my $chunk (@$fulltext) {
177 0 0         foreach (split(/^/m, $chunk, -1)) {
178 0           $nlines++;
179             $blank++ if !/\S/;
180 0           }
181 0           }
182 0            
183 0           # report -1 if it's a blank message ...
184 0           $blank_line_ratio_ref->{$minlines} =
185 0 0         $nlines < $minlines ? -1 : 100 * $blank / $nlines;
186             }
187              
188             return (($min == 0 && $blank_line_ratio_ref->{$minlines} <= $max) ||
189             ($blank_line_ratio_ref->{$minlines} > $min &&
190 0 0         $blank_line_ratio_ref->{$minlines} <= $max));
191             }
192              
193             my ($self, $pms, $text, $min, $max) = @_;
194              
195             # klugy
196 0   0       $max = 101 if ($max >= 100);
197              
198             if (!defined $pms->{tvd_vertical_words}) {
199             $pms->{tvd_vertical_words} = -1;
200 0     0 0    
201             foreach (@{$text}) {
202             my $l = length $_;
203 0 0         next unless ($l > 5);
204             my $spaces = tr/ / /;
205 0 0         my $nonspaces = $l - $spaces;
206 0           my $pct;
207             if ($spaces > $nonspaces || $nonspaces == 0) {
208 0           $pct = 100;
  0            
209 0           }
210 0 0         else {
211 0           $pct = int(100*$spaces/$nonspaces);
212 0           }
213 0           $pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words});
214 0 0 0       }
215 0           }
216              
217             dbg("eval: tvd_vertical_words value: $pms->{tvd_vertical_words} / min: $min / max: $max - value must be >= min and < max");
218 0           return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max);
219             }
220 0 0          
221             my ($self, $pms, $fulltext, $min) = @_;
222              
223             $self->_check_stock_info($pms) unless (exists $pms->{stock_info});
224 0            
225 0 0 0       if ($min == 0 || $pms->{stock_info} >= $min) {
226             return 1;
227             }
228             return 0;
229 0     0 0   }
230              
231 0 0         my ($self, $pms) = @_;
232             $pms->{stock_info} = 0;
233 0 0 0        
234 0           # Find all multipart/alternative parts in the message
235             my @parts = $pms->{msg}->find_parts(qr@^text/plain$@i);
236 0           return if (!@parts);
237              
238             # Go through each of the multipart parts
239             my %hits;
240 0     0     my $part = $parts[0];
241 0           my ($type, $rnd) = $part->rendered();
242             return unless $type;
243              
244 0           # bug 5644,5717: avoid pathological cases where a regexp takes massive amount
245 0 0         # of time by applying the regexp to limited-size text chunks, one at a time
246              
247             foreach my $rnd_chunk (
248 0           Mail::SpamAssassin::Message::split_into_array_of_short_paragraphs($rnd))
249 0           {
250 0           foreach ( $rnd_chunk =~ /^\s*([^:\s][^:\n]{2,29})\s*:\s*\S/mg ) {
251 0 0         my $str = lc $_;
252             $str =~ tr/a-z//cd;
253             #$str =~ s/([a-z])0([a-z])/$1o$2/g;
254              
255             if ($str =~ /(
256 0           ^trad(?:e|ing)date|
257             company(?:name)?|
258             s\w?(?:t\w?o\w?c\w?k|y\w?m(?:\w?b\w?o\w?l)?)|
259 0           t(?:arget|icker)|
260 0           (?:opening|current)p(?:rice)?|
261 0           p(?:rojected|osition)|
262             expectations|
263             weeks?high|
264 0 0         marketperformance|
265             (?:year|week|month|day|price)(?:target|estimates?)|
266             sector|
267             r(?:ecommendation|ating)
268             )$/x) {
269             $hits{$1}++;
270             dbg("eval: stock info hit: $1");
271             }
272             }
273             }
274              
275             $pms->{stock_info} = scalar keys %hits;
276             dbg("eval: stock info total: ".$pms->{stock_info});
277              
278 0           return;
279 0           }
280              
281             my ($self, $pms, undef, $min) = @_;
282              
283             my $body_length = $pms->{msg}->{pristine_body_length};
284 0           dbg("eval: body_length - %s - check for min of %s", $body_length, $min);
285 0            
286             return (defined $body_length && $body_length <= $min) ? 1 : 0;
287 0           }
288              
289              
290             # For plain text parts with a signature delimiter, evaluate the ratio and
291 0     0 0   # lengths (in bytes) of the body and signature parts.
292             #
293 0           # Arguments: min and (optional) max value
294 0           #
295             # body __SIG_RATIO_EXCESSIVE eval:plaintext_body_sig_ratio('0','0.5')
296 0 0 0        
297             my ($self, $pms, undef, $min, $max) = @_;
298              
299             $self->_plaintext_body_sig_ratio($pms);
300              
301             my $len = $pms->{plaintext_body_sig_ratio}->{body_length};
302              
303             return ( defined $len
304             && $len >= $min
305             && (defined $max ? $len <= $max : 1) ) ? 1 : 0;
306             }
307              
308 0     0 0   my ($self, $pms, undef, $min, $max) = @_;
309              
310 0           $self->_plaintext_body_sig_ratio($pms);
311              
312 0           my $len = $pms->{plaintext_body_sig_ratio}->{sig_length};
313              
314 0 0 0       return ( defined $len
315             && $len >= $min
316             && (defined $max ? $len <= $max : 1) ) ? 1 : 0;
317             }
318              
319             my ($self, $pms, undef, $min, $max) = @_;
320 0     0 0    
321             $self->_plaintext_body_sig_ratio($pms);
322 0            
323             my $len = $pms->{plaintext_body_sig_ratio}->{ratio};
324 0            
325             return ( defined $len
326 0 0 0       && $len >= $min
327             && (defined $max ? $len <= $max : 10**6) ) ? 1 : 0;
328             }
329              
330             my ($self, $pms) = @_;
331              
332 0     0 0   return if exists $pms->{plaintext_body_sig_ratio};
333              
334 0           $pms->{plaintext_body_sig_ratio} = {};
335              
336 0           # Find the first text/plain MIME part.
337              
338 0 0 0       # Naive approach. This should commonly match the text/plain part we want,
339             # but could be enhanced to better cope with complex MIME structures.
340             my $part = ($pms->{msg}->find_parts(qr/^text\/plain/))[0];
341              
342             return unless defined $part;
343              
344 0     0     # Decode if necessary, do not render or alter whitespace.
345             my $text = $part->decode();
346 0 0          
347             # Find the last occurence of a signature delimiter and get the body and
348 0           # signature lengths.
349             my ($len_b, $len_s) = map { length } $text =~ /(^|.*\n)-- \n(.*?)$/s;
350              
351             if (! defined $len_b) { # no sig marker, all body
352             $len_b = length $text;
353             $len_s = 0;
354 0           }
355              
356 0 0         $pms->{plaintext_body_sig_ratio}->{body_length} = $len_b;
357             $pms->{plaintext_body_sig_ratio}->{sig_length} = $len_s;
358              
359 0           $pms->{plaintext_body_sig_ratio}->{ratio} = $len_s ? $len_b/$len_s : 10**6;
360              
361             return 1;
362             }
363 0            
  0            
364              
365 0 0         # ---------------------------------------------------------------------------
366 0            
367 0           # capability checks for "if can()":
368             #
369              
370 0            
371 0           1;