File Coverage

blib/lib/Mail/SpamAssassin/Plugin/MIMEEval.pm
Criterion Covered Total %
statement 45 285 15.7
branch 1 158 0.6
condition 1 101 0.9
subroutine 10 30 33.3
pod 7 19 36.8
total 64 593 10.7


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             MIMEEval - perform various tests against MIME structure and body
21              
22             =head1 SYNOPSIS
23              
24             loadplugin Mail::SpamAssassin::Plugin::MIMEEval
25              
26             body NAME_OF_RULE eval:check_for_mime
27             body NAME_OF_RULE eval:check_for_mime_html
28             body NAME_OF_RULE eval:check_for_mime_html_only
29             body NAME_OF_RULE eval:check_mime_multipart_ratio
30             body NAME_OF_RULE eval:check_msg_parse_flags
31             body NAME_OF_RULE eval:check_for_ascii_text_illegal
32             body NAME_OF_RULE eval:check_abundant_unicode_ratio
33             body NAME_OF_RULE eval:check_for_faraway_charset
34             body NAME_OF_RULE eval:check_for_uppercase
35             body NAME_OF_RULE eval:check_ma_non_text
36             body NAME_OF_RULE eval:check_base64_length
37             body NAME_OF_RULE eval:check_qp_ratio
38              
39             =head1 DESCRIPTION
40              
41             Perform various tests against MIME structure and body.
42              
43             =cut
44              
45             package Mail::SpamAssassin::Plugin::MIMEEval;
46              
47 22     22   163 use strict;
  22         56  
  22         707  
48 22     22   120 use warnings;
  22         55  
  22         688  
49             # use bytes;
50 22     22   124 use re 'taint';
  22         56  
  22         871  
51              
52 22     22   135 use Mail::SpamAssassin::Plugin;
  22         47  
  22         519  
53 22     22   122 use Mail::SpamAssassin::Locales;
  22         68  
  22         564  
54 22     22   133 use Mail::SpamAssassin::Constants qw(:sa CHARSETS_LIKELY_TO_FP_AS_CAPS);
  22         56  
  22         3257  
55 22     22   163 use Mail::SpamAssassin::Util qw(untaint_var);
  22         57  
  22         1104  
56 22     22   164 use Mail::SpamAssassin::Logger;
  22         50  
  22         73433  
57              
58             our @ISA = qw(Mail::SpamAssassin::Plugin);
59              
60             # constructor: register the eval rule
61             sub new {
62 63     63 1 237 my $class = shift;
63 63         162 my $mailsaobject = shift;
64              
65             # some boilerplate...
66 63   33     490 $class = ref($class) || $class;
67 63         357 my $self = $class->SUPER::new($mailsaobject);
68 63         169 bless ($self, $class);
69              
70             # the important bit!
71 63         294 $self->register_eval_rule("check_for_mime");
72 63         210 $self->register_eval_rule("check_for_mime_html");
73 63         256 $self->register_eval_rule("check_for_mime_html_only");
74 63         245 $self->register_eval_rule("check_mime_multipart_ratio");
75 63         231 $self->register_eval_rule("check_msg_parse_flags");
76 63         217 $self->register_eval_rule("check_for_ascii_text_illegal");
77 63         216 $self->register_eval_rule("check_abundant_unicode_ratio");
78 63         208 $self->register_eval_rule("check_for_faraway_charset");
79 63         177 $self->register_eval_rule("check_for_uppercase");
80 63         195 $self->register_eval_rule("check_ma_non_text");
81 63         234 $self->register_eval_rule("check_base64_length");
82 63         214 $self->register_eval_rule("check_qp_ratio");
83              
84 63         503 return $self;
85             }
86              
87             ###########################################################################
88              
89             sub are_more_high_bits_set {
90 0     0 0 0 my ($self, $str) = @_;
91              
92             # TODO: I suspect a tr// trick may be faster here
93 0         0 my $numhis = () = ($str =~ /[\200-\377]/g);
94 0         0 my $numlos = length($str) - $numhis;
95              
96 0 0       0 ($numlos <= $numhis && $numhis > 3);
97             }
98              
99             =over 4
100              
101             =item has_check_for_ascii_text_illegal
102              
103             Adds capability check for "if can()" for check_for_ascii_text_illegal
104              
105             =cut
106              
107 0     0 1 0 sub has_check_for_ascii_text_illegal { 1 }
108              
109             =item check_for_ascii_text_illegal
110              
111             If a MIME part claims to be text/plain or text/plain;charset=us-ascii and the Content-Transfer-Encoding is 7bit (either explicitly or by default), then we should enforce the actual text being only TAB, NL, SPACE through TILDE, i.e. all 7bit characters excluding NO-WS-CTL (per RFC-2822).
112              
113             All mainstream MTA's get this right.
114              
115             =cut
116              
117             sub check_for_ascii_text_illegal {
118 0     0 1 0 my ($self, $pms) = @_;
119              
120 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_ascii_text_illegal};
121 0         0 return ($pms->{mime_ascii_text_illegal} > 0);
122             }
123              
124             =item has_check_abundant_unicode_ratio
125              
126             Adds capability check for "if can()" for check_abundant_unicode_ratio
127              
128             =cut
129              
130 0     0 1 0 sub has_check_abundant_unicode_ratio { 1 }
131              
132             =item check_abundant_unicode_ratio
133              
134             A MIME part claiming to be text/plain and containing Unicode characters must be encoded as quoted-printable or base64, or use UTF data coding (typically with 8bit encoding). Any message in 7bit or 8bit encoding containing (HTML) Unicode entities will not render them as Unicode, but literally.
135              
136             Thus a few such sequences might occur on a mailing list of developers discussing such characters, but a message with a high density of such characters is likely spam.
137              
138             =cut
139              
140             sub check_abundant_unicode_ratio {
141 0     0 1 0 my ($self, $pms, undef, $ratio) = @_;
142              
143             # validate ratio?
144 0 0       0 return 0 unless ($ratio =~ /^\d{0,3}\.\d{1,3}$/);
145              
146 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_text_unicode_ratio};
147 0         0 return ($pms->{mime_text_unicode_ratio} >= $ratio);
148             }
149              
150             sub check_for_faraway_charset {
151 0     0 0 0 my ($self, $pms, $body) = @_;
152              
153 0         0 my $type = $pms->get('Content-Type',undef);
154              
155 0         0 my @locales = Mail::SpamAssassin::Util::get_my_locales($self->{main}->{conf}->{ok_locales});
156              
157 0 0       0 return 0 if grep { $_ eq "all" } @locales;
  0         0  
158              
159 0 0       0 $type = get_charset_from_ct_line($type) if defined $type;
160              
161 0 0 0     0 if (defined $type &&
162             !Mail::SpamAssassin::Locales::is_charset_ok_for_locales
163             ($type, @locales))
164             {
165             # sanity check. Some charsets (e.g. koi8-r) include the ASCII
166             # 7-bit charset as well, so make sure we actually have a high
167             # number of 8-bit chars in the body text first.
168              
169 0         0 $body = join("\n", @$body);
170 0 0       0 if ($self->are_more_high_bits_set ($body)) {
171 0         0 return 1;
172             }
173             }
174              
175 0         0 0;
176             }
177              
178             sub check_for_mime {
179 0     0 0 0 my ($self, $pms, undef, $test) = @_;
180              
181 0 0       0 $self->_check_attachments($pms) unless exists $pms->{$test};
182 0         0 return $pms->{$test};
183             }
184              
185             # any text/html MIME part
186             sub check_for_mime_html {
187 0     0 0 0 my ($self, $pms) = @_;
188              
189 0         0 my $ctype = $pms->get('Content-Type');
190 0 0       0 return 1 if $ctype =~ m{^text/html}i;
191              
192 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_body_html_count};
193 0         0 return ($pms->{mime_body_html_count} > 0);
194             }
195              
196             # HTML without some other type of MIME text part
197             sub check_for_mime_html_only {
198 0     0 0 0 my ($self, $pms) = @_;
199              
200 0         0 my $ctype = $pms->get('Content-Type');
201 0 0       0 return 1 if $ctype =~ m{^text/html}i;
202              
203 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_body_html_count};
204             return ($pms->{mime_body_html_count} > 0 &&
205 0   0     0 $pms->{mime_body_text_count} == 0);
206             }
207              
208             sub check_mime_multipart_ratio {
209 0     0 0 0 my ($self, $pms, undef, $min, $max) = @_;
210              
211 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_multipart_alternative};
212              
213             return ($pms->{mime_multipart_ratio} >= $min &&
214 0   0     0 $pms->{mime_multipart_ratio} < $max);
215             }
216              
217             sub _check_mime_header {
218 0     0   0 my ($self, $pms, $ctype, $cte, $cd, $charset, $name) = @_;
219              
220 0   0     0 $charset ||= '';
221              
222 0 0       0 if ($ctype eq 'text/html') {
    0          
223 0         0 $pms->{mime_body_html_count}++;
224             }
225             elsif ($ctype =~ m@^text@i) {
226 0         0 $pms->{mime_body_text_count}++;
227             }
228              
229 0 0       0 if ($cte =~ /base64/) {
    0          
230 0         0 $pms->{mime_base64_count}++;
231             }
232             elsif ($cte =~ /quoted-printable/) {
233 0         0 $pms->{mime_qp_count}++;
234             }
235              
236 0 0 0     0 if ($cd && $cd =~ /attachment/) {
237 0         0 $pms->{mime_attachment}++;
238             }
239              
240 0 0 0     0 if ($ctype =~ /^text/ &&
      0        
      0        
      0        
      0        
241             $cte =~ /base64/ &&
242             (!$charset || $charset =~ /(?:us-ascii|ansi_x3\.4-1968|iso-ir-6|ansi_x3\.4-1986|iso_646\.irv:1991|ascii|iso646-us|us|ibm367|cp367|csascii)/) &&
243             !($cd && $cd =~ /^(?:attachment|inline)/))
244             {
245 0         0 $pms->{mime_base64_encoded_text} = 1;
246             }
247              
248 0 0 0     0 if ($charset =~ /iso-\S+-\S+\b/i &&
249             $charset !~ /iso-(?:8859-\d{1,2}|2022-(?:jp|kr))\b/)
250             {
251 0         0 $pms->{mime_bad_iso_charset} = 1;
252             }
253              
254             # MIME_BASE64_LATIN: now a zero-hitter
255             # if (!$name &&
256             # $cte =~ /base64/ &&
257             # $charset =~ /\b(?:us-ascii|iso-8859-(?:[12349]|1[0345])|windows-(?:125[0247]))\b/)
258             # {
259             # $pms->{mime_base64_latin} = 1;
260             # }
261              
262             # MIME_QP_NO_CHARSET: now a zero-hitter
263             # if ($cte =~ /quoted-printable/ && $cd =~ /inline/ && !$charset) {
264             # $pms->{mime_qp_inline_no_charset} = 1;
265             # }
266              
267             # MIME_HTML_NO_CHARSET: now a zero-hitter
268             # if ($ctype eq 'text/html' &&
269             # !(defined($charset) && $charset) &&
270             # !($cd && $cd =~ /^(?:attachment|inline)/))
271             # {
272             # $pms->{mime_html_no_charset} = 1;
273             # }
274              
275 0 0       0 if ($charset =~ /[a-z]/i) {
276 0 0       0 if (defined $pms->{mime_html_charsets}) {
277 0         0 $pms->{mime_html_charsets} .= " ".$charset;
278             } else {
279 0         0 $pms->{mime_html_charsets} = $charset;
280             }
281              
282 0 0       0 if (! $pms->{mime_faraway_charset}) {
283 0         0 my @l = Mail::SpamAssassin::Util::get_my_locales($self->{main}->{conf}->{ok_locales});
284              
285 0 0 0     0 if (!(grep { $_ eq "all" } @l) &&
286             !Mail::SpamAssassin::Locales::is_charset_ok_for_locales($charset, @l))
287             {
288 0         0 $pms->{mime_faraway_charset} = 1;
289             }
290             }
291             }
292             }
293              
294             sub _check_attachments {
295 0     0   0 my ($self, $pms) = @_;
296              
297             # MIME status
298 0         0 my $where = -1; # -1 = start, 0 = nowhere, 1 = header, 2 = body
299 0         0 my $qp_bytes = 0; # total bytes in QP regions
300 0         0 my $qp_count = 0; # QP-encoded bytes in QP regions
301 0         0 my @part_bytes; # MIME part total bytes
302             my @part_type; # MIME part types
303              
304 0         0 my $normal_chars = 0; # MIME text bytes that aren't encoded
305 0         0 my $unicode_chars = 0; # MIME text bytes that are unicode entities
306              
307             # MIME header information
308 0         0 my $part = -1; # MIME part index
309              
310             # indicate the scan has taken place
311 0         0 $pms->{mime_checked_attachments} = 1;
312              
313             # results
314             # $pms->{mime_base64_blanks} = 0; # expensive to determine, no longer avail
315 0         0 $pms->{mime_base64_count} = 0;
316 0         0 $pms->{mime_base64_encoded_text} = 0;
317             # $pms->{mime_base64_illegal} = 0;
318             # $pms->{mime_base64_latin} = 0;
319 0         0 $pms->{mime_body_html_count} = 0;
320 0         0 $pms->{mime_body_text_count} = 0;
321 0         0 $pms->{mime_faraway_charset} = 0;
322             # $pms->{mime_html_no_charset} = 0;
323 0         0 $pms->{mime_missing_boundary} = 0;
324 0         0 $pms->{mime_multipart_alternative} = 0;
325 0         0 $pms->{mime_multipart_ratio} = 1.0;
326 0         0 $pms->{mime_qp_count} = 0;
327             # $pms->{mime_qp_illegal} = 0;
328             # $pms->{mime_qp_inline_no_charset} = 0;
329 0         0 $pms->{mime_qp_long_line} = 0;
330 0         0 $pms->{mime_qp_ratio} = 0;
331 0         0 $pms->{mime_ascii_text_illegal} = 0;
332 0         0 $pms->{mime_text_unicode_ratio} = 0;
333              
334             # Get all parts ...
335 0         0 foreach my $p ($pms->{msg}->find_parts(qr/./)) {
336             # message headers
337 0         0 my ($ctype, $boundary, $charset, $name) = Mail::SpamAssassin::Util::parse_content_type($p->get_header("content-type"));
338              
339 0 0       0 if ($ctype eq 'multipart/alternative') {
340 0         0 $pms->{mime_multipart_alternative} = 1;
341             }
342              
343 0   0     0 my $cte = $p->get_header('Content-Transfer-Encoding') || '';
344 0 0       0 chomp($cte = defined($cte) ? lc $cte : "");
345              
346 0   0     0 my $cd = $p->get_header('Content-Disposition') || '';
347 0 0       0 chomp($cd = defined($cd) ? lc $cd : "");
348              
349 0 0       0 $charset = lc $charset if ($charset);
350 0 0       0 $name = lc $name if ($name);
351              
352 0         0 $self->_check_mime_header($pms, $ctype, $cte, $cd, $charset, $name);
353              
354             # If we're not in a leaf node in the tree, there will be no raw
355             # section, so skip it.
356 0 0       0 if (! $p->is_leaf()) {
357 0         0 next;
358             }
359              
360 0         0 $part++;
361 0         0 $part_type[$part] = $ctype;
362 0 0       0 $part_bytes[$part] = 0 if $cd !~ /attachment/;
363              
364 0         0 my $cte_is_base64 = $cte =~ /base64/i;
365 0         0 my $previous = '';
366 0         0 foreach (@{$p->raw()}) {
  0         0  
367              
368             # if ($cte_is_base64) {
369             # if ($previous =~ /^\s*$/ && /^\s*$/) { # expensive, avoid!
370             # $pms->{mime_base64_blanks} = 1; # never used, don't bother
371             # }
372             # # MIME_BASE64_ILLEGAL: now a zero-hitter
373             # # if (m@[^A-Za-z0-9+/=\n]@ || /=[^=\s]/) {
374             # # $pms->{mime_base64_illegal} = 1;
375             # # }
376             # }
377              
378             # if ($pms->{mime_html_no_charset} && $ctype eq 'text/html' && defined $charset) {
379             # $pms->{mime_html_no_charset} = 0;
380             # }
381 0 0 0     0 if ($pms->{mime_multipart_alternative} && $cd !~ /attachment/ &&
      0        
      0        
382             ($ctype eq 'text/plain' || $ctype eq 'text/html')) {
383 0         0 $part_bytes[$part] += length;
384             }
385              
386 0 0 0     0 if ($where != 1 && $cte eq "quoted-printable" && ! /^SPAM: /) {
      0        
387             # RFC 5322: Each line SHOULD be no more than 78 characters,
388             # excluding the CRLF.
389             # RFC 2045: The Quoted-Printable encoding REQUIRES that
390             # encoded lines be no more than 76 characters long.
391             # Bug 5491: 6% of email classified as HAM by SA triggered the
392             # MIME_QP_LONG_LINE rule. Apple Mail can generate a QP-line
393             # that is 2 chars too long. Same goes for Outlook Web Access.
394             # lines include one trailing \n character
395             # if (length > 76+1) { # conforms to RFC 5322 and RFC 2045
396 0 0       0 if (length > 78+1) { # conforms to RFC 5322 only, not RFC 2045
397 0         0 $pms->{mime_qp_long_line} = 1;
398             }
399 0         0 $qp_bytes += length;
400              
401             # MIME_QP_DEFICIENT: zero-hitter now
402              
403             # check for illegal substrings (RFC 2045), hexadecimal values 7F-FF and
404             # control characters other than TAB, or CR and LF as parts of CRLF pairs
405             # if (!$pms->{mime_qp_illegal} && /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/)
406             # {
407             # $pms->{mime_qp_illegal} = 1;
408             # }
409              
410             # count excessive QP bytes
411 0 0       0 if (index($_, '=') != -1) {
412             # whoever wrote this next line is an evil hacker -- jm
413 0         0 my $qp = () = m/=(?:09|3[0-9ABCEF]|[2456][0-9A-F]|7[0-9A-E])/g;
414 0 0       0 if ($qp) {
415 0         0 $qp_count += $qp;
416             # tabs and spaces at end of encoded line are okay. Also, multiple
417             # whitespace at the end of a line are OK, like ">=20=20=20=20=20=20".
418 0         0 my ($trailing) = m/((?:=09|=20)+)\s*$/g;
419 0 0       0 if ($trailing) {
420 0         0 $qp_count -= (length($trailing) / 3);
421             }
422             }
423             }
424             }
425              
426             # if our charset is ASCII, this should only contain 7-bit characters
427             # except NUL or a free-standing CR. anything else is a violation of
428             # the definition of charset="us-ascii".
429 0 0 0     0 if ($ctype eq 'text/plain' && (!defined $charset || $charset eq 'us-ascii')) {
      0        
430             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
431 0 0       0 if (m/[\x00\x0d\x80-\xff]+/) {
432 0 0       0 if (would_log('dbg', 'eval')) {
433 0         0 my $str = $_;
434 0         0 $str =~ s/([\x00\x0d\x80-\xff]+)/'<' . unpack('H*', $1) . '>'/eg;
  0         0  
435 0         0 dbg("check: ascii_text_illegal: matches " . $str . "\n");
436             }
437 0         0 $pms->{mime_ascii_text_illegal}++;
438             }
439             }
440              
441             # if we're text/plain, we should never see unicode escapes in this
442             # format, especially not for 7bit or 8bit.
443 0 0 0     0 if ($ctype eq 'text/plain' && ($cte eq '' || $cte eq '7bit' || $cte eq '8bit')) {
      0        
444 0         0 my ($text, $subs) = $_;
445              
446 0         0 $subs = $text =~ s/&#x[0-9A-F]{4};//g;
447 0         0 $normal_chars += length($text);
448 0         0 $unicode_chars += $subs;
449              
450 0 0 0     0 if ($subs && would_log('dbg', 'eval')) {
451 0         0 my $str = $_;
452 0 0       0 $str = substr($str, 0, 512) . '...' if (length($str) > 512);
453 0         0 dbg("check: abundant_unicode: " . $str . " (" . $subs . ")\n");
454             }
455             }
456              
457 0         0 $previous = $_;
458             }
459             }
460              
461 0 0       0 if ($qp_bytes) {
462 0         0 $pms->{mime_qp_ratio} = $qp_count / $qp_bytes;
463 0         0 $pms->{mime_qp_count} = $qp_count;
464 0         0 $pms->{mime_qp_bytes} = $qp_bytes;
465             }
466              
467 0 0       0 if ($normal_chars) {
468 0         0 $pms->{mime_text_unicode_ratio} = $unicode_chars / $normal_chars;
469             }
470              
471 0 0       0 if ($pms->{mime_multipart_alternative}) {
472 0         0 my $text;
473             my $html;
474             # bug 4207: we want the size of the last parts
475 0         0 for (my $i = $part; $i >= 0; $i--) {
476 0 0       0 next if !defined $part_bytes[$i];
477 0 0 0     0 if (!defined($html) && $part_type[$i] eq 'text/html') {
    0 0        
478 0         0 $html = $part_bytes[$i];
479             }
480             elsif (!defined($text) && $part_type[$i] eq 'text/plain') {
481 0         0 $text = $part_bytes[$i];
482             }
483 0 0 0     0 last if (defined($html) && defined($text));
484             }
485 0 0 0     0 if (defined($text) && defined($html) && $html > 0) {
      0        
486 0         0 $pms->{mime_multipart_ratio} = ($text / $html);
487             }
488             }
489              
490             # Look to see if any multipart boundaries are not "balanced"
491 0         0 foreach my $val (values %{$pms->{msg}->{mime_boundary_state}}) {
  0         0  
492 0 0       0 if ($val != 0) {
493 0         0 $pms->{mime_missing_boundary} = 1;
494 0         0 last;
495             }
496             }
497             }
498              
499             =item has_check_qp_ratio
500              
501             Adds capability check for "if can()" for check_qp_ratio
502              
503             =cut
504              
505 0     0 1 0 sub has_check_qp_ratio { 1 }
506              
507             =item check_qp_ratio
508              
509             Takes a min ratio to use in eval to see if there is an spamminess to the ratio of
510             quoted printable to total bytes in an email.
511              
512             =back
513              
514             =cut
515              
516             sub check_qp_ratio {
517 0     0 1 0 my ($self, $pms, undef, $min) = @_;
518              
519 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
520              
521 0         0 my $qp_ratio = $pms->{mime_qp_ratio};
522              
523 0         0 dbg("eval: qp_ratio - %s - check for min of %s", $qp_ratio, $min);
524              
525 0 0 0     0 return (defined $qp_ratio && $qp_ratio >= $min) ? 1 : 0;
526             }
527              
528              
529             sub check_msg_parse_flags {
530 77     77 0 279 my($self, $pms, $type, $type2) = @_;
531 77 50       293 $type = $type2 if ref($type);
532 77         1275 return defined $pms->{msg}->{$type};
533             }
534              
535             sub check_for_uppercase {
536 0     0 0   my ($self, $pms, $body, $min, $max) = @_;
537 0           local ($_);
538              
539 0 0         if (exists $pms->{uppercase}) {
540 0   0       return ($pms->{uppercase} > $min && $pms->{uppercase} <= $max);
541             }
542              
543 0 0         if ($self->body_charset_is_likely_to_fp($pms)) {
544 0           $pms->{uppercase} = 0; return 0;
  0            
545             }
546              
547             # Dec 20 2002 jm: trade off some speed for low memory footprint, by
548             # iterating over the array computing sums, instead of joining the
549             # array into a giant string and working from that.
550              
551 0           my $len = 0;
552 0           my $lower = 0;
553 0           my $upper = 0;
554 0           foreach (@{$body}) {
  0            
555             # examine lines in the body that have an intermediate space
556 0 0         next unless /\S\s+\S/;
557             # strip out lingering base64 (currently possible for forwarded messages)
558 0 0         next if /^(?:[A-Za-z0-9+\/=]{60,76} ){2}/;
559              
560 0           my $line = $_; # copy so we don't muck up the original
561              
562             # remove shift-JIS charset codes
563 0           $line =~ s/\x1b\$B.*\x1b\(B//gs;
564              
565 0           $len += length($line);
566              
567             # count numerals as lower case, otherwise 'date|mail' is spam
568 0           $lower += ($line =~ tr/a-z0-9//d);
569 0           $upper += ($line =~ tr/A-Z//);
570             }
571              
572             # report only on mails above a minimum size; otherwise one
573             # or two acronyms can throw it off
574 0 0         if ($len < 200) {
575 0           $pms->{uppercase} = 0;
576 0           return 0;
577             }
578 0 0         if (($upper + $lower) == 0) {
579 0           $pms->{uppercase} = 0;
580             } else {
581 0           $pms->{uppercase} = ($upper / ($upper + $lower)) * 100;
582             }
583              
584 0   0       return ($pms->{uppercase} > $min && $pms->{uppercase} <= $max);
585             }
586              
587             sub body_charset_is_likely_to_fp {
588 0     0 0   my ($self, $pms) = @_;
589              
590             # check for charsets where this test will FP -- iso-2022-jp, gb2312,
591             # koi8-r etc.
592             #
593 0 0         $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
594 0           my @charsets;
595 0           my $type = $pms->get('Content-Type',undef);
596 0 0         $type = get_charset_from_ct_line($type) if defined $type;
597 0 0         push (@charsets, $type) if defined $type;
598 0 0         if (defined $pms->{mime_html_charsets}) {
599 0           push (@charsets, split(' ', $pms->{mime_html_charsets}));
600             }
601              
602 0           my $CHARSETS_LIKELY_TO_FP_AS_CAPS = CHARSETS_LIKELY_TO_FP_AS_CAPS;
603 0           foreach my $charset (@charsets) {
604 0 0         if ($charset =~ /^${CHARSETS_LIKELY_TO_FP_AS_CAPS}$/) {
605 0           return 1;
606             }
607             }
608 0           return 0;
609             }
610              
611             sub get_charset_from_ct_line {
612 0     0 0   my $type = shift;
613 0 0         if (!defined $type) { return; }
  0            
614 0 0         if ($type =~ /charset="([^"]+)"/i) { return $1; }
  0            
615 0 0         if ($type =~ /charset='([^']+)'/i) { return $1; }
  0            
616 0 0         if ($type =~ /charset=(\S+)/i) { return $1; }
  0            
617 0           return;
618             }
619              
620             # came up on the users@ list, look for multipart/alternative parts which
621             # include non-text parts -- skip certain types which occur normally in ham
622             sub check_ma_non_text {
623 0     0 0   my($self, $pms) = @_;
624              
625 0           foreach my $map ($pms->{msg}->find_parts(qr@^multipart/alternative$@i)) {
626 0           foreach my $p ($map->find_parts(qr/./, 1, 0)) {
627 0 0         next if (lc $p->{'type'} eq 'multipart/related');
628 0 0         next if (lc $p->{'type'} eq 'application/rtf');
629 0 0         next if ($p->{'type'} =~ m@^text/@i);
630 0           return 1;
631             }
632             }
633            
634 0           return 0;
635             }
636              
637             sub check_base64_length {
638 0     0 0   my $self = shift;
639 0           my $pms = shift;
640 0           shift; # body array, unnecessary
641 0           my $min = shift;
642 0           my $max = shift;
643              
644 0 0         if (!defined $pms->{base64_length}) {
645 0           $pms->{base64_length} = $self->_check_base64_length($pms->{msg});
646             }
647              
648 0 0 0       return 0 if (defined $max && $pms->{base64_length} > $max);
649 0           return $pms->{base64_length} >= $min;
650             }
651              
652             sub _check_base64_length {
653 0     0     my $self = shift;
654 0           my $msg = shift;
655              
656 0           my $result = 0;
657              
658 0           foreach my $p ($msg->find_parts(qr@.@, 1)) {
659 0           my $ctype=
660             Mail::SpamAssassin::Util::parse_content_type($p->get_header('content-type'));
661              
662             # FPs from Google Calendar invites, etc.
663             # perhaps just limit to test, and image?
664 0 0         next if ($ctype eq 'application/ics');
665              
666 0   0       my $cte = lc($p->get_header('content-transfer-encoding') || '');
667 0 0         next if ($cte !~ /^base64$/);
668 0           foreach my $l ( @{$p->raw()} ) {
  0            
669 0 0         $result = length $l if length $l > $result;
670             }
671             }
672            
673 0           return $result;
674             }
675              
676             1;