File Coverage

blib/lib/Mail/SpamAssassin/Plugin/MIMEEval.pm
Criterion Covered Total %
statement 45 293 15.3
branch 1 174 0.5
condition 1 101 0.9
subroutine 10 30 33.3
pod 7 19 36.8
total 64 617 10.3


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             MIMEEval - perform various tests against MIME structure and body
21              
22             =head1 SYNOPSIS
23              
24             loadplugin Mail::SpamAssassin::Plugin::MIMEEval
25              
26             body NAME_OF_RULE eval:check_for_mime
27             body NAME_OF_RULE eval:check_for_mime_html
28             body NAME_OF_RULE eval:check_for_mime_html_only
29             body NAME_OF_RULE eval:check_mime_multipart_ratio
30             body NAME_OF_RULE eval:check_msg_parse_flags
31             body NAME_OF_RULE eval:check_for_ascii_text_illegal
32             body NAME_OF_RULE eval:check_abundant_unicode_ratio
33             body NAME_OF_RULE eval:check_for_faraway_charset
34             body NAME_OF_RULE eval:check_for_uppercase
35             body NAME_OF_RULE eval:check_ma_non_text
36             body NAME_OF_RULE eval:check_base64_length
37             body NAME_OF_RULE eval:check_qp_ratio
38              
39             =head1 DESCRIPTION
40              
41             Perform various tests against MIME structure and body.
42              
43             =cut
44              
45             package Mail::SpamAssassin::Plugin::MIMEEval;
46              
47 21     21   159 use strict;
  21         54  
  21         622  
48 21     21   129 use warnings;
  21         49  
  21         668  
49             # use bytes;
50 21     21   121 use re 'taint';
  21         52  
  21         686  
51              
52 21     21   146 use Mail::SpamAssassin::Plugin;
  21         56  
  21         506  
53 21     21   138 use Mail::SpamAssassin::Locales;
  21         52  
  21         582  
54 21     21   128 use Mail::SpamAssassin::Constants qw(:sa CHARSETS_LIKELY_TO_FP_AS_CAPS);
  21         55  
  21         3503  
55 21     21   162 use Mail::SpamAssassin::Util qw(untaint_var);
  21         60  
  21         1146  
56 21     21   156 use Mail::SpamAssassin::Logger;
  21         68  
  21         75249  
57              
58             our @ISA = qw(Mail::SpamAssassin::Plugin);
59              
60             # constructor: register the eval rule
61             sub new {
62 62     62 1 247 my $class = shift;
63 62         182 my $mailsaobject = shift;
64              
65             # some boilerplate...
66 62   33     812 $class = ref($class) || $class;
67 62         392 my $self = $class->SUPER::new($mailsaobject);
68 62         240 bless ($self, $class);
69              
70             # the important bit!
71 62         363 $self->register_eval_rule("check_for_mime");
72 62         288 $self->register_eval_rule("check_for_mime_html");
73 62         289 $self->register_eval_rule("check_for_mime_html_only");
74 62         246 $self->register_eval_rule("check_mime_multipart_ratio");
75 62         288 $self->register_eval_rule("check_msg_parse_flags");
76 62         221 $self->register_eval_rule("check_for_ascii_text_illegal");
77 62         203 $self->register_eval_rule("check_abundant_unicode_ratio");
78 62         255 $self->register_eval_rule("check_for_faraway_charset");
79 62         211 $self->register_eval_rule("check_for_uppercase");
80 62         243 $self->register_eval_rule("check_ma_non_text");
81 62         203 $self->register_eval_rule("check_base64_length");
82 62         217 $self->register_eval_rule("check_qp_ratio");
83              
84 62         580 return $self;
85             }
86              
87             ###########################################################################
88              
89             sub are_more_high_bits_set {
90 0     0 0 0 my ($self, $str) = @_;
91              
92             # TODO: I suspect a tr// trick may be faster here
93 0         0 my $numhis = () = ($str =~ /[\200-\377]/g);
94 0         0 my $numlos = length($str) - $numhis;
95              
96 0 0       0 ($numlos <= $numhis && $numhis > 3);
97             }
98              
99             =over 4
100              
101             =item has_check_for_ascii_text_illegal
102              
103             Adds capability check for "if can()" for check_for_ascii_text_illegal
104              
105             =cut
106              
107 0     0 1 0 sub has_check_for_ascii_text_illegal { 1 }
108              
109             =item check_for_ascii_text_illegal
110              
111             If a MIME part claims to be text/plain or text/plain;charset=us-ascii and the Content-Transfer-Encoding is 7bit (either explicitly or by default), then we should enforce the actual text being only TAB, NL, SPACE through TILDE, i.e. all 7bit characters excluding NO-WS-CTL (per RFC-2822).
112              
113             All mainstream MTA's get this right.
114              
115             =cut
116              
117             sub check_for_ascii_text_illegal {
118 0     0 1 0 my ($self, $pms) = @_;
119              
120 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
121 0 0       0 return 0 unless exists $pms->{mime_ascii_text_illegal};
122 0         0 return ($pms->{mime_ascii_text_illegal} > 0);
123             }
124              
125             =item has_check_abundant_unicode_ratio
126              
127             Adds capability check for "if can()" for check_abundant_unicode_ratio
128              
129             =cut
130              
131 0     0 1 0 sub has_check_abundant_unicode_ratio { 1 }
132              
133             =item check_abundant_unicode_ratio
134              
135             A MIME part claiming to be text/plain and containing Unicode characters must be encoded as quoted-printable or base64, or use UTF data coding (typically with 8bit encoding). Any message in 7bit or 8bit encoding containing (HTML) Unicode entities will not render them as Unicode, but literally.
136              
137             Thus a few such sequences might occur on a mailing list of developers discussing such characters, but a message with a high density of such characters is likely spam.
138              
139             =cut
140              
141             sub check_abundant_unicode_ratio {
142 0     0 1 0 my ($self, $pms, undef, $ratio) = @_;
143              
144             # validate ratio?
145 0 0       0 return 0 unless ($ratio =~ /^\d{0,3}\.\d{1,3}$/);
146              
147 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
148 0 0       0 return 0 unless exists $pms->{mime_text_unicode_ratio};
149 0         0 return ($pms->{mime_text_unicode_ratio} >= $ratio);
150             }
151              
152             sub check_for_faraway_charset {
153 0     0 0 0 my ($self, $pms, $body) = @_;
154              
155 0         0 my $type = $pms->get('Content-Type',undef);
156              
157 0         0 my @locales = Mail::SpamAssassin::Util::get_my_locales($self->{main}->{conf}->{ok_locales});
158              
159 0 0       0 return 0 if grep { $_ eq "all" } @locales;
  0         0  
160              
161 0 0       0 $type = get_charset_from_ct_line($type) if defined $type;
162              
163 0 0 0     0 if (defined $type &&
164             !Mail::SpamAssassin::Locales::is_charset_ok_for_locales
165             ($type, @locales))
166             {
167             # sanity check. Some charsets (e.g. koi8-r) include the ASCII
168             # 7-bit charset as well, so make sure we actually have a high
169             # number of 8-bit chars in the body text first.
170              
171 0         0 $body = join("\n", @$body);
172 0 0       0 if ($self->are_more_high_bits_set ($body)) {
173 0         0 return 1;
174             }
175             }
176              
177 0         0 0;
178             }
179              
180             sub check_for_mime {
181 0     0 0 0 my ($self, $pms, undef, $test) = @_;
182              
183 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
184 0 0       0 return 0 unless exists $pms->{$test};
185 0         0 return $pms->{$test};
186             }
187              
188             # any text/html MIME part
189             sub check_for_mime_html {
190 0     0 0 0 my ($self, $pms) = @_;
191              
192 0         0 my $ctype = $pms->get('Content-Type');
193 0 0       0 return 1 if $ctype =~ m{^text/html}i;
194              
195 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
196 0 0       0 return 0 unless exists $pms->{mime_body_html_count};
197 0         0 return ($pms->{mime_body_html_count} > 0);
198             }
199              
200             # HTML without some other type of MIME text part
201             sub check_for_mime_html_only {
202 0     0 0 0 my ($self, $pms) = @_;
203              
204 0         0 my $ctype = $pms->get('Content-Type');
205 0 0       0 return 1 if $ctype =~ m{^text/html}i;
206              
207 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
208 0 0       0 return 0 unless exists $pms->{mime_body_html_count};
209 0 0       0 return 0 unless exists $pms->{mime_body_text_count};
210             return ($pms->{mime_body_html_count} > 0 &&
211 0   0     0 $pms->{mime_body_text_count} == 0);
212             }
213              
214             sub check_mime_multipart_ratio {
215 0     0 0 0 my ($self, $pms, undef, $min, $max) = @_;
216              
217 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
218 0 0       0 return 0 unless exists $pms->{mime_multipart_ratio};
219             return ($pms->{mime_multipart_ratio} >= $min &&
220 0   0     0 $pms->{mime_multipart_ratio} < $max);
221             }
222              
223             sub _check_mime_header {
224 0     0   0 my ($self, $pms, $ctype, $cte, $cd, $charset, $name) = @_;
225              
226 0   0     0 $charset ||= '';
227              
228 0 0       0 if ($ctype eq 'text/html') {
    0          
229 0         0 $pms->{mime_body_html_count}++;
230             }
231             elsif ($ctype =~ m@^text@i) {
232 0         0 $pms->{mime_body_text_count}++;
233             }
234              
235 0 0       0 if ($cte =~ /base64/) {
    0          
236 0         0 $pms->{mime_base64_count}++;
237             }
238             elsif ($cte =~ /quoted-printable/) {
239 0         0 $pms->{mime_qp_count}++;
240             }
241              
242 0 0 0     0 if ($cd && $cd =~ /attachment/) {
243 0         0 $pms->{mime_attachment}++;
244             }
245              
246 0 0 0     0 if ($ctype =~ /^text/ &&
      0        
      0        
      0        
      0        
247             $cte =~ /base64/ &&
248             (!$charset || $charset =~ /(?:us-ascii|ansi_x3\.4-1968|iso-ir-6|ansi_x3\.4-1986|iso_646\.irv:1991|ascii|iso646-us|us|ibm367|cp367|csascii)/) &&
249             !($cd && $cd =~ /^(?:attachment|inline)/))
250             {
251 0         0 $pms->{mime_base64_encoded_text} = 1;
252             }
253              
254 0 0 0     0 if ($charset =~ /iso-\S+-\S+\b/i &&
255             $charset !~ /iso-(?:8859-\d{1,2}|2022-(?:jp|kr))\b/)
256             {
257 0         0 $pms->{mime_bad_iso_charset} = 1;
258             }
259              
260             # MIME_BASE64_LATIN: now a zero-hitter
261             # if (!$name &&
262             # $cte =~ /base64/ &&
263             # $charset =~ /\b(?:us-ascii|iso-8859-(?:[12349]|1[0345])|windows-(?:125[0247]))\b/)
264             # {
265             # $pms->{mime_base64_latin} = 1;
266             # }
267              
268             # MIME_QP_NO_CHARSET: now a zero-hitter
269             # if ($cte =~ /quoted-printable/ && $cd =~ /inline/ && !$charset) {
270             # $pms->{mime_qp_inline_no_charset} = 1;
271             # }
272              
273             # MIME_HTML_NO_CHARSET: now a zero-hitter
274             # if ($ctype eq 'text/html' &&
275             # !(defined($charset) && $charset) &&
276             # !($cd && $cd =~ /^(?:attachment|inline)/))
277             # {
278             # $pms->{mime_html_no_charset} = 1;
279             # }
280              
281 0 0       0 if ($charset =~ /[a-z]/i) {
282 0 0       0 if (defined $pms->{mime_html_charsets}) {
283 0         0 $pms->{mime_html_charsets} .= " ".$charset;
284             } else {
285 0         0 $pms->{mime_html_charsets} = $charset;
286             }
287              
288 0 0       0 if (! $pms->{mime_faraway_charset}) {
289 0         0 my @l = Mail::SpamAssassin::Util::get_my_locales($self->{main}->{conf}->{ok_locales});
290              
291 0 0 0     0 if (!(grep { $_ eq "all" } @l) &&
292             !Mail::SpamAssassin::Locales::is_charset_ok_for_locales($charset, @l))
293             {
294 0         0 $pms->{mime_faraway_charset} = 1;
295             }
296             }
297             }
298             }
299              
300             sub _check_attachments {
301 0     0   0 my ($self, $pms) = @_;
302              
303             # MIME status
304 0         0 my $where = -1; # -1 = start, 0 = nowhere, 1 = header, 2 = body
305 0         0 my $qp_bytes = 0; # total bytes in QP regions
306 0         0 my $qp_count = 0; # QP-encoded bytes in QP regions
307 0         0 my @part_bytes; # MIME part total bytes
308             my @part_type; # MIME part types
309              
310 0         0 my $normal_chars = 0; # MIME text bytes that aren't encoded
311 0         0 my $unicode_chars = 0; # MIME text bytes that are unicode entities
312              
313             # MIME header information
314 0         0 my $part = -1; # MIME part index
315              
316             # indicate the scan has taken place
317 0         0 $pms->{mime_checked_attachments} = 1;
318              
319             # results
320             # $pms->{mime_base64_blanks} = 0; # expensive to determine, no longer avail
321 0         0 $pms->{mime_base64_count} = 0;
322 0         0 $pms->{mime_base64_encoded_text} = 0;
323             # $pms->{mime_base64_illegal} = 0;
324             # $pms->{mime_base64_latin} = 0;
325 0         0 $pms->{mime_body_html_count} = 0;
326 0         0 $pms->{mime_body_text_count} = 0;
327 0         0 $pms->{mime_faraway_charset} = 0;
328             # $pms->{mime_html_no_charset} = 0;
329 0         0 $pms->{mime_missing_boundary} = 0;
330 0         0 $pms->{mime_multipart_alternative} = 0;
331 0         0 $pms->{mime_multipart_ratio} = 1.0;
332 0         0 $pms->{mime_qp_count} = 0;
333             # $pms->{mime_qp_illegal} = 0;
334             # $pms->{mime_qp_inline_no_charset} = 0;
335 0         0 $pms->{mime_qp_long_line} = 0;
336 0         0 $pms->{mime_qp_ratio} = 0;
337 0         0 $pms->{mime_ascii_text_illegal} = 0;
338 0         0 $pms->{mime_text_unicode_ratio} = 0;
339              
340             # Get all parts ...
341 0         0 foreach my $p ($pms->{msg}->find_parts(qr/./)) {
342             # message headers
343 0         0 my ($ctype, $boundary, $charset, $name) = Mail::SpamAssassin::Util::parse_content_type($p->get_header("content-type"));
344              
345 0 0       0 if ($ctype eq 'multipart/alternative') {
346 0         0 $pms->{mime_multipart_alternative} = 1;
347             }
348              
349 0   0     0 my $cte = $p->get_header('Content-Transfer-Encoding') || '';
350 0 0       0 chomp($cte = defined($cte) ? lc $cte : "");
351              
352 0   0     0 my $cd = $p->get_header('Content-Disposition') || '';
353 0 0       0 chomp($cd = defined($cd) ? lc $cd : "");
354              
355 0 0       0 $charset = lc $charset if ($charset);
356 0 0       0 $name = lc $name if ($name);
357              
358 0         0 $self->_check_mime_header($pms, $ctype, $cte, $cd, $charset, $name);
359              
360             # If we're not in a leaf node in the tree, there will be no raw
361             # section, so skip it.
362 0 0       0 if (! $p->is_leaf()) {
363 0         0 next;
364             }
365              
366 0         0 $part++;
367 0         0 $part_type[$part] = $ctype;
368 0 0       0 $part_bytes[$part] = 0 if $cd !~ /attachment/;
369              
370 0         0 my $cte_is_base64 = $cte =~ /base64/i;
371 0         0 my $previous = '';
372 0         0 foreach (@{$p->raw()}) {
  0         0  
373              
374             # if ($cte_is_base64) {
375             # if ($previous =~ /^\s*$/ && /^\s*$/) { # expensive, avoid!
376             # $pms->{mime_base64_blanks} = 1; # never used, don't bother
377             # }
378             # # MIME_BASE64_ILLEGAL: now a zero-hitter
379             # # if (m@[^A-Za-z0-9+/=\n]@ || /=[^=\s]/) {
380             # # $pms->{mime_base64_illegal} = 1;
381             # # }
382             # }
383              
384             # if ($pms->{mime_html_no_charset} && $ctype eq 'text/html' && defined $charset) {
385             # $pms->{mime_html_no_charset} = 0;
386             # }
387 0 0 0     0 if ($pms->{mime_multipart_alternative} && $cd !~ /attachment/ &&
      0        
      0        
388             ($ctype eq 'text/plain' || $ctype eq 'text/html')) {
389 0         0 $part_bytes[$part] += length;
390             }
391              
392 0 0 0     0 if ($where != 1 && $cte eq "quoted-printable" && ! /^SPAM: /) {
      0        
393             # RFC 5322: Each line SHOULD be no more than 78 characters,
394             # excluding the CRLF.
395             # RFC 2045: The Quoted-Printable encoding REQUIRES that
396             # encoded lines be no more than 76 characters long.
397             # Bug 5491: 6% of email classified as HAM by SA triggered the
398             # MIME_QP_LONG_LINE rule. Apple Mail can generate a QP-line
399             # that is 2 chars too long. Same goes for Outlook Web Access.
400             # lines include one trailing \n character
401             # if (length > 76+1) { # conforms to RFC 5322 and RFC 2045
402 0 0       0 if (length > 78+1) { # conforms to RFC 5322 only, not RFC 2045
403 0         0 $pms->{mime_qp_long_line} = 1;
404             }
405 0         0 $qp_bytes += length;
406              
407             # MIME_QP_DEFICIENT: zero-hitter now
408              
409             # check for illegal substrings (RFC 2045), hexadecimal values 7F-FF and
410             # control characters other than TAB, or CR and LF as parts of CRLF pairs
411             # if (!$pms->{mime_qp_illegal} && /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]/)
412             # {
413             # $pms->{mime_qp_illegal} = 1;
414             # }
415              
416             # count excessive QP bytes
417 0 0       0 if (index($_, '=') != -1) {
418             # whoever wrote this next line is an evil hacker -- jm
419 0         0 my $qp = () = m/=(?:09|3[0-9ABCEF]|[2456][0-9A-F]|7[0-9A-E])/g;
420 0 0       0 if ($qp) {
421 0         0 $qp_count += $qp;
422             # tabs and spaces at end of encoded line are okay. Also, multiple
423             # whitespace at the end of a line are OK, like ">=20=20=20=20=20=20".
424 0         0 my ($trailing) = m/((?:=09|=20)+)\s*$/g;
425 0 0       0 if ($trailing) {
426 0         0 $qp_count -= (length($trailing) / 3);
427             }
428             }
429             }
430             }
431              
432             # if our charset is ASCII, this should only contain 7-bit characters
433             # except NUL or a free-standing CR. anything else is a violation of
434             # the definition of charset="us-ascii".
435 0 0 0     0 if ($ctype eq 'text/plain' && (!defined $charset || $charset eq 'us-ascii')) {
      0        
436             # no re "strict"; # since perl 5.21.8: Ranges of ASCII printables...
437 0 0       0 if (m/[\x00\x0d\x80-\xff]+/) {
438 0 0       0 if (would_log('dbg', 'eval')) {
439 0         0 my $str = $_;
440 0         0 $str =~ s/([\x00\x0d\x80-\xff]+)/'<' . unpack('H*', $1) . '>'/eg;
  0         0  
441 0         0 dbg("check: ascii_text_illegal: matches " . $str . "\n");
442             }
443 0         0 $pms->{mime_ascii_text_illegal}++;
444             }
445             }
446              
447             # if we're text/plain, we should never see unicode escapes in this
448             # format, especially not for 7bit or 8bit.
449 0 0 0     0 if ($ctype eq 'text/plain' && ($cte eq '' || $cte eq '7bit' || $cte eq '8bit')) {
      0        
450 0         0 my ($text, $subs) = $_;
451              
452 0         0 $subs = $text =~ s/&#x[0-9A-F]{4};//g;
453 0         0 $normal_chars += length($text);
454 0         0 $unicode_chars += $subs;
455              
456 0 0 0     0 if ($subs && would_log('dbg', 'eval')) {
457 0         0 my $str = $_;
458 0 0       0 $str = substr($str, 0, 512) . '...' if (length($str) > 512);
459 0         0 dbg("check: abundant_unicode: " . $str . " (" . $subs . ")\n");
460             }
461             }
462              
463 0         0 $previous = $_;
464             }
465             }
466              
467 0 0       0 if ($qp_bytes) {
468 0         0 $pms->{mime_qp_ratio} = $qp_count / $qp_bytes;
469 0         0 $pms->{mime_qp_count} = $qp_count;
470 0         0 $pms->{mime_qp_bytes} = $qp_bytes;
471             }
472              
473 0 0       0 if ($normal_chars) {
474 0         0 $pms->{mime_text_unicode_ratio} = $unicode_chars / $normal_chars;
475             }
476              
477 0 0       0 if ($pms->{mime_multipart_alternative}) {
478 0         0 my $text;
479             my $html;
480             # bug 4207: we want the size of the last parts
481 0         0 for (my $i = $part; $i >= 0; $i--) {
482 0 0       0 next if !defined $part_bytes[$i];
483 0 0 0     0 if (!defined($html) && $part_type[$i] eq 'text/html') {
    0 0        
484 0         0 $html = $part_bytes[$i];
485             }
486             elsif (!defined($text) && $part_type[$i] eq 'text/plain') {
487 0         0 $text = $part_bytes[$i];
488             }
489 0 0 0     0 last if (defined($html) && defined($text));
490             }
491 0 0 0     0 if (defined($text) && defined($html) && $html > 0) {
      0        
492 0         0 $pms->{mime_multipart_ratio} = ($text / $html);
493             }
494             }
495              
496             # Look to see if any multipart boundaries are not "balanced"
497 0         0 foreach my $val (values %{$pms->{msg}->{mime_boundary_state}}) {
  0         0  
498 0 0       0 if ($val != 0) {
499 0         0 $pms->{mime_missing_boundary} = 1;
500 0         0 last;
501             }
502             }
503             }
504              
505             =item has_check_qp_ratio
506              
507             Adds capability check for "if can()" for check_qp_ratio
508              
509             =cut
510              
511 0     0 1 0 sub has_check_qp_ratio { 1 }
512              
513             =item check_qp_ratio
514              
515             Takes a min ratio to use in eval to see if there is an spamminess to the ratio of
516             quoted printable to total bytes in an email.
517              
518             =back
519              
520             =cut
521              
522             sub check_qp_ratio {
523 0     0 1 0 my ($self, $pms, undef, $min) = @_;
524              
525 0 0       0 $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
526 0 0       0 return 0 unless exists $pms->{mime_qp_ratio};
527              
528 0         0 my $qp_ratio = $pms->{mime_qp_ratio};
529              
530 0         0 dbg("eval: qp_ratio - %s - check for min of %s", $qp_ratio, $min);
531              
532 0 0 0     0 return (defined $qp_ratio && $qp_ratio >= $min) ? 1 : 0;
533             }
534              
535              
536             sub check_msg_parse_flags {
537 81     81 0 347 my($self, $pms, $type, $type2) = @_;
538 81 50       365 $type = $type2 if ref($type);
539 81         1527 return defined $pms->{msg}->{$type};
540             }
541              
542             sub check_for_uppercase {
543 0     0 0   my ($self, $pms, $body, $min, $max) = @_;
544 0           local ($_);
545              
546 0 0         if (exists $pms->{uppercase}) {
547 0   0       return ($pms->{uppercase} > $min && $pms->{uppercase} <= $max);
548             }
549              
550 0 0         if ($self->body_charset_is_likely_to_fp($pms)) {
551 0           $pms->{uppercase} = 0; return 0;
  0            
552             }
553              
554             # Dec 20 2002 jm: trade off some speed for low memory footprint, by
555             # iterating over the array computing sums, instead of joining the
556             # array into a giant string and working from that.
557              
558 0           my $len = 0;
559 0           my $lower = 0;
560 0           my $upper = 0;
561 0           foreach (@{$body}) {
  0            
562             # examine lines in the body that have an intermediate space
563 0 0         next unless /\S\s+\S/;
564             # strip out lingering base64 (currently possible for forwarded messages)
565 0 0         next if /^(?:[A-Za-z0-9+\/=]{60,76} ){2}/;
566              
567 0           my $line = $_; # copy so we don't muck up the original
568              
569             # remove shift-JIS charset codes
570 0           $line =~ s/\x1b\$B.*\x1b\(B//gs;
571              
572 0           $len += length($line);
573              
574             # count numerals as lower case, otherwise 'date|mail' is spam
575 0           $lower += ($line =~ tr/a-z0-9//d);
576 0           $upper += ($line =~ tr/A-Z//);
577             }
578              
579             # report only on mails above a minimum size; otherwise one
580             # or two acronyms can throw it off
581 0 0         if ($len < 200) {
582 0           $pms->{uppercase} = 0;
583 0           return 0;
584             }
585 0 0         if (($upper + $lower) == 0) {
586 0           $pms->{uppercase} = 0;
587             } else {
588 0           $pms->{uppercase} = ($upper / ($upper + $lower)) * 100;
589             }
590              
591 0   0       return ($pms->{uppercase} > $min && $pms->{uppercase} <= $max);
592             }
593              
594             sub body_charset_is_likely_to_fp {
595 0     0 0   my ($self, $pms) = @_;
596              
597             # check for charsets where this test will FP -- iso-2022-jp, gb2312,
598             # koi8-r etc.
599             #
600 0 0         $self->_check_attachments($pms) unless exists $pms->{mime_checked_attachments};
601 0           my @charsets;
602 0           my $type = $pms->get('Content-Type',undef);
603 0 0         $type = get_charset_from_ct_line($type) if defined $type;
604 0 0         push (@charsets, $type) if defined $type;
605 0 0         if (defined $pms->{mime_html_charsets}) {
606 0           push (@charsets, split(' ', $pms->{mime_html_charsets}));
607             }
608              
609 0           my $CHARSETS_LIKELY_TO_FP_AS_CAPS = CHARSETS_LIKELY_TO_FP_AS_CAPS;
610 0           foreach my $charset (@charsets) {
611 0 0         if ($charset =~ /^${CHARSETS_LIKELY_TO_FP_AS_CAPS}$/) {
612 0           return 1;
613             }
614             }
615 0           return 0;
616             }
617              
618             sub get_charset_from_ct_line {
619 0     0 0   my $type = shift;
620 0 0         if (!defined $type) { return; }
  0            
621 0 0         if ($type =~ /charset="([^"]+)"/i) { return $1; }
  0            
622 0 0         if ($type =~ /charset='([^']+)'/i) { return $1; }
  0            
623 0 0         if ($type =~ /charset=(\S+)/i) { return $1; }
  0            
624 0           return;
625             }
626              
627             # came up on the users@ list, look for multipart/alternative parts which
628             # include non-text parts -- skip certain types which occur normally in ham
629             sub check_ma_non_text {
630 0     0 0   my($self, $pms) = @_;
631              
632 0           foreach my $map ($pms->{msg}->find_parts(qr@^multipart/alternative$@i)) {
633 0           foreach my $p ($map->find_parts(qr/./, 1, 0)) {
634 0 0         next if (lc $p->{'type'} eq 'multipart/related');
635 0 0         next if (lc $p->{'type'} eq 'application/rtf');
636 0 0         next if ($p->{'type'} =~ m@^text/@i);
637 0           return 1;
638             }
639             }
640            
641 0           return 0;
642             }
643              
644             sub check_base64_length {
645 0     0 0   my $self = shift;
646 0           my $pms = shift;
647 0           shift; # body array, unnecessary
648 0           my $min = shift;
649 0           my $max = shift;
650              
651 0 0         if (!defined $pms->{base64_length}) {
652 0           $pms->{base64_length} = $self->_check_base64_length($pms->{msg});
653             }
654              
655 0 0 0       return 0 if (defined $max && $pms->{base64_length} > $max);
656 0           return $pms->{base64_length} >= $min;
657             }
658              
659             sub _check_base64_length {
660 0     0     my $self = shift;
661 0           my $msg = shift;
662              
663 0           my $result = 0;
664              
665 0           foreach my $p ($msg->find_parts(qr@.@, 1)) {
666 0           my $ctype=
667             Mail::SpamAssassin::Util::parse_content_type($p->get_header('content-type'));
668              
669             # FPs from Google Calendar invites, etc.
670             # perhaps just limit to test, and image?
671 0 0         next if ($ctype eq 'application/ics');
672              
673 0   0       my $cte = lc($p->get_header('content-transfer-encoding') || '');
674 0 0         next if ($cte !~ /^base64$/);
675 0           foreach my $l ( @{$p->raw()} ) {
  0            
676 0 0         $result = length $l if length $l > $result;
677             }
678             }
679            
680 0           return $result;
681             }
682              
683             1;