File Coverage

blib/lib/Mail/SpamAssassin/PerMsgStatus.pm
Criterion Covered Total %
statement 958 1278 74.9
branch 284 560 50.7
condition 91 224 40.6
subroutine 111 125 88.8
pod 43 94 45.7
total 1487 2281 65.1


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::PerMsgStatus - per-message status (spam or not-spam)
21              
22             =head1 SYNOPSIS
23              
24             my $spamtest = new Mail::SpamAssassin ({
25             'rules_filename' => '/etc/spamassassin.rules',
26             'userprefs_filename' => $ENV{HOME}.'/.spamassassin/user_prefs'
27             });
28             my $mail = $spamtest->parse();
29              
30             my $status = $spamtest->check ($mail);
31              
32             my $rewritten_mail;
33             if ($status->is_spam()) {
34             $rewritten_mail = $status->rewrite_mail ();
35             }
36             ...
37              
38              
39             =head1 DESCRIPTION
40              
41             The Mail::SpamAssassin C<check()> method returns an object of this
42             class. This object encapsulates all the per-message state.
43              
44             =head1 METHODS
45              
46             =over 4
47              
48             =cut
49              
50              
51             use strict;
52 41     41   251 use warnings;
  41         74  
  41         1177  
53 41     41   208 use re 'taint';
  41         102  
  41         1200  
54 41     41   262  
  41         96  
  41         1346  
55             use Errno qw(ENOENT);
56 41     41   237 use Time::HiRes qw(time);
  41         109  
  41         1976  
57 41     41   271  
  41         101  
  41         512  
58             use Mail::SpamAssassin::Constants qw(:sa);
59 41     41   5211 use Mail::SpamAssassin::AsyncLoop;
  41         121  
  41         5493  
60 41     41   11901 use Mail::SpamAssassin::Conf;
  41         1425  
  41         1304  
61 41     41   299 use Mail::SpamAssassin::Util qw(untaint_var uri_list_canonicalize is_fqdn_valid);
  41         679  
  41         988  
62 41     41   195 use Mail::SpamAssassin::Timeout;
  41         83  
  41         2362  
63 41     41   13263 use Mail::SpamAssassin::Logger;
  41         113  
  41         1209  
64 41     41   240  
  41         72  
  41         42723  
65             our @ISA = qw();
66              
67             # methods defined by the compiled ruleset; deleted in finish_tests()
68             our @TEMPORARY_METHODS;
69              
70             # methods defined by register_plugin_eval_glue(); deleted in finish_tests()
71             our %TEMPORARY_EVAL_GLUE_METHODS;
72              
73             ###########################################################################
74              
75             our %common_tags;
76              
77             BEGIN {
78             %common_tags = (
79              
80             YESNO => sub {
81             my $pms = shift;
82 34         71 $pms->_get_tag_value_for_yesno(@_);
83 34         135 },
84              
85             YESNOCAPS => sub {
86             my $pms = shift;
87 2         5 uc $pms->_get_tag_value_for_yesno(@_);
88 2         10 },
89              
90             SCORE => sub {
91             my $pms = shift;
92 38         103 $pms->_get_tag_value_for_score(@_);
93 38         132 },
94              
95             HITS => sub {
96             my $pms = shift;
97 0         0 $pms->_get_tag_value_for_score(@_);
98 0         0 },
99              
100             REQD => sub {
101             my $pms = shift;
102 38         64 $pms->_get_tag_value_for_required_score(@_);
103 38         176 },
104              
105             VERSION => \&Mail::SpamAssassin::Version,
106              
107             SUBVERSION => sub { $Mail::SpamAssassin::SUB_VERSION },
108 34         88  
109             RULESVERSION => sub {
110             my $pms = shift;
111 0         0 my $conf = $pms->{conf};
112 0         0 my @fnames;
113 0         0 @fnames =
114             keys %{$conf->{update_version}} if $conf->{update_version};
115 0 0       0 @fnames = sort @fnames if @fnames > 1;
  0         0  
116 0 0       0 join(',', map($conf->{update_version}{$_}, @fnames));
117 0         0 },
118              
119             HOSTNAME => sub {
120             my $pms = shift;
121 39         84 $pms->{conf}->{report_hostname} ||
122             Mail::SpamAssassin::Util::fq_hostname();
123 39 50       282 },
124              
125             REMOTEHOSTNAME => sub {
126             my $pms = shift;
127 0         0 $pms->{tag_data}->{'REMOTEHOSTNAME'} || "localhost";
128 0 0       0 },
129              
130             REMOTEHOSTADDR => sub {
131             my $pms = shift;
132 0         0 $pms->{tag_data}->{'REMOTEHOSTADDR'} || "127.0.0.1";
133 0 0       0 },
134              
135             LASTEXTERNALIP => sub {
136             my $pms = shift;
137 0         0 my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
138 0         0 $lasthop ? $lasthop->{ip} : '';
139 0 0       0 },
140              
141             LASTEXTERNALRDNS => sub {
142             my $pms = shift;
143 0         0 my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
144 0         0 $lasthop ? $lasthop->{rdns} : '';
145 0 0       0 },
146              
147             LASTEXTERNALHELO => sub {
148             my $pms = shift;
149 0         0 my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
150 0         0 $lasthop ? $lasthop->{helo} : '';
151 0 0       0 },
152              
153             CONTACTADDRESS => sub {
154             my $pms = shift;
155 4         10 $pms->{conf}->{report_contact};
156 4         16 },
157              
158             BAYES => sub {
159             my $pms = shift;
160 0         0 defined $pms->{bayes_score} ? sprintf("%3.4f", $pms->{bayes_score})
161             : "0.5";
162 0 0       0 },
163              
164             DATE => sub { Mail::SpamAssassin::Util::time_to_rfc822_date() },
165 0         0  
166             STARS => sub {
167             my $pms = shift;
168 34         71 my $arg = (shift || "*");
169 34   50     132 my $length = int($pms->{score});
170 34         93 $length = 50 if $length > 50;
171 34 50       95 # avoid a perl 5.21 warning: "Negative repeat count does nothing"
172             $length > 0 ? $arg x $length : '';
173 34 100       143 },
174              
175             AUTOLEARN => sub {
176             my $pms = shift;
177 34         97 $pms->get_autolearn_status();
178 34         150 },
179              
180             AUTOLEARNSCORE => sub {
181             my $pms = shift;
182 0         0 $pms->get_autolearn_points();
183 0         0 },
184              
185             TESTS => sub {
186             my $pms = shift;
187 34         61 my $arg = (shift || ',');
188 34   50     143 join($arg, sort @{$pms->{test_names_hit}}) || "none";
189 34 100       73 },
  34         226  
190              
191             SUBTESTS => sub {
192             my $pms = shift;
193 0         0 my $arg = (shift || ',');
194 0   0     0 join($arg, sort @{$pms->{subtest_names_hit}}) || "none";
195 0 0       0 },
  0         0  
196              
197             SUBTESTSCOLLAPSED => sub {
198             my $pms = shift;
199 0         0 my $arg = (shift || ',');
200 0   0     0 my (@subtests) = $pms->get_names_of_subtests_hit("collapsed");
201 0         0 join($arg, sort @subtests) || "none";
202 0 0       0 },
203              
204             TESTSSCORES => sub {
205             my $pms = shift;
206 0         0 my $arg = (shift || ",");
207 0   0     0 my $scores = $pms->{conf}->{scores};
208 0         0 join($arg, map($_ . "=" . ($scores->{$_} || '0'),
209             sort @{$pms->{test_names_hit}})) || "none";
210 0 0 0     0 },
  0         0  
211              
212             PREVIEW => sub {
213             my $pms = shift;
214 4         9 $pms->get_content_preview();
215 4         24 },
216              
217             REPORT => sub {
218             my $pms = shift;
219 3         7 "\n" . ($pms->{tag_data}->{REPORT} || "");
220 3   50     25 },
221              
222             SUBJPREFIX => sub {
223             my $pms = shift;
224 0         0 ($pms->{tag_data}->{SUBJPREFIX} || "");
225 0 0       0 },
226              
227             HEADER => sub {
228             my $pms = shift;
229 0         0 my $hdr = shift;
230 0         0 return if !$hdr;
231 0 0       0 $pms->get($hdr,undef);
232 0         0 },
233              
234             TIMING => sub {
235             my $pms = shift;
236 0         0 $pms->{main}->timer_report();
237 0         0 },
238              
239             ADDEDHEADERHAM => sub {
240             my $pms = shift;
241 0         0 $pms->_get_added_headers('headers_ham');
242 0         0 },
243              
244             ADDEDHEADERSPAM => sub {
245             my $pms = shift;
246 0         0 $pms->_get_added_headers('headers_spam');
247 0         0 },
248              
249             ADDEDHEADER => sub {
250             my $pms = shift;
251 0         0 $pms->_get_added_headers(
252             $pms->{is_spam} ? 'headers_spam' : 'headers_ham');
253 0 0       0 },
254              
255             );
256 41     41   525826 }
257              
258             my $class = shift;
259             $class = ref($class) || $class;
260 155     155 0 760 my ($main, $msg, $opts) = @_;
261 155   33     931  
262 155         465 my $self = {
263             'main' => $main,
264             'msg' => $msg,
265             'score' => 0,
266             'test_log_msgs' => { },
267             'test_names_hit' => [ ],
268             'subtest_names_hit' => [ ],
269             'spamd_result_log_items' => [ ],
270             'tests_already_hit' => { },
271             'c' => { },
272             'tag_data' => { },
273             'rule_errors' => 0,
274             'disable_auto_learning' => 0,
275             'auto_learn_status' => undef,
276             'auto_learn_force_status' => undef,
277             'conf' => $main->{conf},
278             'async' => Mail::SpamAssassin::AsyncLoop->new($main),
279             'master_deadline' => $msg->{master_deadline}, # dflt inherited from msg
280             'deadline_exceeded' => 0, # time limit exceeded, skipping further tests
281             'uri_detail_list' => { },
282 155         2112 'subjprefix' => "",
283             };
284              
285             dbg("check: pms new, time limit in %.3f s",
286             $self->{master_deadline} - time) if $self->{master_deadline};
287              
288 155 50       1502 if (defined $opts && $opts->{disable_auto_learning}) {
289             $self->{disable_auto_learning} = 1;
290 155 50 66     747 }
291 49         123  
292             # used with "mass-check --loghits"
293             if ($self->{main}->{save_pattern_hits}) {
294             $self->{save_pattern_hits} = 1;
295 155 50       505 $self->{pattern_hits} = { };
296 0         0 }
297 0         0  
298             delete $self->{should_log_rule_hits};
299             my $dbgcache = would_log('dbg', 'rules');
300 155         277 if ($dbgcache || $self->{save_pattern_hits}) {
301 155         641 $self->{should_log_rule_hits} = 1;
302 155 50 33     1016 }
303 0         0  
304             # known valid tags that might not get their entry in pms->{tag_data}
305             # in some circumstances
306             my $tag_data_ref = $self->{tag_data};
307             foreach (qw(SUMMARY REPORT SUBJPREFIX RBL)) { $tag_data_ref->{$_} = '' }
308 155         342 foreach (qw(AWL AWLMEAN AWLCOUNT AWLPRESCORE
309 155         431 DCCB DCCR DCCREP PYZOR DKIMIDENTITY DKIMDOMAIN DKIMSELECTOR
  620         1553  
310 155         531 BAYESTC BAYESTCLEARNED BAYESTCSPAMMY BAYESTCHAMMY
311             HAMMYTOKENS SPAMMYTOKENS TOKENSUMMARY)) {
312             $tag_data_ref->{$_} = undef; # exist, but undefined
313             }
314 2790         6106  
315             bless ($self, $class);
316             $self;
317 155         458 }
318 155         570  
319             my ($self) = shift;
320             local $@;
321             eval { $self->delete_fulltext_tmpfile() }; # Bug 5808
322 152     152   3164707 }
323 152         331  
324 152         409 ###########################################################################
  152         718  
325              
326             =item $status->check ()
327              
328             Runs the SpamAssassin rules against the message pointed to by the object.
329              
330             =cut
331              
332             my ($self) = shift;
333             my $master_deadline = $self->{master_deadline};
334             if (!$master_deadline) {
335             $self->check_timed(@_);
336 97     97 1 248 } else {
337 97         321 my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
338 97 50       277 my $err = $t->run(sub { $self->check_timed(@_) });
339 0         0 if (time > $master_deadline && !$self->{deadline_exceeded}) {
340             info("check: exceeded time limit in pms check");
341 97         1130 $self->{deadline_exceeded} = 1;
342 97     97   1022 }
  97         442  
343 96 50 33     929 }
344 0         0 }
345 0         0  
346             my ($self) = @_;
347             local ($_);
348              
349             $self->{learned_points} = 0;
350             $self->{body_only_points} = 0;
351 97     97 0 297 $self->{head_only_points} = 0;
352 97         207 $self->{score} = 0;
353              
354 97         401 # clear NetSet cache before every check to prevent it growing too large
355 97         366 foreach my $nset_name (qw(internal_networks trusted_networks msa_networks)) {
356 97         230 my $netset = $self->{conf}->{$nset_name};
357 97         218 $netset->ditch_cache() if $netset;
358             }
359              
360 97         292 $self->{main}->call_plugins ("check_start", { permsgstatus => $self });
361 291         702  
362 291 50       1167 # in order of slowness; fastest first, slowest last.
363             # we do ALL the tests, even if a spam triggers lots of them early on.
364             # this lets us see ludicrously spammish mails (score: 40) etc., which
365 97         647 # we can then immediately submit to spamblocking services.
366             #
367             # TODO: change this to do whitelist/blacklists first? probably a plan
368             # NOTE: definitely need AWL stuff last, for regression-to-mean of score
369              
370             # TVD: we may want to do more than just clearing out the headers, but ...
371             $self->{msg}->delete_header('X-Spam-.*');
372              
373             # Resident Mail::SpamAssassin code will possibly never change score
374             # sets, even if bayes becomes available. So we should do a quick check
375             # to see if we should go from {0,1} to {2,3}. We of course don't need
376 97         648 # to do this switch if we're already using bayes ... ;)
377             my $set = $self->{conf}->get_score_set();
378             if (($set & 2) == 0 && $self->{main}->{bayes_scanner} && $self->{main}->{bayes_scanner}->is_scan_available() && $self->{conf}->{use_bayes_rules}) {
379             dbg("check: scoreset $set but bayes is available, switching scoresets");
380             $self->{conf}->set_score_set ($set|2);
381             }
382 97         533 dbg("check: using scoreset $set in M:S:Pms");
383 97 0 66     1078 # The primary check functionality occurs via a plugin call. For more
      66        
      33        
384 0         0 # information, please see: Mail::SpamAssassin::Plugin::Check
385 0         0 if (!$self->{main}->call_plugins ("check_main", { permsgstatus => $self }))
386             {
387 97         557 # did anything happen? if not, this is fatal
388             if (!$self->{main}->have_plugin("check_main")) {
389             die "check: no loaded plugin implements 'check_main': cannot scan!\n".
390 97 100       398 "Check that the necessary '.pre' files are in the config directory.\n".
391             "At a minimum, v320.pre loads the Check plugin which is required.\n";
392             }
393 1 50       10 }
394 1         21  
395             # delete temporary storage and memory allocation used during checking
396             $self->delete_fulltext_tmpfile();
397              
398             # now that we've finished checking the mail, clear out this cache
399             # to avoid unforeseen side-effects.
400             $self->{c} = { };
401 96         702  
402             # Round the score to 3 decimal places to avoid rounding issues
403             # We assume required_score to be properly rounded already.
404             # add 0 to force it back to numeric representation instead of string.
405 96         1055 $self->{score} = (sprintf "%0.3f", $self->{score}) + 0;
406              
407             dbg("check: is spam? score=".$self->{score}.
408             " required=".$self->{conf}->{required_score});
409             dbg("check: tests=".$self->get_names_of_tests_hit());
410 96         1420 dbg("check: subtests=".$self->get_names_of_subtests_hit("dbg"));
411             $self->{is_spam} = $self->is_spam();
412              
413 96         892 $self->{main}->{resolver}->bgabort();
414 96         555 $self->{main}->call_plugins ("check_end", { permsgstatus => $self });
415 96         497  
416 96         378 1;
417             }
418 96         527  
419 96         473 ###########################################################################
420              
421 96         1592 =item $status->learn()
422              
423             After a mail message has been checked, this method can be called. If the score
424             is outside a certain range around the threshold, ie. if the message is judged
425             more-or-less definitely spam or definitely non-spam, it will be fed into
426             SpamAssassin's learning systems (currently the naive Bayesian classifier),
427             so that future similar mails will be caught.
428              
429             =cut
430              
431             my ($self) = shift;
432             my $master_deadline = $self->{master_deadline};
433             if (!$master_deadline) {
434             $self->learn_timed(@_);
435             } else {
436             my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
437 96     96 1 203 my $err = $t->run(sub { $self->learn_timed(@_) });
438 96         186 if (time > $master_deadline && !$self->{deadline_exceeded}) {
439 96 50       273 info("learn: exceeded time limit in pms learn");
440 0         0 $self->{deadline_exceeded} = 1;
441             }
442 96         414 }
443 96     96   840 }
  96         602  
444 96 50 33     816  
445 0         0 my ($self) = @_;
446 0         0  
447             if (!$self->{conf}->{bayes_auto_learn} ||
448             !$self->{conf}->{use_bayes} ||
449             $self->{disable_auto_learning})
450             {
451             $self->{auto_learn_status} = "disabled";
452 96     96 0 267 return;
453             }
454 96 100 100     692  
      100        
455             my ($isspam, $force_autolearn, $force_autolearn_names, $arrayref);
456             $arrayref = $self->{main}->call_plugins ("autolearn_discriminator", {
457             permsgstatus => $self
458 84         262 });
459 84         1124  
460             $isspam = $arrayref->[0];
461             $force_autolearn = $arrayref->[1];
462 12         36 $force_autolearn_names = $arrayref->[2];
463 12         61  
464             #AUTOLEARN_FORCE FLAG INFORMATION
465             if (defined $force_autolearn and $force_autolearn > 0) {
466             $self->{auto_learn_force_status} = "yes";
467 12         48 if (defined $force_autolearn_names) {
468 12         24 $self->{auto_learn_force_status} .= " ($force_autolearn_names)";
469 12         19 }
470             } else {
471             $self->{auto_learn_force_status} = "no";
472 12 50 66     70 }
473 0         0  
474 0 0       0 if (!defined $isspam) {
475 0         0 $self->{auto_learn_status} = 'no';
476             return;
477             }
478 12         28  
479              
480             my $timer = $self->{main}->time_method("learn");
481 12 100       59  
482 10         28 $self->{main}->call_plugins ("autolearn", {
483 10         144 permsgstatus => $self,
484             isspam => $isspam
485             });
486              
487 2         11 # bug 3704: temporarily override learn's ability to re-learn a message
488             my $orig_learner = $self->{main}->init_learner({ "no_relearn" => 1 });
489 2         11  
490             my $eval_stat;
491             eval {
492             my $learnstatus = $self->{main}->learn ($self->{msg}, undef, $isspam, 0);
493             if ($learnstatus->did_learn()) {
494             $self->{auto_learn_status} = $isspam ? "spam" : "ham";
495 2         13 }
496             # This must wait until the did_learn call.
497 2         5 $learnstatus->finish();
498             $self->{main}->finish_learner(); # for now
499 2         12  
500 2 50       8 if (exists $self->{main}->{bayes_scanner}) {
501 0 0       0 $self->{main}->{bayes_scanner}->force_close();
502             }
503             1;
504 2         8 } or do {
505 2         11 $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
506             };
507 2 50       6  
508 2         9 # reset learner options to their original values
509             $self->{main}->init_learner($orig_learner);
510 2         12  
511 2 50       3 if (defined $eval_stat) {
512 0 0       0 dbg("learn: auto-learning failed: $eval_stat");
  0         0  
513             $self->{auto_learn_status} = "failed";
514             }
515             }
516 2         9  
517             =item $score = $status->get_autolearn_points()
518 2 50       52  
519 0         0 Return the message's score as computed for auto-learning. Certain tests are
520 0         0 ignored:
521              
522             - rules with tflags set to 'learn' (the Bayesian rules)
523              
524             - rules with tflags set to 'userconf' (user white/black-listing rules, etc)
525              
526             - rules with tflags set to 'noautolearn'
527              
528             Also note that auto-learning occurs using scores from either scoreset 0 or 1,
529             depending on what scoreset is used during message check. It is likely that the
530             message check and auto-learn scores will be different.
531              
532             =cut
533              
534             my ($self) = @_;
535             $self->_get_autolearn_points();
536             return $self->{autolearn_points};
537             }
538              
539             =item $score = $status->get_head_only_points()
540              
541             Return the message's score as computed for auto-learning, ignoring
542 12     12 1 28 all rules except for header-based ones.
543 12         65  
544 12         49 =cut
545              
546             my ($self) = @_;
547             $self->_get_autolearn_points();
548             return $self->{head_only_points};
549             }
550              
551             =item $score = $status->get_learned_points()
552              
553             Return the message's score as computed for auto-learning, ignoring
554             all rules except for learning-based ones.
555 12     12 1 30  
556 12         30 =cut
557 12         44  
558             my ($self) = @_;
559             $self->_get_autolearn_points();
560             return $self->{learned_points};
561             }
562              
563             =item $score = $status->get_body_only_points()
564              
565             Return the message's score as computed for auto-learning, ignoring
566             all rules except for body-based ones.
567              
568 12     12 1 32 =cut
569 12         31  
570 12         34 my ($self) = @_;
571             $self->_get_autolearn_points();
572             return $self->{body_only_points};
573             }
574              
575             =item $score = $status->get_autolearn_force_status()
576              
577             Return whether a message's score included any rules that are flagged as
578             autolearn_force.
579              
580             =cut
581 12     12 1 36  
582 12         36 my ($self) = @_;
583 12         35 $self->_get_autolearn_points();
584             return $self->{autolearn_force};
585             }
586              
587             =item $rule_names = $status->get_autolearn_force_names()
588              
589             Return a list of comma separated list of rule names if a message's
590             score included any rules that are flagged as autolearn_force.
591              
592             =cut
593              
594 12     12 1 31 my ($self) = @_;
595 12         35 my ($names);
596 12         29  
597             $self->_get_autolearn_points();
598             $names = $self->{autolearn_force_names};
599              
600             if (defined $names) {
601             #remove trailing comma
602             $names =~ s/,$//;
603             } else {
604             $names = "";
605             }
606              
607 12     12 1 24 return $names;
608 12         27 }
609              
610 12         33 my ($self) = @_;
611 12         32  
612             return if (exists $self->{autolearn_points});
613 12 50       30 # ensure it only gets computed once, even if we return early
614             $self->{autolearn_points} = 0;
615 0         0  
616             # This function needs to use use sum($score[scoreset % 2]) not just {score}.
617 12         28 # otherwise we shift what we autolearn on and it gets really weird. - tvd
618             my $orig_scoreset = $self->{conf}->get_score_set();
619             my $new_scoreset = $orig_scoreset;
620 12         29 my $scores = $self->{conf}->{scores};
621              
622             if (($orig_scoreset & 2) == 0) { # we don't need to recompute
623             dbg("learn: auto-learn: currently using scoreset $orig_scoreset");
624 72     72   106 }
625             else {
626 72 100       159 $new_scoreset = $orig_scoreset & ~2;
627             dbg("learn: auto-learn: currently using scoreset $orig_scoreset, recomputing score based on scoreset $new_scoreset");
628 12         49 $scores = $self->{conf}->{scoreset}->[$new_scoreset];
629             }
630              
631             my $tflags = $self->{conf}->{tflags};
632 12         64 my $points = 0;
633 12         27  
634 12         26 # Just in case this function is called multiple times, clear out the
635             # previous calculated values
636 12 50       54 $self->{learned_points} = 0;
637 12         59 $self->{body_only_points} = 0;
638             $self->{head_only_points} = 0;
639             $self->{autolearn_force} = 0;
640 0         0  
641 0         0 foreach my $test (@{$self->{test_names_hit}}) {
642 0         0 # According to the documentation, noautolearn, userconf, and learn
643             # rules are ignored for autolearning.
644             if (exists $tflags->{$test}) {
645 12         39 next if $tflags->{$test} =~ /\bnoautolearn\b/;
646 12         23 next if $tflags->{$test} =~ /\buserconf\b/;
647              
648             # Keep track of the learn points for an additional autolearn check.
649             # Use the original scoreset since it'll be 0 in sets 0 and 1.
650 12         29 if ($tflags->{$test} =~ /\blearn\b/) {
651 12         26 # we're guaranteed that the score will be defined
652 12         18 $self->{learned_points} += $self->{conf}->{scoreset}->[$orig_scoreset]->{$test};
653 12         33 next;
654             }
655 12         21  
  12         60  
656             #IF ANY RULES ARE AUTOLEARN FORCE, SET THAT FLAG
657             if ($tflags->{$test} =~ /\bautolearn_force\b/) {
658 19 100       73 $self->{autolearn_force}++;
659 6 100       26 #ADD RULE NAME TO LIST
660 5 50       50 $self->{autolearn_force_names}.="$test,";
661             }
662             }
663              
664 0 0       0 # ignore tests with 0 score (or undefined) in this scoreset
665             next if !$scores->{$test};
666 0         0  
667 0         0 # Go ahead and add points to the proper locations
668             # Changed logic because in testing, I was getting both head and body. Bug 5503
669             if ($self->{conf}->maybe_header_only ($test)) {
670             $self->{head_only_points} += $scores->{$test};
671 0 0       0 dbg("learn: auto-learn: adding head_only points $scores->{$test}");
672 0         0 } elsif ($self->{conf}->maybe_body_only ($test)) {
673             $self->{body_only_points} += $scores->{$test};
674 0         0 dbg("learn: auto-learn: adding body_only points $scores->{$test}");
675             } else {
676             dbg("learn: auto-learn: not considered head or body scores: $scores->{$test}");
677             }
678              
679 13 50       65 $points += $scores->{$test};
680             }
681              
682             # Figure out the final value we'll use for autolearning
683 13 100       65 $points = (sprintf "%0.3f", $points) + 0;
    50          
684 9         38 dbg("learn: auto-learn: message score: ".$self->{score}.", computed score for autolearn: $points");
685 9         75  
686             $self->{autolearn_points} = $points;
687 4         13 }
688 4         28  
689             ###########################################################################
690 0         0  
691             =item $isspam = $status->is_spam ()
692              
693 13         49 After a mail message has been checked, this method can be called. It will
694             return 1 for mail determined likely to be spam, 0 if it does not seem
695             spam-like.
696              
697 12         160 =cut
698 12         131  
699             my ($self) = @_;
700 12         44 # changed to test this so sub-tests can ask "is_spam" during a run
701             return ($self->{score} >= $self->{conf}->{required_score});
702             }
703              
704             ###########################################################################
705              
706             =item $list = $status->get_names_of_tests_hit ()
707              
708             After a mail message has been checked, this method can be called. It will
709             return a comma-separated string, listing all the symbolic test names
710             of the tests which were triggered by the mail.
711              
712             =cut
713              
714 99     99 1 198 my ($self) = @_;
715              
716 99         462 return join(',', sort @{$self->{test_names_hit}});
717             }
718              
719             =item $list = $status->get_names_of_tests_hit_with_scores_hash ()
720              
721             After a mail message has been checked, this method can be called. It will
722             return a pointer to a hash for rule & score pairs for all the symbolic
723             test names and individual scores of the tests which were triggered by the mail.
724              
725             =cut
726              
727             my ($self) = @_;
728              
729             #BASED ON CODE FOR TESTSSCORES TAG
730 97     97 1 260 my $scores = $self->{conf}->{scores};
731             my %testsscores;
732 97         169 $testsscores{$_} = $scores->{$_} || '0' for @{$self->{test_names_hit}};
  97         653  
733             return \%testsscores;
734             }
735              
736             =item $list = $status->get_names_of_tests_hit_with_scores ()
737              
738             After a mail message has been checked, this method can be called. It will
739             return a comma-separated string of rule=score pairs for all the symbolic
740             test names and individual scores of the tests which were triggered by the mail.
741              
742             =cut
743              
744 0     0 1 0 my ($self) = @_;
745              
746             #BASED ON CODE FOR TESTSSCORES TAG
747 0         0 my $scores = $self->{conf}->{scores};
748 0         0 return join(',', map($_ . '=' . ($scores->{$_} || '0'),
749 0   0     0 sort @{$self->{test_names_hit}})) || "none";
  0         0  
750 0         0 }
751              
752              
753             ###########################################################################
754              
755             =item $list = $status->get_names_of_subtests_hit ()
756              
757             After a mail message has been checked, this method can be called. It will
758             return a comma-separated string, listing all the symbolic test names of the
759             meta-rule sub-tests which were triggered by the mail. Sub-tests are the
760             normally-hidden rules, which score 0 and have names beginning with two
761             underscores, used in meta rules.
762 0     0 1 0  
763             If a parameter of collapsed or dbg is passed, the output will be a condensed
764             array of sub-tests with multiple hits reduced to one entry.
765 0         0  
766             If the parameter of dbg is passed, the output will be a condensed string of
767 0   0     0 sub-tests with multiple hits reduced to one entry with the number of hits
768             in parentheses. Some information is also added at the end regarding the
769             multiple hits.
770              
771             =cut
772              
773             my ($self, $mode) = @_;
774              
775             if (defined $mode && ($mode eq 'dbg' || $mode eq 'collapsed')) {
776             # This routine prints only one instance of a subrule hit with a count of how many times it hit if greater than 1
777             my $total_hits = scalar(@{$self->{subtest_names_hit}});
778             return '' if !$total_hits;
779              
780             my %subtest_names_hit;
781             $subtest_names_hit{$_}++ foreach @{$self->{subtest_names_hit}};
782              
783             my @subtests = sort keys %subtest_names_hit;
784             my $deduplicated_hits = scalar(@subtests);
785              
786             my @result;
787             foreach my $rule (@subtests) {
788             if ($subtest_names_hit{$rule} > 1) {
789             push @result, "$rule($subtest_names_hit{$rule})";
790             } else {
791             push @result, $rule;
792 96     96 1 270 }
793             }
794 96 50 33     766  
      33        
795             if ($mode eq 'dbg') {
796 96         224 return join(',', @result)." (Total Subtest Hits: $total_hits / Deduplicated Total Hits: $deduplicated_hits)";
  96         242  
797 96 100       517 } else {
798             return join(',', @result);
799 42         69 }
800 42         100 } else {
  42         232  
801             # Return the simpler string with duplicates and commas
802 42         265 return join(',', sort @{$self->{subtest_names_hit}});
803 42         94 }
804             }
805 42         68  
806 42         139 ###########################################################################
807 84 50       201  
808 0         0 =item $num = $status->get_score ()
809              
810 84         165 After a mail message has been checked, this method can be called. It will
811             return the message's score.
812              
813             =cut
814 42 50       135  
815 42         363 my ($self) = @_;
816             return $self->{score};
817 0         0 }
818              
819             # left as backward compatibility
820             my ($self) = @_;
821 0         0 return $self->{score};
  0         0  
822             }
823              
824             ###########################################################################
825              
826             =item $num = $status->get_required_score ()
827              
828             After a mail message has been checked, this method can be called. It will
829             return the score required for a mail to be considered spam.
830              
831             =cut
832              
833             my ($self) = @_;
834             return $self->{conf}->{required_score};
835 0     0 1 0 }
836 0         0  
837             # left as backward compatibility
838             my ($self) = @_;
839             return $self->{conf}->{required_score};
840             }
841 0     0 0 0  
842 0         0 ###########################################################################
843              
844             =item $num = $status->get_autolearn_status ()
845              
846             After a mail message has been checked, this method can be called. It will
847             return one of the following strings depending on whether the mail was
848             auto-learned or not: "ham", "no", "spam", "disabled", "failed", "unavailable".
849              
850             It also returns is flagged with auto_learn_force, it will also include the status
851             and the rules hit. For example: "autolearn_force=yes (AUTOLEARNTEST_BODY)"
852              
853             =cut
854              
855 0     0 1 0 my ($self) = @_;
856 0         0 my ($status) = $self->{auto_learn_status} || "unavailable";
857              
858             if (defined $self->{auto_learn_force_status}) {
859             $status .= " autolearn_force=".$self->{auto_learn_force_status};
860             }
861 0     0 0 0  
862 0         0 return $status;
863             }
864              
865             ###########################################################################
866              
867             =item $report = $status->get_report ()
868              
869             Deliver a "spam report" on the checked mail message. This contains details of
870             how many spam detection rules it triggered.
871              
872             The report is returned as a multi-line string, with the lines separated by
873             C<\n> characters.
874              
875             =cut
876              
877             my ($self) = @_;
878              
879 34     34 1 77 if (!exists $self->{'report'}) {
880 34   50     117 my $report;
881              
882 34 100       94 my $timer = $self->{main}->time_method("get_report");
883 2         8 $report = $self->{conf}->{report_template};
884             $report ||= '(no report template found)';
885              
886 34         85 $report = $self->_replace_tags($report);
887              
888             $report =~ s/\n*$/\n\n/s;
889             $self->{report} = $report;
890             }
891              
892             return $self->{report};
893             }
894              
895             ###########################################################################
896              
897             =item $preview = $status->get_content_preview ()
898              
899             Give a "preview" of the content.
900              
901             This is returned as a multi-line string, with the lines separated by C<\n>
902 4     4 1 1626 characters, containing a fully-decoded, safe, plain-text sample of the first
903             few lines of the message body.
904 4 50       17  
905 4         6 =cut
906              
907 4         22 my ($self) = @_;
908 4         17  
909 4   50     24 my $str = '';
910             my @ary = @{$self->get_decoded_stripped_body_text_array()};
911 4         31 shift @ary; # drop the subject line
912              
913 4         289 my $numlines = 3;
914 4         32 while (length ($str) < 200 && @ary && $numlines-- > 0) {
915             $str .= shift @ary;
916             }
917 4         16  
918             # in case the last line was huge, trim it back to around 200 chars
919             local $1;
920             $str =~ s/^(.{200}).+$/$1 [...]/gm;
921             chomp ($str); $str .= "\n";
922              
923             # now, some tidy-ups that make things look a bit prettier
924             $str =~ s/-----Original Message-----.*$//gm;
925             $str =~ s/This is a multi-part message in MIME format\.//gs;
926             $str =~ s/[-_*.]{10,}//gs;
927             $str =~ s/\s+/ /gs;
928              
929             # add "Content preview:" ourselves, so that the text aligns
930             # correctly with the template -- then trim it off. We don't
931             # have to get this *exactly* right, but it's nicer if we
932             # make a bit of an effort ;)
933 4     4 1 11 $str = Mail::SpamAssassin::Util::wrap($str, " ", "Content preview: ", 75, 1);
934             $str =~ s/^Content preview:\s+//gs;
935 4         21  
936 4         15 return $str;
  4         23  
937 4         9 }
938              
939 4         9 ###########################################################################
940 4   33     47  
      66        
941 12         127 =item $msg = $status->get_message()
942              
943             Return the object representing the message being scanned.
944              
945 4         15 =cut
946 4         23  
947 4         10 my ($self) = @_;
  4         19  
948             return $self->{msg};
949             }
950 4         13  
951 4         11 ###########################################################################
952 4         20  
953 4         53 =item $status->rewrite_mail ()
954              
955             Rewrite the mail message. This will at minimum add headers, and at
956             maximum MIME-encapsulate the message text, to reflect its spam or not-spam
957             status. The function will return a scalar of the rewritten message.
958              
959 4         24 The actual modifications depend on the configuration (see
960 4         34 C<Mail::SpamAssassin::Conf> for more information).
961              
962 4         29 The possible modifications are as follows:
963              
964             =over 4
965              
966             =item To:, From: and Subject: modification on spam mails
967              
968             Depending on the configuration, the To: and From: lines can have a
969             user-defined RFC 2822 comment appended for spam mail. The subject line
970             may have a user-defined string prepended to it for spam mail.
971              
972             =item X-Spam-* headers for all mails
973              
974 0     0 1 0 Depending on the configuration, zero or more headers with names
975 0         0 beginning with C<X-Spam-> will be added to mail depending on whether
976             it is spam or ham.
977              
978             =item spam message with report_safe
979              
980             If report_safe is set to true (1), then spam messages are encapsulated
981             into their own message/rfc822 MIME attachment without any modifications
982             being made.
983              
984             If report_safe is set to false (0), then the message will only have the
985             above headers added/modified.
986              
987             =back
988              
989             =cut
990              
991             my ($self) = @_;
992              
993             my $timer = $self->{main}->time_method("rewrite_mail");
994             my $msg = $self->{msg}->get_mbox_separator() || '';
995              
996             if ($self->{is_spam} && $self->{conf}->{report_safe}) {
997             $msg .= $self->rewrite_report_safe();
998             }
999             else {
1000             $msg .= $self->rewrite_no_report_safe();
1001             }
1002              
1003             return $msg;
1004             }
1005              
1006             # Make the line endings in the passed string reference appropriate
1007             # for the original mail. Callers must note bug 5250: don't rewrite
1008             # the message body, since that will corrupt 8bit attachments/MIME parts.
1009             #
1010             my ($self, $strref) = @_;
1011             if ($self->{msg}->{line_ending} ne "\n") {
1012             $$strref =~ s/\r?\n/$self->{msg}->{line_ending}/gs;
1013             }
1014             }
1015              
1016             my ($self, $which) = @_;
1017             my $str = '';
1018             # use string appends to put this back together -- I finally benchmarked it.
1019 34     34 1 263 # join() is 56% of the speed of just using string appends. ;)
1020             foreach my $hf_ref (@{$self->{conf}->{$which}}) {
1021 34         125 my($hfname, $hfbody) = @$hf_ref;
1022 34   100     129 my $line = $self->_process_header($hfname,$hfbody);
1023             $line = $self->qp_encode_header($line);
1024 34 100 66     144 $str .= "X-Spam-$hfname: $line\n";
1025 2         11 }
1026             return $str;
1027             };
1028 32         120  
1029             # rewrite the message in report_safe mode
1030             # should not be called directly, use rewrite_mail instead
1031 34         147 #
1032             my ($self) = @_;
1033              
1034             my $tag;
1035              
1036             # This is the original message. We do not want to make any modifications so
1037             # we may recover it if necessary. It will be put into the new message as a
1038             # message/rfc822 MIME part.
1039 36     36   77 my $original = $self->{msg}->get_pristine();
1040 36 50       149  
1041 0         0 # This is the new message.
1042             my $newmsg = '';
1043              
1044             # the report charset
1045             my $report_charset = "; charset=iso-8859-1";
1046 34     34   83 if ($self->{conf}->{report_charset}) {
1047 34         55 $report_charset = "; charset=" . $self->{conf}->{report_charset};
1048             }
1049              
1050 34         63 # the SpamAssassin report
  34         152  
1051 168         629 my $report = $self->get_report();
1052 168         376  
1053 168         481 # If there are any wide characters, need to MIME-encode in UTF-8
1054 168         829 # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
1055             # we could try converting to that charset if possible
1056 34         115 unless ($] < 5.008 || utf8::downgrade($report, 1)) {
1057             $report_charset = "; charset=utf-8";
1058             utf8::encode($report);
1059             }
1060              
1061             # get original headers, "pristine" if we can do it
1062             my $from = $self->{msg}->get_pristine_header("From");
1063 2     2 0 6 my $to = $self->{msg}->get_pristine_header("To");
1064             my $cc = $self->{msg}->get_pristine_header("Cc");
1065 2         4 my $subject = $self->{msg}->get_pristine_header("Subject");
1066             my $msgid = $self->{msg}->get_pristine_header('Message-Id');
1067             my $date = $self->{msg}->get_pristine_header("Date");
1068              
1069             # It'd be nice to do this with a foreach loop, but with only three
1070 2         10 # possibilities right now, it's easier not to...
1071              
1072             if (defined $self->{conf}->{rewrite_header}->{Subject}) {
1073 2         5 # Add a prefix to the subject if needed
1074             $subject = "\n" if !defined $subject;
1075             if((defined $self->{subjprefix}) and ($self->{subjprefix} ne "")) {
1076 2         7 $tag = $self->_replace_tags($self->{subjprefix});
1077 2 50       9 $tag =~ s/\n/ /gs;
1078 0         0 $subject = $tag . $subject;
1079             }
1080             # Add a **SPAM** prefix
1081             $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{Subject});
1082 2         9 $tag =~ s/\n/ /gs; # strip tag's newlines
1083             $subject =~ s/^(?:\Q${tag}\E )?/${tag} /g; # For some reason the tag may already be there!?
1084             }
1085              
1086             if (defined $self->{conf}->{rewrite_header}->{To}) {
1087 2 50 33     18 $to = "\n" if !defined $to;
1088 0         0 my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{To});
1089 0         0 $tag =~ s/\n/ /gs; # strip tag's newlines
1090             $to =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1091             }
1092              
1093 2         14 if (defined $self->{conf}->{rewrite_header}->{From}) {
1094 2         9 $from = "\n" if !defined $from;
1095 2         10 my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{From});
1096 2         9 $tag =~ s/\n+//gs; # strip tag's newlines
1097 2         14 $from =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1098 2         10 }
1099              
1100             # add report headers to message
1101             $newmsg .= "From: $from" if defined $from;
1102             $newmsg .= "To: $to" if defined $to;
1103 2 50       11 $newmsg .= "Cc: $cc" if defined $cc;
1104             $newmsg .= "Subject: $subject" if defined $subject;
1105 0 0       0 $newmsg .= "Date: $date" if defined $date;
1106 0 0 0     0 $newmsg .= "Message-Id: $msgid" if defined $msgid;
1107 0         0 $newmsg .= $self->_get_added_headers('headers_spam');
1108 0         0  
1109 0         0 if (defined $self->{conf}->{report_safe_copy_headers}) {
1110             my %already_added = map { $_ => 1 } qw/from to cc subject date message-id/;
1111              
1112 0         0 foreach my $hdr (@{$self->{conf}->{report_safe_copy_headers}}) {
1113 0         0 next if exists $already_added{lc $hdr};
1114 0         0 my @hdrtext = $self->{msg}->get_pristine_header($hdr);
1115             $already_added{lc $hdr}++;
1116              
1117 2 50       17 if (lc $hdr eq "received") { # add Received at the top ...
1118 0 0       0 my $rhdr = "";
1119 0         0 foreach (@hdrtext) {
1120 0         0 $rhdr .= "$hdr: $_";
1121 0         0 }
1122             $newmsg = "$rhdr$newmsg";
1123             }
1124 2 50       9 else {
1125 0 0       0 foreach (@hdrtext) {
1126 0         0 $newmsg .= "$hdr: $_";
1127 0         0 }
1128 0         0 }
1129             }
1130             }
1131              
1132 2 50       14 # jm: add a SpamAssassin Received header to note markup time etc.
1133 2 50       17 # emulates the fetchmail style.
1134 2 50       6 # tvd: do this after report_safe_copy_headers so Received will be done correctly
1135 2 50       9 $newmsg = "Received: from localhost by " .
1136 2 50       11 Mail::SpamAssassin::Util::fq_hostname() . "\n" .
1137 2 50       16 "\twith SpamAssassin (version " .
1138 2         11 Mail::SpamAssassin::Version() . ");\n" .
1139             "\t" . Mail::SpamAssassin::Util::time_to_rfc822_date() . "\n" .
1140 2 50       12 $newmsg;
1141 2         6  
  12         24  
1142             # MIME boundary
1143 2         6 my $boundary = "----------=_" . sprintf("%08X.%08X",time,int(rand(2 ** 32)));
  2         10  
1144 0 0       0  
1145 0         0 # ensure it's unique, so we can't be attacked this way
1146 0         0 while ($original =~ /^\Q${boundary}\E(?:--)?$/m) {
1147             $boundary .= "/".sprintf("%08X",int(rand(2 ** 32)));
1148 0 0       0 }
1149 0         0  
1150 0         0 # determine whether Content-Disposition should be "attachment" or "inline"
1151 0         0 my $disposition;
1152             my $ct = $self->{msg}->get_header("Content-Type");
1153 0         0 if (defined $ct && $ct ne '' && $ct !~ m{text/plain}i) {
1154             $disposition = "attachment";
1155             $report .= $self->_replace_tags($self->{conf}->{unsafe_report_template});
1156 0         0 # if we wanted to defang the attachment, this would be the place
1157 0         0 }
1158             else {
1159             $disposition = "inline";
1160             }
1161              
1162             my $type = "message/rfc822";
1163             $type = "text/plain" if $self->{conf}->{report_safe} > 1;
1164              
1165             my $description = $self->{conf}->{'encapsulated_content_description'};
1166 2         7  
1167             # Note: the message should end in blank line since mbox format wants
1168             # blank line at end and messages may be concatenated! In addition, the
1169             # x-spam-type parameter is fixed since we will use it later to recognize
1170             # original messages that can be extracted.
1171             $newmsg .= <<"EOM";
1172             MIME-Version: 1.0
1173             Content-Type: multipart/mixed; boundary="$boundary"
1174 2         19  
1175             This is a multi-part message in MIME format.
1176              
1177 2         58 --$boundary
1178 0         0 Content-Type: text/plain$report_charset
1179             Content-Disposition: inline
1180             Content-Transfer-Encoding: 8bit
1181              
1182 2         6 $report
1183 2         11  
1184 2 50 33     108 --$boundary
      33        
1185 2         7 Content-Type: $type; x-spam-type=original
1186 2         10 Content-Description: $description
1187             Content-Disposition: $disposition
1188             Content-Transfer-Encoding: 8bit
1189              
1190 0         0 EOM
1191              
1192             my $newmsgtrailer = "\n--$boundary--\n\n";
1193 2         6  
1194 2 50       9 # now fix line endings in both headers, report_safe body parts,
1195             # and new MIME boundaries and structure
1196 2         7 $self->_fixup_report_line_endings(\$newmsg);
1197             $self->_fixup_report_line_endings(\$newmsgtrailer);
1198             $newmsg .= $original.$newmsgtrailer;
1199              
1200             return $newmsg;
1201             }
1202 2         43  
1203             # rewrite the message in non-report_safe mode (just headers)
1204             # should not be called directly, use rewrite_mail instead
1205             #
1206             my ($self) = @_;
1207              
1208             my $ntag;
1209             my $pref_subject = 0;
1210              
1211             # put the pristine headers into an array
1212             # skip the X-Spam- headers, but allow the X-Spam-Prev headers to remain.
1213             # since there may be a missing header/body
1214             #
1215             my @pristine_headers = split(/^/m, $self->{msg}->get_pristine_header());
1216             for (my $line = 0; $line <= $#pristine_headers; $line++) {
1217             next unless ($pristine_headers[$line] =~ /^X-Spam-(?!Prev-)/i);
1218             splice @pristine_headers, $line, 1 while ($pristine_headers[$line] =~ /^(?:X-Spam-(?!Prev-)|[ \t])/i);
1219             $line--;
1220             }
1221             my $separator = '';
1222             if (@pristine_headers && $pristine_headers[$#pristine_headers] =~ /^\s*$/) {
1223 2         11 $separator = pop @pristine_headers;
1224             }
1225              
1226             my $addition = 'headers_ham';
1227 2         8  
1228 2         6 if($self->{is_spam})
1229 2         54 {
1230             # special-case: Subject lines. ensure one exists, if we're
1231 2         34 # supposed to mark it up.
1232             my $created_subject = 0;
1233             my $subject = $self->{msg}->get_pristine_header('Subject');
1234             if (!defined($subject) && $self->{is_spam}
1235             && exists $self->{conf}->{rewrite_header}->{'Subject'})
1236             {
1237             push(@pristine_headers, "Subject: \n");
1238 32     32 0 64 $created_subject = 1;
1239             }
1240 32         55  
1241 32         47 # Deal with header rewriting
1242             foreach (@pristine_headers) {
1243             # if we're not going to do a rewrite, skip this header!
1244             next if (!/^(From|Subject|To):/i);
1245             my $hdr = ucfirst(lc($1));
1246             next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1247 32         132  
1248 32         155 # pop the original version onto the end of the header array
1249 120 50       426 if ($created_subject) {
1250 0         0 push(@pristine_headers, "X-Spam-Prev-Subject: (nonexistent)\n");
1251 0         0 } else {
1252             push(@pristine_headers, "X-Spam-Prev-$_");
1253 32         63 }
1254 32 50 33     333  
1255 32         90 # Figure out the rewrite piece
1256             my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{$hdr});
1257             $tag =~ s/\n/ /gs;
1258 32         78  
1259             # The tag should be a comment for this header ...
1260 32 50       91 $tag = "($tag)" if ($hdr =~ /^(?:From|To)$/);
1261              
1262             if((defined $self->{subjprefix}) and (defined $self->{conf}->{rewrite_header}->{Subject})) {
1263             if($self->{subjprefix} ne "") {
1264 0         0 $ntag = $self->_replace_tags($self->{subjprefix});
1265 0         0 $ntag =~ s/\n/ /gs;
1266 0 0 0     0 $ntag =~ s/\s+$//;
      0        
1267              
1268             local $1;
1269 0         0 if(defined $ntag) {
1270 0         0 s/^([^:]+:)[ \t]*(?:\Q${ntag}\E )?/$1 ${ntag} /i;
1271             }
1272             }
1273             }
1274 0         0 s/^([^:]+:)[ \t]*(?:\Q${tag}\E )?/$1 ${tag} /i;
1275             }
1276 0 0       0  
1277 0         0 $addition = 'headers_spam';
1278 0 0       0 } else {
1279             # special-case: Subject lines. ensure one exists, if we're
1280             # supposed to mark it up.
1281 0 0       0 my $created_subject = 0;
1282 0         0 my $subject = $self->{msg}->get_pristine_header('Subject');
1283             if (!defined($subject)
1284 0         0 && exists $self->{conf}->{rewrite_header}->{'Subject'})
1285             {
1286             push(@pristine_headers, "Subject: \n");
1287             $created_subject = 1;
1288 0         0 }
1289 0         0  
1290             # Deal with header rewriting
1291             foreach (@pristine_headers) {
1292 0 0       0 # if we're not going to do a rewrite, skip this header!
1293             next if (!/^(Subject):/i);
1294 0 0 0     0 my $hdr = ucfirst(lc($1));
1295 0 0       0 next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1296 0         0  
1297 0         0 if((defined $self->{subjprefix}) and (defined $self->{conf}->{rewrite_header}->{Subject})) {
1298 0         0 if($self->{subjprefix} ne "") {
1299             $ntag = $self->_replace_tags($self->{subjprefix});
1300 0         0 $ntag =~ s/\n/ /gs;
1301 0 0       0 $ntag =~ s/\s+$//;
1302 0         0  
1303             local $1;
1304             if(defined $ntag) {
1305             s/^([^:]+:)[ \t]*(?:\Q${ntag}\E )?/$1 ${ntag} /i;
1306 0         0 }
1307             }
1308             }
1309 0         0 }
1310              
1311             }
1312              
1313 32         92 # Break the pristine header set into two blocks; $new_hdrs_pre is the stuff
1314 32         113 # that we want to ensure comes before any SpamAssassin markup headers,
1315 32 50 33     242 # like the Return-Path header (see bug 3409).
1316             #
1317             # all the rest of the message headers (as left in @pristine_headers), is
1318 0         0 # to be placed after the SpamAssassin markup hdrs. Once one of those headers
1319 0         0 # is seen, all further headers go into that set; it's assumed that it's an
1320             # old copy of the header, or attempted spoofing, if it crops up halfway
1321             # through the headers.
1322              
1323 32         107 my $new_hdrs_pre = '';
1324             if (@pristine_headers && $pristine_headers[0] =~ /^Return-Path:/i) {
1325 88 50       219 $new_hdrs_pre .= shift(@pristine_headers);
1326 0         0 while (@pristine_headers && $pristine_headers[0] =~ /^[ \t]/) {
1327 0 0       0 $new_hdrs_pre .= shift(@pristine_headers);
1328             }
1329 0 0 0     0 }
1330 0 0       0 $new_hdrs_pre .= $self->_get_added_headers($addition);
1331 0         0  
1332 0         0 # fix up line endings appropriately
1333 0         0 my $newmsg = $new_hdrs_pre . join('',@pristine_headers) . $separator;
1334             $self->_fixup_report_line_endings(\$newmsg);
1335 0         0  
1336 0 0       0 return $newmsg.$self->{msg}->get_pristine_body();
1337 0         0 }
1338              
1339             my ($self, $text) = @_;
1340              
1341             # return unchanged if there are no 8-bit characters
1342             return $text if $text !~ tr/\x00-\x7F//c;
1343              
1344             my $cs = 'ISO-8859-1';
1345             if ($self->{report_charset}) {
1346             $cs = $self->{report_charset};
1347             }
1348              
1349             my @hexchars = split('', '0123456789abcdef');
1350             my $ord;
1351             local $1;
1352             $text =~ s{([\x80-\xff])}{
1353             $ord = ord $1;
1354             '='.$hexchars[($ord & 0xf0) >> 4].$hexchars[$ord & 0x0f]
1355 32         87 }ges;
1356 32 50 33     217  
1357 0         0 $text = '=?'.$cs.'?Q?'.$text.'?=';
1358 0   0     0  
1359 0         0 dbg("markup: encoding header in $cs: $text");
1360             return $text;
1361             }
1362 32         147  
1363             my ($self, $hdr_name, $hdr_data) = @_;
1364              
1365 32         220 $hdr_data = $self->_replace_tags($hdr_data);
1366 32         156 $hdr_data =~ s/(?:\r?\n)+$//; # make sure there are no trailing newlines ...
1367              
1368 32         149 if ($self->{conf}->{fold_headers}) {
1369             if ($hdr_data =~ /\n/) {
1370             $hdr_data =~ s/\s*\n\s*/\n\t/g;
1371             return $hdr_data;
1372 168     168 0 436 }
1373             else {
1374             # use '!!' instead of ': ' so it doesn't wrap on the space
1375 168 50       697 my $hdr = "X-Spam-$hdr_name!!$hdr_data";
1376             $hdr = Mail::SpamAssassin::Util::wrap($hdr, "\t", "", 79, 0, '(?<=[\s,])');
1377 0         0 # make sure there are no blank lines in headers
1378 0 0       0 # buggy wrap might not prefix blank lines with \t, so use \s* (bug 7672)
1379 0         0 $hdr =~ s/^\s*\n//gm;
1380             return (split (/!!/, $hdr, 2))[1]; # just return the data part
1381             }
1382 0         0 }
1383 0         0 else {
1384 0         0 $hdr_data =~ s/\n/ /g; # Can't have newlines in headers, unless folded
1385 0         0 return $hdr_data;
1386 0         0 }
1387 0         0 }
1388              
1389             my $self = shift;
1390 0         0 my $text = shift;
1391              
1392 0         0 # default to leaving the original string in place, if we cannot find
1393 0         0 # a tag for it (bug 4793)
1394             local($1,$2,$3);
1395             $text =~ s{(_(\w+?)(?:\((.*?)\))?_)}{
1396             my $full = $1;
1397 168     168   477 my $tag = $2;
1398             my $result;
1399 168         364 if ($tag =~ /^ADDEDHEADER(?:HAM|SPAM|)\z/) {
1400 168         382 # Bug 6278: break infinite recursion through _get_added_headers and
1401             # _get_tag on an attempt to use such tag in add_header template
1402 168 50       404 } else {
1403 168 50       316 $result = $self->get_tag_raw($tag,$3);
1404 0         0 $result = join(' ',@$result) if ref $result eq 'ARRAY';
1405 0         0 }
1406             defined $result ? $result : $full;
1407             }ge;
1408              
1409 168         504 return $text;
1410 168         437 }
1411              
1412             ###########################################################################
1413 168         417  
1414 168         856 # public API for plugins
1415              
1416             =item $status->action_depends_on_tags($tags, $code, @args)
1417              
1418 0         0 Enqueue the supplied subroutine reference C<$code>, to become runnable when
1419 0         0 all the specified tags become available. The C<$tags> may be a simple
1420             scalar - a tag name, or a listref of tag names. The subroutine C<&$code>
1421             when called will be passed a C<permessagestatus> object as its first argument,
1422             followed by the supplied (optional) list C<@args> .
1423              
1424 174     174   255 =cut
1425 174         303  
1426             my($self, $tags, $code, @args) = @_;
1427              
1428             ref $code eq 'CODE'
1429 174         502 or die "action_depends_on_tags: argument must be a subroutine ref";
1430 174         1024  
1431 430         934 # tag names on which the given action depends
1432 430         803 my @dep_tags = !ref $tags ? uc $tags : map(uc($_),@$tags);
1433 430         517  
1434 430 50       700 # @{$self->{tagrun_subs}} list of all submitted subroutines
1435             # @{$self->{tagrun_actions}{$tag}} bitmask of action indices blocked by tag
1436             # $self->{tagrun_tagscnt}[$action_ind] count of tags still pending
1437              
1438 430         712 # store action details, obtain its index
1439 430 50       995 push(@{$self->{tagrun_subs}}, [$code,@args]);
1440             my $action_ind = $#{$self->{tagrun_subs}};
1441 430 50       2098  
1442             # list dependency tag names which are not already satisfied
1443             my @blocking_tags;
1444 174         677 foreach (@dep_tags) {
1445             my $data = $self->get_tag($_);
1446             if (!defined $data || $data eq '') {
1447             push @blocking_tags, $_;
1448             }
1449             }
1450              
1451             $self->{tagrun_tagscnt}[$action_ind] = scalar @blocking_tags;
1452             $self->{tagrun_actions}{$_}[$action_ind] = 1 for @blocking_tags;
1453              
1454             if (@blocking_tags) {
1455             dbg("check: tagrun - action %s blocking on tags %s",
1456             $action_ind, join(', ',@blocking_tags));
1457             } else {
1458             dbg("check: tagrun - tag %s was ready, action %s runnable immediately: %s",
1459             join(', ',@dep_tags), $action_ind, join(', ',$code,@args));
1460             &$code($self, @args);
1461             }
1462 81     81 1 232 }
1463              
1464 81 50       260 # tag_is_ready() will be called by set_tag(), indicating that a given
1465             # tag just received its value, possibly unblocking an action routine
1466             # as declared by action_depends_on_tags().
1467             #
1468 81 50       317 # Well-behaving plugins should call set_tag() once when a tag is fully
1469             # assembled and ready. Multiple calls to set the same tag value are handled
1470             # gracefully, but may result in premature activation of a pending action.
1471             # Setting tag values by plugins should not be done directly but only through
1472             # the public API set_tag(), otherwise a pending action release may be missed.
1473             #
1474             my($self, $tag) = @_;
1475 81         120 $tag = uc $tag;
  81         319  
1476 81         139  
  81         220  
1477             if (would_log('dbg', 'check')) {
1478             my $tag_val = $self->{tag_data}{$tag};
1479 81         135 dbg("check: tagrun - tag %s is now ready, value: %s",
1480 81         181 $tag, !defined $tag_val ? '<UNDEF>'
1481 81         284 : ref $tag_val ne 'ARRAY' ? $tag_val
1482 81 50 33     408 : 'ARY:[' . join(',',@$tag_val) . ']' );
1483 81         222 }
1484             if (ref $self->{tagrun_actions}{$tag}) { # any action blocking on this tag?
1485             my $action_ind = 0;
1486             foreach my $action_pending (@{$self->{tagrun_actions}{$tag}}) {
1487 81         250 if ($action_pending) {
1488 81         301 $self->{tagrun_actions}{$tag}[$action_ind] = 0;
1489             if ($self->{tagrun_tagscnt}[$action_ind] <= 0) {
1490 81 50       205 # should not happen, warn and ignore
1491 81         357 warn "tagrun error: count for $action_ind is ".
1492             $self->{tagrun_tagscnt}[$action_ind]."\n";
1493             } elsif (! --($self->{tagrun_tagscnt}[$action_ind])) {
1494 0         0 my($code,@args) = @{$self->{tagrun_subs}[$action_ind]};
1495             dbg("check: tagrun - tag %s unblocking the action %s: %s",
1496 0         0 $tag, $action_ind, join(', ',$code,@args));
1497             &$code($self, @args);
1498             }
1499             }
1500             $action_ind++;
1501             }
1502             }
1503             }
1504              
1505             # debugging aid: show actions that are still pending, waiting for their
1506             # tags to receive a value
1507             #
1508             my($self) = @_;
1509             my @tags;
1510             @tags = keys %{$self->{tagrun_actions}} if ref $self->{tagrun_actions};
1511 897     897 0 1345 for my $tag (@tags) {
1512 897         1164 my @pending_actions = grep($self->{tagrun_actions}{$tag}[$_],
1513             (0 .. $#{$self->{tagrun_actions}{$tag}}));
1514 897 50       1759 dbg("check: tagrun - tag %s is still blocking action %s",
1515 0         0 $tag, join(', ', @pending_actions)) if @pending_actions;
1516 0 0       0 }
    0          
1517             }
1518              
1519             =item $status->set_tag($tagname, $value)
1520              
1521 897 50       2695 Set a template tag, as used in C<add_header>, report templates, etc.
1522 0         0 This API is intended for use by plugins. Tag names will be converted
1523 0         0 to an all-uppercase representation internally.
  0         0  
1524 0 0       0  
1525 0         0 C<$value> can be a simple scalar (string or number), or a reference to an
1526 0 0       0 array, in which case the public method get_tag will join array elements
    0          
1527             using a space as a separator, returning a single string for backward
1528             compatibility.
1529 0         0  
1530             C<$value> can also be a subroutine reference, which will be evaluated
1531 0         0 each time the template is expanded. The first argument passed by get_tag
  0         0  
1532 0         0 to a called subroutine will be a PerMsgStatus object (this module's object),
1533             followed by optional arguments provided a caller to get_tag.
1534 0         0  
1535             Note that perl supports closures, which means that variables set in the
1536             caller's scope can be accessed inside this C<sub>. For example:
1537 0         0  
1538             my $text = "hello world!";
1539             $status->set_tag("FOO", sub {
1540             my $pms = shift;
1541             return $text;
1542             });
1543              
1544             See C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details
1545             on how template tags are used.
1546 136     136 0 305  
1547 136         212 C<undef> will be returned if a tag by that name has not been defined.
1548 136 100       424  
  98         399  
1549 136         476 =cut
1550              
1551 79         183 my($self,$tag,$val) = @_;
  79         412  
1552 79 50       506 $self->{tag_data}->{uc $tag} = $val;
1553             $self->tag_is_ready($tag);
1554             }
1555              
1556             # public API for plugins
1557              
1558             =item $string = $status->get_tag($tagname)
1559              
1560             Get the current value of a template tag, as used in C<add_header>, report
1561             templates, etc. This API is intended for use by plugins. Tag names will be
1562             converted to an all-uppercase representation internally. See
1563             C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details on
1564             tags.
1565              
1566             C<undef> will be returned if a tag by that name has not been defined.
1567              
1568             =cut
1569              
1570             my($self, $tag, @args) = @_;
1571              
1572             return if !defined $tag;
1573             $tag = uc $tag;
1574             my $data;
1575             if (exists $common_tags{$tag}) {
1576             # tag data from traditional pre-defined tag subroutines
1577             $data = $common_tags{$tag};
1578             $data = $data->($self,@args) if ref $data eq 'CODE';
1579             $data = join(' ',@$data) if ref $data eq 'ARRAY';
1580             $data = "" if !defined $data;
1581             } elsif (exists $self->{tag_data}->{$tag}) {
1582             # tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1583             $data = $self->{tag_data}->{$tag};
1584             $data = $data->($self,@args) if ref $data eq 'CODE';
1585             $data = join(' ',@$data) if ref $data eq 'ARRAY';
1586             $data = "" if !defined $data;
1587             }
1588             return $data;
1589             }
1590 897     897 1 1831  
1591 897         6869 =item $string = $status->get_tag_raw($tagname, @args)
1592 897         1838  
1593             Similar to C<get_tag>, but keeps a tag name unchanged (does not uppercase it),
1594             and does not convert arrayref tag values into a single string.
1595              
1596             =cut
1597              
1598             my($self, $tag, @args) = @_;
1599              
1600             return if !defined $tag;
1601             my $data;
1602             if (exists $common_tags{$tag}) {
1603             # tag data from traditional pre-defined tag subroutines
1604             $data = $common_tags{$tag};
1605             $data = $data->($self,@args) if ref $data eq 'CODE';
1606             $data = "" if !defined $data;
1607             } elsif (exists $self->{tag_data}->{$tag}) {
1608             # tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1609             $data = $self->{tag_data}->{$tag};
1610 95     95 1 263 $data = $data->($self,@args) if ref $data eq 'CODE';
1611             $data = "" if !defined $data;
1612 95 50       240 }
1613 95         192 return $data;
1614 95         112 }
1615 95 100       598  
    50          
1616             ###########################################################################
1617 4         12  
1618 4 50       29 # public API for plugins
1619 4 50       15  
1620 4 50       14 =item $status->set_spamd_result_item($subref)
1621              
1622             Set an entry for the spamd result log line. C<$subref> should be a code
1623 91         195 reference for a subroutine which will return a string in C<'name=VALUE'>
1624 91 50       257 format, similar to the other entries in the spamd result line:
1625 91 50       215  
1626 91 50       287 Jul 17 14:10:47 radish spamd[16670]: spamd: result: Y 22 - ALL_NATURAL,
1627             DATE_IN_FUTURE_03_06,DIET_1,DRUGS_ERECTILE,DRUGS_PAIN,
1628 95         239 TEST_FORGED_YAHOO_RCVD,TEST_INVALID_DATE,TEST_NOREALNAME,
1629             TEST_NORMAL_HTTP_TO_IP,UNDISC_RECIPS scantime=0.4,size=3138,user=jm,
1630             uid=1000,required_score=5.0,rhost=localhost,raddr=127.0.0.1,
1631             rport=33153,mid=<9PS291LhupY>,autolearn=spam
1632              
1633             C<name> and C<VALUE> must not contain C<=> or C<,> characters, as it
1634             is important that these log lines are easy to parse.
1635              
1636             The code reference will be called by spamd after the message has been scanned,
1637             and the C<PerMsgStatus::check()> method has returned.
1638              
1639 430     430 1 1075 =cut
1640              
1641 430 50       942 my ($self, $ref) = @_;
1642 430         480 push @{$self->{spamd_result_log_items}}, $ref;
1643 430 100       1110 }
    50          
1644              
1645 362         576 # called by spamd
1646 362 50       1161 my ($self) = @_;
1647 362 50       759 my @ret;
1648             foreach my $ref (@{$self->{spamd_result_log_items}}) {
1649             push @ret, &$ref;
1650 68         164 }
1651 68 50       185 return @ret;
1652 68 50       181 }
1653              
1654 430         968 ###########################################################################
1655              
1656             my($self, $arg) = @_;
1657             my($arg_spam, $arg_ham);
1658             ($arg_spam, $arg_ham) = split(/,/, $arg, 2) if defined $arg;
1659             return $self->{is_spam} ? (defined $arg_spam ? $arg_spam : 'Yes')
1660             : (defined $arg_ham ? $arg_ham : 'No');
1661             }
1662              
1663             my ($self, $pad) = @_;
1664              
1665             my $score = sprintf("%2.1f", $self->{score});
1666             my $rscore = $self->_get_tag_value_for_required_score();
1667              
1668             #Change due to bug 6419 to use Util function for consistency with spamd
1669             #and PerMessageStatus
1670             $score = Mail::SpamAssassin::Util::get_tag_value_for_score($score, $rscore, $self->{is_spam});
1671              
1672             #$pad IS PROVIDED BY THE _SCORE(PAD)_ tag
1673             if (defined $pad && $pad =~ /^(0+| +)$/) {
1674             my $count = length($1) + 3 - length($score);
1675             $score = (substr($pad, 0, $count) . $score) if $count > 0;
1676             }
1677             return $score;
1678              
1679             }
1680              
1681             my $self = shift;
1682             return sprintf("%2.1f", $self->{conf}->{required_score});
1683 81     81 1 223 }
1684 81         130  
  81         307  
1685              
1686             ###########################################################################
1687              
1688             =item $status->finish ()
1689 0     0 0 0  
1690 0         0 Indicate that this C<$status> object is finished with, and can be destroyed.
1691 0         0  
  0         0  
1692 0         0 If you are using SpamAssassin in a persistent environment, or checking many
1693             mail messages from one C<Mail::SpamAssassin> factory, this method should be
1694 0         0 called to ensure Perl's garbage collection will clean up old status objects.
1695              
1696             =cut
1697              
1698             my ($self) = @_;
1699              
1700 36     36   102 $self->{main}->call_plugins ("per_msg_finish", {
1701 36         69 permsgstatus => $self
1702 36 50       108 });
1703 36 50       163  
    50          
    100          
1704             $self->report_unsatisfied_actions;
1705              
1706             # Delete out all of the members of $self. This will remove any direct
1707             # circular references and let the memory get reclaimed while also being more
1708 38     38   93 # efficient than a foreach() loop over the keys.
1709             %{$self} = ();
1710 38         246 }
1711 38         142  
1712             my ($conf) = @_;
1713             foreach my $method (@TEMPORARY_METHODS) {
1714             if (defined &{$method}) {
1715 38         180 undef &{$method};
1716             }
1717             }
1718 38 50 33     128 @TEMPORARY_METHODS = (); # clear for next time
1719 0         0 %TEMPORARY_EVAL_GLUE_METHODS = ();
1720 0 0       0 }
1721              
1722 38         96  
1723             =item $name = $status->get_current_eval_rule_name()
1724              
1725             Return the name of the currently-running eval rule. C<undef> is
1726             returned if no eval rule is currently being run. Useful for plugins
1727 76     76   104 to determine the current rule name while inside an eval test function
1728 76         319 call.
1729              
1730             =cut
1731              
1732             my ($self) = @_;
1733             return $self->{current_rule_name};
1734             }
1735              
1736             ###########################################################################
1737              
1738             my ($self) = @_;
1739              
1740             my $timer = $self->{main}->time_method("extract_message_metadata");
1741             $self->{msg}->extract_message_metadata($self);
1742              
1743             foreach my $item (qw(
1744             relays_trusted relays_trusted_str num_relays_trusted
1745 136     136 1 26053 relays_untrusted relays_untrusted_str num_relays_untrusted
1746             relays_internal relays_internal_str num_relays_internal
1747 136         701 relays_external relays_external_str num_relays_external
1748             num_relays_unparseable last_trusted_relay_index
1749             last_internal_relay_index
1750             ))
1751 136         867 {
1752             $self->{$item} = $self->{msg}->{metadata}->{$item};
1753             }
1754              
1755             # TODO: International domain names (UTF-8) must be converted to
1756 136         268 # ASCII-compatible encoding (ACE) for the purpose of setting the
  136         3855  
1757             # SENDERDOMAIN and AUTHORDOMAIN tags (and probably for other uses too).
1758             # (explicitly required for DMARC, draft-kucherawy-dmarc-base sect. 5.6.1)
1759             #
1760 0     0 0 0 { local $1;
1761 0         0 my $addr = $self->get('EnvelopeFrom:addr', undef);
1762 0 0       0 # collect a FQDN, ignoring potential trailing WSP
  0         0  
1763 0         0 if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
  0         0  
1764             $self->set_tag('SENDERDOMAIN', lc $1);
1765             }
1766 0         0 # TODO: the get ':addr' only returns the first address; this should be
1767 0         0 # augmented to be able to return all addresses in a header field, multiple
1768             # addresses in a From header field are allowed according to RFC 5322
1769             $addr = $self->get('From:addr', undef);
1770             if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1771             $self->set_tag('AUTHORDOMAIN', lc $1);
1772             }
1773             }
1774              
1775             $self->set_tag('RELAYSTRUSTED', $self->{relays_trusted_str});
1776             $self->set_tag('RELAYSUNTRUSTED', $self->{relays_untrusted_str});
1777             $self->set_tag('RELAYSINTERNAL', $self->{relays_internal_str});
1778             $self->set_tag('RELAYSEXTERNAL', $self->{relays_external_str});
1779             $self->set_tag('LANGUAGES', $self->{msg}->get_metadata("X-Languages"));
1780              
1781 0     0 1 0 # This should happen before we get called, but just in case.
1782 0         0 if (!defined $self->{msg}->{metadata}->{html}) {
1783             $self->get_decoded_stripped_body_text_array();
1784             }
1785             $self->{html} = $self->{msg}->{metadata}->{html};
1786              
1787             # allow plugins to add more metadata, read the stuff that's there, etc.
1788 96     96 0 228 $self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
1789             }
1790 96         366  
1791 96         536 ###########################################################################
1792              
1793 96         501 =item $status->get_decoded_body_text_array ()
1794              
1795             Returns the message body, with B<base64> or B<quoted-printable> encodings
1796             decoded, and non-text parts or non-inline attachments stripped.
1797              
1798             This is the same result text as used in 'rawbody' rules.
1799              
1800             It is returned as an array of strings, with each string being a 2-4kB chunk
1801             of the body, split from boundaries if possible.
1802 1440         3012  
1803             =cut
1804              
1805             return $_[0]->{msg}->get_decoded_body_text_array();
1806             }
1807              
1808             =item $status->get_decoded_stripped_body_text_array ()
1809              
1810 96         198 Returns the message body, decoded (as described in
  96         307  
1811 96         654 get_decoded_body_text_array()), with HTML rendered, and with whitespace
1812             normalized.
1813 96 100 66     505  
1814 1         7 This is the same result text as used in 'body' rules.
1815              
1816             It will always render text/html.
1817              
1818             It is returned as an array of strings, with each string representing one
1819 96         295 'paragraph'. Paragraphs, in plain-text mails, are double-newline-separated
1820 96 100 66     1001 blocks of multi-line text.
1821 57         611  
1822             =cut
1823              
1824             return $_[0]->{msg}->get_rendered_body_text_array();
1825 96         403 }
1826 96         355  
1827 96         339 ###########################################################################
1828 96         332  
1829 96         419 =item $status->get (header_name [, default_value])
1830              
1831             Returns a message header, pseudo-header, real name or address.
1832 96 100       396 C<header_name> is the name of a mail header, such as 'Subject', 'To',
1833 95         361 etc. If C<default_value> is given, it will be used if the requested
1834             C<header_name> does not exist.
1835 96         334  
1836             Appending C<:raw> to the header name will inhibit decoding of quoted-printable
1837             or base-64 encoded strings.
1838 96         545  
1839             Appending a modifier C<:addr> to a header field name will cause everything
1840             except the first email address to be removed from the header field. It is
1841             mainly applicable to header fields 'From', 'Sender', 'To', 'Cc' along with
1842             their 'Resent-*' counterparts, and the 'Return-Path'. For example, all of
1843             the following will result in "example@foo":
1844              
1845             =over 4
1846              
1847             =item example@foo
1848              
1849             =item example@foo (Foo Blah)
1850              
1851             =item example@foo, example@bar
1852              
1853             =item display: example@foo (Foo Blah), example@bar ;
1854              
1855             =item Foo Blah <example@foo>
1856 110     110 1 598  
1857             =item "Foo Blah" <example@foo>
1858              
1859             =item "'Foo Blah'" <example@foo>
1860              
1861             =back
1862              
1863             Appending a modifier C<:name> to a header field name will cause everything
1864             except the first display name to be removed from the header field. It is
1865             mainly applicable to header fields containing a single mail address: 'From',
1866             'Sender', along with their 'Resent-From' and 'Resent-Sender' counterparts.
1867             For example, all of the following will result in "Foo Blah". One level of
1868             single quotes is stripped too, as it is often seen.
1869              
1870             =over 4
1871              
1872             =item example@foo (Foo Blah)
1873              
1874             =item example@foo (Foo Blah), example@bar
1875              
1876 340     340 1 1331 =item display: example@foo (Foo Blah), example@bar ;
1877              
1878             =item Foo Blah <example@foo>
1879              
1880             =item "Foo Blah" <example@foo>
1881              
1882             =item "'Foo Blah'" <example@foo>
1883              
1884             =back
1885              
1886             There are several special pseudo-headers that can be specified:
1887              
1888             =over 4
1889              
1890             =item C<ALL> can be used to mean the text of all the message's headers.
1891             Each header is decoded and unfolded to single line, unless called with :raw.
1892              
1893             =item C<ALL-TRUSTED> can be used to mean the text of all the message's headers
1894             that could only have been added by trusted relays.
1895              
1896             =item C<ALL-INTERNAL> can be used to mean the text of all the message's headers
1897             that could only have been added by internal relays.
1898              
1899             =item C<ALL-UNTRUSTED> can be used to mean the text of all the message's
1900             headers that may have been added by untrusted relays. To make this
1901             pseudo-header more useful for header rules the 'Received' header that was added
1902             by the last trusted relay is included, even though it can be trusted.
1903              
1904             =item C<ALL-EXTERNAL> can be used to mean the text of all the message's headers
1905             that may have been added by external relays. Like C<ALL-UNTRUSTED> the
1906             'Received' header added by the last internal relay is included.
1907              
1908             =item C<ToCc> can be used to mean the contents of both the 'To' and 'Cc'
1909             headers.
1910              
1911             =item C<EnvelopeFrom> is the address used in the 'MAIL FROM:' phase of the SMTP
1912             transaction that delivered this message, if this data has been made available
1913             by the SMTP server.
1914              
1915             =item C<MESSAGEID> is a symbol meaning all Message-Id's found in the message;
1916             some mailing list software moves the real 'Message-Id' to 'Resent-Message-Id'
1917             or 'X-Message-Id', then uses its own one in the 'Message-Id' header. The value
1918             returned for this symbol is the text from all 3 headers, separated by newlines.
1919              
1920             =item C<X-Spam-Relays-Untrusted> is the generated metadata of untrusted relays
1921             the message has passed through
1922              
1923             =item C<X-Spam-Relays-Trusted> is the generated metadata of trusted relays
1924             the message has passed through
1925              
1926             =back
1927              
1928             =cut
1929              
1930             # only uses two arguments, ignores $defval
1931             my ($self, $request) = @_;
1932              
1933             my $result;
1934             my $getaddr = 0;
1935             my $getname = 0;
1936             my $getraw = 0;
1937              
1938             # special queries - process and strip modifiers
1939             if (index($request,':') >= 0) { # triage
1940             local $1;
1941             while ($request =~ s/:([^:]*)//) {
1942             if ($1 eq 'raw') { $getraw = 1 }
1943             elsif ($1 eq 'addr') { $getaddr = $getraw = 1 }
1944             elsif ($1 eq 'name') { $getname = 1 }
1945             }
1946             }
1947             my $request_lc = lc $request;
1948              
1949             # ALL: entire pristine or semi-raw headers
1950             if ($request eq 'ALL') {
1951             return ($getraw ? $self->{msg}->get_pristine_header()
1952             : $self->{msg}->get_all_headers(0));
1953             }
1954             # ALL-TRUSTED: entire trusted raw headers
1955             elsif ($request eq 'ALL-TRUSTED') {
1956             # '+1' since we added the received header even though it's not considered
1957             # trusted, so we know that those headers can be trusted too
1958             return $self->get_all_hdrs_in_rcvd_index_range(
1959             undef, $self->{last_trusted_relay_index}+1,
1960             undef, undef, $getraw);
1961             }
1962             # ALL-INTERNAL: entire internal raw headers
1963             elsif ($request eq 'ALL-INTERNAL') {
1964             # '+1' for the same reason as in ALL-TRUSTED above
1965             return $self->get_all_hdrs_in_rcvd_index_range(
1966             undef, $self->{last_internal_relay_index}+1,
1967             undef, undef, $getraw);
1968             }
1969             # ALL-UNTRUSTED: entire untrusted raw headers
1970             elsif ($request eq 'ALL-UNTRUSTED') {
1971             # '+1' for the same reason as in ALL-TRUSTED above
1972             return $self->get_all_hdrs_in_rcvd_index_range(
1973             $self->{last_trusted_relay_index}+1, undef,
1974             undef, undef, $getraw);
1975             }
1976             # ALL-EXTERNAL: entire external raw headers
1977             elsif ($request eq 'ALL-EXTERNAL') {
1978             # '+1' for the same reason as in ALL-TRUSTED above
1979             return $self->get_all_hdrs_in_rcvd_index_range(
1980             $self->{last_internal_relay_index}+1, undef,
1981             undef, undef, $getraw);
1982             }
1983             # EnvelopeFrom: the SMTP MAIL FROM: address
1984 3306     3306   5921 elsif ($request_lc eq "\LEnvelopeFrom") {
1985             $result = $self->get_envelope_from();
1986 3306         3548 }
1987 3306         3662 # untrusted relays list, as string
1988 3306         3482 elsif ($request_lc eq "\LX-Spam-Relays-Untrusted") {
1989 3306         3418 $result = $self->{relays_untrusted_str};
1990             }
1991             # trusted relays list, as string
1992 3306 100       6628 elsif ($request_lc eq "\LX-Spam-Relays-Trusted") {
1993 747         1560 $result = $self->{relays_trusted_str};
1994 747         4135 }
1995 747 100       2386 # external relays list, as string
  1 100       5  
    50          
1996 742         2211 elsif ($request_lc eq "\LX-Spam-Relays-External") {
1997 4         11 $result = $self->{relays_external_str};
1998             }
1999             # internal relays list, as string
2000 3306         4739 elsif ($request_lc eq "\LX-Spam-Relays-Internal") {
2001             $result = $self->{relays_internal_str};
2002             }
2003 3306 100       17256 # ToCc: the combined recipients list
    50          
    100          
    50          
    50          
    100          
    50          
    50          
    50          
    50          
    100          
    100          
2004             elsif ($request_lc eq "\LToCc") {
2005 15 50       94 $result = join("\n", $self->{msg}->get_header('To', $getraw));
2006             if ($result ne '') {
2007             chomp $result;
2008             $result .= ", " if $result =~ /\S/;
2009             }
2010             $result .= join("\n", $self->{msg}->get_header('Cc', $getraw));
2011             $result = undef if $result eq '';
2012 0         0 }
2013             # MESSAGEID: handle lists which move the real message-id to another
2014             # header for resending.
2015             elsif ($request eq 'MESSAGEID') {
2016             $result = join("\n", grep { defined($_) && $_ ne '' }
2017             $self->{msg}->get_header('X-Message-Id', $getraw),
2018             $self->{msg}->get_header('Resent-Message-Id', $getraw),
2019 4         23 $self->{msg}->get_header('X-Original-Message-ID', $getraw),
2020             $self->{msg}->get_header('Message-Id', $getraw));
2021             }
2022             # a conventional header
2023             else {
2024             my @results = $getraw ? $self->{msg}->raw_header($request)
2025             : $self->{msg}->get_header($request);
2026 0         0 # dbg("message: get(%s)%s = %s",
2027             # $request, $getraw?'raw':'', join(", ",@results));
2028             if (@results) {
2029             $result = join('', @results);
2030             } else { # metadata
2031             $result = $self->{msg}->get_metadata($request);
2032             }
2033 0         0 }
2034              
2035             # special queries
2036             if (defined $result && ($getaddr || $getname)) {
2037             local $1;
2038 110         557 $result =~ s/^[^:]+:(.*);\s*$/$1/gs; # 'undisclosed-recipients: ;'
2039             $result =~ s/\s+/ /g; # reduce whitespace
2040             $result =~ s/^\s+//; # leading whitespace
2041             $result =~ s/\s+$//; # trailing whitespace
2042 0         0  
2043             if ($getaddr) {
2044             # Get the email address out of the header
2045             # All of these should result in "jm@foo":
2046 0         0 # jm@foo
2047             # jm@foo (Foo Blah)
2048             # jm@foo, jm@bar
2049             # display: jm@foo (Foo Blah), jm@bar ;
2050 0         0 # Foo Blah <jm@foo>
2051             # "Foo Blah" <jm@foo>
2052             # "'Foo Blah'" <jm@foo>
2053             #
2054 0         0 # strip out the (comments)
2055             $result =~ s/\s*\(.*?\)//g;
2056             # strip out the "quoted text", unless it's the only thing in the string
2057             if ($result !~ /^".*"$/) {
2058 81         317 $result =~ s/(?<!<)"[^"]*"(?!\@)//g; #" emacs
2059 81 100       303 }
2060 8         24 # Foo Blah <jm@xxx> or <jm@xxx>
2061 8 50       48 local $1;
2062             $result =~ s/^[^"<]*?<(.*?)>.*$/$1/;
2063 81         257 # multiple addresses on one line? remove all but first
2064 81 100       237 $result =~ s/,.*$//;
2065             }
2066             elsif ($getname) {
2067             # Get the display name out of the header
2068             # All of these should result in "Foo Blah":
2069 42 50       592 #
2070             # jm@foo (Foo Blah)
2071             # (Foo Blah) jm@foo
2072             # jm@foo (Foo Blah), jm@bar
2073 81         352 # display: jm@foo (Foo Blah), jm@bar ;
2074             # Foo Blah <jm@foo>
2075             # "Foo Blah" <jm@foo>
2076             # "'Foo Blah'" <jm@foo>
2077             #
2078 3015 100       8793 local $1;
2079             # does not handle mailbox-list or address-list or quotes well, to be improved
2080             if ($result =~ /^ \s* " (.*?) (?<!\\)" \s* < [^<>]* >/sx ||
2081 3015 100       4727 $result =~ /^ \s* (.*?) \s* < [^<>]* >/sx) {
2082 357         1399 $result = $1; # display-name, RFC 5322
2083             # name-addr = [display-name] angle-addr
2084 2658         5795 # display-name = phrase
2085             # phrase = 1*word / obs-phrase
2086             # word = atom / quoted-string
2087             # obs-phrase = word *(word / "." / CFWS)
2088             $result =~ s{ " ( (?: [^"\\] | \\. )* ) " }
2089 3287 100 100     9131 { my $s=$1; $s=~s{\\(.)}{$1}gs; $s }gsxe;
      100        
2090 153         400 $result =~ s/\\"/"/gs;
2091 153         349 } elsif ($result =~ /^ [^(,]*? \( (.*?) \) /sx) { # legacy form
2092 153         902 # nested comments are not handled, to be improved
2093 153         616 $result = $1;
2094 153         598 } else { # no display name
2095             $result = '';
2096 153 100       359 }
    50          
2097             $result =~ s/^ \s* ' \s* (.*?) \s* ' \s* \z/$1/sx;
2098             }
2099             }
2100             return $result;
2101             }
2102              
2103             # optimized for speed
2104             # $_[0] is self
2105             # $_[1] is request
2106             # $_[2] is defval
2107             my $cache = $_[0]->{c};
2108 149         429 my $found;
2109             if (exists $cache->{$_[1]}) {
2110 149 100       481 # return cache entry if it is known
2111 148         316 # (measured hit/attempts rate on a production mailer is about 47%)
2112             $found = $cache->{$_[1]};
2113             } else {
2114 149         299 # fill in a cache entry
2115 149         410 $found = _get(@_);
2116             $cache->{$_[1]} = $found;
2117 149         449 }
2118             # if the requested header wasn't found, we should return a default value
2119             # as specified by the caller: if defval argument is present it represents
2120             # a default value even if undef; if defval argument is absent a default
2121             # value is an empty string for upwards compatibility
2122             return (defined $found ? $found : @_ > 2 ? $_[2] : '');
2123             }
2124              
2125             ###########################################################################
2126              
2127             # uri parsing from plain text:
2128             # The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but
2129             # ignore random strings that might look like URIs, for example in uuencoded files, and to ignore
2130             # URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters.
2131 4         7 # When we extract a domain and look it up in an RBL, an FP on deciding that the text is a URI is not much
2132             # of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule
2133 4 100 66     111 # because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings.
    50          
2134             # The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string,
2135 3         11 # then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most
2136             # likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that
2137             # it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD;
2138             # 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD
2139             #
2140             # We get the entire URI that would be linkified before dealing with it, in order to do the right thing
2141 3         23 # with URI-encodings and redirecting URIs.
  0         0  
  0         0  
  0         0  
2142 3         14 #
2143             # The delimiters for start of a URI in TBird are @(`{|[\"'<>,\s in OE they are ("<\s
2144             #
2145 0         0 # Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end.
2146             # TBird's end delimiters are )`{}|[]"<>\s but ) is only an end delmiter if there is no ( in the URI
2147 1         5 # OE only uses space as a delimiter, but ignores [~!@#^&*()_+`-={}|[]:";'<>?,.]* at the end.
2148             #
2149 4         12 # Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it.
2150             # TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end
2151             # delimiters include ( and '
2152 3287         7911 #
2153             # bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence ESC ( .
2154              
2155             my ($self) = @_;
2156              
2157             # Cached?
2158             return $self->{tbirdurire} if exists $self->{tbirdurire};
2159              
2160 4131     4131 1 18447 # a hybrid of tbird and oe's version of uri parsing
2161 4131         4313 my $tbirdstartdelim = '><"\'`,{[(|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
2162 4131 100       7606 my $iso2022shift = "\x1b" . '\(.'; # bug 4522
2163             my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
2164             my $nonASCII = '\x80-\xff';
2165 825         1503  
2166             # schemeless uri start delimiter, combo of most punctuations and delims above
2167             my $scstartdelim = qr/[\!\"\#\$\&\'\(\)\*\+\,\/\:\;\<\=\>\?\@\[\\\]\^\`\{\|\}\~\s\x1b\xa0]/;
2168 3306         5411  
2169 3306         9278 # bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
2170             # it's a common thing to find in text
2171             my $tbirdenddelimemail = $tbirdenddelim . ',(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes
2172             my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
2173              
2174             # valid TLDs
2175 4131 100       24452 my $tldsRE = $self->{main}->{registryboundaries}->{valid_tlds_re};
    100          
2176              
2177             # knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
2178             # schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
2179             my $urischemeless = qr/([a-z\d][a-z\d._-]{0,251}\.${tldsRE})\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,2048})?/i;
2180             my $uriknownscheme = qr/(?:(?:https?|ftp):\/\/|(?:www\d{0,2}|ftp)\.)[^$tbirdenddelim]{1,2048}/i;
2181             my $urimailscheme = qr/(?:mailto:[^$tbirdenddelimemail]{1,2048}|[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251})/i;
2182              
2183             $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
2184             (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
2185             (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|\z))) |
2186             (?:(?:^|(?<=$scstartdelim))($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/ix;
2187              
2188             return $self->{tbirdurire};
2189             }
2190              
2191             =item $status->get_uri_list ()
2192              
2193             Returns an array of all unique URIs found in the message. It takes
2194             a combination of the URIs found in the rendered (decoded and HTML
2195             stripped) body and the URIs found when parsing the HTML in the message.
2196             Will also set $status->{uri_list} (the array as returned by this function).
2197              
2198             The returned array will include the "raw" URI as well as
2199             "slightly cooked" versions. For example, the single URI
2200             'http://%77&#00119;%77.example.com/' will get turned into:
2201             ( 'http://%77&#00119;%77.example.com/', 'http://www.example.com/' )
2202              
2203             =cut
2204              
2205             my ($self) = @_;
2206              
2207             # use cached answer if available
2208             if (exists $self->{uri_list}) {
2209 131     131   331 return @{$self->{uri_list}};
2210             }
2211              
2212 131 50       405 my %uris;
2213             # $self->{redirect_num} = 0;
2214              
2215 131         338 # get URIs from text/HTML parsing
2216 131         236 while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
2217 131         230 if ($info->{cleaned}) {
2218 131         321 foreach (@{$info->{cleaned}}) {
2219             $uris{$_} = 1;
2220              
2221 131         537 # count redirection attempts and log it
2222             # if (my @http = m{\b(https?:/{0,2})}gi) {
2223             # $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
2224             # }
2225 131         461 }
2226 131         377 }
2227             }
2228              
2229 131         421 @{$self->{uri_list}} = keys %uris;
2230             # $self->set_tag('URILIST', @uris == 1 ? $uris[0] : \@uris) if @uris;
2231              
2232             return @{$self->{uri_list}};
2233 131         333584 }
2234 131         2483  
2235 131         1908 =item $status->get_uri_detail_list ()
2236              
2237 131         338651 Returns a hash reference of all unique URIs found in the message and
2238             various data about where the URIs were found in the message. It takes a
2239             combination of the URIs found in the rendered (decoded and HTML stripped)
2240             body and the URIs found when parsing the HTML in the message. Will also
2241             set $status->{uri_detail_list} (the hash reference as returned by this
2242 131         1786 function).
2243              
2244             The hash format looks something like this:
2245              
2246             raw_uri => {
2247             types => { a => 1, img => 1, parsed => 1, domainkeys => 1,
2248             unlinked => 1, schemeless => 1 },
2249             cleaned => [ canonicalized_uri ],
2250             anchor_text => [ "click here", "no click here" ],
2251             domains => { domain1 => 1, domain2 => 1 },
2252             hosts => { host1 => domain1, host2 => domain2 },
2253             }
2254              
2255             C<raw_uri> is whatever the URI was in the message itself
2256             (http://spamassassin.apache%2Eorg/). Uris parsed from text will be prefixed
2257             with scheme if missing (http://, mailto: etc). HTML uris are as found.
2258              
2259             C<types> is a hash of the HTML tags (lowercase) which referenced the
2260 131     131 1 462 raw_uri. I<parsed> is a faked type which specifies that the raw_uri was
2261             seen in the rendered text. I<domainkeys> is defined when raw_uri was found
2262             from DK/DKIM d= field. I<unlinked> is defined when it's assumed that MUA
2263 131 50       472 will not linkify uri (found in body without scheme or www. prefix).
2264 0         0 I<schemeless> is always added for uris without scheme, regardless of
  0         0  
2265             linkifying (i.e. email address found in body without mailto:).
2266              
2267 131         240 C<cleaned> is an array of the raw and canonicalized version of the raw_uri
2268             (http://spamassassin.apache%2Eorg/, https://spamassassin.apache.org/).
2269              
2270             C<anchor_text> is an array of the anchor text (text between <a> and
2271 131         249 </a>), if any, which linked to the URI.
  619         1109  
2272 488 50       835  
2273 488         494 C<domains> is a hash of the domains found in the canonicalized URIs.
  488         675  
2274 699         1247  
2275             C<hosts> is a hash of unstripped hostnames found in the canonicalized URIs
2276             as hash keys, with their domain part stored as a value of each hash entry.
2277              
2278             =cut
2279              
2280             my ($self) = @_;
2281              
2282             # process only once, use unique uri_detail_list_run flag,
2283             # in case add_uri_detail_list has already been called
2284 131         500 if ($self->{uri_detail_list_run}) {
  131         470  
2285             return $self->{uri_detail_list};
2286             }
2287 131         247 $self->{uri_detail_list_run} = 1;
  131         766  
2288              
2289             my $timer = $self->{main}->time_method("get_uri_detail_list");
2290              
2291             # process text parsed uris
2292             $self->_process_text_uri_list();
2293             # process html uris
2294             $self->_process_html_uri_list();
2295             # process dkim uris
2296             $self->_process_dkim_uri_list();
2297              
2298             return $self->{uri_detail_list};
2299             }
2300              
2301             my ($self) = @_;
2302              
2303             # Use decoded stripped body, which does not contain HTML
2304             my $textary = $self->get_decoded_stripped_body_text_array();
2305             my $tbirdurire = $self->_tbirdurire;
2306             my %seen;
2307             my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
2308              
2309             foreach my $text (@$textary) {
2310             # a workaround for [perl #69973] bug:
2311             # Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
2312             # Bug 6225, regexp and string should both be utf8, or none of them;
2313             # untainting string also seems to avoid the crash
2314             #
2315             # Bug 6225: untaint the string in an attempt to work around a perl crash
2316             local $_ = untaint_var($text);
2317              
2318             local($1,$2,$3);
2319             while (/$tbirdurire/igo) {
2320             my $rawuri = $1||$2||$3;
2321             my $schost = $4;
2322             my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
2323             $rawuri =~ s/(^[^(]*)\).*$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceding it
2324             $rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
2325              
2326             next if exists $seen{$rawuri};
2327             $seen{$rawuri} = 1;
2328              
2329             dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
2330              
2331             # Quick ignore if schemeless host not valid
2332             next if defined $schost && !is_fqdn_valid($schost, 1);
2333              
2334             # Ignore cid: mid: as they can be mistaken for emails,
2335             # these should not be parsed from stripped body in any case.
2336 623     623 1 912 # Example: [cid:image001.png@01D4986E.E3459640]
2337             next if $rawuri =~ /^[cm]id:/i;
2338              
2339             # Ignore empty uris
2340 623 100       1120 next if $rawuri =~ /^\w+:\/{0,2}$/i;
2341 492         1128  
2342             my $types = {parsed => 1};
2343 131         300  
2344             # If it's a hostname that was just sitting out in the
2345 131         563 # open, without a protocol, and not inside of an HTML tag,
2346             # the we should add the proper protocol in front, rather
2347             # than using the base URI.
2348 131         814 my $uri = $rawuri;
2349             if ($uri !~ /^(?:https?|ftp|mailto):/i) {
2350 131         728 if ($uri =~ /^ftp\./i) {
2351             $uri = "ftp://$uri";
2352 131         598 }
2353             elsif ($uri =~ /^www\d{0,2}\./i) {
2354 131         737 $uri = "http://$uri";
2355             }
2356             elsif (index($uri, '@') != -1) {
2357             # This is not linkified by MUAs: foo@bar%2Ecom
2358 131     131   322 # This IS linkified: foo@bar%2Ebar.com
2359             # And this is linkified: foo@bar%2Ecom?foo.com&bar (woot??)
2360             # And this is linkified with Outlook: foo@bar%2Ecom&foo (woot??)
2361 131         436 # ...
2362 131         540 # Skip if no dot found after @, tested without urldecoding,
2363 131         314 # quick skip for crap like Vi@gra.
2364 131         679 next unless $uri =~ /\@.+?\./;
2365             next if index($uri, '&nbsp;') != -1; # ignore garbled
2366 131         645 $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
2367             $uri = "mailto:$uri";
2368             }
2369             else {
2370             # some spammers are using unschemed URIs to escape filters
2371             # flag that this is a URI that MUAs don't linkify so only use for RBLs
2372             # (TODO: why only use for RBLs?? why not uri rules? Use tflags to choose?)
2373 672         1730 next if index($uri, '.') == -1; # skip unless dot found, garbage
2374             $uri = "http://$uri";
2375 672         1618 $types->{unlinked} = 1;
2376 672         56723 }
2377 325   66     1201 # Mark any of those schemeless
2378 325         561 $types->{schemeless} = 1;
2379 325 100       772 }
    100          
2380 325         685  
2381 325         1601 if ($uri =~ /^mailto:/i) {
2382             # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
2383 325 100       939 $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
2384 306         668 # Skip unless @ found after decoding, then check tld is valid
2385             next unless $uri =~ /\@([^?&>]*)/;
2386 306 50       500 my $host = $1; $host =~ s/(?:\%20)+$//; # strip trailing %20 from host
2387             next unless $self->{main}->{registryboundaries}->is_domain_valid($host);
2388             }
2389 306 100 100     756  
2390             dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;
2391              
2392             $self->add_uri_detail_list($uri, $types, 'parsed', 1);
2393             }
2394 297 50       653 }
2395             }
2396              
2397 297 50       746 my ($self) = @_;
2398              
2399 297         623 # get URIs from HTML parsing
2400             # use the metadata version since $self->{html} may not be setup
2401             my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} || { };
2402             $self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
2403              
2404             # canonicalize the HTML parsed URIs
2405 297         435 while(my($uri, $info) = each %{ $detail }) {
2406 297 100       825 if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
2407 140 100       501 # Need also to copy and uniq anchor text
    100          
    100          
2408 1         4 if (exists $info->{anchor_text}) {
2409             my %seen;
2410             foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
2411 59         143 push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
2412             }
2413             }
2414             }
2415             }
2416             }
2417              
2418             my ($self) = @_;
2419              
2420             # This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
2421 29 50       115 # this functionality
2422 29 50       76 # 2013-01-07
2423 29         48 # This functionality is re-enabled as a configuration option disabled by
2424 29         66 # default (bug 7087)
2425             # 2014-10-06
2426              
2427             # Look for the domain in DK/DKIM headers
2428             if ($self->{conf}->{parse_dkim_uris}) {
2429             my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef ),
2430 51 50       113 $self->get('DKIM-Signature',undef) ));
2431 51         89 while ($dk =~ /\bd\s*=\s*([^;]+)/g) {
2432 51         96 my $d = $1;
2433             $d =~ s/\s+//g;
2434             # prefix with domainkeys: so it doesn't merge with identical keys
2435 140         225 $self->add_uri_detail_list("domainkeys:$d",
2436             {'domainkeys'=>1, 'nocanon'=>1, 'noclean'=>1},
2437             'domainkeys', 1);
2438 297 100       703 }
2439             }
2440 36 50       92 }
2441              
2442 36 100       116 =item $status->add_uri_detail_list ($raw_uri, $types, $source, $valid_domain)
2443 34         64  
  34         62  
2444 34 100       114 Adds values to internal uri_detail_list. When used from Plugins, recommended
2445             to call from parsed_metadata (along with register_method_priority, -10) so
2446             other Plugins calling get_uri_detail_list() will see it.
2447 292 50       487  
2448             C<raw_uri> is the URI to be added. The only required parameter.
2449 292         796  
2450             C<types> is an optional hash reference, contents are added to
2451             uri_detail_list->{types} (see get_uri_detail_list for known keys).
2452             I<parsed> is default is no hash given. I<nocanon> does not run
2453             uri_list_canonicalize (no redirector, uri fixing). I<noclean> skips adding
2454             uri_detail_list->{cleaned}, so it would not be used in "uri" rule checks,
2455 131     131   386 but domain/hosts would still be used for URIBL/RBL purposes.
2456              
2457             C<source> is an optional simple string, only used for debug logging purposes
2458             to identify where uri originates from (default: "parsed").
2459 131   100     1166  
2460 131 50       469 C<valid_domain> is an optional boolean (0/1). If true, uri will not be
2461             added unless hostname/domain is in valid format and contains a valid TLD.
2462             (default: 0)
2463 131         254  
  344         1333  
2464 213 50       442 =cut
2465              
2466 213 100       403 my ($self, $uri, $types, $source, $valid_domain) = @_;
2467 119         138  
2468 119         137 $types = {'parsed' => 1} unless defined $types;
  203         503  
  119         213  
2469 119         147 $source ||= 'parsed';
  119         410  
2470              
2471             my (%domains, %hosts, %cleaned);
2472             my $udl = $self->{uri_detail_list};
2473              
2474             dbg("uri: canonicalizing $source uri: $uri");
2475              
2476             my @uris;
2477 131     131   304 if ($types->{nocanon}) {
2478             push @uris, $uri;
2479             } else {
2480             @uris = uri_list_canonicalize($self->{conf}->{redirector_patterns}, $uri);
2481             }
2482             foreach my $cleanuri (@uris) {
2483             # Make sure all the URIs are nice and short
2484             if (length($cleanuri) > MAX_URI_LENGTH) {
2485             $self->{'uri_truncated'} = 1;
2486             $cleanuri = substr($cleanuri, 0, MAX_URI_LENGTH);
2487 131 50       490 }
2488 131         525 dbg("uri: cleaned uri: $cleanuri");
  262         833  
2489             $cleaned{$cleanuri} = 1;
2490 131         629 my ($domain, $host) = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
2491 0         0 if (defined $domain) {
2492 0         0 dbg("uri: added host: $host domain: $domain");
2493             $domains{$domain} = 1;
2494 0         0 $hosts{$host} = $domain;
2495             }
2496             }
2497              
2498             # Bail out if no good uri found
2499             return unless %cleaned;
2500              
2501             # Bail out if no domains/hosts found?
2502             return if $valid_domain && !%domains;
2503              
2504             # Merge cleaned
2505             if (!$types->{noclean}) {
2506             if ($udl->{$uri}->{cleaned}) {
2507             $cleaned{$_} = 1 foreach (@{$udl->{$uri}->{cleaned}});
2508             }
2509             @{$udl->{$uri}->{cleaned}} = keys %cleaned;
2510             }
2511              
2512             # Domains/hosts (there might not be any)
2513             $udl->{$uri}->{domains}->{$_} = 1 foreach keys %domains;
2514             $udl->{$uri}->{hosts}->{$_} = $hosts{$_} foreach keys %hosts;
2515              
2516             # Types
2517             $udl->{$uri}->{types}->{$_} = 1 foreach keys %$types;
2518              
2519             # Invalidate uri_list cache
2520             delete $self->{uri_list};
2521              
2522             return 1;
2523             }
2524              
2525              
2526 505     505 1 1008 ###########################################################################
2527              
2528 505 50       832 my $self = shift;
2529 505   50     848 my $metarule = shift;
2530             # @_ is now the list of rules
2531 505         614  
2532 505         664 foreach my $r (@_) {
2533             # dbg("rules: meta rule depends on net rule $r");
2534 505         1590 next if ($self->is_rule_complete($r));
2535              
2536 505         641 dbg("rules: meta rule $metarule depends on pending rule $r, blocking");
2537 505 50       833 my $timer = $self->{main}->time_method("wait_for_pending_rules");
2538 0         0  
2539             my $start = time;
2540 505         1455 $self->harvest_until_rule_completes($r);
2541             my $elapsed = sprintf "%.2f", time - $start;
2542 505         924  
2543             if (!$self->is_rule_complete($r)) {
2544 717 50       1294 dbg("rules: rule $r is still not complete; exited early?");
2545 0         0 }
2546 0         0 elsif ($elapsed > 0) {
2547             my $txt = "rules: $r took $elapsed seconds to complete, for $metarule";
2548 717         1998 # Info only if something took over 1 sec to wait, prevent log flood
2549 717         1059 if ($elapsed >= 1) { info($txt); } else { dbg($txt); }
2550 717         2075 }
2551 717 100       1437 }
2552 574         1764 }
2553 574         1146  
2554 574         1354 ###########################################################################
2555              
2556             # use a separate sub here, for brevity
2557             # called out of generated eval
2558             my ($self, $rulename) = @_;
2559 505 50       878 warn "rules: failed to run $rulename test, skipping:\n\t($@)\n";
2560             $self->{rule_errors}++;
2561             }
2562 505 100 100     1427  
2563             my ($self, $function) = @_;
2564              
2565 489 50       861 if (!$function) {
2566 489 100       1127 warn "rules: empty function name";
2567 1         3 return;
  1         4  
2568             }
2569 489         833  
  489         1408  
2570             # only need to call this once per fn (globally)
2571             return if exists $TEMPORARY_EVAL_GLUE_METHODS{$function};
2572             $TEMPORARY_EVAL_GLUE_METHODS{$function} = undef;
2573 489         1489  
2574 489         1411 # return if it's not an eval_plugin function
2575             return if (!exists $self->{conf}->{eval_plugins}->{$function});
2576              
2577 489         1680 # return if it's been registered already
2578             return if ($self->can ($function) &&
2579             defined &{'Mail::SpamAssassin::PerMsgStatus::'.$function});
2580 489         712  
2581             my $evalstr = <<"ENDOFEVAL";
2582 489         4613 {
2583             package Mail::SpamAssassin::PerMsgStatus;
2584              
2585             sub $function {
2586             my (\$self) = shift;
2587             my \$plugin = \$self->{conf}->{eval_plugins}->{$function};
2588             return \$plugin->$function (\$self, \@_);
2589 0     0 0 0 }
2590 0         0  
2591             1;
2592             }
2593 0         0 ENDOFEVAL
2594             eval $evalstr . '; 1' ## no critic
2595 0 0       0 or do {
2596             my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
2597 0         0 warn "rules: failed to run header tests, skipping some: $eval_stat\n";
2598 0         0 $self->{rule_errors}++;
2599             };
2600 0         0  
2601 0         0 # ensure this method is deleted if finish_tests() is called
2602 0         0 push (@TEMPORARY_METHODS, $function);
2603             }
2604 0 0       0  
    0          
2605 0         0 ###########################################################################
2606              
2607             # note: only eval tests should store state in $self->{test_log_msgs};
2608 0         0 # pattern tests do not.
2609             #
2610 0 0       0 # the clearing of the test state is now inlined as:
  0         0  
  0         0  
2611             #
2612             # %{$self->{test_log_msgs}} = (); # clear test state
2613             #
2614             # except for this public API for plugin use:
2615              
2616             =item $status->clear_test_state()
2617              
2618             Clear test state, including test log messages from C<$status-E<gt>test_log()>.
2619              
2620 0     0 0 0 =cut
2621 0         0  
2622 0         0 my ($self) = @_;
2623             %{$self->{test_log_msgs}} = ();
2624             }
2625              
2626 3773     3773 0 5877 # internal API, called only by got_hit()
2627             # TODO: refactor and merge this into that function
2628 3773 50       5813 my ($self, $rule, $score, $area, $ruletype, $desc) = @_;
2629 0         0  
2630 0         0 $self->{main}->call_plugins ("hit_rule", {
2631             permsgstatus => $self,
2632             rulename => $rule,
2633             ruletype => $ruletype,
2634 3773 100       39582 score => $score
2635 283         616 });
2636              
2637             # ignore meta-match sub-rules.
2638 283 50       946 if ($rule =~ /^__/) { push(@{$self->{subtest_names_hit}}, $rule); return; }
2639              
2640             # this should not happen; warn about it
2641             if (!defined $score) {
2642 283 50 33     1682 warn "rules: score undef for rule '$rule' in '$area' '$desc'";
  0         0  
2643             return;
2644 283         775 }
2645              
2646             # this should not happen; warn about NaN (bug 3364)
2647             if ($score != $score) {
2648             warn "rules: score '$score' for rule '$rule' in '$area' '$desc'";
2649             return;
2650             }
2651              
2652             # Add the rule hit to the score
2653             $self->{score} += $score;
2654              
2655             push(@{$self->{test_names_hit}}, $rule);
2656             $area ||= '';
2657              
2658 283 50   466 1 27814 if ($score >= 10 || $score <= -10) {
  466     101 1 958  
  466     81 0 923  
  466     81 0 1448  
  101     4 0 256  
  101     77 0 263  
  101     93 1 383  
  81     81 1 250  
  81     81 0 223  
  81     81 1 302  
  81     134 0 249  
  81     93 0 251  
  81     81 0 327  
  4     555 0 10  
  4     81 0 11  
  4     81 0 58  
  77     4 0 226  
  77     4 0 569  
  77     4 0 371  
  81     4 0 256  
  81     4 0 224  
  81     4 0 452  
  81     28 0 231  
  81     4 0 235  
  81     4 0 348  
  81     4 0 252  
  81     0 0 247  
  81     77 0 415  
  81     77 0 249  
  81     77 0 240  
  81     77 0 339  
  81     77 0 235  
  81     77 0 259  
  81     77 0 368  
  81         249  
  81         204  
  81         383  
  81         240  
  81         216  
  81         339  
  543         1138  
  543         1019  
  543         1685  
  81         230  
  81         240  
  81         474  
  81         267  
  81         226  
  81         370  
  4         12  
  4         12  
  4         17  
  4         9  
  4         12  
  4         19  
  4         10  
  4         15  
  4         20  
  4         11  
  4         11  
  4         17  
  4         11  
  4         18  
  4         24  
  4         11  
  4         12  
  4         16  
  28         78  
  28         64  
  28         78  
  4         33  
  4         17  
  4         20  
  4         12  
  4         17  
  4         21  
  4         8  
  4         11  
  4         19  
  0         0  
  0         0  
  0         0  
  77         212  
  77         212  
  77         323  
  77         239  
  77         219  
  77         392  
  77         233  
  77         208  
  77         378  
  77         209  
  89         267  
  89         365  
  89         255  
  89         306  
  77         335  
  77         266  
  77         207  
  77         383  
  77         222  
  77         208  
  77         377  
2659 0 0       0 $score = sprintf("%4.0f", $score);
  0         0  
2660 0         0 }
2661 0         0 else {
2662             $score = sprintf("%4.1f", $score);
2663             }
2664              
2665 283         3504 # save both summaries
2666             # TODO: this is slower than necessary, if we only need one
2667             $self->{tag_data}->{REPORT} .= sprintf ("* %s %s %s%s\n%s",
2668             $score, $rule, $area,
2669             $self->_wrap_desc($desc,
2670             4+length($rule)+length($score)+length($area), "* "),
2671             ($self->{test_log_msgs}->{TERSE} ?
2672             "* " . $self->{test_log_msgs}->{TERSE} : ''));
2673              
2674             $self->{tag_data}->{SUMMARY} .= sprintf ("%s %-22s %s%s\n%s",
2675             $score, $rule, $area,
2676             $self->_wrap_desc($desc,
2677             3+length($rule)+length($score)+length($area), " " x 28),
2678             ($self->{test_log_msgs}->{LONG} || ''));
2679             if((defined $self->{subjprefix}) and ($self->{subjprefix} ne "")) {
2680             $self->{tag_data}->{SUBJPREFIX} = $self->{subjprefix};
2681             }
2682             }
2683              
2684             my ($self, $desc, $firstlinelength, $prefix) = @_;
2685              
2686 0     0 1 0 my $firstline = " " x $firstlinelength;
2687 0         0 my $wrapped = Mail::SpamAssassin::Util::wrap($desc, $prefix, $firstline, $self->{conf}->{report_wrap_width}, 0);
  0         0  
2688             $wrapped =~ s/^\s+//s;
2689             $wrapped;
2690             }
2691              
2692             ###########################################################################
2693 140     128   416  
2694             =item $status->got_hit ($rulename, $desc_prepend [, name => value, ...])
2695 140         836  
2696             Register a hit against a rule in the ruleset.
2697              
2698             There are two mandatory arguments. These are C<$rulename>, the name of the rule
2699             that fired, and C<$desc_prepend>, which is a short string that will be
2700             prepended to the rules C<describe> string in output reports.
2701              
2702             In addition, callers can supplement that with the following optional
2703 140 100       657 data:
  88         166  
  88         205  
  88         232  
2704              
2705             =over 4
2706 56 50       207  
2707 4         16 =item score => $num
2708 4         19  
2709             Optional: the score to use for the rule hit. If unspecified,
2710             the value from the C<Mail::SpamAssassin::Conf> object's C<{scores}>
2711             hash will be used (a configured score), and in its absence the
2712 56 50       245 C<defscore> option value.
2713 4         11  
2714 16         64 =item defscore => $num
2715              
2716             Optional: the score to use for the rule hit if neither the
2717             option C<score> is provided, nor a configured score value is provided.
2718 56         179  
2719             =item value => $num
2720 56         114  
  56         168  
2721 56   100     320 Optional: the value to assign to the rule; the default value is C<1>.
2722             I<tflags multiple> rules use values of greater than 1 to indicate
2723 56 100 66     327 multiple hits. This value is accessible to meta rules.
2724 17         87  
2725             =item ruletype => $type
2726              
2727 55         730 Optional, but recommended: the rule type string. This is used in the
2728             C<hit_rule> plugin call, called by this method. If unset, I<'unknown'> is
2729             used.
2730              
2731             =item tflags => $string
2732              
2733             Optional: a string, i.e. a space-separated list of additional tflags
2734             to be appended to an existing list of flags in $self->{conf}->{tflags},
2735             such as: "nice noautolearn multiple". No syntax checks are performed.
2736              
2737 56 100       427 =item description => $string
2738              
2739             Optional: a custom rule description string. This is used in the
2740             C<hit_rule> plugin call, called by this method. If unset, the static
2741             description is used.
2742              
2743 56   100     371 =back
2744 140 50 33     636  
2745 88         151 Backward compatibility: the two mandatory arguments have been part of this API
2746             since SpamAssassin 2.x. The optional I<name=<gt>value> pairs, however, are a
2747             new addition in SpamAssassin 3.2.0.
2748              
2749             =cut
2750 192     104   646  
2751             my ($self, $rule, $area, %params) = @_;
2752 108         344  
2753 120         496 my $conf_ref = $self->{conf};
2754 108         522  
2755 108         939 my $dynamic_score_provided;
2756             my $score = $params{score};
2757             if (defined $score) { # overrides any configured scores
2758             $dynamic_score_provided = 1;
2759             } else {
2760             $score = $conf_ref->{scores}->{$rule};
2761             $score = $params{defscore} if !defined $score;
2762             }
2763              
2764             # adding a hit does nothing if we don't have a score -- we probably
2765             # shouldn't have run it in the first place
2766             if (!$score) {
2767             %{$self->{test_log_msgs}} = ();
2768             return;
2769             }
2770              
2771             # ensure that rule values always result in an *increase*
2772             # of $self->{tests_already_hit}->{$rule}:
2773             my $value = $params{value};
2774             if (!$value || $value <= 0) { $value = 1 }
2775              
2776             my $tflags_ref = $conf_ref->{tflags};
2777             my $tflags_add = $params{tflags};
2778             if (defined $tflags_add && $tflags_add ne '') {
2779             $_ = (!defined $_ || $_ eq '') ? $tflags_add : ($_ . ' ' . $tflags_add)
2780             for $tflags_ref->{$rule};
2781             };
2782              
2783             my $already_hit = $self->{tests_already_hit}->{$rule} || 0;
2784             # don't count hits multiple times, unless 'tflags multiple' is on
2785             if ($already_hit && ($tflags_ref->{$rule}||'') !~ /\bmultiple\b/) {
2786             %{$self->{test_log_msgs}} = ();
2787             return;
2788             }
2789              
2790             $self->{tests_already_hit}->{$rule} = $already_hit + $value;
2791              
2792             # default ruletype, if not specified:
2793             $params{ruletype} ||= 'unknown';
2794              
2795             if ($dynamic_score_provided) { # copy it to static for proper reporting
2796             $conf_ref->{scoreset}->[$_]->{$rule} = $score for (0..3);
2797             $conf_ref->{scores}->{$rule} = $score;
2798             }
2799              
2800             my $rule_descr = $params{description};
2801             if (defined $rule_descr) {
2802             $conf_ref->{descriptions}->{$rule} = $rule_descr; # save dynamic descr.
2803             } else {
2804             $rule_descr = $conf_ref->get_description_for_rule($rule); # static
2805             }
2806             # Bug 6880 Set Rule Description to something that says no rule
2807             #$rule_descr = $rule if !defined $rule_descr || $rule_descr eq '';
2808             $rule_descr = "No description available." if !defined $rule_descr || $rule_descr eq '';
2809              
2810             if(defined $self->{conf}->{rewrite_header}->{Subject}) {
2811             my $rule_subjprefix = $conf_ref->{subjprefix}->{$rule};
2812             if (defined $rule_subjprefix) {
2813             dbg("subjprefix: setting Subject prefix to $rule_subjprefix");
2814             if($self->{subjprefix} !~ /\Q$rule_subjprefix\E/) {
2815             $self->{subjprefix} .= $rule_subjprefix . " "; # save dynamic subject prefix.
2816             }
2817             }
2818 132     132 1 2563 }
2819              
2820 132         308 $self->_handle_hit($rule,
2821             $score,
2822 170         243 $area,
2823 182         301 $params{ruletype},
2824 180 100       440 $rule_descr);
2825 50         98  
2826             # take care of duplicate rules, too (bug 5206)
2827 188         680 my $dups = $conf_ref->{duplicate_rules}->{$rule};
2828 188 50       635 if ($dups && @{$dups}) {
2829             foreach my $dup (@{$dups}) {
2830             $self->got_hit($dup, $area, %params);
2831             }
2832             }
2833 146 50       376  
2834 10         24 %{$self->{test_log_msgs}} = (); # clear test logs
  10         24  
2835 42         92 return 1;
2836             }
2837              
2838             ###########################################################################
2839              
2840 128         233 # TODO: this needs API doc
2841 128 100 66     356 my ($self, $msg) = @_;
  164         268  
2842             local $1;
2843 128         197 while ($msg =~ s/^(.{30,48})\s//) {
2844 128         197 $self->_test_log_line ($1);
2845 170 50 33     392 }
2846             $self->_test_log_line ($msg);
2847 42 0 0     50 }
2848              
2849             my ($self, $msg) = @_;
2850 170   100     587  
2851             $self->{test_log_msgs}->{TERSE} .= sprintf ("[%s]\n", $msg);
2852 170 100 50     485 if (length($msg) > 47) {
      66        
2853 42         158 $self->{test_log_msgs}->{LONG} .= sprintf ("%78s\n", "[$msg]");
  0         0  
2854 42         253 } else {
2855             $self->{test_log_msgs}->{LONG} .= sprintf ("%27s [%s]\n", "", $msg);
2856             }
2857 170         417 }
2858              
2859             ###########################################################################
2860 170   100     483  
2861             # helper for get()
2862 170 100       485 my ($self) = @_;
2863 0         0  
2864 84         172 # Cached?
2865             return $self->{envelopefrom} if exists $self->{envelopefrom};
2866              
2867 212         327 # bug 2142:
2868 212 50       479 # Get the SMTP MAIL FROM:, aka. the "envelope sender", if our
2869 84         281 # calling app has helpfully marked up the source message
2870             # with it. Various MTAs and calling apps each have their
2871 212         1002 # own idea of what header to use for this! see
2872              
2873             my $envf;
2874              
2875 186 100 66     873 # Rely on the 'envelope-sender-header' header if the user has configured one.
2876             # Assume that because they have configured it, their MTA will always add it.
2877 186 50       448 # This will prevent us falling through and picking up inappropriate headers.
2878 58         69 if (defined $self->{conf}->{envelope_sender_header}) {
2879 58 0       80 # make sure we get the most recent copy - there can be only one EnvelopeSender.
2880 58         106 $envf = $self->get($self->{conf}->{envelope_sender_header}.":addr",undef);
2881 2 0       4 # ok if it contains an "@" sign, or is "" (ie. "<>" without the < and >)
2882 56         104 if (defined $envf && (index($envf, '@') > 0 || $envf eq '')) {
2883             dbg("message: using envelope_sender_header '%s' as EnvelopeFrom: '%s'",
2884             $self->{conf}->{envelope_sender_header}, $envf);
2885             $self->{envelopefrom} = $envf;
2886             return $envf;
2887             }
2888             # Warn them if it's configured, but not there or not usable.
2889             if (defined $envf) {
2890             dbg("message: envelope_sender_header '%s': '%s' is not valid, ignoring",
2891 184         636 $self->{conf}->{envelope_sender_header}, $envf);
2892             } else {
2893             dbg("message: envelope_sender_header '%s' not found in message",
2894 186         403 $self->{conf}->{envelope_sender_header});
2895 128 50 33     333 }
  0         0  
2896 0         0 # Couldn't get envelope-sender using the configured header.
  111         267  
2897 111         4798 $self->{envelopefrom} = undef;
2898             return;
2899             }
2900              
2901 182         243 # User hasn't given us a header to trust, so try to guess the sender.
  186         339  
2902 186         1676  
2903             # use the "envelope-sender" string found in the Received headers,
2904             # if possible... use the last untrusted header, in case there's
2905             # trusted headers.
2906             my $lasthop = $self->{relays_untrusted}->[0];
2907             my $lasthop_str = 'last untrusted';
2908             if (!defined $lasthop) {
2909 58     4 0 113 # no untrusted headers? in that case, the message is ALL_TRUSTED.
2910 0         0 # use the first trusted header (ie. the oldest, originating one).
2911 58         166 $lasthop = $self->{relays_trusted}->[-1];
2912 70         182 $lasthop_str = 'first trusted';
2913             }
2914 20         52  
2915             if (defined $lasthop) {
2916             $envf = $lasthop->{envfrom};
2917             # ok if it contains an "@" sign, or is "" (ie. "<>" without the < and >)
2918 8     4   16 if (defined $envf && (index($envf, '@') > 0 || $envf eq '')) {
2919             dbg("message: using $lasthop_str relay envelope-from as EnvelopeFrom: '$envf'");
2920 8         22 $self->{envelopefrom} = $envf;
2921 62 50       108 return $envf;
2922 62         228 }
2923             }
2924 62         156  
2925             # WARNING: a lot of list software adds an X-Sender for the original env-from
2926             # (including Yahoo! Groups). Unfortunately, fetchmail will pick it up and
2927             # reuse it as the env-from for *its* delivery -- even though the list
2928             # software had used a different env-from in the intervening delivery. Hence,
2929             # if this header is present, and there's a fetchmail sig in the Received
2930             # lines, we cannot trust any Envelope-From headers, since they're likely to
2931             # be incorrect fetchmail guesses.
2932 96     98 0 241  
2933             if (index($self->get("X-Sender"), '@') != -1) {
2934             my $rcvd = join(' ', $self->get("Received"));
2935 108 50       364 if (index($rcvd, '(fetchmail') != -1) {
2936             dbg("message: X-Sender and fetchmail signatures found, cannot trust envelope-from");
2937             $self->{envelopefrom} = undef;
2938             return;
2939             }
2940             }
2941              
2942             # procmailrc notes this (we now recommend adding it to Received instead)
2943 166         275 if (defined($envf = $self->get("X-Envelope-From:addr",undef))) {
2944             # heuristic: this could have been relayed via a list which then used
2945             # a *new* Envelope-from. check
2946             if ($self->get("ALL") =~ /^Received:.*?^X-Envelope-From:/smi) {
2947             dbg("message: X-Envelope-From header found after 1 or more Received lines, cannot trust envelope-from");
2948 166 50       556 $self->{envelopefrom} = undef;
2949             return;
2950 60         139 } else {
2951             dbg("message: using X-Envelope-From header as EnvelopeFrom: '$envf'");
2952 62 0 0     174 $self->{envelopefrom} = $envf;
      0        
2953             return $envf;
2954 72         247 }
2955 62         156 }
2956 12         69  
2957             # qmail, new-inject(1)
2958             if (defined($envf = $self->get("Envelope-Sender:addr",undef))) {
2959 12 0       53 # heuristic: this could have been relayed via a list which then used
2960             # a *new* Envelope-from. check
2961 12         80 if ($self->get("ALL") =~ /^Received:.*?^Envelope-Sender:/smi) {
2962             dbg("message: Envelope-Sender header found after 1 or more Received lines, cannot trust envelope-from");
2963             } else {
2964 0         0 dbg("message: using Envelope-Sender header as EnvelopeFrom: '$envf'");
2965             $self->{envelopefrom} = $envf;
2966             return $envf;
2967 0         0 }
2968 50         130 }
2969              
2970             # Postfix, sendmail, amavisd-new, ...
2971             # RFC 2821 requires it:
2972             # When the delivery SMTP server makes the "final delivery" of a
2973             # message, it inserts a return-path line at the beginning of the mail
2974             # data. This use of return-path is required; mail systems MUST support
2975             # it. The return-path line preserves the information in the <reverse-
2976 144         311 # path> from the MAIL command.
2977 144         339 if (defined($envf = $self->get("Return-Path:addr",undef))) {
2978 94 100       261 # heuristic: this could have been relayed via a list which then used
2979             # a *new* Envelope-from. check
2980             if ($self->get("ALL") =~ /^Received:.*?^Return-Path:/smi) {
2981 86         211 dbg("message: Return-Path header found after 1 or more Received lines, cannot trust envelope-from");
2982 86         165 } else {
2983             dbg("message: using Return-Path header as EnvelopeFrom: '$envf'");
2984             $self->{envelopefrom} = $envf;
2985 94 100       234 return $envf;
2986 83         132 }
2987             }
2988 83 0 0     181  
      33        
2989 50         642 # give up.
2990 42         67 $self->{envelopefrom} = undef;
2991 42         86 return;
2992             }
2993              
2994             ###########################################################################
2995              
2996             # helper for get(ALL-*). get() caches its results, so don't call this
2997             # directly unless you need a range of headers not covered by the ALL-*
2998             # psuedo-headers!
2999              
3000             # Get all the headers found between an index range of received headers, the
3001             # index doesn't care if we could parse the received headers or not.
3002             # Use undef for the $start_rcvd or $end_rcvd numbers to start/end with the
3003 148 50       460 # first/last header in the message, otherwise indicate the index number you
3004 15         50 # want to start/end at. Set $include_start_rcvd or $include_end_rcvd to 0 to
3005 42 0       76 # indicate you don't want to include the received header found at the start or
3006 45         71 # end indexes... basically toggles between [s,e], [s,e), (s,e], (s,e).
3007 57         144 my ($self, $start_rcvd, $end_rcvd, $include_start_rcvd, $include_end_rcvd, $getraw) = @_;
3008 57         134  
3009             # prevent bad input causing us to return the first header found
3010             return if (defined $end_rcvd && $end_rcvd < 0);
3011              
3012             $include_start_rcvd = 1 unless defined $include_start_rcvd;
3013 106 50       457 $include_end_rcvd = 1 unless defined $include_end_rcvd;
3014              
3015             my $cur_rcvd_index = -1; # none found yet
3016 45 0       174 my $result = '';
3017 16         47  
3018 16         101 my @hdrs;
3019 16         125 if ($getraw) {
3020             @hdrs = $self->{msg}->get_pristine_header() =~ /^([^ \t].*?\n)(?![ \t])/smgi;
3021 16         100 } else {
3022 12         372 @hdrs = split(/^/m, $self->{msg}->get_all_headers(0));
3023 12         25 }
3024              
3025             foreach my $hdr (@hdrs) {
3026             if ($hdr =~ /^Received:/i) {
3027             $cur_rcvd_index++;
3028 106 50       754 next if (defined $start_rcvd && !$include_start_rcvd &&
3029             $start_rcvd == $cur_rcvd_index);
3030             last if (defined $end_rcvd && !$include_end_rcvd &&
3031 30 0       164 $end_rcvd == $cur_rcvd_index);
3032 20         74 }
3033             if ((!defined $start_rcvd || $start_rcvd <= $cur_rcvd_index) &&
3034 12         94 (!defined $end_rcvd || $cur_rcvd_index < $end_rcvd)) {
3035 12         116 $result .= $hdr;
3036 12         40 }
3037             elsif (defined $end_rcvd && $cur_rcvd_index == $end_rcvd) {
3038             $result .= $hdr;
3039             last;
3040             }
3041             }
3042             return ($result eq '' ? undef : $result);
3043             }
3044              
3045             ###########################################################################
3046              
3047 106 100       429  
3048             ###########################################################################
3049              
3050 1 50       4 =item $status->create_fulltext_tmpfile (fulltext_ref)
3051 4         11  
3052             This function creates a temporary file containing the passed scalar
3053 5         14 reference data (typically the full/pristine text of the message).
3054 5         14 This is typically used by external programs like pyzor and dccproc, to
3055 5         13 avoid hangs due to buffering issues. Methods that need this, should
3056             call $self->create_fulltext_tmpfile($fulltext) to retrieve the temporary
3057             filename; it will be created if it has not already been.
3058              
3059             Note: This can only be called once until $status->delete_fulltext_tmpfile() is
3060 97         324 called.
3061 97         244  
3062             =cut
3063              
3064             my ($self, $fulltext) = @_;
3065              
3066             if (defined $self->{fulltext_tmpfile}) {
3067             return $self->{fulltext_tmpfile};
3068             }
3069              
3070             my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
3071             $tmpfh or die "failed to create a temporary file";
3072              
3073             # PerlIO's buffered print writes in 8 kB chunks - which can be slow.
3074             # print $tmpfh $$fulltext or die "error writing to $tmpf: $!";
3075             #
3076             # reducing the number of writes and bypassing extra buffering in PerlIO
3077             # speeds up writing of larger text by a factor of 2
3078 0     4 0 0 my $nwrites;
3079             for (my $ofs = 0; $ofs < length($$fulltext); $ofs += $nwrites) {
3080             $nwrites = $tmpfh->syswrite($$fulltext, length($$fulltext)-$ofs, $ofs);
3081 0 50 33     0 defined $nwrites or die "error writing to $tmpf: $!";
3082             }
3083 0 50       0 close $tmpfh or die "error closing $tmpf: $!";
3084 0 50       0  
3085             $self->{fulltext_tmpfile} = $tmpf;
3086 0         0  
3087 4         19 dbg("check: create_fulltext_tmpfile, written %d bytes to file %s",
3088             length($$fulltext), $tmpf);
3089 0         0  
3090 0 50       0 return $self->{fulltext_tmpfile};
3091 0         0 }
3092              
3093 0         0 =item $status->delete_fulltext_tmpfile ()
3094              
3095             Will cleanup after a $status->create_fulltext_tmpfile() call. Deletes the
3096 0         0 temporary file and uncaches the filename.
3097 4 50       13  
3098 0         0 =cut
3099 0 0 0     0  
      0        
3100             my ($self) = @_;
3101 0 0 0     0 if (defined $self->{fulltext_tmpfile}) {
      0        
3102             if (!unlink $self->{fulltext_tmpfile}) {
3103             my $msg = sprintf("cannot unlink %s: %s", $self->{fulltext_tmpfile}, $!);
3104 0 50 33     0 # don't fuss too much if file is missing, perhaps it wasn't even created
    0 33        
      33        
      0        
3105             if ($! == ENOENT) { warn $msg } else { die $msg }
3106 0         0 }
3107             $self->{fulltext_tmpfile} = undef;
3108             }
3109 0         0 }
3110 0         0  
3111             ###########################################################################
3112              
3113 4 50       10 my ($self) = @_;
3114              
3115             if (exists $self->{all_from_addrs}) { return @{$self->{all_from_addrs}}; }
3116              
3117             my @addrs;
3118 0     88 0 0  
3119             # Resent- headers take priority, if present. see bug 672
3120             my $resent = $self->get('Resent-From',undef);
3121             if (defined $resent && $resent =~ /\S/) {
3122             @addrs = $self->{main}->find_all_addrs_in_line ($resent);
3123             }
3124             else {
3125             # bug 2292: Used to use find_all_addrs_in_line() with the same
3126             # headers, but the would catch addresses in comments which caused
3127             # FNs for things like whitelist_from. Since all of these are From
3128             # headers, there should only be 1 address in each anyway (not exactly
3129             # true, RFC 2822 allows multiple addresses in a From header field),
3130             # so use the :addr code...
3131             # bug 3366: some addresses come in as 'foo@bar...', which is invalid.
3132             # so deal with the multiple periods.
3133             ## no critic
3134             @addrs = map { tr/././s; $_ } grep { $_ ne '' }
3135             ($self->get('From:addr'), # std
3136             $self->get('Envelope-Sender:addr'), # qmail: new-inject(1)
3137 0     4 1 0 $self->get('Resent-Sender:addr'), # procmailrc manpage
3138             $self->get('X-Envelope-From:addr'), # procmailrc manpage
3139 0 50       0 $self->get('EnvelopeFrom:addr')); # SMTP envelope
3140 0         0 # http://www.cs.tut.fi/~jkorpela/headers.html is useful here
3141             }
3142              
3143 0         0 # Remove duplicate addresses
3144 4 50       17 my %addrs = map { $_ => 1 } @addrs;
3145             @addrs = keys %addrs;
3146              
3147             dbg("eval: all '*From' addrs: " . join(" ", @addrs));
3148             $self->{all_from_addrs} = \@addrs;
3149             return @addrs;
3150             }
3151 0         0  
3152 0         0 =item all_from_addrs_domains
3153 0         0  
3154 0 50       0 This function returns all the various from addresses in a message using all_from_addrs()
3155             and then returns only the domain names.
3156 0 50       0  
3157             =cut
3158 4         9  
3159             my ($self) = @_;
3160 4         9  
3161             if (exists $self->{all_from_addrs_domains}) {
3162             return @{$self->{all_from_addrs_domains}};
3163 4         12 }
3164              
3165             #TEST POINT - my @addrs = ("test.voipquotes2.net","test.voipquotes2.co.uk");
3166             #Start with all the normal from addrs
3167             my @addrs = all_from_addrs($self);
3168              
3169             dbg("eval: all '*From' addrs domains (before): " . join(" ", @addrs));
3170              
3171             #Take just the domain with a dummy localpart
3172             #removing invalid and duplicate domains
3173             my(%addrs_seen, @addrs_filtered);
3174 191     191 1 408 foreach my $a (@addrs) {
3175 191 100       1254 my $domain = $self->{main}->{registryboundaries}->uri_to_domain($a);
3176 4 50       8 next if !defined $domain || $addrs_seen{lc $domain}++;
3177 4         6 push(@addrs_filtered, 'dummy@'.$domain);
3178             }
3179 4 0       6  
  4         4  
  4         9  
3180             dbg("eval: all '*From' addrs domains (after uri to domain): " .
3181 0         0 join(" ", @addrs_filtered));
3182              
3183             $self->{all_from_addrs_domains} = \@addrs_filtered;
3184              
3185             return @addrs_filtered;
3186             }
3187              
3188 237     275 0 433 my ($self) = @_;
3189              
3190 237 100       577 if (exists $self->{all_to_addrs}) { return @{$self->{all_to_addrs}}; }
  171         282  
  154         923  
3191              
3192 79         146 my @addrs;
3193              
3194             # Resent- headers take priority, if present. see bug 672
3195 79         218 my $resent = join('', $self->get('Resent-To'), $self->get('Resent-Cc'));
3196 96 50 33     411 if ($resent =~ /\S/) {
3197 17         39 @addrs = $self->{main}->find_all_addrs_in_line($resent);
3198             } else {
3199             # OK, a fetchmail trick: try to find the recipient address from
3200             # the most recent 3 Received lines. This is required for sendmail,
3201             # since it does not add a helpful header like exim, qmail
3202             # or Postfix do.
3203             #
3204             my $rcvd = $self->get('Received');
3205             $rcvd =~ s/\n[ \t]+/ /gs;
3206             $rcvd =~ s/\n+/\n/gs;
3207              
3208             my @rcvdlines = split(/\n/, $rcvd, 4); pop @rcvdlines; # forget last one
3209 79         218 my @rcvdaddrs;
  39         162  
  43         216  
  395         784  
3210             foreach my $line (@rcvdlines) {
3211             if ($line =~ / for (\S+\@\S+);/) { push (@rcvdaddrs, $1); }
3212             }
3213              
3214             @addrs = $self->{main}->find_all_addrs_in_line (
3215             join('',
3216             join(" ", @rcvdaddrs)."\n",
3217             $self->get('To'), # std
3218             $self->get('Apparently-To'), # sendmail, from envelope
3219 83         215 $self->get('Delivered-To'), # Postfix, poss qmail
  43         177  
3220 79         219 $self->get('Envelope-Recipients'), # qmail: new-inject(1)
3221             $self->get('Apparently-Resent-To'), # procmailrc manpage
3222 83         415 $self->get('X-Envelope-To'), # procmailrc manpage
3223 83         262 $self->get('Envelope-To'), # exim
3224 83         506 $self->get('X-Delivered-To'), # procmail quick start
3225             $self->get('X-Original-To'), # procmail quick start
3226             $self->get('X-Rcpt-To'), # procmail quick start
3227             $self->get('X-Real-To'), # procmail quick start
3228             $self->get('Cc'))); # std
3229             # those are taken from various sources; thanks to Nancy McGough, who
3230             # noted some in <http://www.ii.com/internet/robots/procmail/qs/#envelope>
3231             }
3232              
3233             dbg("eval: all '*To' addrs: " . join(" ", @addrs));
3234             $self->{all_to_addrs} = \@addrs;
3235 4     50 1 18 return @addrs;
3236              
3237 4 0       72 # http://www.cs.tut.fi/~jkorpela/headers.html is useful here, also
3238 4         220 # http://www.exim.org/pipermail/exim-users/Week-of-Mon-20001009/021672.html
  4         52  
3239             }
3240              
3241             ###########################################################################
3242              
3243 4         13 1;
3244              
3245 4         20 =back
3246              
3247             =head1 SEE ALSO
3248              
3249 4         20 Mail::SpamAssassin(3)
3250 12         20 spamassassin(1)
3251 12         51