File Coverage

blib/lib/Mail/SpamAssassin/PerMsgStatus.pm
Criterion Covered Total %
statement 951 1264 75.2
branch 284 558 50.9
condition 91 221 41.1
subroutine 111 125 88.8
pod 43 94 45.7
total 1480 2262 65.4


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::PerMsgStatus - per-message status (spam or not-spam)
21              
22             =head1 SYNOPSIS
23              
24             my $spamtest = new Mail::SpamAssassin ({
25             'rules_filename' => '/etc/spamassassin.rules',
26             'userprefs_filename' => $ENV{HOME}.'/.spamassassin/user_prefs'
27             });
28             my $mail = $spamtest->parse();
29              
30             my $status = $spamtest->check ($mail);
31              
32             my $rewritten_mail;
33             if ($status->is_spam()) {
34             $rewritten_mail = $status->rewrite_mail ();
35             }
36             ...
37              
38              
39             =head1 DESCRIPTION
40              
41             The Mail::SpamAssassin C<check()> method returns an object of this
42             class. This object encapsulates all the per-message state.
43              
44             =head1 METHODS
45              
46             =over 4
47              
48             =cut
49              
50             package Mail::SpamAssassin::PerMsgStatus;
51              
52 40     40   282 use strict;
  40         87  
  40         1360  
53 40     40   226 use warnings;
  40         88  
  40         1354  
54 40     40   258 use re 'taint';
  40         99  
  40         1830  
55              
56 40     40   260 use Errno qw(ENOENT);
  40         98  
  40         2236  
57 40     40   258 use Time::HiRes qw(time);
  40         86  
  40         502  
58              
59 40     40   5715 use Mail::SpamAssassin::Constants qw(:sa);
  40         97  
  40         5873  
60 40     40   14013 use Mail::SpamAssassin::AsyncLoop;
  40         1451  
  40         5703  
61 40     40   297 use Mail::SpamAssassin::Conf;
  40         737  
  40         1126  
62 40     40   280 use Mail::SpamAssassin::Util qw(untaint_var uri_list_canonicalize is_fqdn_valid);
  40         77  
  40         2327  
63 40     40   19019 use Mail::SpamAssassin::Timeout;
  40         104  
  40         1260  
64 40     40   274 use Mail::SpamAssassin::Logger;
  40         75  
  40         50638  
65              
66             our @ISA = qw();
67              
68             # methods defined by the compiled ruleset; deleted in finish_tests()
69             our @TEMPORARY_METHODS;
70              
71             # methods defined by register_plugin_eval_glue(); deleted in finish_tests()
72             our %TEMPORARY_EVAL_GLUE_METHODS;
73              
74             ###########################################################################
75              
76             our %common_tags;
77              
78             BEGIN {
79             %common_tags = (
80              
81             YESNO => sub {
82 34         71 my $pms = shift;
83 34         181 $pms->_get_tag_value_for_yesno(@_);
84             },
85              
86             YESNOCAPS => sub {
87 2         6 my $pms = shift;
88 2         11 uc $pms->_get_tag_value_for_yesno(@_);
89             },
90              
91             SCORE => sub {
92 38         115 my $pms = shift;
93 38         164 $pms->_get_tag_value_for_score(@_);
94             },
95              
96             HITS => sub {
97 0         0 my $pms = shift;
98 0         0 $pms->_get_tag_value_for_score(@_);
99             },
100              
101             REQD => sub {
102 38         101 my $pms = shift;
103 38         152 $pms->_get_tag_value_for_required_score(@_);
104             },
105              
106             VERSION => \&Mail::SpamAssassin::Version,
107              
108 34         117 SUBVERSION => sub { $Mail::SpamAssassin::SUB_VERSION },
109              
110             RULESVERSION => sub {
111 0         0 my $pms = shift;
112 0         0 my $conf = $pms->{conf};
113 0         0 my @fnames;
114             @fnames =
115 0 0       0 keys %{$conf->{update_version}} if $conf->{update_version};
  0         0  
116 0 0       0 @fnames = sort @fnames if @fnames > 1;
117 0         0 join(',', map($conf->{update_version}{$_}, @fnames));
118             },
119              
120             HOSTNAME => sub {
121 39         83 my $pms = shift;
122             $pms->{conf}->{report_hostname} ||
123 39 50       400 Mail::SpamAssassin::Util::fq_hostname();
124             },
125              
126             REMOTEHOSTNAME => sub {
127 0         0 my $pms = shift;
128 0 0       0 $pms->{tag_data}->{'REMOTEHOSTNAME'} || "localhost";
129             },
130              
131             REMOTEHOSTADDR => sub {
132 0         0 my $pms = shift;
133 0 0       0 $pms->{tag_data}->{'REMOTEHOSTADDR'} || "127.0.0.1";
134             },
135              
136             LASTEXTERNALIP => sub {
137 0         0 my $pms = shift;
138 0         0 my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
139 0 0       0 $lasthop ? $lasthop->{ip} : '';
140             },
141              
142             LASTEXTERNALRDNS => sub {
143 0         0 my $pms = shift;
144 0         0 my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
145 0 0       0 $lasthop ? $lasthop->{rdns} : '';
146             },
147              
148             LASTEXTERNALHELO => sub {
149 0         0 my $pms = shift;
150 0         0 my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
151 0 0       0 $lasthop ? $lasthop->{helo} : '';
152             },
153              
154             CONTACTADDRESS => sub {
155 4         10 my $pms = shift;
156 4         24 $pms->{conf}->{report_contact};
157             },
158              
159             BAYES => sub {
160 0         0 my $pms = shift;
161             defined $pms->{bayes_score} ? sprintf("%3.4f", $pms->{bayes_score})
162 0 0       0 : "0.5";
163             },
164              
165 0         0 DATE => sub { Mail::SpamAssassin::Util::time_to_rfc822_date() },
166              
167             STARS => sub {
168 34         95 my $pms = shift;
169 34   50     156 my $arg = (shift || "*");
170 34         111 my $length = int($pms->{score});
171 34 50       157 $length = 50 if $length > 50;
172             # avoid a perl 5.21 warning: "Negative repeat count does nothing"
173 34 100       174 $length > 0 ? $arg x $length : '';
174             },
175              
176             AUTOLEARN => sub {
177 34         83 my $pms = shift;
178 34         173 $pms->get_autolearn_status();
179             },
180              
181             AUTOLEARNSCORE => sub {
182 0         0 my $pms = shift;
183 0         0 $pms->get_autolearn_points();
184             },
185              
186             TESTS => sub {
187 34         93 my $pms = shift;
188 34   50     203 my $arg = (shift || ',');
189 34 100       87 join($arg, sort @{$pms->{test_names_hit}}) || "none";
  34         268  
190             },
191              
192             SUBTESTS => sub {
193 0         0 my $pms = shift;
194 0   0     0 my $arg = (shift || ',');
195 0 0       0 join($arg, sort @{$pms->{subtest_names_hit}}) || "none";
  0         0  
196             },
197              
198             SUBTESTSCOLLAPSED => sub {
199 0         0 my $pms = shift;
200 0   0     0 my $arg = (shift || ',');
201 0         0 my (@subtests) = $pms->get_names_of_subtests_hit("collapsed");
202 0 0       0 join($arg, sort @subtests) || "none";
203             },
204              
205             TESTSSCORES => sub {
206 0         0 my $pms = shift;
207 0   0     0 my $arg = (shift || ",");
208 0         0 my $scores = $pms->{conf}->{scores};
209             join($arg, map($_ . "=" . ($scores->{$_} || '0'),
210 0 0 0     0 sort @{$pms->{test_names_hit}})) || "none";
  0         0  
211             },
212              
213             PREVIEW => sub {
214 4         11 my $pms = shift;
215 4         30 $pms->get_content_preview();
216             },
217              
218             REPORT => sub {
219 3         8 my $pms = shift;
220 3   50     37 "\n" . ($pms->{tag_data}->{REPORT} || "");
221             },
222              
223             SUBJPREFIX => sub {
224 0         0 my $pms = shift;
225 0 0       0 ($pms->{tag_data}->{SUBJPREFIX} || "");
226             },
227              
228             HEADER => sub {
229 0         0 my $pms = shift;
230 0         0 my $hdr = shift;
231 0 0       0 return if !$hdr;
232 0         0 $pms->get($hdr,undef);
233             },
234              
235             TIMING => sub {
236 0         0 my $pms = shift;
237 0         0 $pms->{main}->timer_report();
238             },
239              
240             ADDEDHEADERHAM => sub {
241 0         0 my $pms = shift;
242 0         0 $pms->_get_added_headers('headers_ham');
243             },
244              
245             ADDEDHEADERSPAM => sub {
246 0         0 my $pms = shift;
247 0         0 $pms->_get_added_headers('headers_spam');
248             },
249              
250             ADDEDHEADER => sub {
251 0         0 my $pms = shift;
252             $pms->_get_added_headers(
253 0 0       0 $pms->{is_spam} ? 'headers_spam' : 'headers_ham');
254             },
255              
256 40     40   577978 );
257             }
258              
259             sub new {
260 154     154 0 767 my $class = shift;
261 154   33     932 $class = ref($class) || $class;
262 154         499 my ($main, $msg, $opts) = @_;
263              
264             my $self = {
265             'main' => $main,
266             'msg' => $msg,
267             'score' => 0,
268             'test_log_msgs' => { },
269             'test_names_hit' => [ ],
270             'subtest_names_hit' => [ ],
271             'spamd_result_log_items' => [ ],
272             'tests_already_hit' => { },
273             'c' => { },
274             'tag_data' => { },
275             'rule_errors' => 0,
276             'disable_auto_learning' => 0,
277             'auto_learn_status' => undef,
278             'auto_learn_force_status' => undef,
279             'conf' => $main->{conf},
280             'async' => Mail::SpamAssassin::AsyncLoop->new($main),
281             'master_deadline' => $msg->{master_deadline}, # dflt inherited from msg
282 154         2083 'deadline_exceeded' => 0, # time limit exceeded, skipping further tests
283             'uri_detail_list' => { },
284             'subjprefix' => "",
285             };
286              
287             dbg("check: pms new, time limit in %.3f s",
288 154 50       1472 $self->{master_deadline} - time) if $self->{master_deadline};
289              
290 154 50 66     792 if (defined $opts && $opts->{disable_auto_learning}) {
291 49         136 $self->{disable_auto_learning} = 1;
292             }
293              
294             # used with "mass-check --loghits"
295 154 50       622 if ($self->{main}->{save_pattern_hits}) {
296 0         0 $self->{save_pattern_hits} = 1;
297 0         0 $self->{pattern_hits} = { };
298             }
299              
300 154         350 delete $self->{should_log_rule_hits};
301 154         643 my $dbgcache = would_log('dbg', 'rules');
302 154 50 33     1017 if ($dbgcache || $self->{save_pattern_hits}) {
303 0         0 $self->{should_log_rule_hits} = 1;
304             }
305              
306             # known valid tags that might not get their entry in pms->{tag_data}
307             # in some circumstances
308 154         405 my $tag_data_ref = $self->{tag_data};
309 154         492 foreach (qw(SUMMARY REPORT SUBJPREFIX RBL)) { $tag_data_ref->{$_} = '' }
  616         1777  
310 154         606 foreach (qw(AWL AWLMEAN AWLCOUNT AWLPRESCORE
311             DCCB DCCR DCCREP PYZOR DKIMIDENTITY DKIMDOMAIN
312             BAYESTC BAYESTCLEARNED BAYESTCSPAMMY BAYESTCHAMMY
313             HAMMYTOKENS SPAMMYTOKENS TOKENSUMMARY)) {
314 2618         6468 $tag_data_ref->{$_} = undef; # exist, but undefined
315             }
316              
317 154         529 bless ($self, $class);
318 154         606 $self;
319             }
320              
321             sub DESTROY {
322 151     151   3482825 my ($self) = shift;
323 151         365 local $@;
324 151         447 eval { $self->delete_fulltext_tmpfile() }; # Bug 5808
  151         764  
325             }
326              
327             ###########################################################################
328              
329             =item $status->check ()
330              
331             Runs the SpamAssassin rules against the message pointed to by the object.
332              
333             =cut
334              
335             sub check {
336 97     97 1 278 my ($self) = shift;
337 97         293 my $master_deadline = $self->{master_deadline};
338 97 50       320 if (!$master_deadline) {
339 0         0 $self->check_timed(@_);
340             } else {
341 97         1111 my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
342 97     97   1171 my $err = $t->run(sub { $self->check_timed(@_) });
  97         675  
343 96 50 33     1113 if (time > $master_deadline && !$self->{deadline_exceeded}) {
344 0         0 info("check: exceeded time limit in pms check");
345 0         0 $self->{deadline_exceeded} = 1;
346             }
347             }
348             }
349              
350             sub check_timed {
351 97     97 0 343 my ($self) = @_;
352 97         258 local ($_);
353              
354 97         332 $self->{learned_points} = 0;
355 97         298 $self->{body_only_points} = 0;
356 97         240 $self->{head_only_points} = 0;
357 97         194 $self->{score} = 0;
358              
359             # clear NetSet cache before every check to prevent it growing too large
360 97         289 foreach my $nset_name (qw(internal_networks trusted_networks msa_networks)) {
361 291         816 my $netset = $self->{conf}->{$nset_name};
362 291 50       1321 $netset->ditch_cache() if $netset;
363             }
364              
365 97         754 $self->{main}->call_plugins ("check_start", { permsgstatus => $self });
366              
367             # in order of slowness; fastest first, slowest last.
368             # we do ALL the tests, even if a spam triggers lots of them early on.
369             # this lets us see ludicrously spammish mails (score: 40) etc., which
370             # we can then immediately submit to spamblocking services.
371             #
372             # TODO: change this to do whitelist/blacklists first? probably a plan
373             # NOTE: definitely need AWL stuff last, for regression-to-mean of score
374              
375             # TVD: we may want to do more than just clearing out the headers, but ...
376 97         897 $self->{msg}->delete_header('X-Spam-.*');
377              
378             # Resident Mail::SpamAssassin code will possibly never change score
379             # sets, even if bayes becomes available. So we should do a quick check
380             # to see if we should go from {0,1} to {2,3}. We of course don't need
381             # to do this switch if we're already using bayes ... ;)
382 97         565 my $set = $self->{conf}->get_score_set();
383 97 0 66     1144 if (($set & 2) == 0 && $self->{main}->{bayes_scanner} && $self->{main}->{bayes_scanner}->is_scan_available() && $self->{conf}->{use_bayes_rules}) {
      66        
      33        
384 0         0 dbg("check: scoreset $set but bayes is available, switching scoresets");
385 0         0 $self->{conf}->set_score_set ($set|2);
386             }
387 97         579 dbg("check: using scoreset $set in M:S:Pms");
388             # The primary check functionality occurs via a plugin call. For more
389             # information, please see: Mail::SpamAssassin::Plugin::Check
390 97 100       489 if (!$self->{main}->call_plugins ("check_main", { permsgstatus => $self }))
391             {
392             # did anything happen? if not, this is fatal
393 1 50       12 if (!$self->{main}->have_plugin("check_main")) {
394 1         25 die "check: no loaded plugin implements 'check_main': cannot scan!\n".
395             "Check that the necessary '.pre' files are in the config directory.\n".
396             "At a minimum, v320.pre loads the Check plugin which is required.\n";
397             }
398             }
399              
400             # delete temporary storage and memory allocation used during checking
401 96         762 $self->delete_fulltext_tmpfile();
402              
403             # now that we've finished checking the mail, clear out this cache
404             # to avoid unforeseen side-effects.
405 96         1133 $self->{c} = { };
406              
407             # Round the score to 3 decimal places to avoid rounding issues
408             # We assume required_score to be properly rounded already.
409             # add 0 to force it back to numeric representation instead of string.
410 96         1690 $self->{score} = (sprintf "%0.3f", $self->{score}) + 0;
411              
412             dbg("check: is spam? score=".$self->{score}.
413 96         1030 " required=".$self->{conf}->{required_score});
414 96         587 dbg("check: tests=".$self->get_names_of_tests_hit());
415 96         589 dbg("check: subtests=".$self->get_names_of_subtests_hit("dbg"));
416 96         433 $self->{is_spam} = $self->is_spam();
417              
418 96         508 $self->{main}->{resolver}->bgabort();
419 96         519 $self->{main}->call_plugins ("check_end", { permsgstatus => $self });
420              
421 96         1720 1;
422             }
423              
424             ###########################################################################
425              
426             =item $status->learn()
427              
428             After a mail message has been checked, this method can be called. If the score
429             is outside a certain range around the threshold, ie. if the message is judged
430             more-or-less definitely spam or definitely non-spam, it will be fed into
431             SpamAssassin's learning systems (currently the naive Bayesian classifier),
432             so that future similar mails will be caught.
433              
434             =cut
435              
436             sub learn {
437 96     96 1 294 my ($self) = shift;
438 96         224 my $master_deadline = $self->{master_deadline};
439 96 50       260 if (!$master_deadline) {
440 0         0 $self->learn_timed(@_);
441             } else {
442 96         465 my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
443 96     96   953 my $err = $t->run(sub { $self->learn_timed(@_) });
  96         815  
444 96 50 33     978 if (time > $master_deadline && !$self->{deadline_exceeded}) {
445 0         0 info("learn: exceeded time limit in pms learn");
446 0         0 $self->{deadline_exceeded} = 1;
447             }
448             }
449             }
450              
451             sub learn_timed {
452 96     96 0 388 my ($self) = @_;
453              
454 96 100 100     848 if (!$self->{conf}->{bayes_auto_learn} ||
      100        
455             !$self->{conf}->{use_bayes} ||
456             $self->{disable_auto_learning})
457             {
458 84         274 $self->{auto_learn_status} = "disabled";
459 84         1083 return;
460             }
461              
462 12         29 my ($isspam, $force_autolearn, $force_autolearn_names, $arrayref);
463 12         81 $arrayref = $self->{main}->call_plugins ("autolearn_discriminator", {
464             permsgstatus => $self
465             });
466              
467 12         42 $isspam = $arrayref->[0];
468 12         24 $force_autolearn = $arrayref->[1];
469 12         24 $force_autolearn_names = $arrayref->[2];
470              
471             #AUTOLEARN_FORCE FLAG INFORMATION
472 12 50 66     70 if (defined $force_autolearn and $force_autolearn > 0) {
473 0         0 $self->{auto_learn_force_status} = "yes";
474 0 0       0 if (defined $force_autolearn_names) {
475 0         0 $self->{auto_learn_force_status} .= " ($force_autolearn_names)";
476             }
477             } else {
478 12         33 $self->{auto_learn_force_status} = "no";
479             }
480              
481 12 100       37 if (!defined $isspam) {
482 10         22 $self->{auto_learn_status} = 'no';
483 10         151 return;
484             }
485              
486              
487 2         11 my $timer = $self->{main}->time_method("learn");
488              
489 2         13 $self->{main}->call_plugins ("autolearn", {
490             permsgstatus => $self,
491             isspam => $isspam
492             });
493              
494             # bug 3704: temporarily override learn's ability to re-learn a message
495 2         19 my $orig_learner = $self->{main}->init_learner({ "no_relearn" => 1 });
496              
497 2         5 my $eval_stat;
498             eval {
499 2         21 my $learnstatus = $self->{main}->learn ($self->{msg}, undef, $isspam, 0);
500 2 50       9 if ($learnstatus->did_learn()) {
501 0 0       0 $self->{auto_learn_status} = $isspam ? "spam" : "ham";
502             }
503             # This must wait until the did_learn call.
504 2         6 $learnstatus->finish();
505 2         8 $self->{main}->finish_learner(); # for now
506              
507 2 50       7 if (exists $self->{main}->{bayes_scanner}) {
508 2         8 $self->{main}->{bayes_scanner}->force_close();
509             }
510 2         12 1;
511 2 50       6 } or do {
512 0 0       0 $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
513             };
514              
515             # reset learner options to their original values
516 2         11 $self->{main}->init_learner($orig_learner);
517              
518 2 50       42 if (defined $eval_stat) {
519 0         0 dbg("learn: auto-learning failed: $eval_stat");
520 0         0 $self->{auto_learn_status} = "failed";
521             }
522             }
523              
524             =item $score = $status->get_autolearn_points()
525              
526             Return the message's score as computed for auto-learning. Certain tests are
527             ignored:
528              
529             - rules with tflags set to 'learn' (the Bayesian rules)
530              
531             - rules with tflags set to 'userconf' (user white/black-listing rules, etc)
532              
533             - rules with tflags set to 'noautolearn'
534              
535             Also note that auto-learning occurs using scores from either scoreset 0 or 1,
536             depending on what scoreset is used during message check. It is likely that the
537             message check and auto-learn scores will be different.
538              
539             =cut
540              
541             sub get_autolearn_points {
542 12     12 1 41 my ($self) = @_;
543 12         62 $self->_get_autolearn_points();
544 12         62 return $self->{autolearn_points};
545             }
546              
547             =item $score = $status->get_head_only_points()
548              
549             Return the message's score as computed for auto-learning, ignoring
550             all rules except for header-based ones.
551              
552             =cut
553              
554             sub get_head_only_points {
555 12     12 1 40 my ($self) = @_;
556 12         31 $self->_get_autolearn_points();
557 12         36 return $self->{head_only_points};
558             }
559              
560             =item $score = $status->get_learned_points()
561              
562             Return the message's score as computed for auto-learning, ignoring
563             all rules except for learning-based ones.
564              
565             =cut
566              
567             sub get_learned_points {
568 12     12 1 32 my ($self) = @_;
569 12         26 $self->_get_autolearn_points();
570 12         38 return $self->{learned_points};
571             }
572              
573             =item $score = $status->get_body_only_points()
574              
575             Return the message's score as computed for auto-learning, ignoring
576             all rules except for body-based ones.
577              
578             =cut
579              
580             sub get_body_only_points {
581 12     12 1 36 my ($self) = @_;
582 12         45 $self->_get_autolearn_points();
583 12         40 return $self->{body_only_points};
584             }
585              
586             =item $score = $status->get_autolearn_force_status()
587              
588             Return whether a message's score included any rules that are flagged as
589             autolearn_force.
590              
591             =cut
592              
593             sub get_autolearn_force_status {
594 12     12 1 30 my ($self) = @_;
595 12         32 $self->_get_autolearn_points();
596 12         27 return $self->{autolearn_force};
597             }
598              
599             =item $rule_names = $status->get_autolearn_force_names()
600              
601             Return a list of comma separated list of rule names if a message's
602             score included any rules that are flagged as autolearn_force.
603              
604             =cut
605              
606             sub get_autolearn_force_names {
607 12     12 1 30 my ($self) = @_;
608 12         27 my ($names);
609              
610 12         39 $self->_get_autolearn_points();
611 12         30 $names = $self->{autolearn_force_names};
612              
613 12 50       51 if (defined $names) {
614             #remove trailing comma
615 0         0 $names =~ s/,$//;
616             } else {
617 12         38 $names = "";
618             }
619              
620 12         38 return $names;
621             }
622              
623             sub _get_autolearn_points {
624 72     72   125 my ($self) = @_;
625              
626 72 100       160 return if (exists $self->{autolearn_points});
627             # ensure it only gets computed once, even if we return early
628 12         41 $self->{autolearn_points} = 0;
629              
630             # This function needs to use use sum($score[scoreset % 2]) not just {score}.
631             # otherwise we shift what we autolearn on and it gets really weird. - tvd
632 12         51 my $orig_scoreset = $self->{conf}->get_score_set();
633 12         25 my $new_scoreset = $orig_scoreset;
634 12         25 my $scores = $self->{conf}->{scores};
635              
636 12 50       61 if (($orig_scoreset & 2) == 0) { # we don't need to recompute
637 12         75 dbg("learn: auto-learn: currently using scoreset $orig_scoreset");
638             }
639             else {
640 0         0 $new_scoreset = $orig_scoreset & ~2;
641 0         0 dbg("learn: auto-learn: currently using scoreset $orig_scoreset, recomputing score based on scoreset $new_scoreset");
642 0         0 $scores = $self->{conf}->{scoreset}->[$new_scoreset];
643             }
644              
645 12         41 my $tflags = $self->{conf}->{tflags};
646 12         34 my $points = 0;
647              
648             # Just in case this function is called multiple times, clear out the
649             # previous calculated values
650 12         37 $self->{learned_points} = 0;
651 12         22 $self->{body_only_points} = 0;
652 12         27 $self->{head_only_points} = 0;
653 12         26 $self->{autolearn_force} = 0;
654              
655 12         19 foreach my $test (@{$self->{test_names_hit}}) {
  12         59  
656             # According to the documentation, noautolearn, userconf, and learn
657             # rules are ignored for autolearning.
658 19 100       78 if (exists $tflags->{$test}) {
659 6 100       40 next if $tflags->{$test} =~ /\bnoautolearn\b/;
660 5 50       71 next if $tflags->{$test} =~ /\buserconf\b/;
661              
662             # Keep track of the learn points for an additional autolearn check.
663             # Use the original scoreset since it'll be 0 in sets 0 and 1.
664 0 0       0 if ($tflags->{$test} =~ /\blearn\b/) {
665             # we're guaranteed that the score will be defined
666 0         0 $self->{learned_points} += $self->{conf}->{scoreset}->[$orig_scoreset]->{$test};
667 0         0 next;
668             }
669              
670             #IF ANY RULES ARE AUTOLEARN FORCE, SET THAT FLAG
671 0 0       0 if ($tflags->{$test} =~ /\bautolearn_force\b/) {
672 0         0 $self->{autolearn_force}++;
673             #ADD RULE NAME TO LIST
674 0         0 $self->{autolearn_force_names}.="$test,";
675             }
676             }
677              
678             # ignore tests with 0 score (or undefined) in this scoreset
679 13 50       55 next if !$scores->{$test};
680              
681             # Go ahead and add points to the proper locations
682             # Changed logic because in testing, I was getting both head and body. Bug 5503
683 13 100       57 if ($self->{conf}->maybe_header_only ($test)) {
    50          
684 9         36 $self->{head_only_points} += $scores->{$test};
685 9         94 dbg("learn: auto-learn: adding head_only points $scores->{$test}");
686             } elsif ($self->{conf}->maybe_body_only ($test)) {
687 4         13 $self->{body_only_points} += $scores->{$test};
688 4         79 dbg("learn: auto-learn: adding body_only points $scores->{$test}");
689             } else {
690 0         0 dbg("learn: auto-learn: not considered head or body scores: $scores->{$test}");
691             }
692              
693 13         64 $points += $scores->{$test};
694             }
695              
696             # Figure out the final value we'll use for autolearning
697 12         144 $points = (sprintf "%0.3f", $points) + 0;
698 12         148 dbg("learn: auto-learn: message score: ".$self->{score}.", computed score for autolearn: $points");
699              
700 12         49 $self->{autolearn_points} = $points;
701             }
702              
703             ###########################################################################
704              
705             =item $isspam = $status->is_spam ()
706              
707             After a mail message has been checked, this method can be called. It will
708             return 1 for mail determined likely to be spam, 0 if it does not seem
709             spam-like.
710              
711             =cut
712              
713             sub is_spam {
714 99     99 1 254 my ($self) = @_;
715             # changed to test this so sub-tests can ask "is_spam" during a run
716 99         462 return ($self->{score} >= $self->{conf}->{required_score});
717             }
718              
719             ###########################################################################
720              
721             =item $list = $status->get_names_of_tests_hit ()
722              
723             After a mail message has been checked, this method can be called. It will
724             return a comma-separated string, listing all the symbolic test names
725             of the tests which were triggered by the mail.
726              
727             =cut
728              
729             sub get_names_of_tests_hit {
730 97     97 1 294 my ($self) = @_;
731              
732 97         203 return join(',', sort @{$self->{test_names_hit}});
  97         787  
733             }
734              
735             =item $list = $status->get_names_of_tests_hit_with_scores_hash ()
736              
737             After a mail message has been checked, this method can be called. It will
738             return a pointer to a hash for rule & score pairs for all the symbolic
739             test names and individual scores of the tests which were triggered by the mail.
740              
741             =cut
742              
743             sub get_names_of_tests_hit_with_scores_hash {
744 0     0 1 0 my ($self) = @_;
745              
746             #BASED ON CODE FOR TESTSSCORES TAG
747 0         0 my $scores = $self->{conf}->{scores};
748 0         0 my %testsscores;
749 0   0     0 $testsscores{$_} = $scores->{$_} || '0' for @{$self->{test_names_hit}};
  0         0  
750 0         0 return \%testsscores;
751             }
752              
753             =item $list = $status->get_names_of_tests_hit_with_scores ()
754              
755             After a mail message has been checked, this method can be called. It will
756             return a comma-separated string of rule=score pairs for all the symbolic
757             test names and individual scores of the tests which were triggered by the mail.
758              
759             =cut
760              
761             sub get_names_of_tests_hit_with_scores {
762 0     0 1 0 my ($self) = @_;
763              
764             #BASED ON CODE FOR TESTSSCORES TAG
765 0         0 my $scores = $self->{conf}->{scores};
766             return join(',', map($_ . '=' . ($scores->{$_} || '0'),
767 0   0     0 sort @{$self->{test_names_hit}})) || "none";
768             }
769              
770              
771             ###########################################################################
772              
773             =item $list = $status->get_names_of_subtests_hit ()
774              
775             After a mail message has been checked, this method can be called. It will
776             return a comma-separated string, listing all the symbolic test names of the
777             meta-rule sub-tests which were triggered by the mail. Sub-tests are the
778             normally-hidden rules, which score 0 and have names beginning with two
779             underscores, used in meta rules.
780              
781             If a parameter of collapsed or dbg is passed, the output will be a condensed
782             array of sub-tests with multiple hits reduced to one entry.
783              
784             If the parameter of dbg is passed, the output will be a condensed string of
785             sub-tests with multiple hits reduced to one entry with the number of hits
786             in parentheses. Some information is also added at the end regarding the
787             multiple hits.
788              
789             =cut
790              
791             sub get_names_of_subtests_hit {
792 96     96 1 319 my ($self, $mode) = @_;
793              
794 96 50 33     929 if (defined $mode && ($mode eq 'dbg' || $mode eq 'collapsed')) {
      33        
795             # This routine prints only one instance of a subrule hit with a count of how many times it hit if greater than 1
796 96         213 my $total_hits = scalar(@{$self->{subtest_names_hit}});
  96         308  
797 96 100       568 return '' if !$total_hits;
798              
799 42         85 my %subtest_names_hit;
800 42         89 $subtest_names_hit{$_}++ foreach @{$self->{subtest_names_hit}};
  42         316  
801              
802 42         291 my @subtests = sort keys %subtest_names_hit;
803 42         111 my $deduplicated_hits = scalar(@subtests);
804              
805 42         89 my @result;
806 42         138 foreach my $rule (@subtests) {
807 84 50       249 if ($subtest_names_hit{$rule} > 1) {
808 0         0 push @result, "$rule($subtest_names_hit{$rule})";
809             } else {
810 84         203 push @result, $rule;
811             }
812             }
813              
814 42 50       172 if ($mode eq 'dbg') {
815 42         406 return join(',', @result)." (Total Subtest Hits: $total_hits / Deduplicated Total Hits: $deduplicated_hits)";
816             } else {
817 0         0 return join(',', @result);
818             }
819             } else {
820             # Return the simpler string with duplicates and commas
821 0         0 return join(',', sort @{$self->{subtest_names_hit}});
  0         0  
822             }
823             }
824              
825             ###########################################################################
826              
827             =item $num = $status->get_score ()
828              
829             After a mail message has been checked, this method can be called. It will
830             return the message's score.
831              
832             =cut
833              
834             sub get_score {
835 0     0 1 0 my ($self) = @_;
836 0         0 return $self->{score};
837             }
838              
839             # left as backward compatibility
840             sub get_hits {
841 0     0 0 0 my ($self) = @_;
842 0         0 return $self->{score};
843             }
844              
845             ###########################################################################
846              
847             =item $num = $status->get_required_score ()
848              
849             After a mail message has been checked, this method can be called. It will
850             return the score required for a mail to be considered spam.
851              
852             =cut
853              
854             sub get_required_score {
855 0     0 1 0 my ($self) = @_;
856 0         0 return $self->{conf}->{required_score};
857             }
858              
859             # left as backward compatibility
860             sub get_required_hits {
861 0     0 0 0 my ($self) = @_;
862 0         0 return $self->{conf}->{required_score};
863             }
864              
865             ###########################################################################
866              
867             =item $num = $status->get_autolearn_status ()
868              
869             After a mail message has been checked, this method can be called. It will
870             return one of the following strings depending on whether the mail was
871             auto-learned or not: "ham", "no", "spam", "disabled", "failed", "unavailable".
872              
873             It also returns is flagged with auto_learn_force, it will also include the status
874             and the rules hit. For example: "autolearn_force=yes (AUTOLEARNTEST_BODY)"
875              
876             =cut
877              
878             sub get_autolearn_status {
879 34     34 1 98 my ($self) = @_;
880 34   50     150 my ($status) = $self->{auto_learn_status} || "unavailable";
881              
882 34 100       126 if (defined $self->{auto_learn_force_status}) {
883 2         9 $status .= " autolearn_force=".$self->{auto_learn_force_status};
884             }
885              
886 34         105 return $status;
887             }
888              
889             ###########################################################################
890              
891             =item $report = $status->get_report ()
892              
893             Deliver a "spam report" on the checked mail message. This contains details of
894             how many spam detection rules it triggered.
895              
896             The report is returned as a multi-line string, with the lines separated by
897             C<\n> characters.
898              
899             =cut
900              
901             sub get_report {
902 4     4 1 901 my ($self) = @_;
903              
904 4 50       21 if (!exists $self->{'report'}) {
905 4         8 my $report;
906              
907 4         20 my $timer = $self->{main}->time_method("get_report");
908 4         18 $report = $self->{conf}->{report_template};
909 4   50     19 $report ||= '(no report template found)';
910              
911 4         21 $report = $self->_replace_tags($report);
912              
913 4         298 $report =~ s/\n*$/\n\n/s;
914 4         19 $self->{report} = $report;
915             }
916              
917 4         19 return $self->{report};
918             }
919              
920             ###########################################################################
921              
922             =item $preview = $status->get_content_preview ()
923              
924             Give a "preview" of the content.
925              
926             This is returned as a multi-line string, with the lines separated by C<\n>
927             characters, containing a fully-decoded, safe, plain-text sample of the first
928             few lines of the message body.
929              
930             =cut
931              
932             sub get_content_preview {
933 4     4 1 12 my ($self) = @_;
934              
935 4         13 my $str = '';
936 4         7 my @ary = @{$self->get_decoded_stripped_body_text_array()};
  4         17  
937 4         13 shift @ary; # drop the subject line
938              
939 4         11 my $numlines = 3;
940 4   33     50 while (length ($str) < 200 && @ary && $numlines-- > 0) {
      66        
941 12         102 $str .= shift @ary;
942             }
943              
944             # in case the last line was huge, trim it back to around 200 chars
945 4         16 local $1;
946 4         13 $str =~ s/^(.{200}).+$/$1 [...]/gm;
947 4         23 chomp ($str); $str .= "\n";
  4         12  
948              
949             # now, some tidy-ups that make things look a bit prettier
950 4         31 $str =~ s/-----Original Message-----.*$//gm;
951 4         15 $str =~ s/This is a multi-part message in MIME format\.//gs;
952 4         32 $str =~ s/[-_*.]{10,}//gs;
953 4         69 $str =~ s/\s+/ /gs;
954              
955             # add "Content preview:" ourselves, so that the text aligns
956             # correctly with the template -- then trim it off. We don't
957             # have to get this *exactly* right, but it's nicer if we
958             # make a bit of an effort ;)
959 4         36 $str = Mail::SpamAssassin::Util::wrap($str, " ", "Content preview: ", 75, 1);
960 4         23 $str =~ s/^Content preview:\s+//gs;
961              
962 4         29 return $str;
963             }
964              
965             ###########################################################################
966              
967             =item $msg = $status->get_message()
968              
969             Return the object representing the message being scanned.
970              
971             =cut
972              
973             sub get_message {
974 0     0 1 0 my ($self) = @_;
975 0         0 return $self->{msg};
976             }
977              
978             ###########################################################################
979              
980             =item $status->rewrite_mail ()
981              
982             Rewrite the mail message. This will at minimum add headers, and at
983             maximum MIME-encapsulate the message text, to reflect its spam or not-spam
984             status. The function will return a scalar of the rewritten message.
985              
986             The actual modifications depend on the configuration (see
987             C<Mail::SpamAssassin::Conf> for more information).
988              
989             The possible modifications are as follows:
990              
991             =over 4
992              
993             =item To:, From: and Subject: modification on spam mails
994              
995             Depending on the configuration, the To: and From: lines can have a
996             user-defined RFC 2822 comment appended for spam mail. The subject line
997             may have a user-defined string prepended to it for spam mail.
998              
999             =item X-Spam-* headers for all mails
1000              
1001             Depending on the configuration, zero or more headers with names
1002             beginning with C<X-Spam-> will be added to mail depending on whether
1003             it is spam or ham.
1004              
1005             =item spam message with report_safe
1006              
1007             If report_safe is set to true (1), then spam messages are encapsulated
1008             into their own message/rfc822 MIME attachment without any modifications
1009             being made.
1010              
1011             If report_safe is set to false (0), then the message will only have the
1012             above headers added/modified.
1013              
1014             =back
1015              
1016             =cut
1017              
1018             sub rewrite_mail {
1019 34     34 1 299 my ($self) = @_;
1020              
1021 34         163 my $timer = $self->{main}->time_method("rewrite_mail");
1022 34   100     213 my $msg = $self->{msg}->get_mbox_separator() || '';
1023              
1024 34 100 66     203 if ($self->{is_spam} && $self->{conf}->{report_safe}) {
1025 2         19 $msg .= $self->rewrite_report_safe();
1026             }
1027             else {
1028 32         172 $msg .= $self->rewrite_no_report_safe();
1029             }
1030              
1031 34         211 return $msg;
1032             }
1033              
1034             # Make the line endings in the passed string reference appropriate
1035             # for the original mail. Callers must note bug 5250: don't rewrite
1036             # the message body, since that will corrupt 8bit attachments/MIME parts.
1037             #
1038             sub _fixup_report_line_endings {
1039 36     36   131 my ($self, $strref) = @_;
1040 36 50       187 if ($self->{msg}->{line_ending} ne "\n") {
1041 0         0 $$strref =~ s/\r?\n/$self->{msg}->{line_ending}/gs;
1042             }
1043             }
1044              
1045             sub _get_added_headers {
1046 34     34   143 my ($self, $which) = @_;
1047 34         113 my $str = '';
1048             # use string appends to put this back together -- I finally benchmarked it.
1049             # join() is 56% of the speed of just using string appends. ;)
1050 34         77 foreach my $hf_ref (@{$self->{conf}->{$which}}) {
  34         166  
1051 168         895 my($hfname, $hfbody) = @$hf_ref;
1052 168         471 my $line = $self->_process_header($hfname,$hfbody);
1053 168         609 $line = $self->qp_encode_header($line);
1054 168         1016 $str .= "X-Spam-$hfname: $line\n";
1055             }
1056 34         237 return $str;
1057             };
1058              
1059             # rewrite the message in report_safe mode
1060             # should not be called directly, use rewrite_mail instead
1061             #
1062             sub rewrite_report_safe {
1063 2     2 0 7 my ($self) = @_;
1064              
1065 2         4 my $tag;
1066              
1067             # This is the original message. We do not want to make any modifications so
1068             # we may recover it if necessary. It will be put into the new message as a
1069             # message/rfc822 MIME part.
1070 2         12 my $original = $self->{msg}->get_pristine();
1071              
1072             # This is the new message.
1073 2         12 my $newmsg = '';
1074              
1075             # the report charset
1076 2         6 my $report_charset = "; charset=iso-8859-1";
1077 2 50       11 if ($self->{conf}->{report_charset}) {
1078 0         0 $report_charset = "; charset=" . $self->{conf}->{report_charset};
1079             }
1080              
1081             # the SpamAssassin report
1082 2         15 my $report = $self->get_report();
1083              
1084             # If there are any wide characters, need to MIME-encode in UTF-8
1085             # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
1086             # we could try converting to that charset if possible
1087 2 50 33     43 unless ($] < 5.008 || utf8::downgrade($report, 1)) {
1088 0         0 $report_charset = "; charset=utf-8";
1089 0         0 utf8::encode($report);
1090             }
1091              
1092             # get original headers, "pristine" if we can do it
1093 2         16 my $from = $self->{msg}->get_pristine_header("From");
1094 2         16 my $to = $self->{msg}->get_pristine_header("To");
1095 2         12 my $cc = $self->{msg}->get_pristine_header("Cc");
1096 2         10 my $subject = $self->{msg}->get_pristine_header("Subject");
1097 2         14 my $msgid = $self->{msg}->get_pristine_header('Message-Id');
1098 2         12 my $date = $self->{msg}->get_pristine_header("Date");
1099              
1100             # It'd be nice to do this with a foreach loop, but with only three
1101             # possibilities right now, it's easier not to...
1102              
1103 2 50       16 if (defined $self->{conf}->{rewrite_header}->{Subject}) {
1104             # Add a prefix to the subject if needed
1105 0 0 0     0 if((defined $self->{subjprefix}) and ($self->{subjprefix} ne "")) {
1106 0         0 $tag = $self->_replace_tags($self->{subjprefix});
1107 0         0 $tag =~ s/\n/ /gs;
1108 0         0 $subject = $tag . $subject;
1109             }
1110             # Add a **SPAM** prefix
1111 0 0       0 $subject = "\n" if !defined $subject;
1112 0         0 $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{Subject});
1113 0         0 $tag =~ s/\n/ /gs; # strip tag's newlines
1114 0         0 $subject =~ s/^(?:\Q${tag}\E )?/${tag} /g; # For some reason the tag may already be there!?
1115             }
1116              
1117 2 50       9 if (defined $self->{conf}->{rewrite_header}->{To}) {
1118 0 0       0 $to = "\n" if !defined $to;
1119 0         0 my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{To});
1120 0         0 $tag =~ s/\n/ /gs; # strip tag's newlines
1121 0         0 $to =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1122             }
1123              
1124 2 50       11 if (defined $self->{conf}->{rewrite_header}->{From}) {
1125 0 0       0 $from = "\n" if !defined $from;
1126 0         0 my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{From});
1127 0         0 $tag =~ s/\n+//gs; # strip tag's newlines
1128 0         0 $from =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1129             }
1130              
1131             # add report headers to message
1132 2 50       16 $newmsg .= "From: $from" if defined $from;
1133 2 50       14 $newmsg .= "To: $to" if defined $to;
1134 2 50       9 $newmsg .= "Cc: $cc" if defined $cc;
1135 2 50       29 $newmsg .= "Subject: $subject" if defined $subject;
1136 2 50       14 $newmsg .= "Date: $date" if defined $date;
1137 2 50       11 $newmsg .= "Message-Id: $msgid" if defined $msgid;
1138 2         13 $newmsg .= $self->_get_added_headers('headers_spam');
1139              
1140 2 50       15 if (defined $self->{conf}->{report_safe_copy_headers}) {
1141 2         10 my %already_added = map { $_ => 1 } qw/from to cc subject date message-id/;
  12         38  
1142              
1143 2         18 foreach my $hdr (@{$self->{conf}->{report_safe_copy_headers}}) {
  2         21  
1144 0 0       0 next if exists $already_added{lc $hdr};
1145 0         0 my @hdrtext = $self->{msg}->get_pristine_header($hdr);
1146 0         0 $already_added{lc $hdr}++;
1147              
1148 0 0       0 if (lc $hdr eq "received") { # add Received at the top ...
1149 0         0 my $rhdr = "";
1150 0         0 foreach (@hdrtext) {
1151 0         0 $rhdr .= "$hdr: $_";
1152             }
1153 0         0 $newmsg = "$rhdr$newmsg";
1154             }
1155             else {
1156 0         0 foreach (@hdrtext) {
1157 0         0 $newmsg .= "$hdr: $_";
1158             }
1159             }
1160             }
1161             }
1162              
1163             # jm: add a SpamAssassin Received header to note markup time etc.
1164             # emulates the fetchmail style.
1165             # tvd: do this after report_safe_copy_headers so Received will be done correctly
1166 2         13 $newmsg = "Received: from localhost by " .
1167             Mail::SpamAssassin::Util::fq_hostname() . "\n" .
1168             "\twith SpamAssassin (version " .
1169             Mail::SpamAssassin::Version() . ");\n" .
1170             "\t" . Mail::SpamAssassin::Util::time_to_rfc822_date() . "\n" .
1171             $newmsg;
1172              
1173             # MIME boundary
1174 2         33 my $boundary = "----------=_" . sprintf("%08X.%08X",time,int(rand(2 ** 32)));
1175              
1176             # ensure it's unique, so we can't be attacked this way
1177 2         78 while ($original =~ /^\Q${boundary}\E(?:--)?$/m) {
1178 0         0 $boundary .= "/".sprintf("%08X",int(rand(2 ** 32)));
1179             }
1180              
1181             # determine whether Content-Disposition should be "attachment" or "inline"
1182 2         8 my $disposition;
1183 2         15 my $ct = $self->{msg}->get_header("Content-Type");
1184 2 50 33     47 if (defined $ct && $ct ne '' && $ct !~ m{text/plain}i) {
      33        
1185 2         7 $disposition = "attachment";
1186 2         12 $report .= $self->_replace_tags($self->{conf}->{unsafe_report_template});
1187             # if we wanted to defang the attachment, this would be the place
1188             }
1189             else {
1190 0         0 $disposition = "inline";
1191             }
1192              
1193 2         7 my $type = "message/rfc822";
1194 2 50       21 $type = "text/plain" if $self->{conf}->{report_safe} > 1;
1195              
1196 2         10 my $description = $self->{conf}->{'encapsulated_content_description'};
1197              
1198             # Note: the message should end in blank line since mbox format wants
1199             # blank line at end and messages may be concatenated! In addition, the
1200             # x-spam-type parameter is fixed since we will use it later to recognize
1201             # original messages that can be extracted.
1202 2         44 $newmsg .= <<"EOM";
1203             MIME-Version: 1.0
1204             Content-Type: multipart/mixed; boundary="$boundary"
1205              
1206             This is a multi-part message in MIME format.
1207              
1208             --$boundary
1209             Content-Type: text/plain$report_charset
1210             Content-Disposition: inline
1211             Content-Transfer-Encoding: 8bit
1212              
1213             $report
1214              
1215             --$boundary
1216             Content-Type: $type; x-spam-type=original
1217             Content-Description: $description
1218             Content-Disposition: $disposition
1219             Content-Transfer-Encoding: 8bit
1220              
1221             EOM
1222              
1223 2         11 my $newmsgtrailer = "\n--$boundary--\n\n";
1224              
1225             # now fix line endings in both headers, report_safe body parts,
1226             # and new MIME boundaries and structure
1227 2         15 $self->_fixup_report_line_endings(\$newmsg);
1228 2         7 $self->_fixup_report_line_endings(\$newmsgtrailer);
1229 2         37 $newmsg .= $original.$newmsgtrailer;
1230              
1231 2         37 return $newmsg;
1232             }
1233              
1234             # rewrite the message in non-report_safe mode (just headers)
1235             # should not be called directly, use rewrite_mail instead
1236             #
1237             sub rewrite_no_report_safe {
1238 32     32 0 88 my ($self) = @_;
1239              
1240 32         84 my $ntag;
1241 32         64 my $pref_subject = 0;
1242              
1243             # put the pristine headers into an array
1244             # skip the X-Spam- headers, but allow the X-Spam-Prev headers to remain.
1245             # since there may be a missing header/body
1246             #
1247 32         157 my @pristine_headers = split(/^/m, $self->{msg}->get_pristine_header());
1248 32         198 for (my $line = 0; $line <= $#pristine_headers; $line++) {
1249 120 50       596 next unless ($pristine_headers[$line] =~ /^X-Spam-(?!Prev-)/i);
1250 0         0 splice @pristine_headers, $line, 1 while ($pristine_headers[$line] =~ /^(?:X-Spam-(?!Prev-)|[ \t])/i);
1251 0         0 $line--;
1252             }
1253 32         96 my $separator = '';
1254 32 50 33     433 if (@pristine_headers && $pristine_headers[$#pristine_headers] =~ /^\s*$/) {
1255 32         137 $separator = pop @pristine_headers;
1256             }
1257              
1258 32         107 my $addition = 'headers_ham';
1259              
1260 32 50       116 if($self->{is_spam})
1261             {
1262             # special-case: Subject lines. ensure one exists, if we're
1263             # supposed to mark it up.
1264 0         0 my $created_subject = 0;
1265 0         0 my $subject = $self->{msg}->get_pristine_header('Subject');
1266 0 0 0     0 if (!defined($subject) && $self->{is_spam}
      0        
1267             && exists $self->{conf}->{rewrite_header}->{'Subject'})
1268             {
1269 0         0 push(@pristine_headers, "Subject: \n");
1270 0         0 $created_subject = 1;
1271             }
1272              
1273             # Deal with header rewriting
1274 0         0 foreach (@pristine_headers) {
1275             # if we're not going to do a rewrite, skip this header!
1276 0 0       0 next if (!/^(From|Subject|To):/i);
1277 0         0 my $hdr = ucfirst(lc($1));
1278 0 0       0 next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1279              
1280             # pop the original version onto the end of the header array
1281 0 0       0 if ($created_subject) {
1282 0         0 push(@pristine_headers, "X-Spam-Prev-Subject: (nonexistent)\n");
1283             } else {
1284 0         0 push(@pristine_headers, "X-Spam-Prev-$_");
1285             }
1286              
1287             # Figure out the rewrite piece
1288 0         0 my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{$hdr});
1289 0         0 $tag =~ s/\n/ /gs;
1290              
1291             # The tag should be a comment for this header ...
1292 0 0       0 $tag = "($tag)" if ($hdr =~ /^(?:From|To)$/);
1293              
1294 0 0 0     0 if((defined $self->{subjprefix}) and (defined $self->{conf}->{rewrite_header}->{Subject})) {
1295 0 0       0 if($self->{subjprefix} ne "") {
1296 0         0 $ntag = $self->_replace_tags($self->{subjprefix});
1297 0         0 $ntag =~ s/\n/ /gs;
1298 0         0 $ntag =~ s/\s+$//;
1299              
1300 0         0 local $1;
1301 0 0       0 if(defined $ntag) {
1302 0         0 s/^([^:]+:)[ \t]*(?:\Q${ntag}\E )?/$1 ${ntag} /i;
1303             }
1304             }
1305             }
1306 0         0 s/^([^:]+:)[ \t]*(?:\Q${tag}\E )?/$1 ${tag} /i;
1307             }
1308              
1309 0         0 $addition = 'headers_spam';
1310             } else {
1311             # special-case: Subject lines. ensure one exists, if we're
1312             # supposed to mark it up.
1313 32         103 my $created_subject = 0;
1314 32         123 my $subject = $self->{msg}->get_pristine_header('Subject');
1315 32 50 33     280 if (!defined($subject)
1316             && exists $self->{conf}->{rewrite_header}->{'Subject'})
1317             {
1318 0         0 push(@pristine_headers, "Subject: \n");
1319 0         0 $created_subject = 1;
1320             }
1321              
1322             # Deal with header rewriting
1323 32         106 foreach (@pristine_headers) {
1324             # if we're not going to do a rewrite, skip this header!
1325 88 50       286 next if (!/^(Subject):/i);
1326 0         0 my $hdr = ucfirst(lc($1));
1327 0 0       0 next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1328              
1329 0 0 0     0 if((defined $self->{subjprefix}) and (defined $self->{conf}->{rewrite_header}->{Subject})) {
1330 0 0       0 if($self->{subjprefix} ne "") {
1331 0         0 $ntag = $self->_replace_tags($self->{subjprefix});
1332 0         0 $ntag =~ s/\n/ /gs;
1333 0         0 $ntag =~ s/\s+$//;
1334              
1335 0         0 local $1;
1336 0 0       0 if(defined $ntag) {
1337 0         0 s/^([^:]+:)[ \t]*(?:\Q${ntag}\E )?/$1 ${ntag} /i;
1338             }
1339             }
1340             }
1341             }
1342              
1343             }
1344              
1345             # Break the pristine header set into two blocks; $new_hdrs_pre is the stuff
1346             # that we want to ensure comes before any SpamAssassin markup headers,
1347             # like the Return-Path header (see bug 3409).
1348             #
1349             # all the rest of the message headers (as left in @pristine_headers), is
1350             # to be placed after the SpamAssassin markup hdrs. Once one of those headers
1351             # is seen, all further headers go into that set; it's assumed that it's an
1352             # old copy of the header, or attempted spoofing, if it crops up halfway
1353             # through the headers.
1354              
1355 32         120 my $new_hdrs_pre = '';
1356 32 50 33     288 if (@pristine_headers && $pristine_headers[0] =~ /^Return-Path:/i) {
1357 0         0 $new_hdrs_pre .= shift(@pristine_headers);
1358 0   0     0 while (@pristine_headers && $pristine_headers[0] =~ /^[ \t]/) {
1359 0         0 $new_hdrs_pre .= shift(@pristine_headers);
1360             }
1361             }
1362 32         184 $new_hdrs_pre .= $self->_get_added_headers($addition);
1363              
1364             # fix up line endings appropriately
1365 32         240 my $newmsg = $new_hdrs_pre . join('',@pristine_headers) . $separator;
1366 32         163 $self->_fixup_report_line_endings(\$newmsg);
1367              
1368 32         166 return $newmsg.$self->{msg}->get_pristine_body();
1369             }
1370              
1371             sub qp_encode_header {
1372 168     168 0 541 my ($self, $text) = @_;
1373              
1374             # return unchanged if there are no 8-bit characters
1375 168 50       936 return $text if $text !~ tr/\x00-\x7F//c;
1376              
1377 0         0 my $cs = 'ISO-8859-1';
1378 0 0       0 if ($self->{report_charset}) {
1379 0         0 $cs = $self->{report_charset};
1380             }
1381              
1382 0         0 my @hexchars = split('', '0123456789abcdef');
1383 0         0 my $ord;
1384 0         0 local $1;
1385 0         0 $text =~ s{([\x80-\xff])}{
1386 0         0 $ord = ord $1;
1387 0         0 '='.$hexchars[($ord & 0xf0) >> 4].$hexchars[$ord & 0x0f]
1388             }ges;
1389              
1390 0         0 $text = '=?'.$cs.'?Q?'.$text.'?=';
1391              
1392 0         0 dbg("markup: encoding header in $cs: $text");
1393 0         0 return $text;
1394             }
1395              
1396             sub _process_header {
1397 168     168   562 my ($self, $hdr_name, $hdr_data) = @_;
1398              
1399 168         470 $hdr_data = $self->_replace_tags($hdr_data);
1400 168         488 $hdr_data =~ s/(?:\r?\n)+$//; # make sure there are no trailing newlines ...
1401              
1402 168 50       492 if ($self->{conf}->{fold_headers}) {
1403 168 50       428 if ($hdr_data =~ /\n/) {
1404 0         0 $hdr_data =~ s/\s*\n\s*/\n\t/g;
1405 0         0 return $hdr_data;
1406             }
1407             else {
1408             # use '!!' instead of ': ' so it doesn't wrap on the space
1409 168         629 my $hdr = "X-Spam-$hdr_name!!$hdr_data";
1410 168         592 $hdr = Mail::SpamAssassin::Util::wrap($hdr, "\t", "", 79, 0, '(?<=[\s,])');
1411             # make sure there are no blank lines in headers
1412             # buggy wrap might not prefix blank lines with \t, so use \s* (bug 7672)
1413 168         518 $hdr =~ s/^\s*\n//gm;
1414 168         1157 return (split (/!!/, $hdr, 2))[1]; # just return the data part
1415             }
1416             }
1417             else {
1418 0         0 $hdr_data =~ s/\n/ /g; # Can't have newlines in headers, unless folded
1419 0         0 return $hdr_data;
1420             }
1421             }
1422              
1423             sub _replace_tags {
1424 174     174   281 my $self = shift;
1425 174         392 my $text = shift;
1426              
1427             # default to leaving the original string in place, if we cannot find
1428             # a tag for it (bug 4793)
1429 174         651 local($1,$2,$3);
1430 174         1376 $text =~ s{(_(\w+?)(?:\((.*?)\))?_)}{
1431 430         1174 my $full = $1;
1432 430         1027 my $tag = $2;
1433 430         666 my $result;
1434 430 50       860 if ($tag =~ /^ADDEDHEADER(?:HAM|SPAM|)\z/) {
1435             # Bug 6278: break infinite recursion through _get_added_headers and
1436             # _get_tag on an attempt to use such tag in add_header template
1437             } else {
1438 430         908 $result = $self->get_tag_raw($tag,$3);
1439 430 50       1195 $result = join(' ',@$result) if ref $result eq 'ARRAY';
1440             }
1441 430 50       2674 defined $result ? $result : $full;
1442             }ge;
1443              
1444 174         874 return $text;
1445             }
1446              
1447             ###########################################################################
1448              
1449             # public API for plugins
1450              
1451             =item $status->action_depends_on_tags($tags, $code, @args)
1452              
1453             Enqueue the supplied subroutine reference C<$code>, to become runnable when
1454             all the specified tags become available. The C<$tags> may be a simple
1455             scalar - a tag name, or a listref of tag names. The subroutine C<&$code>
1456             when called will be passed a C<permessagestatus> object as its first argument,
1457             followed by the supplied (optional) list C<@args> .
1458              
1459             =cut
1460              
1461             sub action_depends_on_tags {
1462 81     81 1 309 my($self, $tags, $code, @args) = @_;
1463              
1464 81 50       328 ref $code eq 'CODE'
1465             or die "action_depends_on_tags: argument must be a subroutine ref";
1466              
1467             # tag names on which the given action depends
1468 81 50       384 my @dep_tags = !ref $tags ? uc $tags : map(uc($_),@$tags);
1469              
1470             # @{$self->{tagrun_subs}} list of all submitted subroutines
1471             # @{$self->{tagrun_actions}{$tag}} bitmask of action indices blocked by tag
1472             # $self->{tagrun_tagscnt}[$action_ind] count of tags still pending
1473              
1474             # store action details, obtain its index
1475 81         199 push(@{$self->{tagrun_subs}}, [$code,@args]);
  81         377  
1476 81         183 my $action_ind = $#{$self->{tagrun_subs}};
  81         237  
1477              
1478             # list dependency tag names which are not already satisfied
1479 81         134 my @blocking_tags;
1480 81         229 foreach (@dep_tags) {
1481 81         338 my $data = $self->get_tag($_);
1482 81 50 33     537 if (!defined $data || $data eq '') {
1483 81         336 push @blocking_tags, $_;
1484             }
1485             }
1486              
1487 81         321 $self->{tagrun_tagscnt}[$action_ind] = scalar @blocking_tags;
1488 81         427 $self->{tagrun_actions}{$_}[$action_ind] = 1 for @blocking_tags;
1489              
1490 81 50       267 if (@blocking_tags) {
1491 81         382 dbg("check: tagrun - action %s blocking on tags %s",
1492             $action_ind, join(', ',@blocking_tags));
1493             } else {
1494 0         0 dbg("check: tagrun - tag %s was ready, action %s runnable immediately: %s",
1495             join(', ',@dep_tags), $action_ind, join(', ',$code,@args));
1496 0         0 &$code($self, @args);
1497             }
1498             }
1499              
1500             # tag_is_ready() will be called by set_tag(), indicating that a given
1501             # tag just received its value, possibly unblocking an action routine
1502             # as declared by action_depends_on_tags().
1503             #
1504             # Well-behaving plugins should call set_tag() once when a tag is fully
1505             # assembled and ready. Multiple calls to set the same tag value are handled
1506             # gracefully, but may result in premature activation of a pending action.
1507             # Setting tag values by plugins should not be done directly but only through
1508             # the public API set_tag(), otherwise a pending action release may be missed.
1509             #
1510             sub tag_is_ready {
1511 897     897 0 1587 my($self, $tag) = @_;
1512 897         1405 $tag = uc $tag;
1513              
1514 897 50       1882 if (would_log('dbg', 'check')) {
1515 0         0 my $tag_val = $self->{tag_data}{$tag};
1516 0 0       0 dbg("check: tagrun - tag %s is now ready, value: %s",
    0          
1517             $tag, !defined $tag_val ? '<UNDEF>'
1518             : ref $tag_val ne 'ARRAY' ? $tag_val
1519             : 'ARY:[' . join(',',@$tag_val) . ']' );
1520             }
1521 897 50       3003 if (ref $self->{tagrun_actions}{$tag}) { # any action blocking on this tag?
1522 0         0 my $action_ind = 0;
1523 0         0 foreach my $action_pending (@{$self->{tagrun_actions}{$tag}}) {
  0         0  
1524 0 0       0 if ($action_pending) {
1525 0         0 $self->{tagrun_actions}{$tag}[$action_ind] = 0;
1526 0 0       0 if ($self->{tagrun_tagscnt}[$action_ind] <= 0) {
    0          
1527             # should not happen, warn and ignore
1528             warn "tagrun error: count for $action_ind is ".
1529 0         0 $self->{tagrun_tagscnt}[$action_ind]."\n";
1530             } elsif (! --($self->{tagrun_tagscnt}[$action_ind])) {
1531 0         0 my($code,@args) = @{$self->{tagrun_subs}[$action_ind]};
  0         0  
1532 0         0 dbg("check: tagrun - tag %s unblocking the action %s: %s",
1533             $tag, $action_ind, join(', ',$code,@args));
1534 0         0 &$code($self, @args);
1535             }
1536             }
1537 0         0 $action_ind++;
1538             }
1539             }
1540             }
1541              
1542             # debugging aid: show actions that are still pending, waiting for their
1543             # tags to receive a value
1544             #
1545             sub report_unsatisfied_actions {
1546 136     136 0 339 my($self) = @_;
1547 136         251 my @tags;
1548 136 100       505 @tags = keys %{$self->{tagrun_actions}} if ref $self->{tagrun_actions};
  98         415  
1549 136         619 for my $tag (@tags) {
1550             my @pending_actions = grep($self->{tagrun_actions}{$tag}[$_],
1551 79         225 (0 .. $#{$self->{tagrun_actions}{$tag}}));
  79         425  
1552 79 50       558 dbg("check: tagrun - tag %s is still blocking action %s",
1553             $tag, join(', ', @pending_actions)) if @pending_actions;
1554             }
1555             }
1556              
1557             =item $status->set_tag($tagname, $value)
1558              
1559             Set a template tag, as used in C<add_header>, report templates, etc.
1560             This API is intended for use by plugins. Tag names will be converted
1561             to an all-uppercase representation internally.
1562              
1563             C<$value> can be a simple scalar (string or number), or a reference to an
1564             array, in which case the public method get_tag will join array elements
1565             using a space as a separator, returning a single string for backward
1566             compatibility.
1567              
1568             C<$value> can also be a subroutine reference, which will be evaluated
1569             each time the template is expanded. The first argument passed by get_tag
1570             to a called subroutine will be a PerMsgStatus object (this module's object),
1571             followed by optional arguments provided a caller to get_tag.
1572              
1573             Note that perl supports closures, which means that variables set in the
1574             caller's scope can be accessed inside this C<sub>. For example:
1575              
1576             my $text = "hello world!";
1577             $status->set_tag("FOO", sub {
1578             my $pms = shift;
1579             return $text;
1580             });
1581              
1582             See C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details
1583             on how template tags are used.
1584              
1585             C<undef> will be returned if a tag by that name has not been defined.
1586              
1587             =cut
1588              
1589             sub set_tag {
1590 897     897 1 2267 my($self,$tag,$val) = @_;
1591 897         2928 $self->{tag_data}->{uc $tag} = $val;
1592 897         2217 $self->tag_is_ready($tag);
1593             }
1594              
1595             # public API for plugins
1596              
1597             =item $string = $status->get_tag($tagname)
1598              
1599             Get the current value of a template tag, as used in C<add_header>, report
1600             templates, etc. This API is intended for use by plugins. Tag names will be
1601             converted to an all-uppercase representation internally. See
1602             C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details on
1603             tags.
1604              
1605             C<undef> will be returned if a tag by that name has not been defined.
1606              
1607             =cut
1608              
1609             sub get_tag {
1610 95     95 1 334 my($self, $tag, @args) = @_;
1611              
1612 95 50       323 return if !defined $tag;
1613 95         239 $tag = uc $tag;
1614 95         225 my $data;
1615 95 100       544 if (exists $common_tags{$tag}) {
    50          
1616             # tag data from traditional pre-defined tag subroutines
1617 4         27 $data = $common_tags{$tag};
1618 4 50       34 $data = $data->($self,@args) if ref $data eq 'CODE';
1619 4 50       27 $data = join(' ',@$data) if ref $data eq 'ARRAY';
1620 4 50       20 $data = "" if !defined $data;
1621             } elsif (exists $self->{tag_data}->{$tag}) {
1622             # tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1623 91         267 $data = $self->{tag_data}->{$tag};
1624 91 50       284 $data = $data->($self,@args) if ref $data eq 'CODE';
1625 91 50       359 $data = join(' ',@$data) if ref $data eq 'ARRAY';
1626 91 50       372 $data = "" if !defined $data;
1627             }
1628 95         308 return $data;
1629             }
1630              
1631             =item $string = $status->get_tag_raw($tagname, @args)
1632              
1633             Similar to C<get_tag>, but keeps a tag name unchanged (does not uppercase it),
1634             and does not convert arrayref tag values into a single string.
1635              
1636             =cut
1637              
1638             sub get_tag_raw {
1639 430     430 1 1380 my($self, $tag, @args) = @_;
1640              
1641 430 50       1265 return if !defined $tag;
1642 430         609 my $data;
1643 430 100       1429 if (exists $common_tags{$tag}) {
    50          
1644             # tag data from traditional pre-defined tag subroutines
1645 362         860 $data = $common_tags{$tag};
1646 362 50       1683 $data = $data->($self,@args) if ref $data eq 'CODE';
1647 362 50       1043 $data = "" if !defined $data;
1648             } elsif (exists $self->{tag_data}->{$tag}) {
1649             # tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1650 68         212 $data = $self->{tag_data}->{$tag};
1651 68 50       1068 $data = $data->($self,@args) if ref $data eq 'CODE';
1652 68 50       229 $data = "" if !defined $data;
1653             }
1654 430         1277 return $data;
1655             }
1656              
1657             ###########################################################################
1658              
1659             # public API for plugins
1660              
1661             =item $status->set_spamd_result_item($subref)
1662              
1663             Set an entry for the spamd result log line. C<$subref> should be a code
1664             reference for a subroutine which will return a string in C<'name=VALUE'>
1665             format, similar to the other entries in the spamd result line:
1666              
1667             Jul 17 14:10:47 radish spamd[16670]: spamd: result: Y 22 - ALL_NATURAL,
1668             DATE_IN_FUTURE_03_06,DIET_1,DRUGS_ERECTILE,DRUGS_PAIN,
1669             TEST_FORGED_YAHOO_RCVD,TEST_INVALID_DATE,TEST_NOREALNAME,
1670             TEST_NORMAL_HTTP_TO_IP,UNDISC_RECIPS scantime=0.4,size=3138,user=jm,
1671             uid=1000,required_score=5.0,rhost=localhost,raddr=127.0.0.1,
1672             rport=33153,mid=<9PS291LhupY>,autolearn=spam
1673              
1674             C<name> and C<VALUE> must not contain C<=> or C<,> characters, as it
1675             is important that these log lines are easy to parse.
1676              
1677             The code reference will be called by spamd after the message has been scanned,
1678             and the C<PerMsgStatus::check()> method has returned.
1679              
1680             =cut
1681              
1682             sub set_spamd_result_item {
1683 81     81 1 267 my ($self, $ref) = @_;
1684 81         163 push @{$self->{spamd_result_log_items}}, $ref;
  81         342  
1685             }
1686              
1687             # called by spamd
1688             sub get_spamd_result_log_items {
1689 0     0 0 0 my ($self) = @_;
1690 0         0 my @ret;
1691 0         0 foreach my $ref (@{$self->{spamd_result_log_items}}) {
  0         0  
1692 0         0 push @ret, &$ref;
1693             }
1694 0         0 return @ret;
1695             }
1696              
1697             ###########################################################################
1698              
1699             sub _get_tag_value_for_yesno {
1700 36     36   126 my($self, $arg) = @_;
1701 36         76 my($arg_spam, $arg_ham);
1702 36 50       126 ($arg_spam, $arg_ham) = split(/,/, $arg, 2) if defined $arg;
1703 36 50       233 return $self->{is_spam} ? (defined $arg_spam ? $arg_spam : 'Yes')
    50          
    100          
1704             : (defined $arg_ham ? $arg_ham : 'No');
1705             }
1706              
1707             sub _get_tag_value_for_score {
1708 38     38   120 my ($self, $pad) = @_;
1709              
1710 38         305 my $score = sprintf("%2.1f", $self->{score});
1711 38         166 my $rscore = $self->_get_tag_value_for_required_score();
1712              
1713             #Change due to bug 6419 to use Util function for consistency with spamd
1714             #and PerMessageStatus
1715 38         274 $score = Mail::SpamAssassin::Util::get_tag_value_for_score($score, $rscore, $self->{is_spam});
1716              
1717             #$pad IS PROVIDED BY THE _SCORE(PAD)_ tag
1718 38 50 33     199 if (defined $pad && $pad =~ /^(0+| +)$/) {
1719 0         0 my $count = length($1) + 3 - length($score);
1720 0 0       0 $score = (substr($pad, 0, $count) . $score) if $count > 0;
1721             }
1722 38         123 return $score;
1723              
1724             }
1725              
1726             sub _get_tag_value_for_required_score {
1727 76     76   135 my $self = shift;
1728 76         457 return sprintf("%2.1f", $self->{conf}->{required_score});
1729             }
1730              
1731              
1732             ###########################################################################
1733              
1734             =item $status->finish ()
1735              
1736             Indicate that this C<$status> object is finished with, and can be destroyed.
1737              
1738             If you are using SpamAssassin in a persistent environment, or checking many
1739             mail messages from one C<Mail::SpamAssassin> factory, this method should be
1740             called to ensure Perl's garbage collection will clean up old status objects.
1741              
1742             =cut
1743              
1744             sub finish {
1745 136     136 1 25640 my ($self) = @_;
1746              
1747 136         846 $self->{main}->call_plugins ("per_msg_finish", {
1748             permsgstatus => $self
1749             });
1750              
1751 136         821 $self->report_unsatisfied_actions;
1752              
1753             # Delete out all of the members of $self. This will remove any direct
1754             # circular references and let the memory get reclaimed while also being more
1755             # efficient than a foreach() loop over the keys.
1756 136         311 %{$self} = ();
  136         4785  
1757             }
1758              
1759             sub finish_tests {
1760 0     0 0 0 my ($conf) = @_;
1761 0         0 foreach my $method (@TEMPORARY_METHODS) {
1762 0 0       0 if (defined &{$method}) {
  0         0  
1763 0         0 undef &{$method};
  0         0  
1764             }
1765             }
1766 0         0 @TEMPORARY_METHODS = (); # clear for next time
1767 0         0 %TEMPORARY_EVAL_GLUE_METHODS = ();
1768             }
1769              
1770              
1771             =item $name = $status->get_current_eval_rule_name()
1772              
1773             Return the name of the currently-running eval rule. C<undef> is
1774             returned if no eval rule is currently being run. Useful for plugins
1775             to determine the current rule name while inside an eval test function
1776             call.
1777              
1778             =cut
1779              
1780             sub get_current_eval_rule_name {
1781 0     0 1 0 my ($self) = @_;
1782 0         0 return $self->{current_rule_name};
1783             }
1784              
1785             ###########################################################################
1786              
1787             sub extract_message_metadata {
1788 96     96 0 260 my ($self) = @_;
1789              
1790 96         439 my $timer = $self->{main}->time_method("extract_message_metadata");
1791 96         637 $self->{msg}->extract_message_metadata($self);
1792              
1793 96         522 foreach my $item (qw(
1794             relays_trusted relays_trusted_str num_relays_trusted
1795             relays_untrusted relays_untrusted_str num_relays_untrusted
1796             relays_internal relays_internal_str num_relays_internal
1797             relays_external relays_external_str num_relays_external
1798             num_relays_unparseable last_trusted_relay_index
1799             last_internal_relay_index
1800             ))
1801             {
1802 1440         3368 $self->{$item} = $self->{msg}->{metadata}->{$item};
1803             }
1804              
1805             # TODO: International domain names (UTF-8) must be converted to
1806             # ASCII-compatible encoding (ACE) for the purpose of setting the
1807             # SENDERDOMAIN and AUTHORDOMAIN tags (and probably for other uses too).
1808             # (explicitly required for DMARC, draft-kucherawy-dmarc-base sect. 5.6.1)
1809             #
1810 96         200 { local $1;
  96         349  
1811 96         691 my $addr = $self->get('EnvelopeFrom:addr', undef);
1812             # collect a FQDN, ignoring potential trailing WSP
1813 96 100 66     512 if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1814 1         14 $self->set_tag('SENDERDOMAIN', lc $1);
1815             }
1816             # TODO: the get ':addr' only returns the first address; this should be
1817             # augmented to be able to return all addresses in a header field, multiple
1818             # addresses in a From header field are allowed according to RFC 5322
1819 96         412 $addr = $self->get('From:addr', undef);
1820 96 100 66     1129 if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1821 57         667 $self->set_tag('AUTHORDOMAIN', lc $1);
1822             }
1823             }
1824              
1825 96         494 $self->set_tag('RELAYSTRUSTED', $self->{relays_trusted_str});
1826 96         313 $self->set_tag('RELAYSUNTRUSTED', $self->{relays_untrusted_str});
1827 96         316 $self->set_tag('RELAYSINTERNAL', $self->{relays_internal_str});
1828 96         309 $self->set_tag('RELAYSEXTERNAL', $self->{relays_external_str});
1829 96         468 $self->set_tag('LANGUAGES', $self->{msg}->get_metadata("X-Languages"));
1830              
1831             # This should happen before we get called, but just in case.
1832 96 100       408 if (!defined $self->{msg}->{metadata}->{html}) {
1833 95         403 $self->get_decoded_stripped_body_text_array();
1834             }
1835 96         351 $self->{html} = $self->{msg}->{metadata}->{html};
1836              
1837             # allow plugins to add more metadata, read the stuff that's there, etc.
1838 96         578 $self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
1839             }
1840              
1841             ###########################################################################
1842              
1843             =item $status->get_decoded_body_text_array ()
1844              
1845             Returns the message body, with B<base64> or B<quoted-printable> encodings
1846             decoded, and non-text parts or non-inline attachments stripped.
1847              
1848             This is the same result text as used in 'rawbody' rules.
1849              
1850             It is returned as an array of strings, with each string being a 2-4kB chunk
1851             of the body, split from boundaries if possible.
1852              
1853             =cut
1854              
1855             sub get_decoded_body_text_array {
1856 110     110 1 558 return $_[0]->{msg}->get_decoded_body_text_array();
1857             }
1858              
1859             =item $status->get_decoded_stripped_body_text_array ()
1860              
1861             Returns the message body, decoded (as described in
1862             get_decoded_body_text_array()), with HTML rendered, and with whitespace
1863             normalized.
1864              
1865             This is the same result text as used in 'body' rules.
1866              
1867             It will always render text/html.
1868              
1869             It is returned as an array of strings, with each string representing one
1870             'paragraph'. Paragraphs, in plain-text mails, are double-newline-separated
1871             blocks of multi-line text.
1872              
1873             =cut
1874              
1875             sub get_decoded_stripped_body_text_array {
1876 339     339 1 1387 return $_[0]->{msg}->get_rendered_body_text_array();
1877             }
1878              
1879             ###########################################################################
1880              
1881             =item $status->get (header_name [, default_value])
1882              
1883             Returns a message header, pseudo-header, real name or address.
1884             C<header_name> is the name of a mail header, such as 'Subject', 'To',
1885             etc. If C<default_value> is given, it will be used if the requested
1886             C<header_name> does not exist.
1887              
1888             Appending C<:raw> to the header name will inhibit decoding of quoted-printable
1889             or base-64 encoded strings.
1890              
1891             Appending a modifier C<:addr> to a header field name will cause everything
1892             except the first email address to be removed from the header field. It is
1893             mainly applicable to header fields 'From', 'Sender', 'To', 'Cc' along with
1894             their 'Resent-*' counterparts, and the 'Return-Path'. For example, all of
1895             the following will result in "example@foo":
1896              
1897             =over 4
1898              
1899             =item example@foo
1900              
1901             =item example@foo (Foo Blah)
1902              
1903             =item example@foo, example@bar
1904              
1905             =item display: example@foo (Foo Blah), example@bar ;
1906              
1907             =item Foo Blah <example@foo>
1908              
1909             =item "Foo Blah" <example@foo>
1910              
1911             =item "'Foo Blah'" <example@foo>
1912              
1913             =back
1914              
1915             Appending a modifier C<:name> to a header field name will cause everything
1916             except the first display name to be removed from the header field. It is
1917             mainly applicable to header fields containing a single mail address: 'From',
1918             'Sender', along with their 'Resent-From' and 'Resent-Sender' counterparts.
1919             For example, all of the following will result in "Foo Blah". One level of
1920             single quotes is stripped too, as it is often seen.
1921              
1922             =over 4
1923              
1924             =item example@foo (Foo Blah)
1925              
1926             =item example@foo (Foo Blah), example@bar
1927              
1928             =item display: example@foo (Foo Blah), example@bar ;
1929              
1930             =item Foo Blah <example@foo>
1931              
1932             =item "Foo Blah" <example@foo>
1933              
1934             =item "'Foo Blah'" <example@foo>
1935              
1936             =back
1937              
1938             There are several special pseudo-headers that can be specified:
1939              
1940             =over 4
1941              
1942             =item C<ALL> can be used to mean the text of all the message's headers.
1943             Each header is decoded and unfolded to single line, unless called with :raw.
1944              
1945             =item C<ALL-TRUSTED> can be used to mean the text of all the message's headers
1946             that could only have been added by trusted relays.
1947              
1948             =item C<ALL-INTERNAL> can be used to mean the text of all the message's headers
1949             that could only have been added by internal relays.
1950              
1951             =item C<ALL-UNTRUSTED> can be used to mean the text of all the message's
1952             headers that may have been added by untrusted relays. To make this
1953             pseudo-header more useful for header rules the 'Received' header that was added
1954             by the last trusted relay is included, even though it can be trusted.
1955              
1956             =item C<ALL-EXTERNAL> can be used to mean the text of all the message's headers
1957             that may have been added by external relays. Like C<ALL-UNTRUSTED> the
1958             'Received' header added by the last internal relay is included.
1959              
1960             =item C<ToCc> can be used to mean the contents of both the 'To' and 'Cc'
1961             headers.
1962              
1963             =item C<EnvelopeFrom> is the address used in the 'MAIL FROM:' phase of the SMTP
1964             transaction that delivered this message, if this data has been made available
1965             by the SMTP server.
1966              
1967             =item C<MESSAGEID> is a symbol meaning all Message-Id's found in the message;
1968             some mailing list software moves the real 'Message-Id' to 'Resent-Message-Id'
1969             or 'X-Message-Id', then uses its own one in the 'Message-Id' header. The value
1970             returned for this symbol is the text from all 3 headers, separated by newlines.
1971              
1972             =item C<X-Spam-Relays-Untrusted> is the generated metadata of untrusted relays
1973             the message has passed through
1974              
1975             =item C<X-Spam-Relays-Trusted> is the generated metadata of trusted relays
1976             the message has passed through
1977              
1978             =back
1979              
1980             =cut
1981              
1982             # only uses two arguments, ignores $defval
1983             sub _get {
1984 3494     3494   6762 my ($self, $request) = @_;
1985              
1986 3494         4490 my $result;
1987 3494         4509 my $getaddr = 0;
1988 3494         4236 my $getname = 0;
1989 3494         4281 my $getraw = 0;
1990              
1991             # special queries - process and strip modifiers
1992 3494 100       8504 if (index($request,':') >= 0) { # triage
1993 607         1500 local $1;
1994 607         4275 while ($request =~ s/:([^:]*)//) {
1995 607 100       2473 if ($1 eq 'raw') { $getraw = 1 }
  1 100       5  
    50          
1996 602         2155 elsif ($1 eq 'addr') { $getaddr = $getraw = 1 }
1997 4         18 elsif ($1 eq 'name') { $getname = 1 }
1998             }
1999             }
2000 3494         6120 my $request_lc = lc $request;
2001              
2002             # ALL: entire pristine or semi-raw headers
2003 3494 100       21319 if ($request eq 'ALL') {
    50          
    100          
    50          
    50          
    100          
    50          
    50          
    50          
    50          
    100          
    100          
2004             return ($getraw ? $self->{msg}->get_pristine_header()
2005 15 50       159 : $self->{msg}->get_all_headers(0));
2006             }
2007             # ALL-TRUSTED: entire trusted raw headers
2008             elsif ($request eq 'ALL-TRUSTED') {
2009             # '+1' since we added the received header even though it's not considered
2010             # trusted, so we know that those headers can be trusted too
2011             return $self->get_all_hdrs_in_rcvd_index_range(
2012 0         0 undef, $self->{last_trusted_relay_index}+1,
2013             undef, undef, $getraw);
2014             }
2015             # ALL-INTERNAL: entire internal raw headers
2016             elsif ($request eq 'ALL-INTERNAL') {
2017             # '+1' for the same reason as in ALL-TRUSTED above
2018             return $self->get_all_hdrs_in_rcvd_index_range(
2019 4         26 undef, $self->{last_internal_relay_index}+1,
2020             undef, undef, $getraw);
2021             }
2022             # ALL-UNTRUSTED: entire untrusted raw headers
2023             elsif ($request eq 'ALL-UNTRUSTED') {
2024             # '+1' for the same reason as in ALL-TRUSTED above
2025             return $self->get_all_hdrs_in_rcvd_index_range(
2026 0         0 $self->{last_trusted_relay_index}+1, undef,
2027             undef, undef, $getraw);
2028             }
2029             # ALL-EXTERNAL: entire external raw headers
2030             elsif ($request eq 'ALL-EXTERNAL') {
2031             # '+1' for the same reason as in ALL-TRUSTED above
2032             return $self->get_all_hdrs_in_rcvd_index_range(
2033 0         0 $self->{last_internal_relay_index}+1, undef,
2034             undef, undef, $getraw);
2035             }
2036             # EnvelopeFrom: the SMTP MAIL FROM: address
2037             elsif ($request_lc eq "\LEnvelopeFrom") {
2038 110         659 $result = $self->get_envelope_from();
2039             }
2040             # untrusted relays list, as string
2041             elsif ($request_lc eq "\LX-Spam-Relays-Untrusted") {
2042 0         0 $result = $self->{relays_untrusted_str};
2043             }
2044             # trusted relays list, as string
2045             elsif ($request_lc eq "\LX-Spam-Relays-Trusted") {
2046 0         0 $result = $self->{relays_trusted_str};
2047             }
2048             # external relays list, as string
2049             elsif ($request_lc eq "\LX-Spam-Relays-External") {
2050 0         0 $result = $self->{relays_external_str};
2051             }
2052             # internal relays list, as string
2053             elsif ($request_lc eq "\LX-Spam-Relays-Internal") {
2054 0         0 $result = $self->{relays_internal_str};
2055             }
2056             # ToCc: the combined recipients list
2057             elsif ($request_lc eq "\LToCc") {
2058 81         546 $result = join("\n", $self->{msg}->get_header('To', $getraw));
2059 81 100       373 if ($result ne '') {
2060 8         39 chomp $result;
2061 8 50       62 $result .= ", " if $result =~ /\S/;
2062             }
2063 81         338 $result .= join("\n", $self->{msg}->get_header('Cc', $getraw));
2064 81 100       459 $result = undef if $result eq '';
2065             }
2066             # MESSAGEID: handle lists which move the real message-id to another
2067             # header for resending.
2068             elsif ($request eq 'MESSAGEID') {
2069 42 50       547 $result = join("\n", grep { defined($_) && $_ ne '' }
2070             $self->{msg}->get_header('X-Message-Id', $getraw),
2071             $self->{msg}->get_header('Resent-Message-Id', $getraw),
2072             $self->{msg}->get_header('X-Original-Message-ID', $getraw),
2073 81         410 $self->{msg}->get_header('Message-Id', $getraw));
2074             }
2075             # a conventional header
2076             else {
2077             my @results = $getraw ? $self->{msg}->raw_header($request)
2078 3203 100       10249 : $self->{msg}->get_header($request);
2079             # dbg("message: get(%s)%s = %s",
2080             # $request, $getraw?'raw':'', join(", ",@results));
2081 3203 100       6154 if (@results) {
2082 357         1617 $result = join('', @results);
2083             } else { # metadata
2084 2846         7381 $result = $self->{msg}->get_metadata($request);
2085             }
2086             }
2087              
2088             # special queries
2089 3475 100 100     10864 if (defined $result && ($getaddr || $getname)) {
      100        
2090 142         476 local $1;
2091 142         443 $result =~ s/^[^:]+:(.*);\s*$/$1/gs; # 'undisclosed-recipients: ;'
2092 142         1042 $result =~ s/\s+/ /g; # reduce whitespace
2093 142         656 $result =~ s/^\s+//; # leading whitespace
2094 142         687 $result =~ s/\s+$//; # trailing whitespace
2095              
2096 142 100       438 if ($getaddr) {
    50          
2097             # Get the email address out of the header
2098             # All of these should result in "jm@foo":
2099             # jm@foo
2100             # jm@foo (Foo Blah)
2101             # jm@foo, jm@bar
2102             # display: jm@foo (Foo Blah), jm@bar ;
2103             # Foo Blah <jm@foo>
2104             # "Foo Blah" <jm@foo>
2105             # "'Foo Blah'" <jm@foo>
2106             #
2107             # strip out the (comments)
2108 138         335 $result =~ s/\s*\(.*?\)//g;
2109             # strip out the "quoted text", unless it's the only thing in the string
2110 138 100       539 if ($result !~ /^".*"$/) {
2111 137         376 $result =~ s/(?<!<)"[^"]*"(?!\@)//g; #" emacs
2112             }
2113             # Foo Blah <jm@xxx> or <jm@xxx>
2114 138         342 local $1;
2115 138         446 $result =~ s/^[^"<]*?<(.*?)>.*$/$1/;
2116             # multiple addresses on one line? remove all but first
2117 138         469 $result =~ s/,.*$//;
2118             }
2119             elsif ($getname) {
2120             # Get the display name out of the header
2121             # All of these should result in "Foo Blah":
2122             #
2123             # jm@foo (Foo Blah)
2124             # (Foo Blah) jm@foo
2125             # jm@foo (Foo Blah), jm@bar
2126             # display: jm@foo (Foo Blah), jm@bar ;
2127             # Foo Blah <jm@foo>
2128             # "Foo Blah" <jm@foo>
2129             # "'Foo Blah'" <jm@foo>
2130             #
2131 4         9 local $1;
2132             # does not handle mailbox-list or address-list or quotes well, to be improved
2133 4 100 66     53 if ($result =~ /^ \s* " (.*?) (?<!\\)" \s* < [^<>]* >/sx ||
    50          
2134             $result =~ /^ \s* (.*?) \s* < [^<>]* >/sx) {
2135 3         11 $result = $1; # display-name, RFC 5322
2136             # name-addr = [display-name] angle-addr
2137             # display-name = phrase
2138             # phrase = 1*word / obs-phrase
2139             # word = atom / quoted-string
2140             # obs-phrase = word *(word / "." / CFWS)
2141 3         27 $result =~ s{ " ( (?: [^"\\] | \\. )* ) " }
  0         0  
  0         0  
  0         0  
2142 3         17 { my $s=$1; $s=~s{\\(.)}{$1}gs; $s }gsxe;
2143             $result =~ s/\\"/"/gs;
2144             } elsif ($result =~ /^ [^(,]*? \( (.*?) \) /sx) { # legacy form
2145 0         0 # nested comments are not handled, to be improved
2146             $result = $1;
2147 1         4 } else { # no display name
2148             $result = '';
2149 4         19 }
2150             $result =~ s/^ \s* ' \s* (.*?) \s* ' \s* \z/$1/sx;
2151             }
2152 3475         9708 }
2153             return $result;
2154             }
2155              
2156             # optimized for speed
2157             # $_[0] is self
2158             # $_[1] is request
2159             # $_[2] is defval
2160 4048     4048 1 25475 sub get {
2161 4048         5129 my $cache = $_[0]->{c};
2162 4048 100       9121 my $found;
2163             if (exists $cache->{$_[1]}) {
2164             # return cache entry if it is known
2165 554         1403 # (measured hit/attempts rate on a production mailer is about 47%)
2166             $found = $cache->{$_[1]};
2167             } else {
2168 3494         6780 # fill in a cache entry
2169 3494         11537 $found = _get(@_);
2170             $cache->{$_[1]} = $found;
2171             }
2172             # if the requested header wasn't found, we should return a default value
2173             # as specified by the caller: if defval argument is present it represents
2174             # a default value even if undef; if defval argument is absent a default
2175 4048 100       26758 # value is an empty string for upwards compatibility
    100          
2176             return (defined $found ? $found : @_ > 2 ? $_[2] : '');
2177             }
2178              
2179             ###########################################################################
2180              
2181             # uri parsing from plain text:
2182             # The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but
2183             # ignore random strings that might look like URIs, for example in uuencoded files, and to ignore
2184             # URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters.
2185             # When we extract a domain and look it up in an RBL, an FP on deciding that the text is a URI is not much
2186             # of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule
2187             # because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings.
2188             # The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string,
2189             # then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most
2190             # likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that
2191             # it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD;
2192             # 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD
2193             #
2194             # We get the entire URI that would be linkified before dealing with it, in order to do the right thing
2195             # with URI-encodings and redirecting URIs.
2196             #
2197             # The delimiters for start of a URI in TBird are @(`{|[\"'<>,\s in OE they are ("<\s
2198             #
2199             # Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end.
2200             # TBird's end delimiters are )`{}|[]"<>\s but ) is only an end delmiter if there is no ( in the URI
2201             # OE only uses space as a delimiter, but ignores [~!@#^&*()_+`-={}|[]:";'<>?,.]* at the end.
2202             #
2203             # Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it.
2204             # TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end
2205             # delimiters include ( and '
2206             #
2207             # bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence ESC ( .
2208              
2209 130     130   379 sub _tbirdurire {
2210             my ($self) = @_;
2211              
2212 130 50       497 # Cached?
2213             return $self->{tbirdurire} if exists $self->{tbirdurire};
2214              
2215 130         354 # a hybrid of tbird and oe's version of uri parsing
2216 130         255 my $tbirdstartdelim = '><"\'`,{[(|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
2217 130         242 my $iso2022shift = "\x1b" . '\(.'; # bug 4522
2218 130         301 my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
2219             my $nonASCII = '\x80-\xff';
2220              
2221 130         605 # schemeless uri start delimiter, combo of most punctuations and delims above
2222             my $scstartdelim = qr/[\!\"\#\$\&\'\(\)\*\+\,\/\:\;\<\=\>\?\@\[\\\]\^\`\{\|\}\~\s\x1b\xa0]/;
2223              
2224             # bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
2225 130         425 # it's a common thing to find in text
2226 130         365 my $tbirdenddelimemail = $tbirdenddelim . ',(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes
2227             my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
2228              
2229 130         403 # valid TLDs
2230             my $tldsRE = $self->{main}->{registryboundaries}->{valid_tlds_re};
2231              
2232             # knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
2233 130         337748 # schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
2234 130         2474 my $urischemeless = qr/([a-z\d][a-z\d._-]{0,251}\.${tldsRE})\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,2048})?/i;
2235 130         2173 my $uriknownscheme = qr/(?:(?:https?|ftp):\/\/|(?:www\d{0,2}|ftp)\.)[^$tbirdenddelim]{1,2048}/i;
2236             my $urimailscheme = qr/(?:mailto:[^$tbirdenddelimemail]{1,2048}|[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251})/i;
2237 130         344459  
2238             $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
2239             (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
2240             (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|\z))) |
2241             (?:(?:^|(?<=$scstartdelim))($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/ix;
2242 130         1359  
2243             return $self->{tbirdurire};
2244             }
2245              
2246             =item $status->get_uri_list ()
2247              
2248             Returns an array of all unique URIs found in the message. It takes
2249             a combination of the URIs found in the rendered (decoded and HTML
2250             stripped) body and the URIs found when parsing the HTML in the message.
2251             Will also set $status->{uri_list} (the array as returned by this function).
2252              
2253             The returned array will include the "raw" URI as well as
2254             "slightly cooked" versions. For example, the single URI
2255             'http://%77&#00119;%77.example.com/' will get turned into:
2256             ( 'http://%77&#00119;%77.example.com/', 'http://www.example.com/' )
2257              
2258             =cut
2259              
2260 130     130 1 523 sub get_uri_list {
2261             my ($self) = @_;
2262              
2263 130 50       531 # use cached answer if available
2264 0         0 if (exists $self->{uri_list}) {
  0         0  
2265             return @{$self->{uri_list}};
2266             }
2267 130         249  
2268             my %uris;
2269             # $self->{redirect_num} = 0;
2270              
2271 130         259 # get URIs from text/HTML parsing
  531         1091  
2272 401 50       841 while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
2273 401         500 if ($info->{cleaned}) {
  401         748  
2274 540         1173 foreach (@{$info->{cleaned}}) {
2275             $uris{$_} = 1;
2276              
2277             # count redirection attempts and log it
2278             # if (my @http = m{\b(https?:/{0,2})}gi) {
2279             # $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
2280             # }
2281             }
2282             }
2283             }
2284 130         557  
  130         461  
2285             @{$self->{uri_list}} = keys %uris;
2286             # $self->set_tag('URILIST', @uris == 1 ? $uris[0] : \@uris) if @uris;
2287 130         254  
  130         797  
2288             return @{$self->{uri_list}};
2289             }
2290              
2291             =item $status->get_uri_detail_list ()
2292              
2293             Returns a hash reference of all unique URIs found in the message and
2294             various data about where the URIs were found in the message. It takes a
2295             combination of the URIs found in the rendered (decoded and HTML stripped)
2296             body and the URIs found when parsing the HTML in the message. Will also
2297             set $status->{uri_detail_list} (the hash reference as returned by this
2298             function).
2299              
2300             The hash format looks something like this:
2301              
2302             raw_uri => {
2303             types => { a => 1, img => 1, parsed => 1, domainkeys => 1,
2304             unlinked => 1, schemeless => 1 },
2305             cleaned => [ canonicalized_uri ],
2306             anchor_text => [ "click here", "no click here" ],
2307             domains => { domain1 => 1, domain2 => 1 },
2308             hosts => { host1 => domain1, host2 => domain2 },
2309             }
2310              
2311             C<raw_uri> is whatever the URI was in the message itself
2312             (http://spamassassin.apache%2Eorg/). Uris parsed from text will be prefixed
2313             with scheme if missing (http://, mailto: etc). HTML uris are as found.
2314              
2315             C<types> is a hash of the HTML tags (lowercase) which referenced the
2316             raw_uri. I<parsed> is a faked type which specifies that the raw_uri was
2317             seen in the rendered text. I<domainkeys> is defined when raw_uri was found
2318             from DK/DKIM d= field. I<unlinked> is defined when it's assumed that MUA
2319             will not linkify uri (found in body without scheme or www. prefix).
2320             I<schemeless> is always added for uris without scheme, regardless of
2321             linkifying (i.e. email address found in body without mailto:).
2322              
2323             C<cleaned> is an array of the raw and canonicalized version of the raw_uri
2324             (http://spamassassin.apache%2Eorg/, https://spamassassin.apache.org/).
2325              
2326             C<anchor_text> is an array of the anchor text (text between <a> and
2327             </a>), if any, which linked to the URI.
2328              
2329             C<domains> is a hash of the domains found in the canonicalized URIs.
2330              
2331             C<hosts> is a hash of unstripped hostnames found in the canonicalized URIs
2332             as hash keys, with their domain part stored as a value of each hash entry.
2333              
2334             =cut
2335              
2336 535     535 1 934 sub get_uri_detail_list {
2337             my ($self) = @_;
2338              
2339             # process only once, use unique uri_detail_list_run flag,
2340 535 100       1114 # in case add_uri_detail_list has already been called
2341 405         1183 if ($self->{uri_detail_list_run}) {
2342             return $self->{uri_detail_list};
2343 130         330 }
2344             $self->{uri_detail_list_run} = 1;
2345 130         565  
2346             my $timer = $self->{main}->time_method("get_uri_detail_list");
2347              
2348 130         700 # process text parsed uris
2349             $self->_process_text_uri_list();
2350 130         888 # process html uris
2351             $self->_process_html_uri_list();
2352 130         738 # process dkim uris
2353             $self->_process_dkim_uri_list();
2354 130         811  
2355             return $self->{uri_detail_list};
2356             }
2357              
2358 130     130   330 sub _process_text_uri_list {
2359             my ($self) = @_;
2360              
2361 130         474 # Use decoded stripped body, which does not contain HTML
2362 130         577 my $textary = $self->get_decoded_stripped_body_text_array();
2363 130         328 my $tbirdurire = $self->_tbirdurire;
2364 130         672 my %seen;
2365             my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
2366 130         643  
2367             foreach my $text (@$textary) {
2368             # a workaround for [perl #69973] bug:
2369             # Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
2370             # Bug 6225, regexp and string should both be utf8, or none of them;
2371             # untainting string also seems to avoid the crash
2372             #
2373 669         1852 # Bug 6225: untaint the string in an attempt to work around a perl crash
2374             local $_ = untaint_var($text);
2375 669         1857  
2376 669         67022 local($1,$2,$3);
2377 324   66     1526 while (/$tbirdurire/igo) {
2378 324         702 my $rawuri = $1||$2||$3;
2379 324 100       922 my $schost = $4;
    100          
2380 324         785 my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
2381 324         1940 $rawuri =~ s/(^[^(]*)\).*$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceding it
2382             $rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
2383 324 100       1143  
2384 305         864 next if exists $seen{$rawuri};
2385             $seen{$rawuri} = 1;
2386 305 50       738  
2387             dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
2388              
2389 305 100 100     894 # Quick ignore if schemeless host not valid
2390             next if defined $schost && !is_fqdn_valid($schost, 1);
2391              
2392             # Ignore cid: mid: as they can be mistaken for emails,
2393             # these should not be parsed from stripped body in any case.
2394 296 50       803 # Example: [cid:image001.png@01D4986E.E3459640]
2395             next if $rawuri =~ /^[cm]id:/i;
2396              
2397 296 50       895 # Ignore empty uris
2398             next if $rawuri =~ /^\w+:\/{0,2}$/i;
2399 296         807  
2400             my $types = {parsed => 1};
2401              
2402             # If it's a hostname that was just sitting out in the
2403             # open, without a protocol, and not inside of an HTML tag,
2404             # the we should add the proper protocol in front, rather
2405 296         531 # than using the base URI.
2406 296 100       1031 my $uri = $rawuri;
2407 140 100       613 if ($uri !~ /^(?:https?|ftp|mailto):/i) {
    100          
    100          
2408 1         3 if ($uri =~ /^ftp\./i) {
2409             $uri = "ftp://$uri";
2410             }
2411 59         166 elsif ($uri =~ /^www\d{0,2}\./i) {
2412             $uri = "http://$uri";
2413             }
2414             elsif (index($uri, '@') != -1) {
2415             # This is not linkified by MUAs: foo@bar%2Ecom
2416             # This IS linkified: foo@bar%2Ebar.com
2417             # And this is linkified: foo@bar%2Ecom?foo.com&bar (woot??)
2418             # And this is linkified with Outlook: foo@bar%2Ecom&foo (woot??)
2419             # ...
2420             # Skip if no dot found after @, tested without urldecoding,
2421 29 50       158 # quick skip for crap like Vi@gra.
2422 29 50       96 next unless $uri =~ /\@.+?\./;
2423 29         69 next if index($uri, '&nbsp;') != -1; # ignore garbled
2424 29         85 $uri =~ s/^(?:skype|e?-?mail)?:+//i; # strip common misparses
2425             $uri = "mailto:$uri";
2426             }
2427             else {
2428             # some spammers are using unschemed URIs to escape filters
2429             # flag that this is a URI that MUAs don't linkify so only use for RBLs
2430 51 50       124 # (TODO: why only use for RBLs?? why not uri rules? Use tflags to choose?)
2431 51         106 next if index($uri, '.') == -1; # skip unless dot found, garbage
2432 51         119 $uri = "http://$uri";
2433             $types->{unlinked} = 1;
2434             }
2435 140         296 # Mark any of those schemeless
2436             $types->{schemeless} = 1;
2437             }
2438 296 100       896  
2439             if ($uri =~ /^mailto:/i) {
2440 36 50       109 # MUAs linkify and urldecode mailto:foo%40bar%2Fcom
2441             $uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
2442 36 100       143 # Skip unless @ found after decoding, then check tld is valid
2443 34         90 next unless $uri =~ /\@([^?&>]*)/;
  34         69  
2444 34 100       133 my $host = $1; $host =~ s/(?:\%20)+$//; # strip trailing %20 from host
2445             next unless $self->{main}->{registryboundaries}->is_domain_valid($host);
2446             }
2447 291 50       628  
2448             dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;
2449 291         843  
2450             $self->add_uri_detail_list($uri, $types, 'parsed', 1);
2451             }
2452             }
2453             }
2454              
2455 130     130   411 sub _process_html_uri_list {
2456             my ($self) = @_;
2457              
2458             # get URIs from HTML parsing
2459 130   100     1229 # use the metadata version since $self->{html} may not be setup
2460 130 50       559 my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} || { };
2461             $self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
2462              
2463 130         314 # canonicalize the HTML parsed URIs
  257         1212  
2464 127 50       308 while(my($uri, $info) = each %{ $detail }) {
2465             if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
2466 127 100       274 # Need also to copy and uniq anchor text
2467 118         176 if (exists $info->{anchor_text}) {
2468 118         166 my %seen;
  118         437  
  118         260  
2469 118         188 foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
  118         469  
2470             push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
2471             }
2472             }
2473             }
2474             }
2475             }
2476              
2477 130     130   369 sub _process_dkim_uri_list {
2478             my ($self) = @_;
2479              
2480             # This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
2481             # this functionality
2482             # 2013-01-07
2483             # This functionality is re-enabled as a configuration option disabled by
2484             # default (bug 7087)
2485             # 2014-10-06
2486              
2487 130 50       575 # Look for the domain in DK/DKIM headers
2488 130         547 if ($self->{conf}->{parse_dkim_uris}) {
  260         1209  
2489             my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef ),
2490 130         697 $self->get('DKIM-Signature',undef) ));
2491 0         0 while ($dk =~ /\bd\s*=\s*([^;]+)/g) {
2492 0         0 my $d = $1;
2493             $d =~ s/\s+//g;
2494 0         0 # prefix with domainkeys: so it doesn't merge with identical keys
2495             $self->add_uri_detail_list("domainkeys:$d",
2496             {'domainkeys'=>1, 'nocanon'=>1, 'noclean'=>1},
2497             'domainkeys', 1);
2498             }
2499             }
2500             }
2501              
2502             =item $status->add_uri_detail_list ($raw_uri, $types, $source, $valid_domain)
2503              
2504             Adds values to internal uri_detail_list. When used from Plugins, recommended
2505             to call from parsed_metadata (along with register_method_priority, -10) so
2506             other Plugins calling get_uri_detail_list() will see it.
2507              
2508             C<raw_uri> is the URI to be added. The only required parameter.
2509              
2510             C<types> is an optional hash reference, contents are added to
2511             uri_detail_list->{types} (see get_uri_detail_list for known keys).
2512             I<parsed> is default is no hash given. I<nocanon> does not run
2513             uri_list_canonicalize (no redirector, uri fixing). I<noclean> skips adding
2514             uri_detail_list->{cleaned}, so it would not be used in "uri" rule checks,
2515             but domain/hosts would still be used for URIBL/RBL purposes.
2516              
2517             C<source> is an optional simple string, only used for debug logging purposes
2518             to identify where uri originates from (default: "parsed").
2519              
2520             C<valid_domain> is an optional boolean (0/1). If true, uri will not be
2521             added unless hostname/domain is in valid format and contains a valid TLD.
2522             (default: 0)
2523              
2524             =cut
2525              
2526 418     418 1 952 sub add_uri_detail_list {
2527             my ($self, $uri, $types, $source, $valid_domain) = @_;
2528 418 50       911  
2529 418   50     822 $types = {'parsed' => 1} unless defined $types;
2530             $source ||= 'parsed';
2531 418         645  
2532 418         672 my (%domains, %hosts, %cleaned);
2533             my $udl = $self->{uri_detail_list};
2534 418         1616  
2535             dbg("uri: canonicalizing $source uri: $uri");
2536 418         642  
2537 418 50       875 my @uris;
2538 0         0 if ($types->{nocanon}) {
2539             push @uris, $uri;
2540 418         1571 } else {
2541             @uris = uri_list_canonicalize($self->{conf}->{redirector_patterns}, $uri);
2542 418         976 }
2543             foreach my $cleanuri (@uris) {
2544 558 50       1216 # Make sure all the URIs are nice and short
2545 0         0 if (length($cleanuri) > MAX_URI_LENGTH) {
2546 0         0 $self->{'uri_truncated'} = 1;
2547             $cleanuri = substr($cleanuri, 0, MAX_URI_LENGTH);
2548 558         1914 }
2549 558         1068 dbg("uri: cleaned uri: $cleanuri");
2550 558         1937 $cleaned{$cleanuri} = 1;
2551 558 100       1323 my ($domain, $host) = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
2552 445         1636 if (defined $domain) {
2553 445         1042 dbg("uri: added host: $host domain: $domain");
2554 445         1313 $domains{$domain} = 1;
2555             $hosts{$host} = $domain;
2556             }
2557             }
2558              
2559 418 50       873 # Bail out if no good uri found
2560             return unless %cleaned;
2561              
2562 418 100 100     1579 # Bail out if no domains/hosts found?
2563             return if $valid_domain && !%domains;
2564              
2565 402 50       876 # Merge cleaned
2566 402 100       1087 if (!$types->{noclean}) {
2567 1         3 if ($udl->{$uri}->{cleaned}) {
  1         4  
2568             $cleaned{$_} = 1 foreach (@{$udl->{$uri}->{cleaned}});
2569 402         825 }
  402         1389  
2570             @{$udl->{$uri}->{cleaned}} = keys %cleaned;
2571             }
2572              
2573 402         1550 # Domains/hosts (there might not be any)
2574 402         1404 $udl->{$uri}->{domains}->{$_} = 1 foreach keys %domains;
2575             $udl->{$uri}->{hosts}->{$_} = $hosts{$_} foreach keys %hosts;
2576              
2577 402         1771 # Types
2578             $udl->{$uri}->{types}->{$_} = 1 foreach keys %$types;
2579              
2580 402         725 # Invalidate uri_list cache
2581             delete $self->{uri_list};
2582 402         5317  
2583             return 1;
2584             }
2585              
2586              
2587             ###########################################################################
2588              
2589 0     0 0 0 sub ensure_rules_are_complete {
2590 0         0 my $self = shift;
2591             my $metarule = shift;
2592             # @_ is now the list of rules
2593 0         0  
2594             foreach my $r (@_) {
2595 0 0       0 # dbg("rules: meta rule depends on net rule $r");
2596             next if ($self->is_rule_complete($r));
2597 0         0  
2598 0         0 dbg("rules: meta rule $metarule depends on pending rule $r, blocking");
2599             my $timer = $self->{main}->time_method("wait_for_pending_rules");
2600 0         0  
2601 0         0 my $start = time;
2602 0         0 $self->harvest_until_rule_completes($r);
2603             my $elapsed = sprintf "%.2f", time - $start;
2604 0 0       0  
    0          
2605 0         0 if (!$self->is_rule_complete($r)) {
2606             dbg("rules: rule $r is still not complete; exited early?");
2607             }
2608 0         0 elsif ($elapsed > 0) {
2609             my $txt = "rules: $r took $elapsed seconds to complete, for $metarule";
2610 0 0       0 # Info only if something took over 1 sec to wait, prevent log flood
  0         0  
  0         0  
2611             if ($elapsed >= 1) { info($txt); } else { dbg($txt); }
2612             }
2613             }
2614             }
2615              
2616             ###########################################################################
2617              
2618             # use a separate sub here, for brevity
2619             # called out of generated eval
2620 0     0 0 0 sub handle_eval_rule_errors {
2621 0         0 my ($self, $rulename) = @_;
2622 0         0 warn "rules: failed to run $rulename test, skipping:\n\t($@)\n";
2623             $self->{rule_errors}++;
2624             }
2625              
2626 3773     3773 0 7254 sub register_plugin_eval_glue {
2627             my ($self, $function) = @_;
2628 3773 50       7675  
2629 0         0 if (!$function) {
2630 0         0 warn "rules: empty function name";
2631             return;
2632             }
2633              
2634 3773 100       46879 # only need to call this once per fn (globally)
2635 283         692 return if exists $TEMPORARY_EVAL_GLUE_METHODS{$function};
2636             $TEMPORARY_EVAL_GLUE_METHODS{$function} = undef;
2637              
2638 283 50       1021 # return if it's not an eval_plugin function
2639             return if (!exists $self->{conf}->{eval_plugins}->{$function});
2640              
2641             # return if it's been registered already
2642 283 50 33     2076 return if ($self->can ($function) &&
  0         0  
2643             defined &{'Mail::SpamAssassin::PerMsgStatus::'.$function});
2644 283         907  
2645             my $evalstr = <<"ENDOFEVAL";
2646             {
2647             package Mail::SpamAssassin::PerMsgStatus;
2648              
2649             sub $function {
2650             my (\$self) = shift;
2651             my \$plugin = \$self->{conf}->{eval_plugins}->{$function};
2652             return \$plugin->$function (\$self, \@_);
2653             }
2654              
2655             1;
2656             }
2657             ENDOFEVAL
2658 283 50   466 1 31832 eval $evalstr . '; 1' ## no critic
  466     101 1 1190  
  466     81 0 1138  
  466     81 0 1669  
  101     4 0 371  
  101     77 0 322  
  101     93 1 601  
  81     81 1 379  
  81     81 0 313  
  81     81 1 415  
  81     133 0 315  
  81     93 0 275  
  81     81 0 391  
  4     555 0 12  
  4     81 0 21  
  4     81 0 21  
  77     4 0 285  
  77     4 0 255  
  77     4 0 439  
  81     4 0 343  
  81     4 0 269  
  81     4 0 766  
  81     28 0 384  
  81     4 0 262  
  81     4 0 492  
  81     4 0 295  
  81     0 0 308  
  81     77 0 533  
  81     77 0 323  
  81     77 0 288  
  81     77 0 427  
  81     77 0 360  
  81     77 0 320  
  81     77 0 425  
  81         310  
  81         307  
  81         526  
  81         304  
  81         372  
  81         399  
  543         1375  
  543         1255  
  543         1944  
  81         321  
  81         269  
  81         492  
  81         317  
  81         258  
  81         440  
  4         13  
  4         24  
  4         24  
  4         13  
  4         16  
  4         40  
  4         17  
  4         17  
  4         19  
  4         16  
  4         29  
  4         24  
  4         11  
  4         17  
  4         29  
  4         19  
  4         21  
  4         28  
  28         77  
  28         74  
  28         105  
  4         19  
  4         20  
  4         31  
  4         19  
  4         22  
  4         31  
  4         14  
  4         16  
  4         37  
  0         0  
  0         0  
  0         0  
  77         330  
  77         299  
  77         456  
  77         282  
  77         276  
  77         395  
  77         296  
  77         269  
  77         454  
  77         316  
  89         359  
  89         463  
  89         367  
  77         286  
  77         479  
  77         299  
  77         261  
  77         447  
  77         307  
  77         303  
  89         460  
2659 0 0       0 or do {
  0         0  
2660 0         0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
2661 0         0 warn "rules: failed to run header tests, skipping some: $eval_stat\n";
2662             $self->{rule_errors}++;
2663             };
2664              
2665 283         4107 # ensure this method is deleted if finish_tests() is called
2666             push (@TEMPORARY_METHODS, $function);
2667             }
2668              
2669             ###########################################################################
2670              
2671             # note: only eval tests should store state in $self->{test_log_msgs};
2672             # pattern tests do not.
2673             #
2674             # the clearing of the test state is now inlined as:
2675             #
2676             # %{$self->{test_log_msgs}} = (); # clear test state
2677             #
2678             # except for this public API for plugin use:
2679              
2680             =item $status->clear_test_state()
2681              
2682             Clear test state, including test log messages from C<$status-E<gt>test_log()>.
2683              
2684             =cut
2685              
2686 12     0 1 39 sub clear_test_state {
2687 12         55 my ($self) = @_;
  12         58  
2688             %{$self->{test_log_msgs}} = ();
2689             }
2690              
2691             # internal API, called only by got_hit()
2692             # TODO: refactor and merge this into that function
2693 128     128   459 sub _handle_hit {
2694             my ($self, $rule, $score, $area, $ruletype, $desc) = @_;
2695 128         1090  
2696             $self->{main}->call_plugins ("hit_rule", {
2697             permsgstatus => $self,
2698             rulename => $rule,
2699             ruletype => $ruletype,
2700             score => $score
2701             });
2702              
2703 128 100       729 # ignore meta-match sub-rules.
  88         197  
  76         215  
  76         210  
2704             if ($rule =~ /^__/) { push(@{$self->{subtest_names_hit}}, $rule); return; }
2705              
2706 56 50       294 # this should not happen; warn about it
2707 4         15 if (!defined $score) {
2708 16         92 warn "rules: score undef for rule '$rule' in '$area' '$desc'";
2709             return;
2710             }
2711              
2712 56 50       279 # this should not happen; warn about NaN (bug 3364)
2713 4         19 if ($score != $score) {
2714 4         31 warn "rules: score '$score' for rule '$rule' in '$area' '$desc'";
2715             return;
2716             }
2717              
2718 56         268 # Add the rule hit to the score
2719             $self->{score} += $score;
2720 68         223  
  56         200  
2721 56   100     321 push(@{$self->{test_names_hit}}, $rule);
2722             $area ||= '';
2723 56 100 66     401  
2724 17         90 if ($score >= 10 || $score <= -10) {
2725             $score = sprintf("%4.0f", $score);
2726             }
2727 65         955 else {
2728             $score = sprintf("%4.1f", $score);
2729             }
2730              
2731             # save both summaries
2732             # TODO: this is slower than necessary, if we only need one
2733             $self->{tag_data}->{REPORT} .= sprintf ("* %s %s %s%s\n%s",
2734             $score, $rule, $area,
2735             $self->_wrap_desc($desc,
2736             4+length($rule)+length($score)+length($area), "* "),
2737 56 100       530 ($self->{test_log_msgs}->{TERSE} ?
2738             "* " . $self->{test_log_msgs}->{TERSE} : ''));
2739              
2740             $self->{tag_data}->{SUMMARY} .= sprintf ("%s %-22s %s%s\n%s",
2741             $score, $rule, $area,
2742             $self->_wrap_desc($desc,
2743 66   100     517 3+length($rule)+length($score)+length($area), " " x 28),
2744 142 50 33     788 ($self->{test_log_msgs}->{LONG} || ''));
2745 98         217 if((defined $self->{subjprefix}) and ($self->{subjprefix} ne "")) {
2746             $self->{tag_data}->{SUBJPREFIX} = $self->{subjprefix};
2747             }
2748             }
2749              
2750 202     104   818 sub _wrap_desc {
2751             my ($self, $desc, $firstlinelength, $prefix) = @_;
2752 118         422  
2753 108         522 my $firstline = " " x $firstlinelength;
2754 108         577 my $wrapped = Mail::SpamAssassin::Util::wrap($desc, $prefix, $firstline, $self->{conf}->{report_wrap_width}, 0);
2755 108         1020 $wrapped =~ s/^\s+//s;
2756             $wrapped;
2757             }
2758              
2759             ###########################################################################
2760              
2761             =item $status->got_hit ($rulename, $desc_prepend [, name => value, ...])
2762              
2763             Register a hit against a rule in the ruleset.
2764              
2765             There are two mandatory arguments. These are C<$rulename>, the name of the rule
2766             that fired, and C<$desc_prepend>, which is a short string that will be
2767             prepended to the rules C<describe> string in output reports.
2768              
2769             In addition, callers can supplement that with the following optional
2770             data:
2771              
2772             =over 4
2773              
2774             =item score => $num
2775              
2776             Optional: the score to use for the rule hit. If unspecified,
2777             the value from the C<Mail::SpamAssassin::Conf> object's C<{scores}>
2778             hash will be used (a configured score), and in its absence the
2779             C<defscore> option value.
2780              
2781             =item defscore => $num
2782              
2783             Optional: the score to use for the rule hit if neither the
2784             option C<score> is provided, nor a configured score value is provided.
2785              
2786             =item value => $num
2787              
2788             Optional: the value to assign to the rule; the default value is C<1>.
2789             I<tflags multiple> rules use values of greater than 1 to indicate
2790             multiple hits. This value is accessible to meta rules.
2791              
2792             =item ruletype => $type
2793              
2794             Optional, but recommended: the rule type string. This is used in the
2795             C<hit_rule> plugin call, called by this method. If unset, I<'unknown'> is
2796             used.
2797              
2798             =item tflags => $string
2799              
2800             Optional: a string, i.e. a space-separated list of additional tflags
2801             to be appended to an existing list of flags in $self->{conf}->{tflags},
2802             such as: "nice noautolearn multiple". No syntax checks are performed.
2803              
2804             =item description => $string
2805              
2806             Optional: a custom rule description string. This is used in the
2807             C<hit_rule> plugin call, called by this method. If unset, the static
2808             description is used.
2809              
2810             =back
2811              
2812             Backward compatibility: the two mandatory arguments have been part of this API
2813             since SpamAssassin 2.x. The optional I<name=<gt>value> pairs, however, are a
2814             new addition in SpamAssassin 3.2.0.
2815              
2816             =cut
2817              
2818 132     132 1 2705 sub got_hit {
2819             my ($self, $rule, $area, %params) = @_;
2820 132         355  
2821             my $conf_ref = $self->{conf};
2822 170         259  
2823 170         305 my $dynamic_score_provided;
2824 170 100       390 my $score = $params{score};
2825 50         151 if (defined $score) { # overrides any configured scores
2826             $dynamic_score_provided = 1;
2827 178         666 } else {
2828 178 50       662 $score = $conf_ref->{scores}->{$rule};
2829             $score = $params{defscore} if !defined $score;
2830             }
2831              
2832             # adding a hit does nothing if we don't have a score -- we probably
2833 136 50       408 # shouldn't have run it in the first place
2834 8         29 if (!$score) {
  8         16  
2835 42         131 %{$self->{test_log_msgs}} = ();
2836             return;
2837             }
2838              
2839             # ensure that rule values always result in an *increase*
2840 128         286 # of $self->{tests_already_hit}->{$rule}:
2841 128 100 66     385 my $value = $params{value};
  164         319  
2842             if (!$value || $value <= 0) { $value = 1 }
2843 128         270  
2844 128         225 my $tflags_ref = $conf_ref->{tflags};
2845 170 50 33     507 my $tflags_add = $params{tflags};
2846             if (defined $tflags_add && $tflags_add ne '') {
2847 42 0 0     65 $_ = (!defined $_ || $_ eq '') ? $tflags_add : ($_ . ' ' . $tflags_add)
2848             for $tflags_ref->{$rule};
2849             };
2850 170   100     680  
2851             my $already_hit = $self->{tests_already_hit}->{$rule} || 0;
2852 170 100 50     564 # don't count hits multiple times, unless 'tflags multiple' is on
      66        
2853 42         186 if ($already_hit && ($tflags_ref->{$rule}||'') !~ /\bmultiple\b/) {
  0         0  
2854 42         305 %{$self->{test_log_msgs}} = ();
2855             return;
2856             }
2857 170         514  
2858             $self->{tests_already_hit}->{$rule} = $already_hit + $value;
2859              
2860 170   100     556 # default ruletype, if not specified:
2861             $params{ruletype} ||= 'unknown';
2862 170 100       626  
2863 0         0 if ($dynamic_score_provided) { # copy it to static for proper reporting
2864 84         234 $conf_ref->{scoreset}->[$_]->{$rule} = $score for (0..3);
2865             $conf_ref->{scores}->{$rule} = $score;
2866             }
2867 212         415  
2868 212 50       550 my $rule_descr = $params{description};
2869 136         530 if (defined $rule_descr) {
2870             $conf_ref->{descriptions}->{$rule} = $rule_descr; # save dynamic descr.
2871 264         5552 } else {
2872             $rule_descr = $conf_ref->get_description_for_rule($rule); # static
2873             }
2874             # Bug 6880 Set Rule Description to something that says no rule
2875 186 100 66     1326 #$rule_descr = $rule if !defined $rule_descr || $rule_descr eq '';
2876             $rule_descr = "No description available." if !defined $rule_descr || $rule_descr eq '';
2877 186 50       573  
2878 58         84 if(defined $self->{conf}->{rewrite_header}->{Subject}) {
2879 58 0       92 my $rule_subjprefix = $conf_ref->{subjprefix}->{$rule};
2880 58         120 if (defined $rule_subjprefix) {
2881 2 0       5 dbg("subjprefix: setting Subject prefix to $rule_subjprefix");
2882 68         179 if($self->{subjprefix} !~ /\Q$rule_subjprefix\E/) {
2883             $self->{subjprefix} .= $rule_subjprefix . " "; # save dynamic subject prefix.
2884             }
2885             }
2886             }
2887              
2888             $self->_handle_hit($rule,
2889             $score,
2890             $area,
2891 196         752 $params{ruletype},
2892             $rule_descr);
2893              
2894 186         479 # take care of duplicate rules, too (bug 5206)
2895 128 50 33     406 my $dups = $conf_ref->{duplicate_rules}->{$rule};
  12         29  
2896 12         64 if ($dups && @{$dups}) {
  70         156  
2897 58         163 foreach my $dup (@{$dups}) {
2898             $self->got_hit($dup, $area, %params);
2899             }
2900             }
2901 194         366  
  208         452  
2902 208         2039 %{$self->{test_log_msgs}} = (); # clear test logs
2903             return 1;
2904             }
2905              
2906             ###########################################################################
2907              
2908             # TODO: this needs API doc
2909 118     4 0 285 sub test_log {
2910 12         41 my ($self, $msg) = @_;
2911 80         297 local $1;
2912 70         253 while ($msg =~ s/^(.{30,48})\s//) {
2913             $self->_test_log_line ($1);
2914 20         98 }
2915             $self->_test_log_line ($msg);
2916             }
2917              
2918 20     4   66 sub _test_log_line {
2919             my ($self, $msg) = @_;
2920 20         114  
2921 50 50       99 $self->{test_log_msgs}->{TERSE} .= sprintf ("[%s]\n", $msg);
2922 50         193 if (length($msg) > 47) {
2923             $self->{test_log_msgs}->{LONG} .= sprintf ("%78s\n", "[$msg]");
2924 50         102 } else {
2925             $self->{test_log_msgs}->{LONG} .= sprintf ("%27s [%s]\n", "", $msg);
2926             }
2927             }
2928              
2929             ###########################################################################
2930              
2931             # helper for get(). Do not call directly, as get() caches its results
2932             # and this does not!
2933 96     98 0 309 sub get_envelope_from {
2934             my ($self) = @_;
2935              
2936             # bug 2142:
2937             # Get the SMTP MAIL FROM:, aka. the "envelope sender", if our
2938             # calling app has helpfully marked up the source message
2939             # with it. Various MTAs and calling apps each have their
2940             # own idea of what header to use for this! see
2941 96         171  
2942             my $envf;
2943              
2944             # Rely on the 'envelope-sender-header' header if the user has configured one.
2945             # Assume that because they have configured it, their MTA will always add it.
2946 144 50       492 # This will prevent us falling through and picking up inappropriate headers.
2947             if (defined $self->{conf}->{envelope_sender_header}) {
2948 50         101 # make sure we get the most recent copy - there can be only one EnvelopeSender.
2949             $envf = $self->get($self->{conf}->{envelope_sender_header}.":addr",undef);
2950 0 0 0     0 # ok if it contains an "@" sign, or is "" (ie. "<>" without the < and >)
      0        
2951             goto ok if defined $envf && ($envf =~ /\@/ || $envf eq '');
2952 50 0       154 # Warn them if it's configured, but not there or not usable.
2953 50         220 if (defined $envf) {
2954             chomp $envf;
2955 50         153 dbg("message: envelope_sender_header '%s: %s' is not an FQDN, ignoring",
2956             $self->{conf}->{envelope_sender_header}, $envf);
2957             } else {
2958 0         0 dbg("message: envelope_sender_header '%s' not found in message",
2959             $self->{conf}->{envelope_sender_header});
2960             }
2961 0         0 # Couldn't get envelope-sender using the configured header.
2962             return;
2963             }
2964              
2965             # User hasn't given us a header to trust, so try to guess the sender.
2966              
2967             # use the "envelope-sender" string found in the Received headers,
2968             # if possible... use the last untrusted header, in case there's
2969 94         226 # trusted headers.
2970 106 100       390 my $lasthop = $self->{relays_untrusted}->[0];
2971             if (!defined $lasthop) {
2972             # no untrusted headers? in that case, the message is ALL_TRUSTED.
2973 98         257 # use the first trusted header (ie. the oldest, originating one).
2974             $lasthop = $self->{relays_trusted}->[-1];
2975             }
2976 144 100       575  
2977 83         222 if (defined $lasthop) {
2978             $envf = $lasthop->{envfrom};
2979             # TODO FIXME: Received.pm puts both null senders and absence-of-sender
2980 95 50 33     335 # into the relays array as '', so we can't distinguish them :(
2981 12         42 if ($envf && ($envf =~ /\@/)) {
2982             goto ok;
2983             }
2984             }
2985              
2986             # WARNING: a lot of list software adds an X-Sender for the original env-from
2987             # (including Yahoo! Groups). Unfortunately, fetchmail will pick it up and
2988             # reuse it as the env-from for *its* delivery -- even though the list
2989             # software had used a different env-from in the intervening delivery. Hence,
2990             # if this header is present, and there's a fetchmail sig in the Received
2991             # lines, we cannot trust any Envelope-From headers, since they're likely to
2992             # be incorrect fetchmail guesses.
2993 106 50       440  
2994 0         0 if ($self->get("X-Sender") =~ /\@/) {
2995 12 0       56 my $rcvd = join(' ', $self->get("Received"));
2996 62         187 if ($rcvd =~ /\(fetchmail/) {
2997 62         269 dbg("message: X-Sender and fetchmail signatures found, cannot trust envelope-from");
2998             return;
2999             }
3000             }
3001              
3002 156 50       1063 # procmailrc notes this (we now recommend adding it to Received instead)
3003             if ($envf = $self->get("X-Envelope-From")) {
3004             # heuristic: this could have been relayed via a list which then used
3005 54 0       115 # a *new* Envelope-from. check
3006 54         132 if ($self->get("ALL") =~ /^Received:.*?^X-Envelope-From:/smi) {
3007 54         189 dbg("message: X-Envelope-From header found after 1 or more Received lines, cannot trust envelope-from");
3008             return;
3009 33         189 } else {
3010             goto ok;
3011             }
3012             }
3013              
3014 156 50       532 # qmail, new-inject(1)
3015             if ($envf = $self->get("Envelope-Sender")) {
3016             # heuristic: this could have been relayed via a list which then used
3017 57 0       180 # a *new* Envelope-from. check
3018 57         291 if ($self->get("ALL") =~ /^Received:.*?^Envelope-Sender:/smi) {
3019             dbg("message: Envelope-Sender header found after 1 or more Received lines, cannot trust envelope-from");
3020 57         146 } else {
3021             goto ok;
3022             }
3023             }
3024              
3025             # Postfix, sendmail, amavisd-new, ...
3026             # RFC 2821 requires it:
3027             # When the delivery SMTP server makes the "final delivery" of a
3028             # message, it inserts a return-path line at the beginning of the mail
3029             # data. This use of return-path is required; mail systems MUST support
3030             # it. The return-path line preserves the information in the <reverse-
3031 106 100       476 # path> from the MAIL command.
3032             if ($envf = $self->get("Return-Path")) {
3033             # heuristic: this could have been relayed via a list which then used
3034 46 50       220 # a *new* Envelope-from. check
3035 4         11 if ($self->get("ALL") =~ /^Received:.*?^Return-Path:/smi) {
3036             dbg("message: Return-Path header found after 1 or more Received lines, cannot trust envelope-from");
3037 5         16 } else {
3038             goto ok;
3039             }
3040             }
3041              
3042 97         367 # give up.
3043             return;
3044 1         4  
3045             ok:
3046 1         9 $envf =~ s/^<*//s; # remove <
3047             $envf =~ s/>*\s*\z//s; # remove >, whitespace, newlines
3048 1         4  
3049             return $envf;
3050             }
3051              
3052             ###########################################################################
3053              
3054             # helper for get(ALL-*). get() caches its results, so don't call this
3055             # directly unless you need a range of headers not covered by the ALL-*
3056             # psuedo-headers!
3057              
3058             # Get all the headers found between an index range of received headers, the
3059             # index doesn't care if we could parse the received headers or not.
3060             # Use undef for the $start_rcvd or $end_rcvd numbers to start/end with the
3061             # first/last header in the message, otherwise indicate the index number you
3062             # want to start/end at. Set $include_start_rcvd or $include_end_rcvd to 0 to
3063             # indicate you don't want to include the received header found at the start or
3064             # end indexes... basically toggles between [s,e], [s,e), (s,e], (s,e).
3065 0     4 0 0 sub get_all_hdrs_in_rcvd_index_range {
3066             my ($self, $start_rcvd, $end_rcvd, $include_start_rcvd, $include_end_rcvd, $getraw) = @_;
3067              
3068 0 50 33     0 # prevent bad input causing us to return the first header found
3069             return if (defined $end_rcvd && $end_rcvd < 0);
3070 0 50       0  
3071 0 50       0 $include_start_rcvd = 1 unless defined $include_start_rcvd;
3072             $include_end_rcvd = 1 unless defined $include_end_rcvd;
3073 4         11  
3074 4         14 my $cur_rcvd_index = -1; # none found yet
3075             my $result = '';
3076 4         9  
3077 4 50       13 my @hdrs;
3078 0         0 if ($getraw) {
3079             @hdrs = $self->{msg}->get_pristine_header() =~ /^([^ \t].*?\n)(?![ \t])/smgi;
3080 0         0 } else {
3081             @hdrs = split(/^/m, $self->{msg}->get_all_headers(0));
3082             }
3083 0         0  
3084 4 50       29 foreach my $hdr (@hdrs) {
3085 0         0 if ($hdr =~ /^Received:/i) {
3086 0 0 0     0 $cur_rcvd_index++;
      0        
3087             next if (defined $start_rcvd && !$include_start_rcvd &&
3088 0 0 0     0 $start_rcvd == $cur_rcvd_index);
      0        
3089             last if (defined $end_rcvd && !$include_end_rcvd &&
3090             $end_rcvd == $cur_rcvd_index);
3091 0 50 33     0 }
    0 33        
      33        
      0        
3092             if ((!defined $start_rcvd || $start_rcvd <= $cur_rcvd_index) &&
3093 4         18 (!defined $end_rcvd || $cur_rcvd_index < $end_rcvd)) {
3094             $result .= $hdr;
3095             }
3096 0         0 elsif (defined $end_rcvd && $cur_rcvd_index == $end_rcvd) {
3097 0         0 $result .= $hdr;
3098             last;
3099             }
3100 0 50       0 }
3101             return ($result eq '' ? undef : $result);
3102             }
3103              
3104             ###########################################################################
3105 0     88 0 0  
3106             sub sa_die { Mail::SpamAssassin::sa_die(@_); }
3107              
3108             ###########################################################################
3109              
3110             =item $status->create_fulltext_tmpfile (fulltext_ref)
3111              
3112             This function creates a temporary file containing the passed scalar
3113             reference data (typically the full/pristine text of the message).
3114             This is typically used by external programs like pyzor and dccproc, to
3115             avoid hangs due to buffering issues. Methods that need this, should
3116             call $self->create_fulltext_tmpfile($fulltext) to retrieve the temporary
3117             filename; it will be created if it has not already been.
3118              
3119             Note: This can only be called once until $status->delete_fulltext_tmpfile() is
3120             called.
3121              
3122             =cut
3123              
3124 4     4 1 26 sub create_fulltext_tmpfile {
3125             my ($self, $fulltext) = @_;
3126 0 50       0  
3127 0         0 if (defined $self->{fulltext_tmpfile}) {
3128             return $self->{fulltext_tmpfile};
3129             }
3130 0         0  
3131 4 50       13 my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
3132             $tmpfh or die "failed to create a temporary file";
3133              
3134             # PerlIO's buffered print writes in 8 kB chunks - which can be slow.
3135             # print $tmpfh $$fulltext or die "error writing to $tmpf: $!";
3136             #
3137             # reducing the number of writes and bypassing extra buffering in PerlIO
3138 0         0 # speeds up writing of larger text by a factor of 2
3139 0         0 my $nwrites;
3140 0         0 for (my $ofs = 0; $ofs < length($$fulltext); $ofs += $nwrites) {
3141 4 50       23 $nwrites = $tmpfh->syswrite($$fulltext, length($$fulltext)-$ofs, $ofs);
3142             defined $nwrites or die "error writing to $tmpf: $!";
3143 0 50       0 }
3144             close $tmpfh or die "error closing $tmpf: $!";
3145 0         0  
3146             $self->{fulltext_tmpfile} = $tmpf;
3147 0         0  
3148             dbg("check: create_fulltext_tmpfile, written %d bytes to file %s",
3149             length($$fulltext), $tmpf);
3150 4         14  
3151             return $self->{fulltext_tmpfile};
3152             }
3153              
3154             =item $status->delete_fulltext_tmpfile ()
3155              
3156             Will cleanup after a $status->create_fulltext_tmpfile() call. Deletes the
3157             temporary file and uncaches the filename.
3158              
3159             =cut
3160              
3161 191     191 1 479 sub delete_fulltext_tmpfile {
3162 191 100       1149 my ($self) = @_;
3163 4 50       14 if (defined $self->{fulltext_tmpfile}) {
3164 4         12 if (!unlink $self->{fulltext_tmpfile}) {
3165             my $msg = sprintf("cannot unlink %s: %s", $self->{fulltext_tmpfile}, $!);
3166 4 0       7 # don't fuss too much if file is missing, perhaps it wasn't even created
  4         9  
  4         12  
3167             if ($! == ENOENT) { warn $msg } else { die $msg }
3168 0         0 }
3169             $self->{fulltext_tmpfile} = undef;
3170             }
3171             }
3172              
3173             ###########################################################################
3174              
3175 237     275 0 590 sub all_from_addrs {
3176             my ($self) = @_;
3177 237 100       692  
  171         364  
  154         1044  
3178             if (exists $self->{all_from_addrs}) { return @{$self->{all_from_addrs}}; }
3179 79         157  
3180             my @addrs;
3181              
3182 79         262 # Resent- headers take priority, if present. see bug 672
3183 96 50 33     534 my $resent = $self->get('Resent-From',undef);
3184 17         53 if (defined $resent && $resent =~ /\S/) {
3185             @addrs = $self->{main}->find_all_addrs_in_line ($resent);
3186             }
3187             else {
3188             # bug 2292: Used to use find_all_addrs_in_line() with the same
3189             # headers, but the would catch addresses in comments which caused
3190             # FNs for things like whitelist_from. Since all of these are From
3191             # headers, there should only be 1 address in each anyway (not exactly
3192             # true, RFC 2822 allows multiple addresses in a From header field),
3193             # so use the :addr code...
3194             # bug 3366: some addresses come in as 'foo@bar...', which is invalid.
3195             # so deal with the multiple periods.
3196 79         293 ## no critic
  39         237  
  43         265  
  395         926  
3197             @addrs = map { tr/././s; $_ } grep { $_ ne '' }
3198             ($self->get('From:addr'), # std
3199             $self->get('Envelope-Sender:addr'), # qmail: new-inject(1)
3200             $self->get('Resent-Sender:addr'), # procmailrc manpage
3201             $self->get('X-Envelope-From:addr'), # procmailrc manpage
3202             $self->get('EnvelopeFrom:addr')); # SMTP envelope
3203             # http://www.cs.tut.fi/~jkorpela/headers.html is useful here
3204             }
3205              
3206 83         328 # Remove duplicate addresses
  43         219  
3207 79         271 my %addrs = map { $_ => 1 } @addrs;
3208             @addrs = keys %addrs;
3209 83         492  
3210 83         380 dbg("eval: all '*From' addrs: " . join(" ", @addrs));
3211 83         755 $self->{all_from_addrs} = \@addrs;
3212             return @addrs;
3213             }
3214              
3215             =item all_from_addrs_domains
3216              
3217             This function returns all the various from addresses in a message using all_from_addrs()
3218             and then returns only the domain names.
3219              
3220             =cut
3221              
3222 4     50 1 26 sub all_from_addrs_domains {
3223             my ($self) = @_;
3224 4 0       62  
3225 4         278 if (exists $self->{all_from_addrs_domains}) {
  4         65  
3226             return @{$self->{all_from_addrs_domains}};
3227             }
3228              
3229             #TEST POINT - my @addrs = ("test.voipquotes2.net","test.voipquotes2.co.uk");
3230 4         32 #Start with all the normal from addrs
3231             my @addrs = all_from_addrs($self);
3232 4         29  
3233             dbg("eval: all '*From' addrs domains (before): " . join(" ", @addrs));
3234              
3235             #Take just the domain with a dummy localpart
3236 4         27 #removing invalid and duplicate domains
3237 12         28 my(%addrs_seen, @addrs_filtered);
3238 12         57 foreach my $a (@addrs) {
3239 4 0 0     235 my $domain = $self->{main}->{registryboundaries}->uri_to_domain($a);
3240 0         0 next if !defined $domain || $addrs_seen{lc $domain}++;
3241             push(@addrs_filtered, 'dummy@'.$domain);
3242             }
3243 0         0  
3244             dbg("eval: all '*From' addrs domains (after uri to domain): " .
3245             join(" ", @addrs_filtered));
3246 0         0  
3247             $self->{all_from_addrs_domains} = \@addrs_filtered;
3248 0         0  
3249             return @addrs_filtered;
3250             }
3251              
3252 160     240 0 393 sub all_to_addrs {
3253             my ($self) = @_;
3254 168 100       496  
  89         217  
  85         1460  
3255             if (exists $self->{all_to_addrs}) { return @{$self->{all_to_addrs}}; }
3256 87         185  
3257             my @addrs;
3258              
3259 83         269 # Resent- headers take priority, if present. see bug 672
3260 83 50       383 my $resent = join('', $self->get('Resent-To'), $self->get('Resent-Cc'));
3261 4         21 if ($resent =~ /\S/) {
3262             @addrs = $self->{main}->find_all_addrs_in_line($resent);
3263             } else {
3264             # OK, a fetchmail trick: try to find the recipient address from
3265             # the most recent 3 Received lines. This is required for sendmail,
3266             # since it does not add a helpful header like exim, qmail
3267             # or Postfix do.
3268 79         236 #
3269 83         377 my $rcvd = $self->get('Received');
3270 83         312 $rcvd =~ s/\n[ \t]+/ /gs;
3271             $rcvd =~ s/\n+/\n/gs;
3272 83         421  
  99         256  
3273 83         178 my @rcvdlines = split(/\n/, $rcvd, 4); pop @rcvdlines; # forget last one
3274 83         272 my @rcvdaddrs;
3275 47 100       233 foreach my $line (@rcvdlines) {
  6         33  
3276             if ($line =~ / for (\S+\@\S+);/) { push (@rcvdaddrs, $1); }
3277             }
3278              
3279 83         440 @addrs = $self->{main}->find_all_addrs_in_line (
3280             join('',
3281             join(" ", @rcvdaddrs)."\n",
3282             $self->get('To'), # std
3283             $self->get('Apparently-To'), # sendmail, from envelope
3284             $self->get('Delivered-To'), # Postfix, poss qmail
3285             $self->get('Envelope-Recipients'), # qmail: new-inject(1)
3286             $self->get('Apparently-Resent-To'), # procmailrc manpage
3287             $self->get('X-Envelope-To'), # procmailrc manpage
3288             $self->get('Envelope-To'), # exim
3289             $self->get('X-Delivered-To'), # procmail quick start
3290             $self->get('X-Original-To'), # procmail quick start
3291             $self->get('X-Rcpt-To'), # procmail quick start
3292             $self->get('X-Real-To'), # procmail quick start
3293             $self->get('Cc'))); # std
3294             # those are taken from various sources; thanks to Nancy McGough, who
3295             # noted some in <http://www.ii.com/internet/robots/procmail/qs/#envelope>
3296             }
3297 83         621  
3298 79         287 dbg("eval: all '*To' addrs: " . join(" ", @addrs));
3299 79         1958 $self->{all_to_addrs} = \@addrs;
3300             return @addrs;
3301              
3302             # http://www.cs.tut.fi/~jkorpela/headers.html is useful here, also
3303             # http://www.exim.org/pipermail/exim-users/Week-of-Mon-20001009/021672.html
3304             }
3305              
3306             ###########################################################################
3307              
3308             1;
3309             __END__
3310              
3311             =back
3312              
3313             =head1 SEE ALSO
3314              
3315             Mail::SpamAssassin(3)
3316             spamassassin(1)
3317