File Coverage

blib/lib/Mail/SpamAssassin/PerMsgStatus.pm
Criterion Covered Total %
statement 827 1188 69.6
branch 246 510 48.2
condition 85 217 39.1
subroutine 90 118 76.2
pod 47 89 52.8
total 1295 2122 61.0


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::PerMsgStatus - per-message status (spam or not-spam)
21              
22             =head1 SYNOPSIS
23              
24             my $spamtest = new Mail::SpamAssassin ({
25             'rules_filename' => '/etc/spamassassin.rules',
26             'userprefs_filename' => $ENV{HOME}.'/.spamassassin/user_prefs'
27             });
28             my $mail = $spamtest->parse();
29              
30             my $status = $spamtest->check ($mail);
31              
32             my $rewritten_mail;
33             if ($status->is_spam()) {
34             $rewritten_mail = $status->rewrite_mail ();
35             }
36             ...
37              
38              
39             =head1 DESCRIPTION
40              
41             The Mail::SpamAssassin C<check()> method returns an object of this
42             class. This object encapsulates all the per-message state.
43              
44             =head1 METHODS
45              
46             =over 4
47              
48             =cut
49              
50             package Mail::SpamAssassin::PerMsgStatus;
51              
52 40     40   274 use strict;
  40         90  
  40         1308  
53 40     40   237 use warnings;
  40         106  
  40         1493  
54 40     40   260 use re 'taint';
  40         100  
  40         1424  
55              
56 40     40   286 use Errno qw(ENOENT);
  40         97  
  40         2186  
57 40     40   286 use Time::HiRes qw(time);
  40         552  
  40         415  
58              
59 40     40   5227 use Mail::SpamAssassin::Constants qw(:sa);
  40         101  
  40         5629  
60 40     40   12797 use Mail::SpamAssassin::AsyncLoop;
  40         122  
  40         1264  
61 40     40   922 use Mail::SpamAssassin::Conf;
  40         94  
  40         1096  
62 40     40   233 use Mail::SpamAssassin::Util qw(untaint_var uri_list_canonicalize);
  40         75  
  40         2235  
63 40     40   13100 use Mail::SpamAssassin::Timeout;
  40         99  
  40         1280  
64 40     40   251 use Mail::SpamAssassin::Logger;
  40         76  
  40         42880  
65              
66             our @ISA = qw();
67              
68             # methods defined by the compiled ruleset; deleted in finish_tests()
69             our @TEMPORARY_METHODS;
70              
71             # methods defined by register_plugin_eval_glue(); deleted in finish_tests()
72             our %TEMPORARY_EVAL_GLUE_METHODS;
73              
74             ###########################################################################
75              
76             our %common_tags;
77              
78             BEGIN {
79             %common_tags = (
80              
81             YESNO => sub {
82 34         79 my $pms = shift;
83 34         168 $pms->_get_tag_value_for_yesno(@_);
84             },
85              
86             YESNOCAPS => sub {
87 2         5 my $pms = shift;
88 2         16 uc $pms->_get_tag_value_for_yesno(@_);
89             },
90              
91             SCORE => sub {
92 38         86 my $pms = shift;
93 38         176 $pms->_get_tag_value_for_score(@_);
94             },
95              
96             HITS => sub {
97 0         0 my $pms = shift;
98 0         0 $pms->_get_tag_value_for_score(@_);
99             },
100              
101             REQD => sub {
102 38         89 my $pms = shift;
103 38         133 $pms->_get_tag_value_for_required_score(@_);
104             },
105              
106             VERSION => \&Mail::SpamAssassin::Version,
107              
108 34         143 SUBVERSION => sub { $Mail::SpamAssassin::SUB_VERSION },
109              
110             RULESVERSION => sub {
111 0         0 my $pms = shift;
112 0         0 my $conf = $pms->{conf};
113 0         0 my @fnames;
114             @fnames =
115 0 0       0 keys %{$conf->{update_version}} if $conf->{update_version};
  0         0  
116 0 0       0 @fnames = sort @fnames if @fnames > 1;
117 0         0 join(',', map($conf->{update_version}{$_}, @fnames));
118             },
119              
120             HOSTNAME => sub {
121 38         99 my $pms = shift;
122             $pms->{conf}->{report_hostname} ||
123 38 50       397 Mail::SpamAssassin::Util::fq_hostname();
124             },
125              
126             REMOTEHOSTNAME => sub {
127 0         0 my $pms = shift;
128 0 0       0 $pms->{tag_data}->{'REMOTEHOSTNAME'} || "localhost";
129             },
130              
131             REMOTEHOSTADDR => sub {
132 0         0 my $pms = shift;
133 0 0       0 $pms->{tag_data}->{'REMOTEHOSTADDR'} || "127.0.0.1";
134             },
135              
136             LASTEXTERNALIP => sub {
137 0         0 my $pms = shift;
138 0         0 my $lasthop = $pms->{relays_external}->[0];
139 0 0       0 $lasthop ? $lasthop->{ip} : '';
140             },
141              
142             LASTEXTERNALRDNS => sub {
143 0         0 my $pms = shift;
144 0         0 my $lasthop = $pms->{relays_external}->[0];
145 0 0       0 $lasthop ? $lasthop->{rdns} : '';
146             },
147              
148             LASTEXTERNALHELO => sub {
149 0         0 my $pms = shift;
150 0         0 my $lasthop = $pms->{relays_external}->[0];
151 0 0       0 $lasthop ? $lasthop->{helo} : '';
152             },
153              
154             CONTACTADDRESS => sub {
155 4         9 my $pms = shift;
156 4         31 $pms->{conf}->{report_contact};
157             },
158              
159             BAYES => sub {
160 0         0 my $pms = shift;
161             defined $pms->{bayes_score} ? sprintf("%3.4f", $pms->{bayes_score})
162 0 0       0 : "0.5";
163             },
164              
165 0         0 DATE => sub { Mail::SpamAssassin::Util::time_to_rfc822_date() },
166              
167             STARS => sub {
168 34         67 my $pms = shift;
169 34   50     112 my $arg = (shift || "*");
170 34         112 my $length = int($pms->{score});
171 34 50       129 $length = 50 if $length > 50;
172             # avoid a perl 5.21 warning: "Negative repeat count does nothing"
173 34 100       143 $length > 0 ? $arg x $length : '';
174             },
175              
176             AUTOLEARN => sub {
177 34         84 my $pms = shift;
178 34         168 $pms->get_autolearn_status();
179             },
180              
181             AUTOLEARNSCORE => sub {
182 0         0 my $pms = shift;
183 0         0 $pms->get_autolearn_points();
184             },
185              
186             TESTS => sub {
187 34         83 my $pms = shift;
188 34   50     194 my $arg = (shift || ',');
189 34 100       70 join($arg, sort @{$pms->{test_names_hit}}) || "none";
  34         211  
190             },
191              
192             SUBTESTS => sub {
193 0         0 my $pms = shift;
194 0   0     0 my $arg = (shift || ',');
195 0 0       0 join($arg, sort @{$pms->{subtest_names_hit}}) || "none";
  0         0  
196             },
197              
198             TESTSSCORES => sub {
199 0         0 my $pms = shift;
200 0   0     0 my $arg = (shift || ",");
201 0         0 my $scores = $pms->{conf}->{scores};
202             join($arg, map($_ . "=" . ($scores->{$_} || '0'),
203 0 0 0     0 sort @{$pms->{test_names_hit}})) || "none";
  0         0  
204             },
205              
206             PREVIEW => sub {
207 4         10 my $pms = shift;
208 4         27 $pms->get_content_preview();
209             },
210              
211             REPORT => sub {
212 3         7 my $pms = shift;
213 3   50     29 "\n" . ($pms->{tag_data}->{REPORT} || "");
214             },
215              
216             HEADER => sub {
217 0         0 my $pms = shift;
218 0         0 my $hdr = shift;
219 0 0       0 return if !$hdr;
220 0         0 $pms->get($hdr,undef);
221             },
222              
223             TIMING => sub {
224 0         0 my $pms = shift;
225 0         0 $pms->{main}->timer_report();
226             },
227              
228             ADDEDHEADERHAM => sub {
229 0         0 my $pms = shift;
230 0         0 $pms->_get_added_headers('headers_ham');
231             },
232              
233             ADDEDHEADERSPAM => sub {
234 0         0 my $pms = shift;
235 0         0 $pms->_get_added_headers('headers_spam');
236             },
237              
238             ADDEDHEADER => sub {
239 0         0 my $pms = shift;
240             $pms->_get_added_headers(
241 0 0       0 $pms->{is_spam} ? 'headers_spam' : 'headers_ham');
242             },
243              
244 40     40   522755 );
245             }
246              
247             sub new {
248 142     142 0 649 my $class = shift;
249 142   33     907 $class = ref($class) || $class;
250 142         472 my ($main, $msg, $opts) = @_;
251              
252             my $self = {
253             'main' => $main,
254             'msg' => $msg,
255             'score' => 0,
256             'test_log_msgs' => { },
257             'test_names_hit' => [ ],
258             'subtest_names_hit' => [ ],
259             'spamd_result_log_items' => [ ],
260             'tests_already_hit' => { },
261             'c' => { },
262             'tag_data' => { },
263             'rule_errors' => 0,
264             'disable_auto_learning' => 0,
265             'auto_learn_status' => undef,
266             'auto_learn_force_status' => undef,
267             'conf' => $main->{conf},
268             'async' => Mail::SpamAssassin::AsyncLoop->new($main),
269             'master_deadline' => $msg->{master_deadline}, # dflt inherited from msg
270 142         2054 'deadline_exceeded' => 0, # time limit exceeded, skipping further tests
271             };
272             #$self->{main}->{use_rule_subs} = 1;
273              
274             dbg("check: pms new, time limit in %.3f s",
275 142 50       1259 $self->{master_deadline} - time) if $self->{master_deadline};
276              
277 142 50 66     1101 if (defined $opts && $opts->{disable_auto_learning}) {
278 49         205 $self->{disable_auto_learning} = 1;
279             }
280              
281             # used with "mass-check --loghits"
282 142 50       553 if ($self->{main}->{save_pattern_hits}) {
283 0         0 $self->{save_pattern_hits} = 1;
284 0         0 $self->{pattern_hits} = { };
285             }
286              
287 142         308 delete $self->{should_log_rule_hits};
288 142         730 my $dbgcache = would_log('dbg', 'rules');
289 142 50 33     1212 if ($dbgcache || $self->{save_pattern_hits}) {
290 0         0 $self->{should_log_rule_hits} = 1;
291             }
292              
293             # known valid tags that might not get their entry in pms->{tag_data}
294             # in some circumstances
295 142         382 my $tag_data_ref = $self->{tag_data};
296 142         413 foreach (qw(SUMMARY REPORT RBL)) { $tag_data_ref->{$_} = '' }
  426         1449  
297 142         673 foreach (qw(AWL AWLMEAN AWLCOUNT AWLPRESCORE
298             DCCB DCCR DCCREP PYZOR DKIMIDENTITY DKIMDOMAIN
299             BAYESTC BAYESTCLEARNED BAYESTCSPAMMY BAYESTCHAMMY
300             HAMMYTOKENS SPAMMYTOKENS TOKENSUMMARY)) {
301 2414         5539 $tag_data_ref->{$_} = undef; # exist, but undefined
302             }
303              
304 142         519 bless ($self, $class);
305 142         501 $self;
306             }
307              
308             sub DESTROY {
309 141     141   3111560 my ($self) = shift;
310 141         344 local $@;
311 141         389 eval { $self->delete_fulltext_tmpfile() }; # Bug 5808
  141         671  
312             }
313              
314             ###########################################################################
315              
316             =item $status->check ()
317              
318             Runs the SpamAssassin rules against the message pointed to by the object.
319              
320             =cut
321              
322             sub check {
323 97     97 1 258 my ($self) = shift;
324 97         337 my $master_deadline = $self->{master_deadline};
325 97 50       322 if (!$master_deadline) {
326 0         0 $self->check_timed(@_);
327             } else {
328 97         1371 my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
329 97     97   1198 my $err = $t->run(sub { $self->check_timed(@_) });
  97         537  
330 96 50 33     1128 if (time > $master_deadline && !$self->{deadline_exceeded}) {
331 0         0 info("check: exceeded time limit in pms check");
332 0         0 $self->{deadline_exceeded} = 1;
333             }
334             }
335             }
336              
337             sub check_timed {
338 97     97 0 304 my ($self) = @_;
339 97         216 local ($_);
340              
341 97         370 $self->{learned_points} = 0;
342 97         286 $self->{body_only_points} = 0;
343 97         267 $self->{head_only_points} = 0;
344 97         215 $self->{score} = 0;
345              
346             # clear NetSet cache before every check to prevent it growing too large
347 97         326 foreach my $nset_name (qw(internal_networks trusted_networks msa_networks)) {
348 291         902 my $netset = $self->{conf}->{$nset_name};
349 291 50       1347 $netset->ditch_cache() if $netset;
350             }
351              
352 97         911 $self->{main}->call_plugins ("check_start", { permsgstatus => $self });
353              
354             # in order of slowness; fastest first, slowest last.
355             # we do ALL the tests, even if a spam triggers lots of them early on.
356             # this lets us see ludicrously spammish mails (score: 40) etc., which
357             # we can then immediately submit to spamblocking services.
358             #
359             # TODO: change this to do whitelist/blacklists first? probably a plan
360             # NOTE: definitely need AWL stuff last, for regression-to-mean of score
361              
362             # TVD: we may want to do more than just clearing out the headers, but ...
363 97         900 $self->{msg}->delete_header('X-Spam-.*');
364              
365             # Resident Mail::SpamAssassin code will possibly never change score
366             # sets, even if bayes becomes available. So we should do a quick check
367             # to see if we should go from {0,1} to {2,3}. We of course don't need
368             # to do this switch if we're already using bayes ... ;)
369 97         1074 my $set = $self->{conf}->get_score_set();
370 97 0 66     1378 if (($set & 2) == 0 && $self->{main}->{bayes_scanner} && $self->{main}->{bayes_scanner}->is_scan_available() && $self->{conf}->{use_bayes_rules}) {
      66        
      33        
371 0         0 dbg("check: scoreset $set but bayes is available, switching scoresets");
372 0         0 $self->{conf}->set_score_set ($set|2);
373             }
374 97         566 dbg("check: using scoreset $set in M:S:Pms");
375             # The primary check functionality occurs via a plugin call. For more
376             # information, please see: Mail::SpamAssassin::Plugin::Check
377 97 100       482 if (!$self->{main}->call_plugins ("check_main", { permsgstatus => $self }))
378             {
379             # did anything happen? if not, this is fatal
380 1 50       4 if (!$self->{main}->have_plugin("check_main")) {
381 1         21 die "check: no loaded plugin implements 'check_main': cannot scan!\n".
382             "Check that the necessary '.pre' files are in the config directory.\n".
383             "At a minimum, v320.pre loads the Check plugin which is required.\n";
384             }
385             }
386              
387             # delete temporary storage and memory allocation used during checking
388 96         760 $self->delete_fulltext_tmpfile();
389              
390             # now that we've finished checking the mail, clear out this cache
391             # to avoid unforeseen side-effects.
392 96         1256 $self->{c} = { };
393              
394             # Round the score to 3 decimal places to avoid rounding issues
395             # We assume required_score to be properly rounded already.
396             # add 0 to force it back to numeric representation instead of string.
397 96         1739 $self->{score} = (sprintf "%0.3f", $self->{score}) + 0;
398              
399             dbg("check: is spam? score=".$self->{score}.
400 96         974 " required=".$self->{conf}->{required_score});
401 96         658 dbg("check: tests=".$self->get_names_of_tests_hit());
402 96         574 dbg("check: subtests=".$self->get_names_of_subtests_hit());
403 96         502 $self->{is_spam} = $self->is_spam();
404              
405 96         558 $self->{main}->{resolver}->bgabort();
406 96         571 $self->{main}->call_plugins ("check_end", { permsgstatus => $self });
407              
408 96         1742 1;
409             }
410              
411             ###########################################################################
412              
413             =item $status->learn()
414              
415             After a mail message has been checked, this method can be called. If the score
416             is outside a certain range around the threshold, ie. if the message is judged
417             more-or-less definitely spam or definitely non-spam, it will be fed into
418             SpamAssassin's learning systems (currently the naive Bayesian classifier),
419             so that future similar mails will be caught.
420              
421             =cut
422              
423             sub learn {
424 96     96 1 247 my ($self) = shift;
425 96         218 my $master_deadline = $self->{master_deadline};
426 96 50       272 if (!$master_deadline) {
427 0         0 $self->learn_timed(@_);
428             } else {
429 96         516 my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
430 96     96   972 my $err = $t->run(sub { $self->learn_timed(@_) });
  96         691  
431 96 50 33     998 if (time > $master_deadline && !$self->{deadline_exceeded}) {
432 0         0 info("learn: exceeded time limit in pms learn");
433 0         0 $self->{deadline_exceeded} = 1;
434             }
435             }
436             }
437              
438             sub learn_timed {
439 96     96 0 250 my ($self) = @_;
440              
441 96 100 100     711 if (!$self->{conf}->{bayes_auto_learn} ||
      100        
442             !$self->{conf}->{use_bayes} ||
443             $self->{disable_auto_learning})
444             {
445 84         327 $self->{auto_learn_status} = "disabled";
446 84         1195 return;
447             }
448              
449 12         34 my ($isspam, $force_autolearn, $force_autolearn_names, $arrayref);
450 12         90 $arrayref = $self->{main}->call_plugins ("autolearn_discriminator", {
451             permsgstatus => $self
452             });
453              
454 12         35 $isspam = $arrayref->[0];
455 12         29 $force_autolearn = $arrayref->[1];
456 12         23 $force_autolearn_names = $arrayref->[2];
457              
458             #AUTOLEARN_FORCE FLAG INFORMATION
459 12 50 66     61 if (defined $force_autolearn and $force_autolearn > 0) {
460 0         0 $self->{auto_learn_force_status} = "yes";
461 0 0       0 if (defined $force_autolearn_names) {
462 0         0 $self->{auto_learn_force_status} .= " ($force_autolearn_names)";
463             }
464             } else {
465 12         34 $self->{auto_learn_force_status} = "no";
466             }
467              
468 12 100       48 if (!defined $isspam) {
469 10         23 $self->{auto_learn_status} = 'no';
470 10         138 return;
471             }
472              
473              
474 2         7 my $timer = $self->{main}->time_method("learn");
475              
476 2         12 $self->{main}->call_plugins ("autolearn", {
477             permsgstatus => $self,
478             isspam => $isspam
479             });
480              
481             # bug 3704: temporarily override learn's ability to re-learn a message
482 2         14 my $orig_learner = $self->{main}->init_learner({ "no_relearn" => 1 });
483              
484 2         2 my $eval_stat;
485             eval {
486 2         18 my $learnstatus = $self->{main}->learn ($self->{msg}, undef, $isspam, 0);
487 2 50       5 if ($learnstatus->did_learn()) {
488 0 0       0 $self->{auto_learn_status} = $isspam ? "spam" : "ham";
489             }
490             # This must wait until the did_learn call.
491 2         7 $learnstatus->finish();
492 2         14 $self->{main}->finish_learner(); # for now
493              
494 2 50       10 if (exists $self->{main}->{bayes_scanner}) {
495 2         10 $self->{main}->{bayes_scanner}->force_close();
496             }
497 2         17 1;
498 2 50       5 } or do {
499 0 0       0 $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
500             };
501              
502             # reset learner options to their original values
503 2         9 $self->{main}->init_learner($orig_learner);
504              
505 2 50       32 if (defined $eval_stat) {
506 0         0 dbg("learn: auto-learning failed: $eval_stat");
507 0         0 $self->{auto_learn_status} = "failed";
508             }
509             }
510              
511             =item $score = $status->get_autolearn_points()
512              
513             Return the message's score as computed for auto-learning. Certain tests are
514             ignored:
515              
516             - rules with tflags set to 'learn' (the Bayesian rules)
517              
518             - rules with tflags set to 'userconf' (user white/black-listing rules, etc)
519              
520             - rules with tflags set to 'noautolearn'
521              
522             Also note that auto-learning occurs using scores from either scoreset 0 or 1,
523             depending on what scoreset is used during message check. It is likely that the
524             message check and auto-learn scores will be different.
525              
526             =cut
527              
528             sub get_autolearn_points {
529 12     12 1 27 my ($self) = @_;
530 12         71 $self->_get_autolearn_points();
531 12         40 return $self->{autolearn_points};
532             }
533              
534             =item $score = $status->get_head_only_points()
535              
536             Return the message's score as computed for auto-learning, ignoring
537             all rules except for header-based ones.
538              
539             =cut
540              
541             sub get_head_only_points {
542 12     12 1 25 my ($self) = @_;
543 12         34 $self->_get_autolearn_points();
544 12         30 return $self->{head_only_points};
545             }
546              
547             =item $score = $status->get_learned_points()
548              
549             Return the message's score as computed for auto-learning, ignoring
550             all rules except for learning-based ones.
551              
552             =cut
553              
554             sub get_learned_points {
555 12     12 1 29 my ($self) = @_;
556 12         38 $self->_get_autolearn_points();
557 12         29 return $self->{learned_points};
558             }
559              
560             =item $score = $status->get_body_only_points()
561              
562             Return the message's score as computed for auto-learning, ignoring
563             all rules except for body-based ones.
564              
565             =cut
566              
567             sub get_body_only_points {
568 12     12 1 38 my ($self) = @_;
569 12         37 $self->_get_autolearn_points();
570 12         32 return $self->{body_only_points};
571             }
572              
573             =item $score = $status->get_autolearn_force_status()
574              
575             Return whether a message's score included any rules that are flagged as
576             autolearn_force.
577              
578             =cut
579              
580             sub get_autolearn_force_status {
581 12     12 1 34 my ($self) = @_;
582 12         36 $self->_get_autolearn_points();
583 12         31 return $self->{autolearn_force};
584             }
585              
586             =item $rule_names = $status->get_autolearn_force_names()
587              
588             Return a list of comma separated list of rule names if a message's
589             score included any rules that are flagged as autolearn_force.
590              
591             =cut
592              
593             sub get_autolearn_force_names {
594 12     12 1 24 my ($self) = @_;
595 12         29 my ($names);
596              
597 12         38 $self->_get_autolearn_points();
598 12         25 $names = $self->{autolearn_force_names};
599              
600 12 50       40 if (defined $names) {
601             #remove trailing comma
602 0         0 $names =~ s/,$//;
603             } else {
604 12         19 $names = "";
605             }
606              
607 12         30 return $names;
608             }
609              
610             sub _get_autolearn_points {
611 72     72   101 my ($self) = @_;
612              
613 72 100       134 return if (exists $self->{autolearn_points});
614             # ensure it only gets computed once, even if we return early
615 12         38 $self->{autolearn_points} = 0;
616              
617             # This function needs to use use sum($score[scoreset % 2]) not just {score}.
618             # otherwise we shift what we autolearn on and it gets really wierd. - tvd
619 12         48 my $orig_scoreset = $self->{conf}->get_score_set();
620 12         23 my $new_scoreset = $orig_scoreset;
621 12         31 my $scores = $self->{conf}->{scores};
622              
623 12 50       51 if (($orig_scoreset & 2) == 0) { # we don't need to recompute
624 12         50 dbg("learn: auto-learn: currently using scoreset $orig_scoreset");
625             }
626             else {
627 0         0 $new_scoreset = $orig_scoreset & ~2;
628 0         0 dbg("learn: auto-learn: currently using scoreset $orig_scoreset, recomputing score based on scoreset $new_scoreset");
629 0         0 $scores = $self->{conf}->{scoreset}->[$new_scoreset];
630             }
631              
632 12         38 my $tflags = $self->{conf}->{tflags};
633 12         32 my $points = 0;
634              
635             # Just in case this function is called multiple times, clear out the
636             # previous calculated values
637 12         33 $self->{learned_points} = 0;
638 12         23 $self->{body_only_points} = 0;
639 12         21 $self->{head_only_points} = 0;
640 12         29 $self->{autolearn_force} = 0;
641              
642 12         20 foreach my $test (@{$self->{test_names_hit}}) {
  12         61  
643             # According to the documentation, noautolearn, userconf, and learn
644             # rules are ignored for autolearning.
645 19 100       86 if (exists $tflags->{$test}) {
646 6 100       23 next if $tflags->{$test} =~ /\bnoautolearn\b/;
647 5 50       37 next if $tflags->{$test} =~ /\buserconf\b/;
648              
649             # Keep track of the learn points for an additional autolearn check.
650             # Use the original scoreset since it'll be 0 in sets 0 and 1.
651 0 0       0 if ($tflags->{$test} =~ /\blearn\b/) {
652             # we're guaranteed that the score will be defined
653 0         0 $self->{learned_points} += $self->{conf}->{scoreset}->[$orig_scoreset]->{$test};
654 0         0 next;
655             }
656              
657             #IF ANY RULES ARE AUTOLEARN FORCE, SET THAT FLAG
658 0 0       0 if ($tflags->{$test} =~ /\bautolearn_force\b/) {
659 0         0 $self->{autolearn_force}++;
660             #ADD RULE NAME TO LIST
661 0         0 $self->{autolearn_force_names}.="$test,";
662             }
663             }
664              
665             # ignore tests with 0 score (or undefined) in this scoreset
666 13 50       47 next if !$scores->{$test};
667              
668             # Go ahead and add points to the proper locations
669             # Changed logic because in testing, I was getting both head and body. Bug 5503
670 13 100       68 if ($self->{conf}->maybe_header_only ($test)) {
    50          
671 9         31 $self->{head_only_points} += $scores->{$test};
672 9         68 dbg("learn: auto-learn: adding head_only points $scores->{$test}");
673             } elsif ($self->{conf}->maybe_body_only ($test)) {
674 4         13 $self->{body_only_points} += $scores->{$test};
675 4         29 dbg("learn: auto-learn: adding body_only points $scores->{$test}");
676             } else {
677 0         0 dbg("learn: auto-learn: not considered head or body scores: $scores->{$test}");
678             }
679              
680 13         43 $points += $scores->{$test};
681             }
682              
683             # Figure out the final value we'll use for autolearning
684 12         134 $points = (sprintf "%0.3f", $points) + 0;
685 12         114 dbg("learn: auto-learn: message score: ".$self->{score}.", computed score for autolearn: $points");
686              
687 12         40 $self->{autolearn_points} = $points;
688             }
689              
690             ###########################################################################
691              
692             =item $isspam = $status->is_spam ()
693              
694             After a mail message has been checked, this method can be called. It will
695             return 1 for mail determined likely to be spam, 0 if it does not seem
696             spam-like.
697              
698             =cut
699              
700             sub is_spam {
701 99     99 1 282 my ($self) = @_;
702             # changed to test this so sub-tests can ask "is_spam" during a run
703 99         548 return ($self->{score} >= $self->{conf}->{required_score});
704             }
705              
706             ###########################################################################
707              
708             =item $list = $status->get_names_of_tests_hit ()
709              
710             After a mail message has been checked, this method can be called. It will
711             return a comma-separated string, listing all the symbolic test names
712             of the tests which were triggered by the mail.
713              
714             =cut
715              
716             sub get_names_of_tests_hit {
717 97     97 1 291 my ($self) = @_;
718              
719 97         212 return join(',', sort @{$self->{test_names_hit}});
  97         716  
720             }
721              
722             =item $list = $status->get_names_of_tests_hit_with_scores_hash ()
723              
724             After a mail message has been checked, this method can be called. It will
725             return a pointer to a hash for rule & score pairs for all the symbolic
726             test names and individual scores of the tests which were triggered by the mail.
727              
728             =cut
729              
730             sub get_names_of_tests_hit_with_scores_hash {
731 0     0 1 0 my ($self) = @_;
732              
733             #BASED ON CODE FOR TESTSSCORES TAG
734 0         0 my $scores = $self->{conf}->{scores};
735 0         0 my %testsscores;
736 0   0     0 $testsscores{$_} = $scores->{$_} || '0' for @{$self->{test_names_hit}};
  0         0  
737 0         0 return \%testsscores;
738             }
739              
740             =item $list = $status->get_names_of_tests_hit_with_scores ()
741              
742             After a mail message has been checked, this method can be called. It will
743             return a comma-separated string of rule=score pairs for all the symbolic
744             test names and individual scores of the tests which were triggered by the mail.
745              
746             =cut
747              
748             sub get_names_of_tests_hit_with_scores {
749 0     0 1 0 my ($self) = @_;
750              
751             #BASED ON CODE FOR TESTSSCORES TAG
752 0         0 my $scores = $self->{conf}->{scores};
753             return join(',', map($_ . '=' . ($scores->{$_} || '0'),
754 0   0     0 sort @{$self->{test_names_hit}})) || "none";
755             }
756              
757              
758             ###########################################################################
759              
760             =item $list = $status->get_names_of_subtests_hit ()
761              
762             After a mail message has been checked, this method can be called. It will
763             return a comma-separated string, listing all the symbolic test names of the
764             meta-rule sub-tests which were triggered by the mail. Sub-tests are the
765             normally-hidden rules, which score 0 and have names beginning with two
766             underscores, used in meta rules.
767              
768             =cut
769              
770             sub get_names_of_subtests_hit {
771 96     96 1 256 my ($self) = @_;
772              
773 96         262 return join(',', sort @{$self->{subtest_names_hit}});
  96         737  
774             }
775              
776             ###########################################################################
777              
778             =item $num = $status->get_score ()
779              
780             After a mail message has been checked, this method can be called. It will
781             return the message's score.
782              
783             =cut
784              
785             sub get_score {
786 0     0 1 0 my ($self) = @_;
787 0         0 return $self->{score};
788             }
789              
790             # left as backward compatibility
791             sub get_hits {
792 0     0 0 0 my ($self) = @_;
793 0         0 return $self->{score};
794             }
795              
796             ###########################################################################
797              
798             =item $num = $status->get_required_score ()
799              
800             After a mail message has been checked, this method can be called. It will
801             return the score required for a mail to be considered spam.
802              
803             =cut
804              
805             sub get_required_score {
806 0     0 1 0 my ($self) = @_;
807 0         0 return $self->{conf}->{required_score};
808             }
809              
810             # left as backward compatibility
811             sub get_required_hits {
812 0     0 0 0 my ($self) = @_;
813 0         0 return $self->{conf}->{required_score};
814             }
815              
816             ###########################################################################
817              
818             =item $num = $status->get_autolearn_status ()
819              
820             After a mail message has been checked, this method can be called. It will
821             return one of the following strings depending on whether the mail was
822             auto-learned or not: "ham", "no", "spam", "disabled", "failed", "unavailable".
823              
824             It also returns is flagged with auto_learn_force, it will also include the status
825             and the rules hit. For example: "autolearn_force=yes (AUTOLEARNTEST_BODY)"
826              
827             =cut
828              
829             sub get_autolearn_status {
830 34     34 1 82 my ($self) = @_;
831 34   50     139 my ($status) = $self->{auto_learn_status} || "unavailable";
832              
833 34 100       126 if (defined $self->{auto_learn_force_status}) {
834 2         9 $status .= " autolearn_force=".$self->{auto_learn_force_status};
835             }
836              
837 34         83 return $status;
838             }
839              
840             ###########################################################################
841              
842             =item $report = $status->get_report ()
843              
844             Deliver a "spam report" on the checked mail message. This contains details of
845             how many spam detection rules it triggered.
846              
847             The report is returned as a multi-line string, with the lines separated by
848             C<\n> characters.
849              
850             =cut
851              
852             sub get_report {
853 4     4 1 1132 my ($self) = @_;
854              
855 4 50       19 if (!exists $self->{'report'}) {
856 4         9 my $report;
857              
858 4         19 my $timer = $self->{main}->time_method("get_report");
859 4         18 $report = $self->{conf}->{report_template};
860 4   50     26 $report ||= '(no report template found)';
861              
862 4         26 $report = $self->_replace_tags($report);
863              
864 4         329 $report =~ s/\n*$/\n\n/s;
865 4         20 $self->{report} = $report;
866             }
867              
868 4         16 return $self->{report};
869             }
870              
871             ###########################################################################
872              
873             =item $preview = $status->get_content_preview ()
874              
875             Give a "preview" of the content.
876              
877             This is returned as a multi-line string, with the lines separated by C<\n>
878             characters, containing a fully-decoded, safe, plain-text sample of the first
879             few lines of the message body.
880              
881             =cut
882              
883             sub get_content_preview {
884 4     4 1 21 my ($self) = @_;
885              
886 4         12 my $str = '';
887 4         20 my $ary = $self->get_decoded_stripped_body_text_array();
888 4         10 shift @{$ary}; # drop the subject line
  4         11  
889              
890 4         11 my $numlines = 3;
891 4   33     38 while (length ($str) < 200 && @{$ary} && $numlines-- > 0) {
  16   66     106  
892 12         21 $str .= shift @{$ary};
  12         42  
893             }
894 4         9 undef $ary;
895              
896             # in case the last line was huge, trim it back to around 200 chars
897 4         25 local $1;
898 4         12 $str =~ s/^(.{200}).+$/$1 [...]/gm;
899 4         12 chomp ($str); $str .= "\n";
  4         9  
900              
901             # now, some tidy-ups that make things look a bit prettier
902 4         27 $str =~ s/-----Original Message-----.*$//gm;
903 4         18 $str =~ s/This is a multi-part message in MIME format\.//gs;
904 4         24 $str =~ s/[-_*.]{10,}//gs;
905 4         67 $str =~ s/\s+/ /gs;
906              
907             # add "Content preview:" ourselves, so that the text aligns
908             # correctly with the template -- then trim it off. We don't
909             # have to get this *exactly* right, but it's nicer if we
910             # make a bit of an effort ;)
911 4         27 $str = Mail::SpamAssassin::Util::wrap($str, " ", "Content preview: ", 75, 1);
912 4         39 $str =~ s/^Content preview:\s+//gs;
913              
914 4         22 return $str;
915             }
916              
917             ###########################################################################
918              
919             =item $msg = $status->get_message()
920              
921             Return the object representing the message being scanned.
922              
923             =cut
924              
925             sub get_message {
926 0     0 1 0 my ($self) = @_;
927 0         0 return $self->{msg};
928             }
929              
930             ###########################################################################
931              
932             =item $status->rewrite_mail ()
933              
934             Rewrite the mail message. This will at minimum add headers, and at
935             maximum MIME-encapsulate the message text, to reflect its spam or not-spam
936             status. The function will return a scalar of the rewritten message.
937              
938             The actual modifications depend on the configuration (see
939             C<Mail::SpamAssassin::Conf> for more information).
940              
941             The possible modifications are as follows:
942              
943             =over 4
944              
945             =item To:, From: and Subject: modification on spam mails
946              
947             Depending on the configuration, the To: and From: lines can have a
948             user-defined RFC 2822 comment appended for spam mail. The subject line
949             may have a user-defined string prepended to it for spam mail.
950              
951             =item X-Spam-* headers for all mails
952              
953             Depending on the configuration, zero or more headers with names
954             beginning with C<X-Spam-> will be added to mail depending on whether
955             it is spam or ham.
956              
957             =item spam message with report_safe
958              
959             If report_safe is set to true (1), then spam messages are encapsulated
960             into their own message/rfc822 MIME attachment without any modifications
961             being made.
962              
963             If report_safe is set to false (0), then the message will only have the
964             above headers added/modified.
965              
966             =back
967              
968             =cut
969              
970             sub rewrite_mail {
971 34     34 1 336 my ($self) = @_;
972              
973 34         142 my $timer = $self->{main}->time_method("rewrite_mail");
974 34   100     220 my $msg = $self->{msg}->get_mbox_separator() || '';
975              
976 34 100 66     171 if ($self->{is_spam} && $self->{conf}->{report_safe}) {
977 2         20 $msg .= $self->rewrite_report_safe();
978             }
979             else {
980 32         173 $msg .= $self->rewrite_no_report_safe();
981             }
982              
983 34         150 return $msg;
984             }
985              
986             # Make the line endings in the passed string reference appropriate
987             # for the original mail. Callers must note bug 5250: don't rewrite
988             # the message body, since that will corrupt 8bit attachments/MIME parts.
989             #
990             sub _fixup_report_line_endings {
991 36     36   124 my ($self, $strref) = @_;
992 36 50       175 if ($self->{msg}->{line_ending} ne "\n") {
993 0         0 $$strref =~ s/\r?\n/$self->{msg}->{line_ending}/gs;
994             }
995             }
996              
997             sub _get_added_headers {
998 34     34   96 my ($self, $which) = @_;
999 34         81 my $str = '';
1000             # use string appends to put this back together -- I finally benchmarked it.
1001             # join() is 56% of the speed of just using string appends. ;)
1002 34         83 foreach my $hf_ref (@{$self->{conf}->{$which}}) {
  34         179  
1003 168         579 my($hfname, $hfbody) = @$hf_ref;
1004 168         435 my $line = $self->_process_header($hfname,$hfbody);
1005 168         511 $line = $self->qp_encode_header($line);
1006 168         687 $str .= "X-Spam-$hfname: $line\n";
1007             }
1008 34         116 return $str;
1009             };
1010              
1011             # rewrite the message in report_safe mode
1012             # should not be called directly, use rewrite_mail instead
1013             #
1014             sub rewrite_report_safe {
1015 2     2 0 9 my ($self) = @_;
1016              
1017             # This is the original message. We do not want to make any modifications so
1018             # we may recover it if necessary. It will be put into the new message as a
1019             # message/rfc822 MIME part.
1020 2         12 my $original = $self->{msg}->get_pristine();
1021              
1022             # This is the new message.
1023 2         7 my $newmsg = '';
1024              
1025             # the report charset
1026 2         10 my $report_charset = "; charset=iso-8859-1";
1027 2 50       13 if ($self->{conf}->{report_charset}) {
1028 0         0 $report_charset = "; charset=" . $self->{conf}->{report_charset};
1029             }
1030              
1031             # the SpamAssassin report
1032 2         13 my $report = $self->get_report();
1033              
1034             # If there are any wide characters, need to MIME-encode in UTF-8
1035             # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
1036             # we could try converting to that charset if possible
1037 2 50 33     25 unless ($] < 5.008 || utf8::downgrade($report, 1)) {
1038 0         0 $report_charset = "; charset=utf-8";
1039 0         0 utf8::encode($report);
1040             }
1041              
1042             # get original headers, "pristine" if we can do it
1043 2         24 my $from = $self->{msg}->get_pristine_header("From");
1044 2         9 my $to = $self->{msg}->get_pristine_header("To");
1045 2         12 my $cc = $self->{msg}->get_pristine_header("Cc");
1046 2         11 my $subject = $self->{msg}->get_pristine_header("Subject");
1047 2         11 my $msgid = $self->{msg}->get_pristine_header('Message-Id');
1048 2         19 my $date = $self->{msg}->get_pristine_header("Date");
1049              
1050             # It'd be nice to do this with a foreach loop, but with only three
1051             # possibilities right now, it's easier not to...
1052              
1053 2 50       14 if (defined $self->{conf}->{rewrite_header}->{Subject}) {
1054 0 0       0 $subject = "\n" if !defined $subject;
1055 0         0 my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{Subject});
1056 0         0 $tag =~ s/\n/ /gs; # strip tag's newlines
1057 0         0 $subject =~ s/^(?:\Q${tag}\E )?/${tag} /g; # For some reason the tag may already be there!?
1058             }
1059              
1060 2 50       9 if (defined $self->{conf}->{rewrite_header}->{To}) {
1061 0 0       0 $to = "\n" if !defined $to;
1062 0         0 my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{To});
1063 0         0 $tag =~ s/\n/ /gs; # strip tag's newlines
1064 0         0 $to =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1065             }
1066              
1067 2 50       8 if (defined $self->{conf}->{rewrite_header}->{From}) {
1068 0 0       0 $from = "\n" if !defined $from;
1069 0         0 my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{From});
1070 0         0 $tag =~ s/\n+//gs; # strip tag's newlines
1071 0         0 $from =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1072             }
1073              
1074             # add report headers to message
1075 2 50       13 $newmsg .= "From: $from" if defined $from;
1076 2 50       11 $newmsg .= "To: $to" if defined $to;
1077 2 50       8 $newmsg .= "Cc: $cc" if defined $cc;
1078 2 50       12 $newmsg .= "Subject: $subject" if defined $subject;
1079 2 50       20 $newmsg .= "Date: $date" if defined $date;
1080 2 50       23 $newmsg .= "Message-Id: $msgid" if defined $msgid;
1081 2         20 $newmsg .= $self->_get_added_headers('headers_spam');
1082              
1083 2 50       14 if (defined $self->{conf}->{report_safe_copy_headers}) {
1084 2         9 my %already_added = map { $_ => 1 } qw/from to cc subject date message-id/;
  12         44  
1085              
1086 2         6 foreach my $hdr (@{$self->{conf}->{report_safe_copy_headers}}) {
  2         12  
1087 0 0       0 next if exists $already_added{lc $hdr};
1088 0         0 my @hdrtext = $self->{msg}->get_pristine_header($hdr);
1089 0         0 $already_added{lc $hdr}++;
1090              
1091 0 0       0 if (lc $hdr eq "received") { # add Received at the top ...
1092 0         0 my $rhdr = "";
1093 0         0 foreach (@hdrtext) {
1094 0         0 $rhdr .= "$hdr: $_";
1095             }
1096 0         0 $newmsg = "$rhdr$newmsg";
1097             }
1098             else {
1099 0         0 foreach (@hdrtext) {
1100 0         0 $newmsg .= "$hdr: $_";
1101             }
1102             }
1103             }
1104             }
1105              
1106             # jm: add a SpamAssassin Received header to note markup time etc.
1107             # emulates the fetchmail style.
1108             # tvd: do this after report_safe_copy_headers so Received will be done correctly
1109 2         8 $newmsg = "Received: from localhost by " .
1110             Mail::SpamAssassin::Util::fq_hostname() . "\n" .
1111             "\twith SpamAssassin (version " .
1112             Mail::SpamAssassin::Version() . ");\n" .
1113             "\t" . Mail::SpamAssassin::Util::time_to_rfc822_date() . "\n" .
1114             $newmsg;
1115              
1116             # MIME boundary
1117 2         20 my $boundary = "----------=_" . sprintf("%08X.%08X",time,int(rand(2 ** 32)));
1118              
1119             # ensure it's unique, so we can't be attacked this way
1120 2         60 while ($original =~ /^\Q${boundary}\E(?:--)?$/m) {
1121 0         0 $boundary .= "/".sprintf("%08X",int(rand(2 ** 32)));
1122             }
1123              
1124             # determine whether Content-Disposition should be "attachment" or "inline"
1125 2         6 my $disposition;
1126 2         13 my $ct = $self->{msg}->get_header("Content-Type");
1127 2 50 33     73 if (defined $ct && $ct ne '' && $ct !~ m{text/plain}i) {
      33        
1128 2         11 $disposition = "attachment";
1129 2         11 $report .= $self->_replace_tags($self->{conf}->{unsafe_report_template});
1130             # if we wanted to defang the attachment, this would be the place
1131             }
1132             else {
1133 0         0 $disposition = "inline";
1134             }
1135              
1136 2         7 my $type = "message/rfc822";
1137 2 50       18 $type = "text/plain" if $self->{conf}->{report_safe} > 1;
1138              
1139 2         10 my $description = $self->{conf}->{'encapsulated_content_description'};
1140              
1141             # Note: the message should end in blank line since mbox format wants
1142             # blank line at end and messages may be concatenated! In addition, the
1143             # x-spam-type parameter is fixed since we will use it later to recognize
1144             # original messages that can be extracted.
1145 2         37 $newmsg .= <<"EOM";
1146             MIME-Version: 1.0
1147             Content-Type: multipart/mixed; boundary="$boundary"
1148              
1149             This is a multi-part message in MIME format.
1150              
1151             --$boundary
1152             Content-Type: text/plain$report_charset
1153             Content-Disposition: inline
1154             Content-Transfer-Encoding: 8bit
1155              
1156             $report
1157              
1158             --$boundary
1159             Content-Type: $type; x-spam-type=original
1160             Content-Description: $description
1161             Content-Disposition: $disposition
1162             Content-Transfer-Encoding: 8bit
1163              
1164             EOM
1165              
1166 2         12 my $newmsgtrailer = "\n--$boundary--\n\n";
1167              
1168             # now fix line endings in both headers, report_safe body parts,
1169             # and new MIME boundaries and structure
1170 2         12 $self->_fixup_report_line_endings(\$newmsg);
1171 2         6 $self->_fixup_report_line_endings(\$newmsgtrailer);
1172 2         44 $newmsg .= $original.$newmsgtrailer;
1173              
1174 2         33 return $newmsg;
1175             }
1176              
1177             # rewrite the message in non-report_safe mode (just headers)
1178             # should not be called directly, use rewrite_mail instead
1179             #
1180             sub rewrite_no_report_safe {
1181 32     32 0 87 my ($self) = @_;
1182              
1183             # put the pristine headers into an array
1184             # skip the X-Spam- headers, but allow the X-Spam-Prev headers to remain.
1185             # since there may be a missing header/body
1186             #
1187 32         189 my @pristine_headers = split(/^/m, $self->{msg}->get_pristine_header());
1188 32         208 for (my $line = 0; $line <= $#pristine_headers; $line++) {
1189 120 50       488 next unless ($pristine_headers[$line] =~ /^X-Spam-(?!Prev-)/i);
1190 0         0 splice @pristine_headers, $line, 1 while ($pristine_headers[$line] =~ /^(?:X-Spam-(?!Prev-)|[ \t])/i);
1191 0         0 $line--;
1192             }
1193 32         101 my $separator = '';
1194 32 50 33     349 if (@pristine_headers && $pristine_headers[$#pristine_headers] =~ /^\s*$/) {
1195 32         91 $separator = pop @pristine_headers;
1196             }
1197              
1198 32         85 my $addition = 'headers_ham';
1199              
1200 32 50       120 if($self->{is_spam})
1201             {
1202             # special-case: Subject lines. ensure one exists, if we're
1203             # supposed to mark it up.
1204 0         0 my $created_subject = 0;
1205 0         0 my $subject = $self->{msg}->get_pristine_header('Subject');
1206 0 0 0     0 if (!defined($subject) && $self->{is_spam}
      0        
1207             && exists $self->{conf}->{rewrite_header}->{'Subject'})
1208             {
1209 0         0 push(@pristine_headers, "Subject: \n");
1210 0         0 $created_subject = 1;
1211             }
1212              
1213             # Deal with header rewriting
1214 0         0 foreach (@pristine_headers) {
1215             # if we're not going to do a rewrite, skip this header!
1216 0 0       0 next if (!/^(From|Subject|To):/i);
1217 0         0 my $hdr = ucfirst(lc($1));
1218 0 0       0 next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1219              
1220             # pop the original version onto the end of the header array
1221 0 0       0 if ($created_subject) {
1222 0         0 push(@pristine_headers, "X-Spam-Prev-Subject: (nonexistent)\n");
1223             } else {
1224 0         0 push(@pristine_headers, "X-Spam-Prev-$_");
1225             }
1226              
1227             # Figure out the rewrite piece
1228 0         0 my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{$hdr});
1229 0         0 $tag =~ s/\n/ /gs;
1230              
1231             # The tag should be a comment for this header ...
1232 0 0       0 $tag = "($tag)" if ($hdr =~ /^(?:From|To)$/);
1233              
1234 0         0 local $1;
1235 0         0 s/^([^:]+:)[ \t]*(?:\Q${tag}\E )?/$1 ${tag} /i;
1236             }
1237              
1238 0         0 $addition = 'headers_spam';
1239             }
1240              
1241             # Break the pristine header set into two blocks; $new_hdrs_pre is the stuff
1242             # that we want to ensure comes before any SpamAssassin markup headers,
1243             # like the Return-Path header (see bug 3409).
1244             #
1245             # all the rest of the message headers (as left in @pristine_headers), is
1246             # to be placed after the SpamAssassin markup hdrs. Once one of those headers
1247             # is seen, all further headers go into that set; it's assumed that it's an
1248             # old copy of the header, or attempted spoofing, if it crops up halfway
1249             # through the headers.
1250              
1251 32         82 my $new_hdrs_pre = '';
1252 32 50 33     298 if (@pristine_headers && $pristine_headers[0] =~ /^Return-Path:/i) {
1253 0         0 $new_hdrs_pre .= shift(@pristine_headers);
1254 0   0     0 while (@pristine_headers && $pristine_headers[0] =~ /^[ \t]/) {
1255 0         0 $new_hdrs_pre .= shift(@pristine_headers);
1256             }
1257             }
1258 32         195 $new_hdrs_pre .= $self->_get_added_headers($addition);
1259              
1260             # fix up line endings appropriately
1261 32         158 my $newmsg = $new_hdrs_pre . join('',@pristine_headers) . $separator;
1262 32         195 $self->_fixup_report_line_endings(\$newmsg);
1263              
1264 32         164 return $newmsg.$self->{msg}->get_pristine_body();
1265             }
1266              
1267             sub qp_encode_header {
1268 168     168 0 375 my ($self, $text) = @_;
1269              
1270             # return unchanged if there are no 8-bit characters
1271 168 50       596 return $text if $text !~ tr/\x00-\x7F//c;
1272              
1273 0         0 my $cs = 'ISO-8859-1';
1274 0 0       0 if ($self->{report_charset}) {
1275 0         0 $cs = $self->{report_charset};
1276             }
1277              
1278 0         0 my @hexchars = split('', '0123456789abcdef');
1279 0         0 my $ord;
1280 0         0 local $1;
1281 0         0 $text =~ s{([\x80-\xff])}{
1282 0         0 $ord = ord $1;
1283 0         0 '='.$hexchars[($ord & 0xf0) >> 4].$hexchars[$ord & 0x0f]
1284             }ges;
1285              
1286 0         0 $text = '=?'.$cs.'?Q?'.$text.'?=';
1287              
1288 0         0 dbg("markup: encoding header in $cs: $text");
1289 0         0 return $text;
1290             }
1291              
1292             sub _process_header {
1293 168     168   373 my ($self, $hdr_name, $hdr_data) = @_;
1294              
1295 168         412 $hdr_data = $self->_replace_tags($hdr_data);
1296 168         358 $hdr_data =~ s/(?:\r?\n)+$//; # make sure there are no trailing newlines ...
1297              
1298 168 50       433 if ($self->{conf}->{fold_headers}) {
1299 168 50       352 if ($hdr_data =~ /\n/) {
1300 0         0 $hdr_data =~ s/\s*\n\s*/\n\t/g;
1301 0         0 return $hdr_data;
1302             }
1303             else {
1304             # use '!!' instead of ': ' so it doesn't wrap on the space
1305 168         421 my $hdr = "X-Spam-$hdr_name!!$hdr_data";
1306 168         530 $hdr = Mail::SpamAssassin::Util::wrap($hdr, "\t", "", 79, 0, '(?<=[\s,])');
1307 168         397 $hdr =~ s/^\t\n//gm;
1308 168         688 return (split (/!!/, $hdr, 2))[1]; # just return the data part
1309             }
1310             }
1311             else {
1312 0         0 $hdr_data =~ s/\n/ /g; # Can't have newlines in headers, unless folded
1313 0         0 return $hdr_data;
1314             }
1315             }
1316              
1317             sub _replace_tags {
1318 174     174   256 my $self = shift;
1319 174         283 my $text = shift;
1320              
1321             # default to leaving the original string in place, if we cannot find
1322             # a tag for it (bug 4793)
1323 174         632 local($1,$2,$3);
1324 174         1298 $text =~ s{(_(\w+?)(?:\((.*?)\))?_)}{
1325 430         1021 my $full = $1;
1326 430         690 my $tag = $2;
1327 430         529 my $result;
1328 430 50       749 if ($tag =~ /^ADDEDHEADER(?:HAM|SPAM|)\z/) {
1329             # Bug 6278: break infinite recursion through _get_added_headers and
1330             # _get_tag on an attempt to use such tag in add_header template
1331             } else {
1332 430         966 $result = $self->get_tag_raw($tag,$3);
1333 430 50       839 $result = join(' ',@$result) if ref $result eq 'ARRAY';
1334             }
1335 430 50       2170 defined $result ? $result : $full;
1336             }ge;
1337              
1338 174         713 return $text;
1339             }
1340              
1341             ###########################################################################
1342              
1343             # public API for plugins
1344              
1345             =item $status->action_depends_on_tags($tags, $code, @args)
1346              
1347             Enqueue the supplied subroutine reference C<$code>, to become runnable when
1348             all the specified tags become available. The C<$tags> may be a simple
1349             scalar - a tag name, or a listref of tag names. The subroutine C<&$code>
1350             when called will be passed a C<permessagestatus> object as its first argument,
1351             followed by the supplied (optional) list C<@args> .
1352              
1353             =cut
1354              
1355             sub action_depends_on_tags {
1356 0     0 1 0 my($self, $tags, $code, @args) = @_;
1357              
1358 0 0       0 ref $code eq 'CODE'
1359             or die "action_depends_on_tags: argument must be a subroutine ref";
1360              
1361             # tag names on which the given action depends
1362 0 0       0 my @dep_tags = !ref $tags ? uc $tags : map(uc($_),@$tags);
1363              
1364             # @{$self->{tagrun_subs}} list of all submitted subroutines
1365             # @{$self->{tagrun_actions}{$tag}} bitmask of action indices blocked by tag
1366             # $self->{tagrun_tagscnt}[$action_ind] count of tags still pending
1367              
1368             # store action details, obtain its index
1369 0         0 push(@{$self->{tagrun_subs}}, [$code,@args]);
  0         0  
1370 0         0 my $action_ind = $#{$self->{tagrun_subs}};
  0         0  
1371              
1372             # list dependency tag names which are not already satistied
1373             my @blocking_tags =
1374 0   0     0 grep(!defined $self->{tag_data}{$_} || $self->{tag_data}{$_} eq '',
1375             @dep_tags);
1376              
1377 0         0 $self->{tagrun_tagscnt}[$action_ind] = scalar @blocking_tags;
1378 0         0 $self->{tagrun_actions}{$_}[$action_ind] = 1 for @blocking_tags;
1379              
1380 0 0       0 if (@blocking_tags) {
1381 0         0 dbg("check: tagrun - action %s blocking on tags %s",
1382             $action_ind, join(', ',@blocking_tags));
1383             } else {
1384 0         0 dbg("check: tagrun - tag %s was ready, action %s runnable immediately: %s",
1385             join(', ',@dep_tags), $action_ind, join(', ',$code,@args));
1386 0         0 &$code($self, @args);
1387             }
1388             }
1389              
1390             # tag_is_ready() will be called by set_tag(), indicating that a given
1391             # tag just received its value, possibly unblocking an action routine
1392             # as declared by action_depends_on_tags().
1393             #
1394             # Well-behaving plugins should call set_tag() once when a tag is fully
1395             # assembled and ready. Multiple calls to set the same tag value are handled
1396             # gracefully, but may result in premature activation of a pending action.
1397             # Setting tag values by plugins should not be done directly but only through
1398             # the public API set_tag(), otherwise a pending action release may be missed.
1399             #
1400             sub tag_is_ready {
1401 658     658 0 1212 my($self, $tag) = @_;
1402 658         993 $tag = uc $tag;
1403              
1404 658 50       1413 if (would_log('dbg', 'check')) {
1405 0         0 my $tag_val = $self->{tag_data}{$tag};
1406 0 0       0 dbg("check: tagrun - tag %s is now ready, value: %s",
    0          
1407             $tag, !defined $tag_val ? '<UNDEF>'
1408             : ref $tag_val ne 'ARRAY' ? $tag_val
1409             : 'ARY:[' . join(',',@$tag_val) . ']' );
1410             }
1411 658 50       2022 if (ref $self->{tagrun_actions}{$tag}) { # any action blocking on this tag?
1412 0         0 my $action_ind = 0;
1413 0         0 foreach my $action_pending (@{$self->{tagrun_actions}{$tag}}) {
  0         0  
1414 0 0       0 if ($action_pending) {
1415 0         0 $self->{tagrun_actions}{$tag}[$action_ind] = 0;
1416 0 0       0 if ($self->{tagrun_tagscnt}[$action_ind] <= 0) {
    0          
1417             # should not happen, warn and ignore
1418             warn "tagrun error: count for $action_ind is ".
1419 0         0 $self->{tagrun_tagscnt}[$action_ind]."\n";
1420             } elsif (! --($self->{tagrun_tagscnt}[$action_ind])) {
1421 0         0 my($code,@args) = @{$self->{tagrun_subs}[$action_ind]};
  0         0  
1422 0         0 dbg("check: tagrun - tag %s unblocking the action %s: %s",
1423             $tag, $action_ind, join(', ',$code,@args));
1424 0         0 &$code($self, @args);
1425             }
1426             }
1427 0         0 $action_ind++;
1428             }
1429             }
1430             }
1431              
1432             # debugging aid: show actions that are still pending, waiting for their
1433             # tags to receive a value
1434             #
1435             sub report_unsatisfied_actions {
1436 124     124 0 334 my($self) = @_;
1437 124         296 my @tags;
1438 124 100       874 @tags = keys %{$self->{tagrun_actions}} if ref $self->{tagrun_actions};
  98         376  
1439 124         469 for my $tag (@tags) {
1440             my @pending_actions = grep($self->{tagrun_actions}{$tag}[$_],
1441 0         0 (0 .. $#{$self->{tagrun_actions}{$tag}}));
  0         0  
1442 0 0       0 dbg("check: tagrun - tag %s is still blocking action %s",
1443             $tag, join(', ', @pending_actions)) if @pending_actions;
1444             }
1445             }
1446              
1447             =item $status->set_tag($tagname, $value)
1448              
1449             Set a template tag, as used in C<add_header>, report templates, etc.
1450             This API is intended for use by plugins. Tag names will be converted
1451             to an all-uppercase representation internally.
1452              
1453             C<$value> can be a simple scalar (string or number), or a reference to an
1454             array, in which case the public method get_tag will join array elements
1455             using a space as a separator, returning a single string for backward
1456             compatibility.
1457              
1458             C<$value> can also be a subroutine reference, which will be evaluated
1459             each time the template is expanded. The first argument passed by get_tag
1460             to a called subroutine will be a PerMsgStatus object (this module's object),
1461             followed by optional arguments provided a caller to get_tag.
1462              
1463             Note that perl supports closures, which means that variables set in the
1464             caller's scope can be accessed inside this C<sub>. For example:
1465              
1466             my $text = "hello world!";
1467             $status->set_tag("FOO", sub {
1468             my $pms = shift;
1469             return $text;
1470             });
1471              
1472             See C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details
1473             on how template tags are used.
1474              
1475             C<undef> will be returned if a tag by that name has not been defined.
1476              
1477             =cut
1478              
1479             sub set_tag {
1480 658     658 1 1452 my($self,$tag,$val) = @_;
1481 658         2370 $self->{tag_data}->{uc $tag} = $val;
1482 658         1492 $self->tag_is_ready($tag);
1483             }
1484              
1485             # public API for plugins
1486              
1487             =item $string = $status->get_tag($tagname)
1488              
1489             Get the current value of a template tag, as used in C<add_header>, report
1490             templates, etc. This API is intended for use by plugins. Tag names will be
1491             converted to an all-uppercase representation internally. See
1492             C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details on
1493             tags.
1494              
1495             C<undef> will be returned if a tag by that name has not been defined.
1496              
1497             =cut
1498              
1499             sub get_tag {
1500 9     9 1 69 my($self, $tag, @args) = @_;
1501              
1502 9 50       35 return if !defined $tag;
1503 9         30 $tag = uc $tag;
1504 9         17 my $data;
1505 9 100       76 if (exists $common_tags{$tag}) {
    50          
1506             # tag data from traditional pre-defined tag subroutines
1507 3         9 $data = $common_tags{$tag};
1508 3 50       18 $data = $data->($self,@args) if ref $data eq 'CODE';
1509 3 50       23 $data = join(' ',@$data) if ref $data eq 'ARRAY';
1510 3 50       13 $data = "" if !defined $data;
1511             } elsif (exists $self->{tag_data}->{$tag}) {
1512             # tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1513 6         20 $data = $self->{tag_data}->{$tag};
1514 6 50       26 $data = $data->($self,@args) if ref $data eq 'CODE';
1515 6 50       25 $data = join(' ',@$data) if ref $data eq 'ARRAY';
1516 6 50       43 $data = "" if !defined $data;
1517             }
1518 9         39 return $data;
1519             }
1520              
1521             =item $string = $status->get_tag_raw($tagname, @args)
1522              
1523             Similar to C<get_tag>, but keeps a tag name unchanged (does not uppercase it),
1524             and does not convert arrayref tag values into a single string.
1525              
1526             =cut
1527              
1528             sub get_tag_raw {
1529 430     430 1 997 my($self, $tag, @args) = @_;
1530              
1531 430 50       835 return if !defined $tag;
1532 430         553 my $data;
1533 430 100       1141 if (exists $common_tags{$tag}) {
    50          
1534             # tag data from traditional pre-defined tag subroutines
1535 362         654 $data = $common_tags{$tag};
1536 362 50       1345 $data = $data->($self,@args) if ref $data eq 'CODE';
1537 362 50       768 $data = "" if !defined $data;
1538             } elsif (exists $self->{tag_data}->{$tag}) {
1539             # tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1540 68         165 $data = $self->{tag_data}->{$tag};
1541 68 50       216 $data = $data->($self,@args) if ref $data eq 'CODE';
1542 68 50       241 $data = "" if !defined $data;
1543             }
1544 430         887 return $data;
1545             }
1546              
1547             ###########################################################################
1548              
1549             # public API for plugins
1550              
1551             =item $status->set_spamd_result_item($subref)
1552              
1553             Set an entry for the spamd result log line. C<$subref> should be a code
1554             reference for a subroutine which will return a string in C<'name=VALUE'>
1555             format, similar to the other entries in the spamd result line:
1556              
1557             Jul 17 14:10:47 radish spamd[16670]: spamd: result: Y 22 - ALL_NATURAL,
1558             DATE_IN_FUTURE_03_06,DIET_1,DRUGS_ERECTILE,DRUGS_PAIN,
1559             TEST_FORGED_YAHOO_RCVD,TEST_INVALID_DATE,TEST_NOREALNAME,
1560             TEST_NORMAL_HTTP_TO_IP,UNDISC_RECIPS scantime=0.4,size=3138,user=jm,
1561             uid=1000,required_score=5.0,rhost=localhost,raddr=127.0.0.1,
1562             rport=33153,mid=<9PS291LhupY>,autolearn=spam
1563              
1564             C<name> and C<VALUE> must not contain C<=> or C<,> characters, as it
1565             is important that these log lines are easy to parse.
1566              
1567             The code reference will be called by spamd after the message has been scanned,
1568             and the C<PerMsgStatus::check()> method has returned.
1569              
1570             =cut
1571              
1572             sub set_spamd_result_item {
1573 0     0 1 0 my ($self, $ref) = @_;
1574 0         0 push @{$self->{spamd_result_log_items}}, $ref;
  0         0  
1575             }
1576              
1577             # called by spamd
1578             sub get_spamd_result_log_items {
1579 0     0 0 0 my ($self) = @_;
1580 0         0 my @ret;
1581 0         0 foreach my $ref (@{$self->{spamd_result_log_items}}) {
  0         0  
1582 0         0 push @ret, &$ref;
1583             }
1584 0         0 return @ret;
1585             }
1586              
1587             ###########################################################################
1588              
1589             sub _get_tag_value_for_yesno {
1590 36     36   123 my($self, $arg) = @_;
1591 36         84 my($arg_spam, $arg_ham);
1592 36 50       124 ($arg_spam, $arg_ham) = split(/,/, $arg, 2) if defined $arg;
1593 36 50       183 return $self->{is_spam} ? (defined $arg_spam ? $arg_spam : 'Yes')
    50          
    100          
1594             : (defined $arg_ham ? $arg_ham : 'No');
1595             }
1596              
1597             sub _get_tag_value_for_score {
1598 38     38   325 my ($self, $pad) = @_;
1599              
1600 38         357 my $score = sprintf("%2.1f", $self->{score});
1601 38         155 my $rscore = $self->_get_tag_value_for_required_score();
1602              
1603             #Change due to bug 6419 to use Util function for consistency with spamd
1604             #and PerMessageStatus
1605 38         251 $score = Mail::SpamAssassin::Util::get_tag_value_for_score($score, $rscore, $self->{is_spam});
1606              
1607             #$pad IS PROVIDED BY THE _SCORE(PAD)_ tag
1608 38 50 33     158 if (defined $pad && $pad =~ /^(0+| +)$/) {
1609 0         0 my $count = length($1) + 3 - length($score);
1610 0 0       0 $score = (substr($pad, 0, $count) . $score) if $count > 0;
1611             }
1612 38         102 return $score;
1613              
1614             }
1615              
1616             sub _get_tag_value_for_required_score {
1617 76     76   135 my $self = shift;
1618 76         409 return sprintf("%2.1f", $self->{conf}->{required_score});
1619             }
1620              
1621              
1622             ###########################################################################
1623              
1624             =item $status->finish ()
1625              
1626             Indicate that this C<$status> object is finished with, and can be destroyed.
1627              
1628             If you are using SpamAssassin in a persistent environment, or checking many
1629             mail messages from one C<Mail::SpamAssassin> factory, this method should be
1630             called to ensure Perl's garbage collection will clean up old status objects.
1631              
1632             =cut
1633              
1634             sub finish {
1635 124     124 1 27284 my ($self) = @_;
1636              
1637 124         768 $self->{main}->call_plugins ("per_msg_finish", {
1638             permsgstatus => $self
1639             });
1640              
1641 124         774 $self->report_unsatisfied_actions;
1642              
1643             # Delete out all of the members of $self. This will remove any direct
1644             # circular references and let the memory get reclaimed while also being more
1645             # efficient than a foreach() loop over the keys.
1646 124         256 %{$self} = ();
  124         3226  
1647             }
1648              
1649             sub finish_tests {
1650 0     0 0 0 my ($conf) = @_;
1651 0         0 foreach my $method (@TEMPORARY_METHODS) {
1652 0 0       0 if (defined &{$method}) {
  0         0  
1653 0         0 undef &{$method};
  0         0  
1654             }
1655             }
1656 0         0 @TEMPORARY_METHODS = (); # clear for next time
1657 0         0 %TEMPORARY_EVAL_GLUE_METHODS = ();
1658             }
1659              
1660              
1661             =item $name = $status->get_current_eval_rule_name()
1662              
1663             Return the name of the currently-running eval rule. C<undef> is
1664             returned if no eval rule is currently being run. Useful for plugins
1665             to determine the current rule name while inside an eval test function
1666             call.
1667              
1668             =cut
1669              
1670             sub get_current_eval_rule_name {
1671 0     0 1 0 my ($self) = @_;
1672 0         0 return $self->{current_rule_name};
1673             }
1674              
1675             ###########################################################################
1676              
1677             sub extract_message_metadata {
1678 96     96 0 593 my ($self) = @_;
1679              
1680 96         457 my $timer = $self->{main}->time_method("extract_message_metadata");
1681 96         732 $self->{msg}->extract_message_metadata($self);
1682              
1683 96         606 foreach my $item (qw(
1684             relays_trusted relays_trusted_str num_relays_trusted
1685             relays_untrusted relays_untrusted_str num_relays_untrusted
1686             relays_internal relays_internal_str num_relays_internal
1687             relays_external relays_external_str num_relays_external
1688             num_relays_unparseable last_trusted_relay_index
1689             last_internal_relay_index
1690             ))
1691             {
1692 1440         3270 $self->{$item} = $self->{msg}->{metadata}->{$item};
1693             }
1694              
1695             # TODO: International domain names (UTF-8) must be converted to
1696             # ASCII-compatible encoding (ACE) for the purpose of setting the
1697             # SENDERDOMAIN and AUTHORDOMAIN tags (and probably for other uses too).
1698             # (explicitly required for DMARC, draft-kucherawy-dmarc-base sect. 5.6.1)
1699             #
1700 96         306 { local $1;
  96         327  
1701 96         782 my $addr = $self->get('EnvelopeFrom:addr', undef);
1702             # collect a FQDN, ignoring potential trailing WSP
1703 96 100 66     396 if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1704 1         5 $self->set_tag('SENDERDOMAIN', lc $1);
1705             }
1706             # TODO: the get ':addr' only returns the first address; this should be
1707             # augmented to be able to return all addresses in a header field, multiple
1708             # addresses in a From header field are allowed according to RFC 5322
1709 96         312 $addr = $self->get('From:addr', undef);
1710 96 100 66     1019 if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1711 57         582 $self->set_tag('AUTHORDOMAIN', lc $1);
1712             }
1713             }
1714              
1715 96         600 $self->set_tag('RELAYSTRUSTED', $self->{relays_trusted_str});
1716 96         302 $self->set_tag('RELAYSUNTRUSTED', $self->{relays_untrusted_str});
1717 96         343 $self->set_tag('RELAYSINTERNAL', $self->{relays_internal_str});
1718 96         542 $self->set_tag('RELAYSEXTERNAL', $self->{relays_external_str});
1719 96         321 my $lasthop = $self->{relays_external}->[0];
1720 96 100       300 if ($lasthop) {
1721 14         51 $self->set_tag('LASTEXTERNALIP', $lasthop->{ip});
1722             }
1723 96         590 $self->set_tag('LANGUAGES', $self->{msg}->get_metadata("X-Languages"));
1724              
1725             # This should happen before we get called, but just in case.
1726 96 100       452 if (!defined $self->{msg}->{metadata}->{html}) {
1727 95         516 $self->get_decoded_stripped_body_text_array();
1728             }
1729 96         391 $self->{html} = $self->{msg}->{metadata}->{html};
1730              
1731             # allow plugins to add more metadata, read the stuff that's there, etc.
1732 96         628 $self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
1733             }
1734              
1735             ###########################################################################
1736              
1737             =item $status->get_decoded_body_text_array ()
1738              
1739             Returns the message body, with B<base64> or B<quoted-printable> encodings
1740             decoded, and non-text parts or non-inline attachments stripped.
1741              
1742             It is returned as an array of strings, with each string representing
1743             one newline-separated line of the body.
1744              
1745             =cut
1746              
1747             sub get_decoded_body_text_array {
1748 98     98 1 414 return $_[0]->{msg}->get_decoded_body_text_array();
1749             }
1750              
1751             =item $status->get_decoded_stripped_body_text_array ()
1752              
1753             Returns the message body, decoded (as described in
1754             get_decoded_body_text_array()), with HTML rendered, and with whitespace
1755             normalized.
1756              
1757             It will always render text/html, and will use a heuristic to determine if other
1758             text/* parts should be considered text/html.
1759              
1760             It is returned as an array of strings, with each string representing one
1761             'paragraph'. Paragraphs, in plain-text mails, are double-newline-separated
1762             blocks of multi-line text.
1763              
1764             =cut
1765              
1766             sub get_decoded_stripped_body_text_array {
1767 327     327 1 1345 return $_[0]->{msg}->get_rendered_body_text_array();
1768             }
1769              
1770             ###########################################################################
1771              
1772             =item $status->get (header_name [, default_value])
1773              
1774             Returns a message header, pseudo-header, real name or address.
1775             C<header_name> is the name of a mail header, such as 'Subject', 'To',
1776             etc. If C<default_value> is given, it will be used if the requested
1777             C<header_name> does not exist.
1778              
1779             Appending C<:raw> to the header name will inhibit decoding of quoted-printable
1780             or base-64 encoded strings.
1781              
1782             Appending a modifier C<:addr> to a header field name will cause everything
1783             except the first email address to be removed from the header field. It is
1784             mainly applicable to header fields 'From', 'Sender', 'To', 'Cc' along with
1785             their 'Resent-*' counterparts, and the 'Return-Path'. For example, all of
1786             the following will result in "example@foo":
1787              
1788             =over 4
1789              
1790             =item example@foo
1791              
1792             =item example@foo (Foo Blah)
1793              
1794             =item example@foo, example@bar
1795              
1796             =item display: example@foo (Foo Blah), example@bar ;
1797              
1798             =item Foo Blah <example@foo>
1799              
1800             =item "Foo Blah" <example@foo>
1801              
1802             =item "'Foo Blah'" <example@foo>
1803              
1804             =back
1805              
1806             Appending a modifier C<:name> to a header field name will cause everything
1807             except the first display name to be removed from the header field. It is
1808             mainly applicable to header fields containing a single mail address: 'From',
1809             'Sender', along with their 'Resent-From' and 'Resent-Sender' counterparts.
1810             For example, all of the following will result in "Foo Blah". One level of
1811             single quotes is stripped too, as it is often seen.
1812              
1813             =over 4
1814              
1815             =item example@foo (Foo Blah)
1816              
1817             =item example@foo (Foo Blah), example@bar
1818              
1819             =item display: example@foo (Foo Blah), example@bar ;
1820              
1821             =item Foo Blah <example@foo>
1822              
1823             =item "Foo Blah" <example@foo>
1824              
1825             =item "'Foo Blah'" <example@foo>
1826              
1827             =back
1828              
1829             There are several special pseudo-headers that can be specified:
1830              
1831             =over 4
1832              
1833             =item C<ALL> can be used to mean the text of all the message's headers.
1834              
1835             =item C<ALL-TRUSTED> can be used to mean the text of all the message's headers
1836             that could only have been added by trusted relays.
1837              
1838             =item C<ALL-INTERNAL> can be used to mean the text of all the message's headers
1839             that could only have been added by internal relays.
1840              
1841             =item C<ALL-UNTRUSTED> can be used to mean the text of all the message's
1842             headers that may have been added by untrusted relays. To make this
1843             pseudo-header more useful for header rules the 'Received' header that was added
1844             by the last trusted relay is included, even though it can be trusted.
1845              
1846             =item C<ALL-EXTERNAL> can be used to mean the text of all the message's headers
1847             that may have been added by external relays. Like C<ALL-UNTRUSTED> the
1848             'Received' header added by the last internal relay is included.
1849              
1850             =item C<ToCc> can be used to mean the contents of both the 'To' and 'Cc'
1851             headers.
1852              
1853             =item C<EnvelopeFrom> is the address used in the 'MAIL FROM:' phase of the SMTP
1854             transaction that delivered this message, if this data has been made available
1855             by the SMTP server.
1856              
1857             =item C<MESSAGEID> is a symbol meaning all Message-Id's found in the message;
1858             some mailing list software moves the real 'Message-Id' to 'Resent-Message-Id'
1859             or 'X-Message-Id', then uses its own one in the 'Message-Id' header. The value
1860             returned for this symbol is the text from all 3 headers, separated by newlines.
1861              
1862             =item C<X-Spam-Relays-Untrusted> is the generated metadata of untrusted relays
1863             the message has passed through
1864              
1865             =item C<X-Spam-Relays-Trusted> is the generated metadata of trusted relays
1866             the message has passed through
1867              
1868             =back
1869              
1870             =cut
1871              
1872             # only uses two arguments, ignores $defval
1873             sub _get {
1874 3368     3368   5933 my ($self, $request) = @_;
1875              
1876 3368         3765 my $result;
1877 3368         4131 my $getaddr = 0;
1878 3368         3733 my $getname = 0;
1879 3368         3796 my $getraw = 0;
1880              
1881             # special queries - process and strip modifiers
1882 3368 100       7095 if (index($request,':') >= 0) { # triage
1883 596         1455 local $1;
1884 596         3726 while ($request =~ s/:([^:]*)//) {
1885 596 100       2170 if ($1 eq 'raw') { $getraw = 1 }
  12 50       42  
    0          
1886 584         1843 elsif ($1 eq 'addr') { $getaddr = $getraw = 1 }
1887 0         0 elsif ($1 eq 'name') { $getname = 1 }
1888             }
1889             }
1890 3368         5368 my $request_lc = lc $request;
1891              
1892             # ALL: entire pristine or semi-raw headers
1893 3368 100       16354 if ($request eq 'ALL') {
    50          
    50          
    50          
    50          
    100          
    50          
    50          
    50          
    50          
    100          
    100          
1894             $result = $getraw ? $self->{msg}->get_pristine_header()
1895 11 50       78 : $self->{msg}->get_all_headers(1);
1896             }
1897             # ALL-TRUSTED: entire trusted raw headers
1898             elsif ($request eq 'ALL-TRUSTED') {
1899             # '+1' since we added the received header even though it's not considered
1900             # trusted, so we know that those headers can be trusted too
1901             return $self->get_all_hdrs_in_rcvd_index_range(
1902 0         0 undef, $self->{last_trusted_relay_index}+1);
1903             }
1904             # ALL-INTERNAL: entire internal raw headers
1905             elsif ($request eq 'ALL-INTERNAL') {
1906             # '+1' for the same reason as in ALL-TRUSTED above
1907             return $self->get_all_hdrs_in_rcvd_index_range(
1908 0         0 undef, $self->{last_internal_relay_index}+1);
1909             }
1910             # ALL-UNTRUSTED: entire untrusted raw headers
1911             elsif ($request eq 'ALL-UNTRUSTED') {
1912             # '+1' for the same reason as in ALL-TRUSTED above
1913             return $self->get_all_hdrs_in_rcvd_index_range(
1914 0         0 $self->{last_trusted_relay_index}+1, undef);
1915             }
1916             # ALL-EXTERNAL: entire external raw headers
1917             elsif ($request eq 'ALL-EXTERNAL') {
1918             # '+1' for the same reason as in ALL-TRUSTED above
1919             return $self->get_all_hdrs_in_rcvd_index_range(
1920 0         0 $self->{last_internal_relay_index}+1, undef);
1921             }
1922             # EnvelopeFrom: the SMTP MAIL FROM: address
1923             elsif ($request_lc eq "\LEnvelopeFrom") {
1924 110         720 $result = $self->get_envelope_from();
1925             }
1926             # untrusted relays list, as string
1927             elsif ($request_lc eq "\LX-Spam-Relays-Untrusted") {
1928 0         0 $result = $self->{relays_untrusted_str};
1929             }
1930             # trusted relays list, as string
1931             elsif ($request_lc eq "\LX-Spam-Relays-Trusted") {
1932 0         0 $result = $self->{relays_trusted_str};
1933             }
1934             # external relays list, as string
1935             elsif ($request_lc eq "\LX-Spam-Relays-External") {
1936 0         0 $result = $self->{relays_external_str};
1937             }
1938             # internal relays list, as string
1939             elsif ($request_lc eq "\LX-Spam-Relays-Internal") {
1940 0         0 $result = $self->{relays_internal_str};
1941             }
1942             # ToCc: the combined recipients list
1943             elsif ($request_lc eq "\LToCc") {
1944 77         372 $result = join("\n", $self->{msg}->get_header('To', $getraw));
1945 77 100       309 if ($result ne '') {
1946 5         17 chomp $result;
1947 5 50       30 $result .= ", " if $result =~ /\S/;
1948             }
1949 77         289 $result .= join("\n", $self->{msg}->get_header('Cc', $getraw));
1950 77 100       390 $result = undef if $result eq '';
1951             }
1952             # MESSAGEID: handle lists which move the real message-id to another
1953             # header for resending.
1954             elsif ($request eq 'MESSAGEID') {
1955 38 50       358 $result = join("\n", grep { defined($_) && $_ ne '' }
1956             $self->{msg}->get_header('X-Message-Id', $getraw),
1957             $self->{msg}->get_header('Resent-Message-Id', $getraw),
1958             $self->{msg}->get_header('X-Original-Message-ID', $getraw),
1959 77         374 $self->{msg}->get_header('Message-Id', $getraw));
1960             }
1961             # a conventional header
1962             else {
1963             my @results = $getraw ? $self->{msg}->raw_header($request)
1964 3093 100       8696 : $self->{msg}->get_header($request);
1965             # dbg("message: get(%s)%s = %s",
1966             # $request, $getraw?'raw':'', join(", ",@results));
1967 3093 100       4926 if (@results) {
1968 330         1093 $result = join('', @results);
1969             } else { # metadata
1970 2763         6353 $result = $self->{msg}->get_metadata($request);
1971             }
1972             }
1973              
1974             # special queries
1975 3368 100 66     7739 if (defined $result && ($getaddr || $getname)) {
      66        
1976 132         436 local $1;
1977 132         383 $result =~ s/^[^:]+:(.*);\s*$/$1/gs; # 'undisclosed-recipients: ;'
1978 132         818 $result =~ s/\s+/ /g; # reduce whitespace
1979 132         580 $result =~ s/^\s+//; # leading whitespace
1980 132         721 $result =~ s/\s+$//; # trailing whitespace
1981              
1982 132 50       391 if ($getaddr) {
    0          
1983             # Get the email address out of the header
1984             # All of these should result in "jm@foo":
1985             # jm@foo
1986             # jm@foo (Foo Blah)
1987             # jm@foo, jm@bar
1988             # display: jm@foo (Foo Blah), jm@bar ;
1989             # Foo Blah <jm@foo>
1990             # "Foo Blah" <jm@foo>
1991             # "'Foo Blah'" <jm@foo>
1992             #
1993             # strip out the (comments)
1994 132         315 $result =~ s/\s*\(.*?\)//g;
1995             # strip out the "quoted text", unless it's the only thing in the string
1996 132 100       412 if ($result !~ /^".*"$/) {
1997 131         402 $result =~ s/(?<!<)"[^"]*"(?!\@)//g; #" emacs
1998             }
1999             # Foo Blah <jm@xxx> or <jm@xxx>
2000 132         349 local $1;
2001 132         364 $result =~ s/^[^"<]*?<(.*?)>.*$/$1/;
2002             # multiple addresses on one line? remove all but first
2003 132         375 $result =~ s/,.*$//;
2004             }
2005             elsif ($getname) {
2006             # Get the display name out of the header
2007             # All of these should result in "Foo Blah":
2008             #
2009             # jm@foo (Foo Blah)
2010             # (Foo Blah) jm@foo
2011             # jm@foo (Foo Blah), jm@bar
2012             # display: jm@foo (Foo Blah), jm@bar ;
2013             # Foo Blah <jm@foo>
2014             # "Foo Blah" <jm@foo>
2015             # "'Foo Blah'" <jm@foo>
2016             #
2017 0         0 local $1;
2018             # does not handle mailbox-list or address-list well, to be improved
2019 0 0       0 if ($result =~ /^ \s* (.*?) \s* < [^<>]* >/sx) {
    0          
2020 0         0 $result = $1; # display-name, RFC 5322
2021             # name-addr = [display-name] angle-addr
2022             # display-name = phrase
2023             # phrase = 1*word / obs-phrase
2024             # word = atom / quoted-string
2025             # obs-phrase = word *(word / "." / CFWS)
2026 0         0 $result =~ s{ " ( (?: [^"\\] | \\. )* ) " }
  0         0  
  0         0  
  0         0  
2027             { my $s=$1; $s=~s{\\(.)}{$1}gs; $s }gsxe;
2028             } elsif ($result =~ /^ [^(,]*? \( (.*?) \) /sx) { # legacy form
2029 0         0 # nested comments are not handled, to be improved
2030             $result = $1;
2031 0         0 } else { # no display name
2032             $result = '';
2033 0         0 }
2034             $result =~ s/^ \s* ' \s* (.*?) \s* ' \s* \z/$1/sx;
2035             }
2036 3368         5523 }
2037             return $result;
2038             }
2039              
2040             # optimized for speed
2041             # $_[0] is self
2042             # $_[1] is request
2043             # $_[2] is defval
2044 3894     3894 1 17277 sub get {
2045 3894         4426 my $cache = $_[0]->{c};
2046 3894 100       7309 my $found;
2047             if (exists $cache->{$_[1]}) {
2048             # return cache entry if it is known
2049 526         978 # (measured hit/attempts rate on a production mailer is about 47%)
2050             $found = $cache->{$_[1]};
2051             } else {
2052 3368         5734 # fill in a cache entry
2053 3368         9433 $found = _get(@_);
2054             $cache->{$_[1]} = $found;
2055             }
2056             # if the requested header wasn't found, we should return a default value
2057             # as specified by the caller: if defval argument is present it represents
2058             # a default value even if undef; if defval argument is absent a default
2059 3894 100       21414 # value is an empty string for upwards compatibility
    100          
2060             return (defined $found ? $found : @_ > 2 ? $_[2] : '');
2061             }
2062              
2063             ###########################################################################
2064              
2065             # uri parsing from plain text:
2066             # The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but
2067             # ignore random strings that might look like URIs, for example in uuencoded files, and to ignore
2068             # URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters.
2069             # When we extract a domain and look it up in an RBL, an FP on deciding that the text is a URI is not much
2070             # of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule
2071             # because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings.
2072             # The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string,
2073             # then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most
2074             # likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that
2075             # it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD;
2076             # 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD
2077             #
2078             # We get the entire URI that would be linkified before dealing with it, in order to do the right thing
2079             # with URI-encodings and redirecting URIs.
2080             #
2081             # The delimiters for start of a URI in TBird are @(`{|[\"'<>,\s in OE they are ("<\s
2082             #
2083             # Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end.
2084             # TBird's end delimiters are )`{}|[]"<>\s but ) is only an end delmiter if there is no ( in the URI
2085             # OE only uses space as a delimiter, but ignores [~!@#^&*()_+`-={}|[]:";'<>?,.]* at the end.
2086             #
2087             # Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it.
2088             # TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end
2089             # delimiters include ( and '
2090             #
2091             # bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence ESC ( .
2092              
2093 130     130   372 sub _tbirdurire {
2094             my ($self) = @_;
2095              
2096 130 50       509 # Cached?
2097             return $self->{tbirdurire} if $self->{tbirdurire};
2098              
2099 130         353 # a hybrid of tbird and oe's version of uri parsing
2100 130         272 my $tbirdstartdelim = '><"\'`,{[(|\s' . "\x1b"; # The \x1b as per bug 4522
2101 130         261 my $iso2022shift = "\x1b" . '\(.'; # bug 4522
2102 130         361 my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b"; # The \x1b as per bug 4522
2103             my $nonASCII = '\x80-\xff';
2104              
2105             # bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
2106 130         373 # it's a common thing to find in text
2107 130         321 my $tbirdenddelimemail = $tbirdenddelim . ',(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes
2108             my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
2109              
2110 130         475 # valid TLDs
2111             my $tldsRE = $self->{main}->{registryboundaries}->{valid_tlds_re};
2112              
2113             # knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
2114 130         31780 # schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
2115 130         1425 my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/io;
2116 130         1338 my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}/io;
2117             my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}/io;
2118 130         30238  
2119             $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
2120             (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
2121             (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|\z))) |
2122             (?:\b($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/xo;
2123 130         642  
2124             return $self->{tbirdurire};
2125             }
2126              
2127             =item $status->get_uri_list ()
2128              
2129             Returns an array of all unique URIs found in the message. It takes
2130             a combination of the URIs found in the rendered (decoded and HTML
2131             stripped) body and the URIs found when parsing the HTML in the message.
2132             Will also set $status->{uri_list} (the array as returned by this function).
2133              
2134             The returned array will include the "raw" URI as well as
2135             "slightly cooked" versions. For example, the single URI
2136             'http://%77&#00119;%77.example.com/' will get turned into:
2137             ( 'http://%77&#00119;%77.example.com/', 'http://www.example.com/' )
2138              
2139             =cut
2140              
2141 130     130 1 477 sub get_uri_list {
2142             my ($self) = @_;
2143              
2144 130 50       529 # use cached answer if available
2145 0         0 if (defined $self->{uri_list}) {
  0         0  
2146             return @{$self->{uri_list}};
2147             }
2148 130         235  
2149             my @uris;
2150             # $self->{redirect_num} = 0;
2151              
2152 130         272 # get URIs from HTML parsing
  1383         2071  
2153 1253 50       2034 while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
2154 1253         1161 if ($info->{cleaned}) {
  1253         1654  
2155 1675         2471 foreach (@{$info->{cleaned}}) {
2156             push(@uris, $_);
2157              
2158             # count redirection attempts and log it
2159             # if (my @http = m{\b(https?:/{0,2})}gi) {
2160             # $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
2161             # }
2162             }
2163             }
2164             }
2165 130         469  
2166             $self->{uri_list} = \@uris;
2167             # $self->set_tag('URILIST', @uris == 1 ? $uris[0] : \@uris) if @uris;
2168 130         923  
2169             return @uris;
2170             }
2171              
2172             =item $status->get_uri_detail_list ()
2173              
2174             Returns a hash reference of all unique URIs found in the message and
2175             various data about where the URIs were found in the message. It takes a
2176             combination of the URIs found in the rendered (decoded and HTML stripped)
2177             body and the URIs found when parsing the HTML in the message. Will also
2178             set $status->{uri_detail_list} (the hash reference as returned by this
2179             function). This function will also set $status->{uri_domain_count} (count of
2180             unique domains).
2181              
2182             The hash format looks something like this:
2183              
2184             raw_uri => {
2185             types => { a => 1, img => 1, parsed => 1 },
2186             cleaned => [ canonicalized_uri ],
2187             anchor_text => [ "click here", "no click here" ],
2188             domains => { domain1 => 1, domain2 => 1 },
2189             }
2190              
2191             C<raw_uri> is whatever the URI was in the message itself
2192             (http://spamassassin.apache%2Eorg/).
2193              
2194             C<types> is a hash of the HTML tags (lowercase) which referenced
2195             the raw_uri. I<parsed> is a faked type which specifies that the
2196             raw_uri was seen in the rendered text.
2197              
2198             C<cleaned> is an array of the raw and canonicalized version of the raw_uri
2199             (http://spamassassin.apache%2Eorg/, http://spamassassin.apache.org/).
2200              
2201             C<anchor_text> is an array of the anchor text (text between <a> and
2202             </a>), if any, which linked to the URI.
2203              
2204             C<domains> is a hash of the domains found in the canonicalized URIs.
2205              
2206             C<hosts> is a hash of unstripped hostnames found in the canonicalized URIs
2207             as hash keys, with their domain part stored as a value of each hash entry.
2208              
2209             =cut
2210              
2211 1387     1387 1 1703 sub get_uri_detail_list {
2212             my ($self) = @_;
2213              
2214 1387 100       2133 # use cached answer if available
2215 1257         3065 if (defined $self->{uri_detail_list}) {
2216             return $self->{uri_detail_list};
2217             }
2218 130         553  
2219             my $timer = $self->{main}->time_method("get_uri_detail_list");
2220 130         503  
2221             $self->{uri_domain_count} = 0;
2222              
2223 130         736 # do this so we're sure metadata->html is setup
  1306         2530  
2224             my %parsed = map { $_ => 'parsed' } $self->_get_parsed_uri_list();
2225              
2226              
2227             # This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
2228             # this functionality
2229             # 2013-01-07
2230             # This functionality is re-enabled as a configuration option disabled by
2231             # default (bug 7087)
2232             # 2014-10-06
2233              
2234 130 50       613 # Look for the domain in DK/DKIM headers
2235 130         550 if ( $self->{conf}->{parse_dkim_uris} ) {
  260         777  
2236             my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef),
2237 130         560 $self->get('DKIM-Signature',undef) ));
2238 0         0 while ($dk =~ /\bd\s*=\s*([^;]+)/g) {
2239 0         0 my $dom = $1;
2240 0         0 $dom =~ s/\s+//g;
2241             $parsed{$dom} = 'domainkeys';
2242             }
2243             }
2244              
2245             # get URIs from HTML parsing
2246 130   100     968 # use the metadata version since $self->{html} may not be setup
2247 130 50       609 my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} || { };
2248             $self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
2249              
2250 130         375 # don't keep dereferencing ...
2251             my $redirector_patterns = $self->{conf}->{redirector_patterns};
2252              
2253 130         340 # canonicalize the HTML parsed URIs
  257         1026  
2254 127         304 while(my($uri, $info) = each %{ $detail }) {
2255 127         282 my @tmp = uri_list_canonicalize($redirector_patterns, $uri);
2256             $info->{cleaned} = \@tmp;
2257 127         187  
2258 239         626 foreach (@tmp) {
2259 239 100 66     997 my($domain,$host) = $self->{main}->{registryboundaries}->uri_to_domain($_);
      100        
2260             if (defined $host && $host ne '' && !$info->{hosts}->{$host}) {
2261 111         263 # unstripped full host name as a key, and its domain part as a value
2262 111 50 33     444 $info->{hosts}->{$host} = $domain;
      33        
2263 111         186 if (defined $domain && $domain ne '' && !$info->{domains}->{$domain}) {
2264 111         203 $info->{domains}->{$domain} = 1; # stripped to domain boundary
2265             $self->{uri_domain_count}++;
2266             }
2267             }
2268             }
2269 127 50       300  
2270 0         0 if (would_log('dbg', 'uri') == 2) {
2271 0         0 dbg("uri: html uri found, $uri");
2272 0         0 foreach my $nuri (@tmp) {
2273             dbg("uri: cleaned html uri, $nuri");
2274 0 0 0     0 }
2275 0         0 if ($info->{hosts} && $info->{domains}) {
  0         0  
2276 0         0 for my $host (keys %{$info->{hosts}}) {
2277             dbg("uri: html host %s, domain %s", $host, $info->{hosts}->{$host});
2278             }
2279             }
2280             }
2281             }
2282              
2283 130         685 # canonicalize the text parsed URIs
2284 1126         3477 while (my($uri, $type) = each %parsed) {
2285 1126         1372 $detail->{$uri}->{types}->{$type} = 1;
2286             my $info = $detail->{$uri};
2287 1126         1219  
2288             my @uris;
2289 1126 50       1701  
2290 1126 50       1543 if (!exists $info->{cleaned}) {
2291 1126         2151 if ($type eq 'parsed') {
2292             @uris = uri_list_canonicalize($redirector_patterns, $uri);
2293             }
2294 0         0 else {
2295             @uris = ( $uri );
2296 1126         2045 }
2297             $info->{cleaned} = \@uris;
2298 1126         1640  
2299 1436         3571 foreach (@uris) {
2300 1436 100 66     6527 my($domain,$host) = $self->{main}->{registryboundaries}->uri_to_domain($_);
      100        
2301             if (defined $host && $host ne '' && !$info->{hosts}->{$host}) {
2302 1126         2430 # unstripped full host name as a key, and its domain part as a value
2303 1126 50 33     4549 $info->{hosts}->{$host} = $domain;
      33        
2304 1126         2164 if (defined $domain && $domain ne '' && !$info->{domains}->{$domain}){
2305 1126         1900 $info->{domains}->{$domain} = 1;
2306             $self->{uri_domain_count}++;
2307             }
2308             }
2309             }
2310             }
2311 1126 50       2464  
2312 0         0 if (would_log('dbg', 'uri') == 2) {
2313 0         0 dbg("uri: parsed uri found of type $type, $uri");
2314 0         0 foreach my $nuri (@uris) {
2315             dbg("uri: cleaned parsed uri, $nuri");
2316 0 0 0     0 }
2317 0         0 if ($info->{hosts} && $info->{domains}) {
  0         0  
2318 0         0 for my $host (keys %{$info->{hosts}}) {
2319             dbg("uri: parsed host %s, domain %s", $host, $info->{hosts}->{$host});
2320             }
2321             }
2322             }
2323             }
2324              
2325 130         437 # setup the cache
2326             $self->{uri_detail_list} = $detail;
2327 130         886  
2328             return $detail;
2329             }
2330              
2331 130     130   358 sub _get_parsed_uri_list {
2332             my ($self) = @_;
2333              
2334 130 50       588 # use cached answer if available
2335             unless (defined $self->{parsed_uri_list}) {
2336             # TVD: we used to use decoded_body which is fine, except then we'll
2337             # try parsing URLs out of HTML, which is what the HTML code is going
2338             # to do (note: we know the HTML parsing occurs, because we call for the
2339             # rendered text which does HTML parsing...) trying to get URLs out of
2340             # HTML w/out parsing causes issues, so let's not do it.
2341             # also, if we allow $textary to be passed in, we need to invalidate
2342 130         488 # the cache first. fyi.
2343 130         400 my $textary = $self->get_decoded_stripped_body_text_array();
2344             my $redirector_patterns = $self->{conf}->{redirector_patterns};
2345 130         500  
2346 130         0 my ($rulename, $pat, @uris);
2347 130         572 my $text;
2348             my $tbirdurire = $self->_tbirdurire;
2349 130         582  
2350             for my $entry (@$textary) {
2351              
2352             # a workaround for [perl #69973] bug:
2353             # Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
2354             # Bug 6225, regexp and string should both be utf8, or none of them;
2355             # untainting string also seems to avoid the crash
2356             #
2357 672         1714 # Bug 6225: untaint the string in an attempt to work around a perl crash
2358             local $_ = untaint_var($entry);
2359 672         1623  
2360 672         62668 local($1,$2,$3);
2361 831   66     2791 while (/$tbirdurire/igo) {
2362 831         1348 my $rawuri = $1||$2||$3;
2363 831         3148 $rawuri =~ s/(^[^(]*)\).*$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
2364             $rawuri =~ s/[-~!@#^&*()_+=:;\'?,.]*$//; # remove trailing string of punctuations that TBird ignores
2365 831 100       1758 # skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp
2366             next if $rawuri =~ /^(?:(?:https?|ftp|mailto):(?:\/\/)?)?[a-z\d.-]*\.\./i;
2367              
2368             # If it's a hostname that was just sitting out in the
2369             # open, without a protocol, and not inside of an HTML tag,
2370             # the we should add the proper protocol in front, rather
2371 829         1043 # than using the base URI.
2372 829         871 my $uri = $rawuri;
2373 829 100       1620 my $rblonly;
2374 664 100       1761 if ($uri !~ /^(?:https?|ftp|mailto|javascript|file):/i) {
    100          
    100          
2375 1         3 if ($uri =~ /^ftp\./i) {
2376             $uri = "ftp://$uri";
2377             }
2378 316         542 elsif ($uri =~ /^www\d{0,2}\./i) {
2379             $uri = "http://$uri";
2380             }
2381 34         98 elsif ($uri =~ /\@/) {
2382             $uri = "mailto:$uri";
2383             }
2384             else {
2385 313         340 # some spammers are using unschemed URIs to escape filters
2386 313         524 $rblonly = 1; # flag that this is a URI that MUAs don't linkify so only use for RBLs
2387             $uri = "http://$uri";
2388             }
2389             }
2390 829 100       1682  
2391             if ($uri =~ /^mailto:/i) {
2392 39 50       100 # skip a mail link that does not have a valid TLD or other than one @ after decoding any URLEncoded characters
2393 39 100       140 $uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]|[3-6][0-9a-fA-F]|7[0-9a-eA-E])/);
2394 37         126 next if ($uri !~ /^[^@]+@[^@]+$/);
2395 37 100       105 my $domuri = $self->{main}->{registryboundaries}->uri_to_domain($uri);
2396 35         425 next unless $domuri;
2397 35 100       114 push (@uris, $rawuri);
2398             push (@uris, $uri) unless ($rawuri eq $uri);
2399             }
2400 825 100       2945  
2401             next unless ($uri =~/^(?:https?|ftp):/i); # at this point only valid if one or the other of these
2402 790         1768  
2403 790         1004 my @tmp = uri_list_canonicalize($redirector_patterns, $uri);
2404 790         1245 my $goodurifound = 0;
2405 800         2254 foreach my $cleanuri (@tmp) {
2406 800 100       1436 my $domain = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
2407             if ($domain) {
2408 777 100       1091 # bug 5780: Stop after domain to avoid FP, but do that after all deobfuscation of urlencoding and redirection
2409 312         564 if ($rblonly) {
2410 312         1301 local $1;
2411             $cleanuri =~ s/^(https?:\/\/[^:\/]+).*$/$1/i;
2412 777         1186 }
2413 777         1151 push (@uris, $cleanuri);
2414             $goodurifound = 1;
2415             }
2416 790 100       1177 }
2417 774 100       5699 next unless $goodurifound;
2418             push @uris, $rawuri unless $rblonly;
2419             }
2420             }
2421              
2422 130         567 # Make sure all the URIs are nice and short
2423 1306 50       1772 foreach my $uri ( @uris ) {
2424 0         0 if (length $uri > MAX_URI_LENGTH) {
2425 0         0 $self->{'uri_truncated'} = 1;
2426             $uri = substr $uri, 0, MAX_URI_LENGTH;
2427             }
2428             }
2429              
2430 130         614 # setup the cache and return
2431             $self->{parsed_uri_list} = \@uris;
2432             }
2433 130         260  
  130         720  
2434             return @{$self->{parsed_uri_list}};
2435             }
2436              
2437             ###########################################################################
2438              
2439 0     0 0 0 sub ensure_rules_are_complete {
2440 0         0 my $self = shift;
2441             my $metarule = shift;
2442             # @_ is now the list of rules
2443 0         0  
2444             foreach my $r (@_) {
2445 0 0       0 # dbg("rules: meta rule depends on net rule $r");
2446             next if ($self->is_rule_complete($r));
2447 0         0  
2448 0         0 dbg("rules: meta rule $metarule depends on pending rule $r, blocking");
2449             my $timer = $self->{main}->time_method("wait_for_pending_rules");
2450 0         0  
2451 0         0 my $start = time;
2452 0         0 $self->harvest_until_rule_completes($r);
2453             my $elapsed = time - $start;
2454 0 0       0  
    0          
2455 0         0 if (!$self->is_rule_complete($r)) {
2456             dbg("rules: rule $r is still not complete; exited early?");
2457             }
2458 0         0 elsif ($elapsed > 0) {
2459             info("rules: $r took $elapsed seconds to complete, for $metarule");
2460             }
2461             }
2462             }
2463              
2464             ###########################################################################
2465              
2466             # use a separate sub here, for brevity
2467             # called out of generated eval
2468 0     0 0 0 sub handle_eval_rule_errors {
2469 0         0 my ($self, $rulename) = @_;
2470 0         0 warn "rules: failed to run $rulename test, skipping:\n\t($@)\n";
2471             $self->{rule_errors}++;
2472             }
2473              
2474 3281     3281 0 6058 sub register_plugin_eval_glue {
2475             my ($self, $function) = @_;
2476 3281 50       5724  
2477 0         0 if (!$function) {
2478 0         0 warn "rules: empty function name";
2479             return;
2480             }
2481              
2482 3281 100       39208 # only need to call this once per fn (globally)
2483 220         483 return if exists $TEMPORARY_EVAL_GLUE_METHODS{$function};
2484             $TEMPORARY_EVAL_GLUE_METHODS{$function} = undef;
2485              
2486 220 50       848 # return if it's not an eval_plugin function
2487             return if (!exists $self->{conf}->{eval_plugins}->{$function});
2488              
2489             # return if it's been registered already
2490 220 50 33     1582 return if ($self->can ($function) &&
  0         0  
2491             defined &{'Mail::SpamAssassin::PerMsgStatus::'.$function});
2492 220         737  
2493             my $evalstr = <<"ENDOFEVAL";
2494             {
2495             package Mail::SpamAssassin::PerMsgStatus;
2496              
2497             sub $function {
2498             my (\$self) = shift;
2499             my \$plugin = \$self->{conf}->{eval_plugins}->{$function};
2500             return \$plugin->$function (\$self, \@_);
2501             }
2502              
2503             1;
2504             }
2505             ENDOFEVAL
2506 220 50   462 1 24229 eval $evalstr . '; 1' ## no critic
  462     77 1 1102  
  462     77 0 1034  
  462     77 0 1509  
  77     0 0 317  
  77     77 0 303  
  77     89 1 442  
  77     77 1 266  
  77     77 0 318  
  77     165 1 445  
  77     159 1 289  
  77     123 1 284  
  77     145 0 453  
  0     593 0 0  
  0     119 0 0  
  0     122 0 0  
  77     4 1 291  
  77     0 1 270  
  77     0 0 454  
  77     0 1 293  
  77     8 0 310  
  77     0 0 549  
  77     0 0 346  
  77     0 0 294  
  77     77 0 482  
  77     77 0 267  
  77     77 0 286  
  77     77 0 549  
  165     77 0 473  
  165     77 0 477  
  165         630  
  119         347  
  119         356  
  119         521  
  111         394  
  111         489  
  111         596  
  77         320  
  77         288  
  77         376  
  573         1404  
  539         1304  
  539         1751  
  111         372  
  77         303  
  77         567  
  111         361  
  111         318  
  111         638  
  34         174  
  34         112  
  0         0  
  34         234  
  34         133  
  34         121  
  68         161  
  68         141  
  68         183  
  68         240  
  68         461  
  42         79  
  42         64  
  42         51  
  42         62  
  42         77  
  0         0  
  42         144  
  42         77  
  42         78  
  0         0  
  0         0  
  0         0  
  42         58  
  119         402  
  119         340  
  119         515  
  119         327  
  119         350  
  77         466  
  119         436  
  119         429  
  85         517  
  85         303  
  85         299  
  123         565  
  123         482  
  123         372  
  77         465  
  77         300  
  111         313  
  111         506  
2507 0 0       0 or do {
  0         0  
2508 0         0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
2509 0         0 warn "rules: failed to run header tests, skipping some: $eval_stat\n";
2510             $self->{rule_errors}++;
2511             };
2512              
2513 220         2802 # ensure this method is deleted if finish_tests() is called
2514             push (@TEMPORARY_METHODS, $function);
2515             }
2516              
2517             ###########################################################################
2518              
2519             # note: only eval tests should store state in $self->{test_log_msgs};
2520             # pattern tests do not.
2521             #
2522             # the clearing of the test state is now inlined as:
2523             #
2524             # %{$self->{test_log_msgs}} = (); # clear test state
2525             #
2526             # except for this public API for plugin use:
2527              
2528             =item $status->clear_test_state()
2529              
2530             Clear test state, including test log messages from C<$status-E<gt>test_log()>.
2531              
2532             =cut
2533              
2534 0     0 1 0 sub clear_test_state {
2535 34         95 my ($self) = @_;
  34         96  
2536             %{$self->{test_log_msgs}} = ();
2537             }
2538              
2539             # internal API, called only by get_hit()
2540             # TODO: refactor and merge this into that function
2541 174     128   503 sub _handle_hit {
2542             my ($self, $rule, $score, $area, $ruletype, $desc) = @_;
2543 174         1012  
2544             $self->{main}->call_plugins ("hit_rule", {
2545             permsgstatus => $self,
2546             rulename => $rule,
2547             ruletype => $ruletype,
2548             score => $score
2549             });
2550              
2551 174 100       790 # ignore meta-match sub-rules.
  88         207  
  76         206  
  76         156  
2552             if ($rule =~ /^__/) { push(@{$self->{subtest_names_hit}}, $rule); return; }
2553              
2554 52 50       185 # this should not happen; warn about it
2555 46         111 if (!defined $score) {
2556 34         82 warn "rules: score undef for rule '$rule' in '$area' '$desc'";
2557             return;
2558             }
2559              
2560 86 50       312 # this should not happen; warn about NaN (bug 3364)
2561 42         63 if ($score != $score) {
2562 42         106 warn "rules: score '$score' for rule '$rule' in '$area' '$desc'";
2563             return;
2564             }
2565              
2566 106         299 # Add the rule hit to the score
2567             $self->{score} += $score;
2568 55         123  
  94         276  
2569 97   100     417 push(@{$self->{test_names_hit}}, $rule);
2570             $area ||= '';
2571 97 100 66     438  
2572 58         137 if ($score >= 10 || $score <= -10) {
2573             $score = sprintf("%4.0f", $score);
2574             }
2575 51         819 else {
2576             $score = sprintf("%4.1f", $score);
2577             }
2578              
2579             # save both summaries
2580             # TODO: this is slower than necessary, if we only need one
2581             $self->{tag_data}->{REPORT} .= sprintf ("* %s %s %s%s\n%s",
2582             $score, $rule, $area,
2583             $self->_wrap_desc($desc,
2584             4+length($rule)+length($score)+length($area), "* "),
2585 97 100       555 ($self->{test_log_msgs}->{TERSE} ?
2586             "* " . $self->{test_log_msgs}->{TERSE} : ''));
2587              
2588             $self->{tag_data}->{SUMMARY} .= sprintf ("%s %-22s %s%s\n%s",
2589             $score, $rule, $area,
2590             $self->_wrap_desc($desc,
2591 56   100     346 3+length($rule)+length($score)+length($area), " " x 28),
2592             ($self->{test_log_msgs}->{LONG} || ''));
2593             }
2594              
2595 120     104   378 sub _wrap_desc {
2596             my ($self, $desc, $firstlinelength, $prefix) = @_;
2597 118         378  
2598 104         484 my $firstline = " " x $firstlinelength;
2599 114         524 my $wrapped = Mail::SpamAssassin::Util::wrap($desc, $prefix, $firstline, $self->{conf}->{report_wrap_width}, 0);
2600 106         845 $wrapped =~ s/^\s+//s;
2601             $wrapped;
2602             }
2603              
2604             ###########################################################################
2605              
2606             =item $status->got_hit ($rulename, $desc_prepend [, name => value, ...])
2607              
2608             Register a hit against a rule in the ruleset.
2609              
2610             There are two mandatory arguments. These are C<$rulename>, the name of the rule
2611             that fired, and C<$desc_prepend>, which is a short string that will be
2612             prepended to the rules C<describe> string in output reports.
2613              
2614             In addition, callers can supplement that with the following optional
2615             data:
2616              
2617             =over 4
2618              
2619             =item score => $num
2620              
2621             Optional: the score to use for the rule hit. If unspecified,
2622             the value from the C<Mail::SpamAssassin::Conf> object's C<{scores}>
2623             hash will be used (a configured score), and in its absence the
2624             C<defscore> option value.
2625              
2626             =item defscore => $num
2627              
2628             Optional: the score to use for the rule hit if neither the
2629             option C<score> is provided, nor a configured score value is provided.
2630              
2631             =item value => $num
2632              
2633             Optional: the value to assign to the rule; the default value is C<1>.
2634             I<tflags multiple> rules use values of greater than 1 to indicate
2635             multiple hits. This value is accessible to meta rules.
2636              
2637             =item ruletype => $type
2638              
2639             Optional, but recommended: the rule type string. This is used in the
2640             C<hit_rule> plugin call, called by this method. If unset, I<'unknown'> is
2641             used.
2642              
2643             =item tflags => $string
2644              
2645             Optional: a string, i.e. a space-separated list of additional tflags
2646             to be appended to an existing list of flags in $self->{conf}->{tflags},
2647             such as: "nice noautolearn multiple". No syntax checks are performed.
2648              
2649             =item description => $string
2650              
2651             Optional: a custom rule description string. This is used in the
2652             C<hit_rule> plugin call, called by this method. If unset, the static
2653             description is used.
2654              
2655             =back
2656              
2657             Backward compatibility: the two mandatory arguments have been part of this API
2658             since SpamAssassin 2.x. The optional I<name=<gt>value> pairs, however, are a
2659             new addition in SpamAssassin 3.2.0.
2660              
2661             =cut
2662              
2663 138     128 1 2789 sub got_hit {
2664             my ($self, $rule, $area, %params) = @_;
2665 138         450  
2666             my $conf_ref = $self->{conf};
2667 138         248  
2668 128         273 my $dynamic_score_provided;
2669 132 50       291 my $score = $params{score};
2670 4         16 if (defined $score) { # overrides any configured scores
2671             $dynamic_score_provided = 1;
2672 132         281 } else {
2673 132 50       328 $score = $conf_ref->{scores}->{$rule};
2674             $score = $params{defscore} if !defined $score;
2675             }
2676              
2677             # adding a hit does nothing if we don't have a score -- we probably
2678 128 50       291 # shouldn't have run it in the first place
2679 0         0 if (!$score) {
  0         0  
2680 4         24 %{$self->{test_log_msgs}} = ();
2681             return;
2682             }
2683              
2684             # ensure that rule values always result in an *increase*
2685 128         235 # of $self->{tests_already_hit}->{$rule}:
2686 128 100 66     408 my $value = $params{value};
  122         190  
2687             if (!$value || $value <= 0) { $value = 1 }
2688 128         228  
2689 132         237 my $tflags_ref = $conf_ref->{tflags};
2690 128 50 33     348 my $tflags_add = $params{tflags};
2691             if (defined $tflags_add && $tflags_add ne '') {
2692 0 0 0     0 $_ = (!defined $_ || $_ eq '') ? $tflags_add : ($_ . ' ' . $tflags_add)
2693             for $tflags_ref->{$rule};
2694             };
2695 128   100     527  
2696             my $already_hit = $self->{tests_already_hit}->{$rule} || 0;
2697 128 100 50     342 # don't count hits multiple times, unless 'tflags multiple' is on
      66        
2698 4         15 if ($already_hit && ($tflags_ref->{$rule}||'') !~ /\bmultiple\b/) {
  0         0  
2699 0         0 %{$self->{test_log_msgs}} = ();
2700             return;
2701             }
2702 128         331  
2703             $self->{tests_already_hit}->{$rule} = $already_hit + $value;
2704              
2705 132   100     387 # default ruletype, if not specified:
2706             $params{ruletype} ||= 'unknown';
2707 128 50       319  
2708 0         0 if ($dynamic_score_provided) { # copy it to static for proper reporting
2709 0         0 $conf_ref->{scoreset}->[$_]->{$rule} = $score for (0..3);
2710             $conf_ref->{scores}->{$rule} = $score;
2711             }
2712 132         287  
2713 128 50       262 my $rule_descr = $params{description};
2714 0         0 if (defined $rule_descr) {
2715             $conf_ref->{descriptions}->{$rule} = $rule_descr; # save dynamic descr.
2716 128         481 } else {
2717             $rule_descr = $conf_ref->get_description_for_rule($rule); # static
2718             }
2719             # Bug 6880 Set Rule Description to something that says no rule
2720 168 100 66     817 #$rule_descr = $rule if !defined $rule_descr || $rule_descr eq '';
2721             $rule_descr = "No description available." if !defined $rule_descr || $rule_descr eq '';
2722              
2723             $self->_handle_hit($rule,
2724             $score,
2725             $area,
2726 168         5665 $params{ruletype},
2727             $rule_descr);
2728              
2729 128         344 # take care of duplicate rules, too (bug 5206)
2730 128 50 33     356 my $dups = $conf_ref->{duplicate_rules}->{$rule};
  0         0  
2731 0         0 if ($dups && @{$dups}) {
  0         0  
2732 0         0 foreach my $dup (@{$dups}) {
2733             $self->got_hit($dup, $area, %params);
2734             }
2735             }
2736 140         645  
  140         309  
2737 128         1788 %{$self->{test_log_msgs}} = (); # clear test logs
2738             return 1;
2739             }
2740              
2741             ###########################################################################
2742              
2743             # TODO: this needs API doc
2744 0     0 0 0 sub test_log {
2745 12         34 my ($self, $msg) = @_;
2746 12         63 local $1;
2747 12         70 while ($msg =~ s/^(.{30,48})\s//) {
2748             $self->_test_log_line ($1);
2749 0         0 }
2750             $self->_test_log_line ($msg);
2751             }
2752              
2753 12     0   61 sub _test_log_line {
2754             my ($self, $msg) = @_;
2755 22         58  
2756 22 50       60 $self->{test_log_msgs}->{TERSE} .= sprintf ("[%s]\n", $msg);
2757 60         114 if (length($msg) > 47) {
2758             $self->{test_log_msgs}->{LONG} .= sprintf ("%78s\n", "[$msg]");
2759 12         87 } else {
2760             $self->{test_log_msgs}->{LONG} .= sprintf ("%27s [%s]\n", "", $msg);
2761             }
2762             }
2763              
2764             ###########################################################################
2765              
2766             # helper for get(). Do not call directly, as get() caches its results
2767             # and this does not!
2768 116     94 0 324 sub get_envelope_from {
2769             my ($self) = @_;
2770              
2771             # bug 2142:
2772             # Get the SMTP MAIL FROM:, aka. the "envelope sender", if our
2773             # calling app has helpfully marked up the source message
2774             # with it. Various MTAs and calling apps each have their
2775             # own idea of what header to use for this! see
2776 106         244  
2777             my $envf;
2778              
2779             # Rely on the 'envelope-sender-header' header if the user has configured one.
2780             # Assume that because they have configured it, their MTA will always add it.
2781 106 50       450 # This will prevent us falling through and picking up inappropriate headers.
2782             if (defined $self->{conf}->{envelope_sender_header}) {
2783 12         40 # make sure we get the most recent copy - there can be only one EnvelopeSender.
2784             $envf = $self->get($self->{conf}->{envelope_sender_header}.":addr",undef);
2785 12 0 0     91 # ok if it contains an "@" sign, or is "" (ie. "<>" without the < and >)
      0        
2786             goto ok if defined $envf && ($envf =~ /\@/ || $envf eq '');
2787 0 0       0 # Warn them if it's configured, but not there or not usable.
2788 0         0 if (defined $envf) {
2789             chomp $envf;
2790 0         0 dbg("message: envelope_sender_header '%s: %s' is not an FQDN, ignoring",
2791             $self->{conf}->{envelope_sender_header}, $envf);
2792             } else {
2793 0         0 dbg("message: envelope_sender_header '%s' not found in message",
2794             $self->{conf}->{envelope_sender_header});
2795             }
2796 8         22 # Couldn't get envelope-sender using the configured header.
2797             return;
2798             }
2799              
2800             # User hasn't given us a header to trust, so try to guess the sender.
2801              
2802             # use the "envelope-sender" string found in the Received headers,
2803             # if possible... use the last untrusted header, in case there's
2804 102         330 # trusted headers.
2805 94 100       280 my $lasthop = $self->{relays_untrusted}->[0];
2806             if (!defined $lasthop) {
2807             # no untrusted headers? in that case, the message is ALL_TRUSTED.
2808 86         235 # use the first trusted header (ie. the oldest, originating one).
2809             $lasthop = $self->{relays_trusted}->[-1];
2810             }
2811 94 100       266  
2812 33         98 if (defined $lasthop) {
2813             $envf = $lasthop->{envfrom};
2814             # TODO FIXME: Received.pm puts both null senders and absence-of-sender
2815 33 50 33     143 # into the relays array as '', so we can't distinguish them :(
2816 0         0 if ($envf && ($envf =~ /\@/)) {
2817             goto ok;
2818             }
2819             }
2820              
2821             # WARNING: a lot of list software adds an X-Sender for the original env-from
2822             # (including Yahoo! Groups). Unfortunately, fetchmail will pick it up and
2823             # reuse it as the env-from for *its* delivery -- even though the list
2824             # software had used a different env-from in the intervening delivery. Hence,
2825             # if this header is present, and there's a fetchmail sig in the Received
2826             # lines, we cannot trust any Envelope-From headers, since they're likely to
2827             # be incorrect fetchmail guesses.
2828 94 50       429  
2829 0         0 if ($self->get("X-Sender") =~ /\@/) {
2830 12 0       38 my $rcvd = join(' ', $self->get("Received"));
2831 12         47 if ($rcvd =~ /\(fetchmail/) {
2832 0         0 dbg("message: X-Sender and fetchmail signatures found, cannot trust envelope-from");
2833             return;
2834             }
2835             }
2836              
2837 94 50       353 # procmailrc notes this (we now recommend adding it to Received instead)
2838             if ($envf = $self->get("X-Envelope-From")) {
2839             # heuristic: this could have been relayed via a list which then used
2840 12 0       28 # a *new* Envelope-from. check
2841 12         40 if ($self->get("ALL:raw") =~ /^Received:.*^X-Envelope-From:/smi) {
2842 12         45 dbg("message: X-Envelope-From header found after 1 or more Received lines, cannot trust envelope-from");
2843             return;
2844 0         0 } else {
2845             goto ok;
2846             }
2847             }
2848              
2849 106 50       447 # qmail, new-inject(1)
2850             if ($envf = $self->get("Envelope-Sender")) {
2851             # heuristic: this could have been relayed via a list which then used
2852 12 0       80 # a *new* Envelope-from. check
2853 12         141 if ($self->get("ALL:raw") =~ /^Received:.*^Envelope-Sender:/smi) {
2854             dbg("message: Envelope-Sender header found after 1 or more Received lines, cannot trust envelope-from");
2855 12         65 } else {
2856             goto ok;
2857             }
2858             }
2859              
2860             # Postfix, sendmail, amavisd-new, ...
2861             # RFC 2821 requires it:
2862             # When the delivery SMTP server makes the "final delivery" of a
2863             # message, it inserts a return-path line at the beginning of the mail
2864             # data. This use of return-path is required; mail systems MUST support
2865             # it. The return-path line preserves the information in the <reverse-
2866 106 100       345 # path> from the MAIL command.
2867             if ($envf = $self->get("Return-Path")) {
2868             # heuristic: this could have been relayed via a list which then used
2869 13 50       28 # a *new* Envelope-from. check
2870 12         49 if ($self->get("ALL:raw") =~ /^Received:.*^Return-Path:/smi) {
2871             dbg("message: Return-Path header found after 1 or more Received lines, cannot trust envelope-from");
2872 31         145 } else {
2873             goto ok;
2874             }
2875             }
2876              
2877 113         311 # give up.
2878             return;
2879 13         81  
2880             ok:
2881 13         89 $envf =~ s/^<*//s; # remove <
2882             $envf =~ s/>*\s*\z//s; # remove >, whitespace, newlines
2883 13         40  
2884             return $envf;
2885             }
2886              
2887             ###########################################################################
2888              
2889             # helper for get(ALL-*). get() caches its results, so don't call this
2890             # directly unless you need a range of headers not covered by the ALL-*
2891             # psuedo-headers!
2892              
2893             # Get all the headers found between an index range of received headers, the
2894             # index doesn't care if we could parse the received headers or not.
2895             # Use undef for the $start_rcvd or $end_rcvd numbers to start/end with the
2896             # first/last header in the message, otherwise indicate the index number you
2897             # want to start/end at. Set $include_start_rcvd or $include_end_rcvd to 0 to
2898             # indicate you don't want to include the received header found at the start or
2899             # end indexes... basically toggles between [s,e], [s,e), (s,e], (s,e).
2900 12     0 0 69 sub get_all_hdrs_in_rcvd_index_range {
2901             my ($self, $start_rcvd, $end_rcvd, $include_start_rcvd, $include_end_rcvd) = @_;
2902              
2903 0 0 0     0 # prevent bad input causing us to return the first header found
2904             return if (defined $end_rcvd && $end_rcvd < 0);
2905 0 0       0  
2906 0 0       0 $include_start_rcvd = 1 unless defined $include_start_rcvd;
2907             $include_end_rcvd = 1 unless defined $include_end_rcvd;
2908 0         0  
2909 0         0 my $cur_rcvd_index = -1; # none found yet
2910             my $result = '';
2911 0         0  
2912 0 0       0 foreach my $hdr (split(/^/m, $self->{msg}->get_pristine_header())) {
2913 0         0 if ($hdr =~ /^Received:/i) {
2914 0 0 0     0 $cur_rcvd_index++;
      0        
2915             next if (defined $start_rcvd && !$include_start_rcvd &&
2916 0 0 0     0 $start_rcvd == $cur_rcvd_index);
      0        
2917             last if (defined $end_rcvd && !$include_end_rcvd &&
2918             $end_rcvd == $cur_rcvd_index);
2919 0 0 0     0 }
    0 0        
      0        
      0        
2920             if ((!defined $start_rcvd || $start_rcvd <= $cur_rcvd_index) &&
2921 0         0 (!defined $end_rcvd || $cur_rcvd_index < $end_rcvd)) {
2922             $result .= $hdr."\n";
2923             }
2924 0         0 elsif (defined $end_rcvd && $cur_rcvd_index == $end_rcvd) {
2925 0         0 $result .= $hdr."\n";
2926             last;
2927             }
2928 0 0       0 }
2929             return ($result eq '' ? undef : $result);
2930             }
2931              
2932             ###########################################################################
2933 0     0 0 0  
2934             sub sa_die { Mail::SpamAssassin::sa_die(@_); }
2935              
2936             ###########################################################################
2937              
2938             =item $status->create_fulltext_tmpfile (fulltext_ref)
2939              
2940             This function creates a temporary file containing the passed scalar
2941             reference data (typically the full/pristine text of the message).
2942             This is typically used by external programs like pyzor and dccproc, to
2943             avoid hangs due to buffering issues. Methods that need this, should
2944             call $self->create_fulltext_tmpfile($fulltext) to retrieve the temporary
2945             filename; it will be created if it has not already been.
2946              
2947             Note: This can only be called once until $status->delete_fulltext_tmpfile() is
2948             called.
2949              
2950             =cut
2951              
2952 0     0 1 0 sub create_fulltext_tmpfile {
2953             my ($self, $fulltext) = @_;
2954 0 0       0  
2955 0         0 if (defined $self->{fulltext_tmpfile}) {
2956             return $self->{fulltext_tmpfile};
2957             }
2958 0         0  
2959 0 0       0 my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
2960             $tmpfh or die "failed to create a temporary file";
2961              
2962             # PerlIO's buffered print writes in 8 kB chunks - which can be slow.
2963             # print $tmpfh $$fulltext or die "error writing to $tmpf: $!";
2964             #
2965             # reducing the number of writes and bypassing extra buffering in PerlIO
2966 0         0 # speeds up writing of larger text by a factor of 2
2967 0         0 my $nwrites;
2968 0         0 for (my $ofs = 0; $ofs < length($$fulltext); $ofs += $nwrites) {
2969 0 0       0 $nwrites = $tmpfh->syswrite($$fulltext, length($$fulltext)-$ofs, $ofs);
2970             defined $nwrites or die "error writing to $tmpf: $!";
2971 0 0       0 }
2972             close $tmpfh or die "error closing $tmpf: $!";
2973 0         0  
2974             $self->{fulltext_tmpfile} = $tmpf;
2975 0         0  
2976             dbg("check: create_fulltext_tmpfile, written %d bytes to file %s",
2977             length($$fulltext), $tmpf);
2978 0         0  
2979             return $self->{fulltext_tmpfile};
2980             }
2981              
2982             =item $status->delete_fulltext_tmpfile ()
2983              
2984             Will cleanup after a $status->create_fulltext_tmpfile() call. Deletes the
2985             temporary file and uncaches the filename.
2986              
2987             =cut
2988              
2989 189     189 1 494 sub delete_fulltext_tmpfile {
2990 189 50       1534 my ($self) = @_;
2991 0 0       0 if (defined $self->{fulltext_tmpfile}) {
2992 0         0 if (!unlink $self->{fulltext_tmpfile}) {
2993             my $msg = sprintf("cannot unlink %s: %s", $self->{fulltext_tmpfile}, $!);
2994 0 0       0 # don't fuss too much if file is missing, perhaps it wasn't even created
  0         0  
  0         0  
2995             if ($! == ENOENT) { warn $msg } else { die $msg }
2996 0         0 }
2997             $self->{fulltext_tmpfile} = undef;
2998             }
2999             }
3000              
3001             ###########################################################################
3002              
3003 233     233 0 432 sub all_from_addrs {
3004             my ($self) = @_;
3005 233 100       605  
  154         226  
  154         977  
3006             if (exists $self->{all_from_addrs}) { return @{$self->{all_from_addrs}}; }
3007 79         129  
3008             my @addrs;
3009              
3010 79         261 # Resent- headers take priority, if present. see bug 672
3011 79 50 33     377 my $resent = $self->get('Resent-From',undef);
3012 0         0 if (defined $resent && $resent =~ /\S/) {
3013             @addrs = $self->{main}->find_all_addrs_in_line ($resent);
3014             }
3015             else {
3016             # bug 2292: Used to use find_all_addrs_in_line() with the same
3017             # headers, but the would catch addresses in comments which caused
3018             # FNs for things like whitelist_from. Since all of these are From
3019             # headers, there should only be 1 address in each anyway (not exactly
3020             # true, RFC 2822 allows multiple addresses in a From header field),
3021             # so use the :addr code...
3022             # bug 3366: some addresses come in as 'foo@bar...', which is invalid.
3023             # so deal with the multiple periods.
3024 79         281 ## no critic
  39         154  
  39         176  
  395         832  
3025             @addrs = map { tr/././s; $_ } grep { $_ ne '' }
3026             ($self->get('From:addr'), # std
3027             $self->get('Envelope-Sender:addr'), # qmail: new-inject(1)
3028             $self->get('Resent-Sender:addr'), # procmailrc manpage
3029             $self->get('X-Envelope-From:addr'), # procmailrc manpage
3030             $self->get('EnvelopeFrom:addr')); # SMTP envelope
3031             # http://www.cs.tut.fi/~jkorpela/headers.html is useful here
3032             }
3033              
3034 79         261 # Remove duplicate addresses
  39         171  
3035 79         277 my %addrs = map { $_ => 1 } @addrs;
3036             @addrs = keys %addrs;
3037 79         477  
3038 79         375 dbg("eval: all '*From' addrs: " . join(" ", @addrs));
3039 79         515 $self->{all_from_addrs} = \@addrs;
3040             return @addrs;
3041             }
3042              
3043             =item all_from_addrs_domains
3044              
3045             This function returns all the various from addresses in a message using all_from_addrs()
3046             and then returns only the domain names.
3047              
3048             =cut
3049              
3050 0     0 1 0 sub all_from_addrs_domains {
3051             my ($self) = @_;
3052 0 0       0  
3053 0         0 if (exists $self->{all_from_addrs_domains}) {
  0         0  
3054             return @{$self->{all_from_addrs_domains}};
3055             }
3056              
3057             #TEST POINT - my @addrs = ("test.voipquotes2.net","test.voipquotes2.co.uk");
3058 0         0 #Start with all the normal from addrs
3059             my @addrs = all_from_addrs($self);
3060 0         0  
3061             dbg("eval: all '*From' addrs domains (before): " . join(" ", @addrs));
3062              
3063             #Take just the domain with a dummy localpart
3064 0         0 #removing invalid and duplicate domains
3065 0         0 my(%addrs_seen, @addrs_filtered);
3066 0         0 foreach my $a (@addrs) {
3067 0 0 0     0 my $domain = $self->{main}->{registryboundaries}->uri_to_domain($a);
3068 0         0 next if !defined $domain || $addrs_seen{lc $domain}++;
3069             push(@addrs_filtered, 'dummy@'.$domain);
3070             }
3071 0         0  
3072             dbg("eval: all '*From' addrs domains (after uri to domain): " .
3073             join(" ", @addrs_filtered));
3074 0         0  
3075             $self->{all_from_addrs_domains} = \@addrs_filtered;
3076 0         0  
3077             return @addrs_filtered;
3078             }
3079              
3080 156     156 0 337 sub all_to_addrs {
3081             my ($self) = @_;
3082 156 100       531  
  77         155  
  77         1283  
3083             if (exists $self->{all_to_addrs}) { return @{$self->{all_to_addrs}}; }
3084 79         151  
3085             my @addrs;
3086              
3087 79         277 # Resent- headers take priority, if present. see bug 672
3088 79 50       290 my $resent = join('', $self->get('Resent-To'), $self->get('Resent-Cc'));
3089 0         0 if ($resent =~ /\S/) {
3090             @addrs = $self->{main}->find_all_addrs_in_line($resent);
3091             } else {
3092             # OK, a fetchmail trick: try to find the recipient address from
3093             # the most recent 3 Received lines. This is required for sendmail,
3094             # since it does not add a helpful header like exim, qmail
3095             # or Postfix do.
3096 79         244 #
3097 79         275 my $rcvd = $self->get('Received');
3098 79         295 $rcvd =~ s/\n[ \t]+/ /gs;
3099             $rcvd =~ s/\n+/\n/gs;
3100 79         366  
  79         173  
3101 79         131 my @rcvdlines = split(/\n/, $rcvd, 4); pop @rcvdlines; # forget last one
3102 79         262 my @rcvdaddrs;
3103 43 100       227 foreach my $line (@rcvdlines) {
  2         7  
3104             if ($line =~ / for (\S+\@\S+);/) { push (@rcvdaddrs, $1); }
3105             }
3106              
3107 79         398 @addrs = $self->{main}->find_all_addrs_in_line (
3108             join('',
3109             join(" ", @rcvdaddrs)."\n",
3110             $self->get('To'), # std
3111             $self->get('Apparently-To'), # sendmail, from envelope
3112             $self->get('Delivered-To'), # Postfix, poss qmail
3113             $self->get('Envelope-Recipients'), # qmail: new-inject(1)
3114             $self->get('Apparently-Resent-To'), # procmailrc manpage
3115             $self->get('X-Envelope-To'), # procmailrc manpage
3116             $self->get('Envelope-To'), # exim
3117             $self->get('X-Delivered-To'), # procmail quick start
3118             $self->get('X-Original-To'), # procmail quick start
3119             $self->get('X-Rcpt-To'), # procmail quick start
3120             $self->get('X-Real-To'), # procmail quick start
3121             $self->get('Cc'))); # std
3122             # those are taken from various sources; thanks to Nancy McGough, who
3123             # noted some in <http://www.ii.com/internet/robots/procmail/qs/#envelope>
3124             }
3125 79         508  
3126 79         267 dbg("eval: all '*To' addrs: " . join(" ", @addrs));
3127 79         1781 $self->{all_to_addrs} = \@addrs;
3128             return @addrs;
3129              
3130             # http://www.cs.tut.fi/~jkorpela/headers.html is useful here, also
3131             # http://www.exim.org/pipermail/exim-users/Week-of-Mon-20001009/021672.html
3132             }
3133              
3134             ###########################################################################
3135              
3136             1;
3137             __END__
3138              
3139             =back
3140              
3141             =head1 SEE ALSO
3142              
3143             C<Mail::SpamAssassin>
3144             C<spamassassin>
3145