File Coverage

blib/lib/Mail/SpamAssassin/PerMsgStatus.pm

Criterion	Covered	Total	%
statement	958	1278	74.9
branch	284	560	50.7
condition	91	224	40.6
subroutine	111	125	88.8
pod	43	94	45.7
total	1487	2281	65.1

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::PerMsgStatus - per-message status (spam or not-spam)
21
22							=head1 SYNOPSIS
23
24							my $spamtest = new Mail::SpamAssassin ({
25							'rules_filename' => '/etc/spamassassin.rules',
26							'userprefs_filename' => $ENV{HOME}.'/.spamassassin/user_prefs'
27							});
28							my $mail = $spamtest->parse();
29
30							my $status = $spamtest->check ($mail);
31
32							my $rewritten_mail;
33							if ($status->is_spam()) {
34							$rewritten_mail = $status->rewrite_mail ();
35							}
36							...
37
38
39							=head1 DESCRIPTION
40
41							The Mail::SpamAssassin C<check()> method returns an object of this
42							class. This object encapsulates all the per-message state.
43
44							=head1 METHODS
45
46							=over 4
47
48							=cut
49
50
51							use strict;
52	41			41		251	use warnings;
	41					74
	41					1177
53	41			41		208	use re 'taint';
	41					102
	41					1200
54	41			41		262
	41					96
	41					1346
55							use Errno qw(ENOENT);
56	41			41		237	use Time::HiRes qw(time);
	41					109
	41					1976
57	41			41		271
	41					101
	41					512
58							use Mail::SpamAssassin::Constants qw(:sa);
59	41			41		5211	use Mail::SpamAssassin::AsyncLoop;
	41					121
	41					5493
60	41			41		11901	use Mail::SpamAssassin::Conf;
	41					1425
	41					1304
61	41			41		299	use Mail::SpamAssassin::Util qw(untaint_var uri_list_canonicalize is_fqdn_valid);
	41					679
	41					988
62	41			41		195	use Mail::SpamAssassin::Timeout;
	41					83
	41					2362
63	41			41		13263	use Mail::SpamAssassin::Logger;
	41					113
	41					1209
64	41			41		240
	41					72
	41					42723
65							our @ISA = qw();
66
67							# methods defined by the compiled ruleset; deleted in finish_tests()
68							our @TEMPORARY_METHODS;
69
70							# methods defined by register_plugin_eval_glue(); deleted in finish_tests()
71							our %TEMPORARY_EVAL_GLUE_METHODS;
72
73							###########################################################################
74
75							our %common_tags;
76
77							BEGIN {
78							%common_tags = (
79
80							YESNO => sub {
81							my $pms = shift;
82	34					71	$pms->_get_tag_value_for_yesno(@_);
83	34					135	},
84
85							YESNOCAPS => sub {
86							my $pms = shift;
87	2					5	uc $pms->_get_tag_value_for_yesno(@_);
88	2					10	},
89
90							SCORE => sub {
91							my $pms = shift;
92	38					103	$pms->_get_tag_value_for_score(@_);
93	38					132	},
94
95							HITS => sub {
96							my $pms = shift;
97	0					0	$pms->_get_tag_value_for_score(@_);
98	0					0	},
99
100							REQD => sub {
101							my $pms = shift;
102	38					64	$pms->_get_tag_value_for_required_score(@_);
103	38					176	},
104
105							VERSION => \&Mail::SpamAssassin::Version,
106
107							SUBVERSION => sub { $Mail::SpamAssassin::SUB_VERSION },
108	34					88
109							RULESVERSION => sub {
110							my $pms = shift;
111	0					0	my $conf = $pms->{conf};
112	0					0	my @fnames;
113	0					0	@fnames =
114							keys %{$conf->{update_version}} if $conf->{update_version};
115	0	0				0	@fnames = sort @fnames if @fnames > 1;
	0					0
116	0	0				0	join(',', map($conf->{update_version}{$_}, @fnames));
117	0					0	},
118
119							HOSTNAME => sub {
120							my $pms = shift;
121	39					84	$pms->{conf}->{report_hostname} \|\|
122							Mail::SpamAssassin::Util::fq_hostname();
123	39	50				282	},
124
125							REMOTEHOSTNAME => sub {
126							my $pms = shift;
127	0					0	$pms->{tag_data}->{'REMOTEHOSTNAME'} \|\| "localhost";
128	0	0				0	},
129
130							REMOTEHOSTADDR => sub {
131							my $pms = shift;
132	0					0	$pms->{tag_data}->{'REMOTEHOSTADDR'} \|\| "127.0.0.1";
133	0	0				0	},
134
135							LASTEXTERNALIP => sub {
136							my $pms = shift;
137	0					0	my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
138	0					0	$lasthop ? $lasthop->{ip} : '';
139	0	0				0	},
140
141							LASTEXTERNALRDNS => sub {
142							my $pms = shift;
143	0					0	my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
144	0					0	$lasthop ? $lasthop->{rdns} : '';
145	0	0				0	},
146
147							LASTEXTERNALHELO => sub {
148							my $pms = shift;
149	0					0	my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
150	0					0	$lasthop ? $lasthop->{helo} : '';
151	0	0				0	},
152
153							CONTACTADDRESS => sub {
154							my $pms = shift;
155	4					10	$pms->{conf}->{report_contact};
156	4					16	},
157
158							BAYES => sub {
159							my $pms = shift;
160	0					0	defined $pms->{bayes_score} ? sprintf("%3.4f", $pms->{bayes_score})
161							: "0.5";
162	0	0				0	},
163
164							DATE => sub { Mail::SpamAssassin::Util::time_to_rfc822_date() },
165	0					0
166							STARS => sub {
167							my $pms = shift;
168	34					71	my $arg = (shift \|\| "*");
169	34		50			132	my $length = int($pms->{score});
170	34					93	$length = 50 if $length > 50;
171	34	50				95	# avoid a perl 5.21 warning: "Negative repeat count does nothing"
172							$length > 0 ? $arg x $length : '';
173	34	100				143	},
174
175							AUTOLEARN => sub {
176							my $pms = shift;
177	34					97	$pms->get_autolearn_status();
178	34					150	},
179
180							AUTOLEARNSCORE => sub {
181							my $pms = shift;
182	0					0	$pms->get_autolearn_points();
183	0					0	},
184
185							TESTS => sub {
186							my $pms = shift;
187	34					61	my $arg = (shift \|\| ',');
188	34		50			143	join($arg, sort @{$pms->{test_names_hit}}) \|\| "none";
189	34	100				73	},
	34					226
190
191							SUBTESTS => sub {
192							my $pms = shift;
193	0					0	my $arg = (shift \|\| ',');
194	0		0			0	join($arg, sort @{$pms->{subtest_names_hit}}) \|\| "none";
195	0	0				0	},
	0					0
196
197							SUBTESTSCOLLAPSED => sub {
198							my $pms = shift;
199	0					0	my $arg = (shift \|\| ',');
200	0		0			0	my (@subtests) = $pms->get_names_of_subtests_hit("collapsed");
201	0					0	join($arg, sort @subtests) \|\| "none";
202	0	0				0	},
203
204							TESTSSCORES => sub {
205							my $pms = shift;
206	0					0	my $arg = (shift \|\| ",");
207	0		0			0	my $scores = $pms->{conf}->{scores};
208	0					0	join($arg, map($_ . "=" . ($scores->{$_} \|\| '0'),
209							sort @{$pms->{test_names_hit}})) \|\| "none";
210	0	0	0			0	},
	0					0
211
212							PREVIEW => sub {
213							my $pms = shift;
214	4					9	$pms->get_content_preview();
215	4					24	},
216
217							REPORT => sub {
218							my $pms = shift;
219	3					7	"\n" . ($pms->{tag_data}->{REPORT} \|\| "");
220	3		50			25	},
221
222							SUBJPREFIX => sub {
223							my $pms = shift;
224	0					0	($pms->{tag_data}->{SUBJPREFIX} \|\| "");
225	0	0				0	},
226
227							HEADER => sub {
228							my $pms = shift;
229	0					0	my $hdr = shift;
230	0					0	return if !$hdr;
231	0	0				0	$pms->get($hdr,undef);
232	0					0	},
233
234							TIMING => sub {
235							my $pms = shift;
236	0					0	$pms->{main}->timer_report();
237	0					0	},
238
239							ADDEDHEADERHAM => sub {
240							my $pms = shift;
241	0					0	$pms->_get_added_headers('headers_ham');
242	0					0	},
243
244							ADDEDHEADERSPAM => sub {
245							my $pms = shift;
246	0					0	$pms->_get_added_headers('headers_spam');
247	0					0	},
248
249							ADDEDHEADER => sub {
250							my $pms = shift;
251	0					0	$pms->_get_added_headers(
252							$pms->{is_spam} ? 'headers_spam' : 'headers_ham');
253	0	0				0	},
254
255							);
256	41			41		525826	}
257
258							my $class = shift;
259							$class = ref($class) \|\| $class;
260	155			155	0	760	my ($main, $msg, $opts) = @_;
261	155		33			931
262	155					465	my $self = {
263							'main' => $main,
264							'msg' => $msg,
265							'score' => 0,
266							'test_log_msgs' => { },
267							'test_names_hit' => [ ],
268							'subtest_names_hit' => [ ],
269							'spamd_result_log_items' => [ ],
270							'tests_already_hit' => { },
271							'c' => { },
272							'tag_data' => { },
273							'rule_errors' => 0,
274							'disable_auto_learning' => 0,
275							'auto_learn_status' => undef,
276							'auto_learn_force_status' => undef,
277							'conf' => $main->{conf},
278							'async' => Mail::SpamAssassin::AsyncLoop->new($main),
279							'master_deadline' => $msg->{master_deadline}, # dflt inherited from msg
280							'deadline_exceeded' => 0, # time limit exceeded, skipping further tests
281							'uri_detail_list' => { },
282	155					2112	'subjprefix' => "",
283							};
284
285							dbg("check: pms new, time limit in %.3f s",
286							$self->{master_deadline} - time) if $self->{master_deadline};
287
288	155	50				1502	if (defined $opts && $opts->{disable_auto_learning}) {
289							$self->{disable_auto_learning} = 1;
290	155	50	66			747	}
291	49					123
292							# used with "mass-check --loghits"
293							if ($self->{main}->{save_pattern_hits}) {
294							$self->{save_pattern_hits} = 1;
295	155	50				505	$self->{pattern_hits} = { };
296	0					0	}
297	0					0
298							delete $self->{should_log_rule_hits};
299							my $dbgcache = would_log('dbg', 'rules');
300	155					277	if ($dbgcache \|\| $self->{save_pattern_hits}) {
301	155					641	$self->{should_log_rule_hits} = 1;
302	155	50	33			1016	}
303	0					0
304							# known valid tags that might not get their entry in pms->{tag_data}
305							# in some circumstances
306							my $tag_data_ref = $self->{tag_data};
307							foreach (qw(SUMMARY REPORT SUBJPREFIX RBL)) { $tag_data_ref->{$_} = '' }
308	155					342	foreach (qw(AWL AWLMEAN AWLCOUNT AWLPRESCORE
309	155					431	DCCB DCCR DCCREP PYZOR DKIMIDENTITY DKIMDOMAIN DKIMSELECTOR
	620					1553
310	155					531	BAYESTC BAYESTCLEARNED BAYESTCSPAMMY BAYESTCHAMMY
311							HAMMYTOKENS SPAMMYTOKENS TOKENSUMMARY)) {
312							$tag_data_ref->{$_} = undef; # exist, but undefined
313							}
314	2790					6106
315							bless ($self, $class);
316							$self;
317	155					458	}
318	155					570
319							my ($self) = shift;
320							local $@;
321							eval { $self->delete_fulltext_tmpfile() }; # Bug 5808
322	152			152		3164707	}
323	152					331
324	152					409	###########################################################################
	152					718
325
326							=item $status->check ()
327
328							Runs the SpamAssassin rules against the message pointed to by the object.
329
330							=cut
331
332							my ($self) = shift;
333							my $master_deadline = $self->{master_deadline};
334							if (!$master_deadline) {
335							$self->check_timed(@_);
336	97			97	1	248	} else {
337	97					321	my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
338	97	50				277	my $err = $t->run(sub { $self->check_timed(@_) });
339	0					0	if (time > $master_deadline && !$self->{deadline_exceeded}) {
340							info("check: exceeded time limit in pms check");
341	97					1130	$self->{deadline_exceeded} = 1;
342	97			97		1022	}
	97					442
343	96	50	33			929	}
344	0					0	}
345	0					0
346							my ($self) = @_;
347							local ($_);
348
349							$self->{learned_points} = 0;
350							$self->{body_only_points} = 0;
351	97			97	0	297	$self->{head_only_points} = 0;
352	97					207	$self->{score} = 0;
353
354	97					401	# clear NetSet cache before every check to prevent it growing too large
355	97					366	foreach my $nset_name (qw(internal_networks trusted_networks msa_networks)) {
356	97					230	my $netset = $self->{conf}->{$nset_name};
357	97					218	$netset->ditch_cache() if $netset;
358							}
359
360	97					292	$self->{main}->call_plugins ("check_start", { permsgstatus => $self });
361	291					702
362	291	50				1167	# in order of slowness; fastest first, slowest last.
363							# we do ALL the tests, even if a spam triggers lots of them early on.
364							# this lets us see ludicrously spammish mails (score: 40) etc., which
365	97					647	# we can then immediately submit to spamblocking services.
366							#
367							# TODO: change this to do whitelist/blacklists first? probably a plan
368							# NOTE: definitely need AWL stuff last, for regression-to-mean of score
369
370							# TVD: we may want to do more than just clearing out the headers, but ...
371							$self->{msg}->delete_header('X-Spam-.*');
372
373							# Resident Mail::SpamAssassin code will possibly never change score
374							# sets, even if bayes becomes available. So we should do a quick check
375							# to see if we should go from {0,1} to {2,3}. We of course don't need
376	97					648	# to do this switch if we're already using bayes ... ;)
377							my $set = $self->{conf}->get_score_set();
378							if (($set & 2) == 0 && $self->{main}->{bayes_scanner} && $self->{main}->{bayes_scanner}->is_scan_available() && $self->{conf}->{use_bayes_rules}) {
379							dbg("check: scoreset $set but bayes is available, switching scoresets");
380							$self->{conf}->set_score_set ($set\|2);
381							}
382	97					533	dbg("check: using scoreset $set in M:S:Pms");
383	97	0	66			1078	# The primary check functionality occurs via a plugin call. For more
			66
			33
384	0					0	# information, please see: Mail::SpamAssassin::Plugin::Check
385	0					0	if (!$self->{main}->call_plugins ("check_main", { permsgstatus => $self }))
386							{
387	97					557	# did anything happen? if not, this is fatal
388							if (!$self->{main}->have_plugin("check_main")) {
389							die "check: no loaded plugin implements 'check_main': cannot scan!\n".
390	97	100				398	"Check that the necessary '.pre' files are in the config directory.\n".
391							"At a minimum, v320.pre loads the Check plugin which is required.\n";
392							}
393	1	50				10	}
394	1					21
395							# delete temporary storage and memory allocation used during checking
396							$self->delete_fulltext_tmpfile();
397
398							# now that we've finished checking the mail, clear out this cache
399							# to avoid unforeseen side-effects.
400							$self->{c} = { };
401	96					702
402							# Round the score to 3 decimal places to avoid rounding issues
403							# We assume required_score to be properly rounded already.
404							# add 0 to force it back to numeric representation instead of string.
405	96					1055	$self->{score} = (sprintf "%0.3f", $self->{score}) + 0;
406
407							dbg("check: is spam? score=".$self->{score}.
408							" required=".$self->{conf}->{required_score});
409							dbg("check: tests=".$self->get_names_of_tests_hit());
410	96					1420	dbg("check: subtests=".$self->get_names_of_subtests_hit("dbg"));
411							$self->{is_spam} = $self->is_spam();
412
413	96					892	$self->{main}->{resolver}->bgabort();
414	96					555	$self->{main}->call_plugins ("check_end", { permsgstatus => $self });
415	96					497
416	96					378	1;
417							}
418	96					527
419	96					473	###########################################################################
420
421	96					1592	=item $status->learn()
422
423							After a mail message has been checked, this method can be called. If the score
424							is outside a certain range around the threshold, ie. if the message is judged
425							more-or-less definitely spam or definitely non-spam, it will be fed into
426							SpamAssassin's learning systems (currently the naive Bayesian classifier),
427							so that future similar mails will be caught.
428
429							=cut
430
431							my ($self) = shift;
432							my $master_deadline = $self->{master_deadline};
433							if (!$master_deadline) {
434							$self->learn_timed(@_);
435							} else {
436							my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
437	96			96	1	203	my $err = $t->run(sub { $self->learn_timed(@_) });
438	96					186	if (time > $master_deadline && !$self->{deadline_exceeded}) {
439	96	50				273	info("learn: exceeded time limit in pms learn");
440	0					0	$self->{deadline_exceeded} = 1;
441							}
442	96					414	}
443	96			96		840	}
	96					602
444	96	50	33			816
445	0					0	my ($self) = @_;
446	0					0
447							if (!$self->{conf}->{bayes_auto_learn} \|\|
448							!$self->{conf}->{use_bayes} \|\|
449							$self->{disable_auto_learning})
450							{
451							$self->{auto_learn_status} = "disabled";
452	96			96	0	267	return;
453							}
454	96	100	100			692
			100
455							my ($isspam, $force_autolearn, $force_autolearn_names, $arrayref);
456							$arrayref = $self->{main}->call_plugins ("autolearn_discriminator", {
457							permsgstatus => $self
458	84					262	});
459	84					1124
460							$isspam = $arrayref->[0];
461							$force_autolearn = $arrayref->[1];
462	12					36	$force_autolearn_names = $arrayref->[2];
463	12					61
464							#AUTOLEARN_FORCE FLAG INFORMATION
465							if (defined $force_autolearn and $force_autolearn > 0) {
466							$self->{auto_learn_force_status} = "yes";
467	12					48	if (defined $force_autolearn_names) {
468	12					24	$self->{auto_learn_force_status} .= " ($force_autolearn_names)";
469	12					19	}
470							} else {
471							$self->{auto_learn_force_status} = "no";
472	12	50	66			70	}
473	0					0
474	0	0				0	if (!defined $isspam) {
475	0					0	$self->{auto_learn_status} = 'no';
476							return;
477							}
478	12					28
479
480							my $timer = $self->{main}->time_method("learn");
481	12	100				59
482	10					28	$self->{main}->call_plugins ("autolearn", {
483	10					144	permsgstatus => $self,
484							isspam => $isspam
485							});
486
487	2					11	# bug 3704: temporarily override learn's ability to re-learn a message
488							my $orig_learner = $self->{main}->init_learner({ "no_relearn" => 1 });
489	2					11
490							my $eval_stat;
491							eval {
492							my $learnstatus = $self->{main}->learn ($self->{msg}, undef, $isspam, 0);
493							if ($learnstatus->did_learn()) {
494							$self->{auto_learn_status} = $isspam ? "spam" : "ham";
495	2					13	}
496							# This must wait until the did_learn call.
497	2					5	$learnstatus->finish();
498							$self->{main}->finish_learner(); # for now
499	2					12
500	2	50				8	if (exists $self->{main}->{bayes_scanner}) {
501	0	0				0	$self->{main}->{bayes_scanner}->force_close();
502							}
503							1;
504	2					8	} or do {
505	2					11	$eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
506							};
507	2	50				6
508	2					9	# reset learner options to their original values
509							$self->{main}->init_learner($orig_learner);
510	2					12
511	2	50				3	if (defined $eval_stat) {
512	0	0				0	dbg("learn: auto-learning failed: $eval_stat");
	0					0
513							$self->{auto_learn_status} = "failed";
514							}
515							}
516	2					9
517							=item $score = $status->get_autolearn_points()
518	2	50				52
519	0					0	Return the message's score as computed for auto-learning. Certain tests are
520	0					0	ignored:
521
522							- rules with tflags set to 'learn' (the Bayesian rules)
523
524							- rules with tflags set to 'userconf' (user white/black-listing rules, etc)
525
526							- rules with tflags set to 'noautolearn'
527
528							Also note that auto-learning occurs using scores from either scoreset 0 or 1,
529							depending on what scoreset is used during message check. It is likely that the
530							message check and auto-learn scores will be different.
531
532							=cut
533
534							my ($self) = @_;
535							$self->_get_autolearn_points();
536							return $self->{autolearn_points};
537							}
538
539							=item $score = $status->get_head_only_points()
540
541							Return the message's score as computed for auto-learning, ignoring
542	12			12	1	28	all rules except for header-based ones.
543	12					65
544	12					49	=cut
545
546							my ($self) = @_;
547							$self->_get_autolearn_points();
548							return $self->{head_only_points};
549							}
550
551							=item $score = $status->get_learned_points()
552
553							Return the message's score as computed for auto-learning, ignoring
554							all rules except for learning-based ones.
555	12			12	1	30
556	12					30	=cut
557	12					44
558							my ($self) = @_;
559							$self->_get_autolearn_points();
560							return $self->{learned_points};
561							}
562
563							=item $score = $status->get_body_only_points()
564
565							Return the message's score as computed for auto-learning, ignoring
566							all rules except for body-based ones.
567
568	12			12	1	32	=cut
569	12					31
570	12					34	my ($self) = @_;
571							$self->_get_autolearn_points();
572							return $self->{body_only_points};
573							}
574
575							=item $score = $status->get_autolearn_force_status()
576
577							Return whether a message's score included any rules that are flagged as
578							autolearn_force.
579
580							=cut
581	12			12	1	36
582	12					36	my ($self) = @_;
583	12					35	$self->_get_autolearn_points();
584							return $self->{autolearn_force};
585							}
586
587							=item $rule_names = $status->get_autolearn_force_names()
588
589							Return a list of comma separated list of rule names if a message's
590							score included any rules that are flagged as autolearn_force.
591
592							=cut
593
594	12			12	1	31	my ($self) = @_;
595	12					35	my ($names);
596	12					29
597							$self->_get_autolearn_points();
598							$names = $self->{autolearn_force_names};
599
600							if (defined $names) {
601							#remove trailing comma
602							$names =~ s/,$//;
603							} else {
604							$names = "";
605							}
606
607	12			12	1	24	return $names;
608	12					27	}
609
610	12					33	my ($self) = @_;
611	12					32
612							return if (exists $self->{autolearn_points});
613	12	50				30	# ensure it only gets computed once, even if we return early
614							$self->{autolearn_points} = 0;
615	0					0
616							# This function needs to use use sum($score[scoreset % 2]) not just {score}.
617	12					28	# otherwise we shift what we autolearn on and it gets really weird. - tvd
618							my $orig_scoreset = $self->{conf}->get_score_set();
619							my $new_scoreset = $orig_scoreset;
620	12					29	my $scores = $self->{conf}->{scores};
621
622							if (($orig_scoreset & 2) == 0) { # we don't need to recompute
623							dbg("learn: auto-learn: currently using scoreset $orig_scoreset");
624	72			72		106	}
625							else {
626	72	100				159	$new_scoreset = $orig_scoreset & ~2;
627							dbg("learn: auto-learn: currently using scoreset $orig_scoreset, recomputing score based on scoreset $new_scoreset");
628	12					49	$scores = $self->{conf}->{scoreset}->[$new_scoreset];
629							}
630
631							my $tflags = $self->{conf}->{tflags};
632	12					64	my $points = 0;
633	12					27
634	12					26	# Just in case this function is called multiple times, clear out the
635							# previous calculated values
636	12	50				54	$self->{learned_points} = 0;
637	12					59	$self->{body_only_points} = 0;
638							$self->{head_only_points} = 0;
639							$self->{autolearn_force} = 0;
640	0					0
641	0					0	foreach my $test (@{$self->{test_names_hit}}) {
642	0					0	# According to the documentation, noautolearn, userconf, and learn
643							# rules are ignored for autolearning.
644							if (exists $tflags->{$test}) {
645	12					39	next if $tflags->{$test} =~ /\bnoautolearn\b/;
646	12					23	next if $tflags->{$test} =~ /\buserconf\b/;
647
648							# Keep track of the learn points for an additional autolearn check.
649							# Use the original scoreset since it'll be 0 in sets 0 and 1.
650	12					29	if ($tflags->{$test} =~ /\blearn\b/) {
651	12					26	# we're guaranteed that the score will be defined
652	12					18	$self->{learned_points} += $self->{conf}->{scoreset}->[$orig_scoreset]->{$test};
653	12					33	next;
654							}
655	12					21
	12					60
656							#IF ANY RULES ARE AUTOLEARN FORCE, SET THAT FLAG
657							if ($tflags->{$test} =~ /\bautolearn_force\b/) {
658	19	100				73	$self->{autolearn_force}++;
659	6	100				26	#ADD RULE NAME TO LIST
660	5	50				50	$self->{autolearn_force_names}.="$test,";
661							}
662							}
663
664	0	0				0	# ignore tests with 0 score (or undefined) in this scoreset
665							next if !$scores->{$test};
666	0					0
667	0					0	# Go ahead and add points to the proper locations
668							# Changed logic because in testing, I was getting both head and body. Bug 5503
669							if ($self->{conf}->maybe_header_only ($test)) {
670							$self->{head_only_points} += $scores->{$test};
671	0	0				0	dbg("learn: auto-learn: adding head_only points $scores->{$test}");
672	0					0	} elsif ($self->{conf}->maybe_body_only ($test)) {
673							$self->{body_only_points} += $scores->{$test};
674	0					0	dbg("learn: auto-learn: adding body_only points $scores->{$test}");
675							} else {
676							dbg("learn: auto-learn: not considered head or body scores: $scores->{$test}");
677							}
678
679	13	50				65	$points += $scores->{$test};
680							}
681
682							# Figure out the final value we'll use for autolearning
683	13	100				65	$points = (sprintf "%0.3f", $points) + 0;
		50
684	9					38	dbg("learn: auto-learn: message score: ".$self->{score}.", computed score for autolearn: $points");
685	9					75
686							$self->{autolearn_points} = $points;
687	4					13	}
688	4					28
689							###########################################################################
690	0					0
691							=item $isspam = $status->is_spam ()
692
693	13					49	After a mail message has been checked, this method can be called. It will
694							return 1 for mail determined likely to be spam, 0 if it does not seem
695							spam-like.
696
697	12					160	=cut
698	12					131
699							my ($self) = @_;
700	12					44	# changed to test this so sub-tests can ask "is_spam" during a run
701							return ($self->{score} >= $self->{conf}->{required_score});
702							}
703
704							###########################################################################
705
706							=item $list = $status->get_names_of_tests_hit ()
707
708							After a mail message has been checked, this method can be called. It will
709							return a comma-separated string, listing all the symbolic test names
710							of the tests which were triggered by the mail.
711
712							=cut
713
714	99			99	1	198	my ($self) = @_;
715
716	99					462	return join(',', sort @{$self->{test_names_hit}});
717							}
718
719							=item $list = $status->get_names_of_tests_hit_with_scores_hash ()
720
721							After a mail message has been checked, this method can be called. It will
722							return a pointer to a hash for rule & score pairs for all the symbolic
723							test names and individual scores of the tests which were triggered by the mail.
724
725							=cut
726
727							my ($self) = @_;
728
729							#BASED ON CODE FOR TESTSSCORES TAG
730	97			97	1	260	my $scores = $self->{conf}->{scores};
731							my %testsscores;
732	97					169	$testsscores{$_} = $scores->{$_} \|\| '0' for @{$self->{test_names_hit}};
	97					653
733							return \%testsscores;
734							}
735
736							=item $list = $status->get_names_of_tests_hit_with_scores ()
737
738							After a mail message has been checked, this method can be called. It will
739							return a comma-separated string of rule=score pairs for all the symbolic
740							test names and individual scores of the tests which were triggered by the mail.
741
742							=cut
743
744	0			0	1	0	my ($self) = @_;
745
746							#BASED ON CODE FOR TESTSSCORES TAG
747	0					0	my $scores = $self->{conf}->{scores};
748	0					0	return join(',', map($_ . '=' . ($scores->{$_} \|\| '0'),
749	0		0			0	sort @{$self->{test_names_hit}})) \|\| "none";
	0					0
750	0					0	}
751
752
753							###########################################################################
754
755							=item $list = $status->get_names_of_subtests_hit ()
756
757							After a mail message has been checked, this method can be called. It will
758							return a comma-separated string, listing all the symbolic test names of the
759							meta-rule sub-tests which were triggered by the mail. Sub-tests are the
760							normally-hidden rules, which score 0 and have names beginning with two
761							underscores, used in meta rules.
762	0			0	1	0
763							If a parameter of collapsed or dbg is passed, the output will be a condensed
764							array of sub-tests with multiple hits reduced to one entry.
765	0					0
766							If the parameter of dbg is passed, the output will be a condensed string of
767	0		0			0	sub-tests with multiple hits reduced to one entry with the number of hits
768							in parentheses. Some information is also added at the end regarding the
769							multiple hits.
770
771							=cut
772
773							my ($self, $mode) = @_;
774
775							if (defined $mode && ($mode eq 'dbg' \|\| $mode eq 'collapsed')) {
776							# This routine prints only one instance of a subrule hit with a count of how many times it hit if greater than 1
777							my $total_hits = scalar(@{$self->{subtest_names_hit}});
778							return '' if !$total_hits;
779
780							my %subtest_names_hit;
781							$subtest_names_hit{$_}++ foreach @{$self->{subtest_names_hit}};
782
783							my @subtests = sort keys %subtest_names_hit;
784							my $deduplicated_hits = scalar(@subtests);
785
786							my @result;
787							foreach my $rule (@subtests) {
788							if ($subtest_names_hit{$rule} > 1) {
789							push @result, "$rule($subtest_names_hit{$rule})";
790							} else {
791							push @result, $rule;
792	96			96	1	270	}
793							}
794	96	50	33			766
			33
795							if ($mode eq 'dbg') {
796	96					224	return join(',', @result)." (Total Subtest Hits: $total_hits / Deduplicated Total Hits: $deduplicated_hits)";
	96					242
797	96	100				517	} else {
798							return join(',', @result);
799	42					69	}
800	42					100	} else {
	42					232
801							# Return the simpler string with duplicates and commas
802	42					265	return join(',', sort @{$self->{subtest_names_hit}});
803	42					94	}
804							}
805	42					68
806	42					139	###########################################################################
807	84	50				201
808	0					0	=item $num = $status->get_score ()
809
810	84					165	After a mail message has been checked, this method can be called. It will
811							return the message's score.
812
813							=cut
814	42	50				135
815	42					363	my ($self) = @_;
816							return $self->{score};
817	0					0	}
818
819							# left as backward compatibility
820							my ($self) = @_;
821	0					0	return $self->{score};
	0					0
822							}
823
824							###########################################################################
825
826							=item $num = $status->get_required_score ()
827
828							After a mail message has been checked, this method can be called. It will
829							return the score required for a mail to be considered spam.
830
831							=cut
832
833							my ($self) = @_;
834							return $self->{conf}->{required_score};
835	0			0	1	0	}
836	0					0
837							# left as backward compatibility
838							my ($self) = @_;
839							return $self->{conf}->{required_score};
840							}
841	0			0	0	0
842	0					0	###########################################################################
843
844							=item $num = $status->get_autolearn_status ()
845
846							After a mail message has been checked, this method can be called. It will
847							return one of the following strings depending on whether the mail was
848							auto-learned or not: "ham", "no", "spam", "disabled", "failed", "unavailable".
849
850							It also returns is flagged with auto_learn_force, it will also include the status
851							and the rules hit. For example: "autolearn_force=yes (AUTOLEARNTEST_BODY)"
852
853							=cut
854
855	0			0	1	0	my ($self) = @_;
856	0					0	my ($status) = $self->{auto_learn_status} \|\| "unavailable";
857
858							if (defined $self->{auto_learn_force_status}) {
859							$status .= " autolearn_force=".$self->{auto_learn_force_status};
860							}
861	0			0	0	0
862	0					0	return $status;
863							}
864
865							###########################################################################
866
867							=item $report = $status->get_report ()
868
869							Deliver a "spam report" on the checked mail message. This contains details of
870							how many spam detection rules it triggered.
871
872							The report is returned as a multi-line string, with the lines separated by
873							C<\n> characters.
874
875							=cut
876
877							my ($self) = @_;
878
879	34			34	1	77	if (!exists $self->{'report'}) {
880	34		50			117	my $report;
881
882	34	100				94	my $timer = $self->{main}->time_method("get_report");
883	2					8	$report = $self->{conf}->{report_template};
884							$report \|\|= '(no report template found)';
885
886	34					85	$report = $self->_replace_tags($report);
887
888							$report =~ s/\n*$/\n\n/s;
889							$self->{report} = $report;
890							}
891
892							return $self->{report};
893							}
894
895							###########################################################################
896
897							=item $preview = $status->get_content_preview ()
898
899							Give a "preview" of the content.
900
901							This is returned as a multi-line string, with the lines separated by C<\n>
902	4			4	1	1626	characters, containing a fully-decoded, safe, plain-text sample of the first
903							few lines of the message body.
904	4	50				17
905	4					6	=cut
906
907	4					22	my ($self) = @_;
908	4					17
909	4		50			24	my $str = '';
910							my @ary = @{$self->get_decoded_stripped_body_text_array()};
911	4					31	shift @ary; # drop the subject line
912
913	4					289	my $numlines = 3;
914	4					32	while (length ($str) < 200 && @ary && $numlines-- > 0) {
915							$str .= shift @ary;
916							}
917	4					16
918							# in case the last line was huge, trim it back to around 200 chars
919							local $1;
920							$str =~ s/^(.{200}).+$/$1 [...]/gm;
921							chomp ($str); $str .= "\n";
922
923							# now, some tidy-ups that make things look a bit prettier
924							$str =~ s/-----Original Message-----.*$//gm;
925							$str =~ s/This is a multi-part message in MIME format\.//gs;
926							$str =~ s/[-_*.]{10,}//gs;
927							$str =~ s/\s+/ /gs;
928
929							# add "Content preview:" ourselves, so that the text aligns
930							# correctly with the template -- then trim it off. We don't
931							# have to get this exactly right, but it's nicer if we
932							# make a bit of an effort ;)
933	4			4	1	11	$str = Mail::SpamAssassin::Util::wrap($str, " ", "Content preview: ", 75, 1);
934							$str =~ s/^Content preview:\s+//gs;
935	4					21
936	4					15	return $str;
	4					23
937	4					9	}
938
939	4					9	###########################################################################
940	4		33			47
			66
941	12					127	=item $msg = $status->get_message()
942
943							Return the object representing the message being scanned.
944
945	4					15	=cut
946	4					23
947	4					10	my ($self) = @_;
	4					19
948							return $self->{msg};
949							}
950	4					13
951	4					11	###########################################################################
952	4					20
953	4					53	=item $status->rewrite_mail ()
954
955							Rewrite the mail message. This will at minimum add headers, and at
956							maximum MIME-encapsulate the message text, to reflect its spam or not-spam
957							status. The function will return a scalar of the rewritten message.
958
959	4					24	The actual modifications depend on the configuration (see
960	4					34	C<Mail::SpamAssassin::Conf> for more information).
961
962	4					29	The possible modifications are as follows:
963
964							=over 4
965
966							=item To:, From: and Subject: modification on spam mails
967
968							Depending on the configuration, the To: and From: lines can have a
969							user-defined RFC 2822 comment appended for spam mail. The subject line
970							may have a user-defined string prepended to it for spam mail.
971
972							=item X-Spam-* headers for all mails
973
974	0			0	1	0	Depending on the configuration, zero or more headers with names
975	0					0	beginning with C<X-Spam-> will be added to mail depending on whether
976							it is spam or ham.
977
978							=item spam message with report_safe
979
980							If report_safe is set to true (1), then spam messages are encapsulated
981							into their own message/rfc822 MIME attachment without any modifications
982							being made.
983
984							If report_safe is set to false (0), then the message will only have the
985							above headers added/modified.
986
987							=back
988
989							=cut
990
991							my ($self) = @_;
992
993							my $timer = $self->{main}->time_method("rewrite_mail");
994							my $msg = $self->{msg}->get_mbox_separator() \|\| '';
995
996							if ($self->{is_spam} && $self->{conf}->{report_safe}) {
997							$msg .= $self->rewrite_report_safe();
998							}
999							else {
1000							$msg .= $self->rewrite_no_report_safe();
1001							}
1002
1003							return $msg;
1004							}
1005
1006							# Make the line endings in the passed string reference appropriate
1007							# for the original mail. Callers must note bug 5250: don't rewrite
1008							# the message body, since that will corrupt 8bit attachments/MIME parts.
1009							#
1010							my ($self, $strref) = @_;
1011							if ($self->{msg}->{line_ending} ne "\n") {
1012							$$strref =~ s/\r?\n/$self->{msg}->{line_ending}/gs;
1013							}
1014							}
1015
1016							my ($self, $which) = @_;
1017							my $str = '';
1018							# use string appends to put this back together -- I finally benchmarked it.
1019	34			34	1	263	# join() is 56% of the speed of just using string appends. ;)
1020							foreach my $hf_ref (@{$self->{conf}->{$which}}) {
1021	34					125	my($hfname, $hfbody) = @$hf_ref;
1022	34		100			129	my $line = $self->_process_header($hfname,$hfbody);
1023							$line = $self->qp_encode_header($line);
1024	34	100	66			144	$str .= "X-Spam-$hfname: $line\n";
1025	2					11	}
1026							return $str;
1027							};
1028	32					120
1029							# rewrite the message in report_safe mode
1030							# should not be called directly, use rewrite_mail instead
1031	34					147	#
1032							my ($self) = @_;
1033
1034							my $tag;
1035
1036							# This is the original message. We do not want to make any modifications so
1037							# we may recover it if necessary. It will be put into the new message as a
1038							# message/rfc822 MIME part.
1039	36			36		77	my $original = $self->{msg}->get_pristine();
1040	36	50				149
1041	0					0	# This is the new message.
1042							my $newmsg = '';
1043
1044							# the report charset
1045							my $report_charset = "; charset=iso-8859-1";
1046	34			34		83	if ($self->{conf}->{report_charset}) {
1047	34					55	$report_charset = "; charset=" . $self->{conf}->{report_charset};
1048							}
1049
1050	34					63	# the SpamAssassin report
	34					152
1051	168					629	my $report = $self->get_report();
1052	168					376
1053	168					481	# If there are any wide characters, need to MIME-encode in UTF-8
1054	168					829	# TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
1055							# we could try converting to that charset if possible
1056	34					115	unless ($] < 5.008 \|\| utf8::downgrade($report, 1)) {
1057							$report_charset = "; charset=utf-8";
1058							utf8::encode($report);
1059							}
1060
1061							# get original headers, "pristine" if we can do it
1062							my $from = $self->{msg}->get_pristine_header("From");
1063	2			2	0	6	my $to = $self->{msg}->get_pristine_header("To");
1064							my $cc = $self->{msg}->get_pristine_header("Cc");
1065	2					4	my $subject = $self->{msg}->get_pristine_header("Subject");
1066							my $msgid = $self->{msg}->get_pristine_header('Message-Id');
1067							my $date = $self->{msg}->get_pristine_header("Date");
1068
1069							# It'd be nice to do this with a foreach loop, but with only three
1070	2					10	# possibilities right now, it's easier not to...
1071
1072							if (defined $self->{conf}->{rewrite_header}->{Subject}) {
1073	2					5	# Add a prefix to the subject if needed
1074							$subject = "\n" if !defined $subject;
1075							if((defined $self->{subjprefix}) and ($self->{subjprefix} ne "")) {
1076	2					7	$tag = $self->_replace_tags($self->{subjprefix});
1077	2	50				9	$tag =~ s/\n/ /gs;
1078	0					0	$subject = $tag . $subject;
1079							}
1080							# Add a SPAM prefix
1081							$tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{Subject});
1082	2					9	$tag =~ s/\n/ /gs; # strip tag's newlines
1083							$subject =~ s/^(?:\Q${tag}\E )?/${tag} /g; # For some reason the tag may already be there!?
1084							}
1085
1086							if (defined $self->{conf}->{rewrite_header}->{To}) {
1087	2	50	33			18	$to = "\n" if !defined $to;
1088	0					0	my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{To});
1089	0					0	$tag =~ s/\n/ /gs; # strip tag's newlines
1090							$to =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1091							}
1092
1093	2					14	if (defined $self->{conf}->{rewrite_header}->{From}) {
1094	2					9	$from = "\n" if !defined $from;
1095	2					10	my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{From});
1096	2					9	$tag =~ s/\n+//gs; # strip tag's newlines
1097	2					14	$from =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1098	2					10	}
1099
1100							# add report headers to message
1101							$newmsg .= "From: $from" if defined $from;
1102							$newmsg .= "To: $to" if defined $to;
1103	2	50				11	$newmsg .= "Cc: $cc" if defined $cc;
1104							$newmsg .= "Subject: $subject" if defined $subject;
1105	0	0				0	$newmsg .= "Date: $date" if defined $date;
1106	0	0	0			0	$newmsg .= "Message-Id: $msgid" if defined $msgid;
1107	0					0	$newmsg .= $self->_get_added_headers('headers_spam');
1108	0					0
1109	0					0	if (defined $self->{conf}->{report_safe_copy_headers}) {
1110							my %already_added = map { $_ => 1 } qw/from to cc subject date message-id/;
1111
1112	0					0	foreach my $hdr (@{$self->{conf}->{report_safe_copy_headers}}) {
1113	0					0	next if exists $already_added{lc $hdr};
1114	0					0	my @hdrtext = $self->{msg}->get_pristine_header($hdr);
1115							$already_added{lc $hdr}++;
1116
1117	2	50				17	if (lc $hdr eq "received") { # add Received at the top ...
1118	0	0				0	my $rhdr = "";
1119	0					0	foreach (@hdrtext) {
1120	0					0	$rhdr .= "$hdr: $_";
1121	0					0	}
1122							$newmsg = "$rhdr$newmsg";
1123							}
1124	2	50				9	else {
1125	0	0				0	foreach (@hdrtext) {
1126	0					0	$newmsg .= "$hdr: $_";
1127	0					0	}
1128	0					0	}
1129							}
1130							}
1131
1132	2	50				14	# jm: add a SpamAssassin Received header to note markup time etc.
1133	2	50				17	# emulates the fetchmail style.
1134	2	50				6	# tvd: do this after report_safe_copy_headers so Received will be done correctly
1135	2	50				9	$newmsg = "Received: from localhost by " .
1136	2	50				11	Mail::SpamAssassin::Util::fq_hostname() . "\n" .
1137	2	50				16	"\twith SpamAssassin (version " .
1138	2					11	Mail::SpamAssassin::Version() . ");\n" .
1139							"\t" . Mail::SpamAssassin::Util::time_to_rfc822_date() . "\n" .
1140	2	50				12	$newmsg;
1141	2					6
	12					24
1142							# MIME boundary
1143	2					6	my $boundary = "----------=_" . sprintf("%08X.%08X",time,int(rand(2 ** 32)));
	2					10
1144	0	0				0
1145	0					0	# ensure it's unique, so we can't be attacked this way
1146	0					0	while ($original =~ /^\Q${boundary}\E(?:--)?$/m) {
1147							$boundary .= "/".sprintf("%08X",int(rand(2 ** 32)));
1148	0	0				0	}
1149	0					0
1150	0					0	# determine whether Content-Disposition should be "attachment" or "inline"
1151	0					0	my $disposition;
1152							my $ct = $self->{msg}->get_header("Content-Type");
1153	0					0	if (defined $ct && $ct ne '' && $ct !~ m{text/plain}i) {
1154							$disposition = "attachment";
1155							$report .= $self->_replace_tags($self->{conf}->{unsafe_report_template});
1156	0					0	# if we wanted to defang the attachment, this would be the place
1157	0					0	}
1158							else {
1159							$disposition = "inline";
1160							}
1161
1162							my $type = "message/rfc822";
1163							$type = "text/plain" if $self->{conf}->{report_safe} > 1;
1164
1165							my $description = $self->{conf}->{'encapsulated_content_description'};
1166	2					7
1167							# Note: the message should end in blank line since mbox format wants
1168							# blank line at end and messages may be concatenated! In addition, the
1169							# x-spam-type parameter is fixed since we will use it later to recognize
1170							# original messages that can be extracted.
1171							$newmsg .= <<"EOM";
1172							MIME-Version: 1.0
1173							Content-Type: multipart/mixed; boundary="$boundary"
1174	2					19
1175							This is a multi-part message in MIME format.
1176
1177	2					58	--$boundary
1178	0					0	Content-Type: text/plain$report_charset
1179							Content-Disposition: inline
1180							Content-Transfer-Encoding: 8bit
1181
1182	2					6	$report
1183	2					11
1184	2	50	33			108	--$boundary
			33
1185	2					7	Content-Type: $type; x-spam-type=original
1186	2					10	Content-Description: $description
1187							Content-Disposition: $disposition
1188							Content-Transfer-Encoding: 8bit
1189
1190	0					0	EOM
1191
1192							my $newmsgtrailer = "\n--$boundary--\n\n";
1193	2					6
1194	2	50				9	# now fix line endings in both headers, report_safe body parts,
1195							# and new MIME boundaries and structure
1196	2					7	$self->_fixup_report_line_endings(\$newmsg);
1197							$self->_fixup_report_line_endings(\$newmsgtrailer);
1198							$newmsg .= $original.$newmsgtrailer;
1199
1200							return $newmsg;
1201							}
1202	2					43
1203							# rewrite the message in non-report_safe mode (just headers)
1204							# should not be called directly, use rewrite_mail instead
1205							#
1206							my ($self) = @_;
1207
1208							my $ntag;
1209							my $pref_subject = 0;
1210
1211							# put the pristine headers into an array
1212							# skip the X-Spam- headers, but allow the X-Spam-Prev headers to remain.
1213							# since there may be a missing header/body
1214							#
1215							my @pristine_headers = split(/^/m, $self->{msg}->get_pristine_header());
1216							for (my $line = 0; $line <= $#pristine_headers; $line++) {
1217							next unless ($pristine_headers[$line] =~ /^X-Spam-(?!Prev-)/i);
1218							splice @pristine_headers, $line, 1 while ($pristine_headers[$line] =~ /^(?:X-Spam-(?!Prev-)\|[ \t])/i);
1219							$line--;
1220							}
1221							my $separator = '';
1222							if (@pristine_headers && $pristine_headers[$#pristine_headers] =~ /^\s*$/) {
1223	2					11	$separator = pop @pristine_headers;
1224							}
1225
1226							my $addition = 'headers_ham';
1227	2					8
1228	2					6	if($self->{is_spam})
1229	2					54	{
1230							# special-case: Subject lines. ensure one exists, if we're
1231	2					34	# supposed to mark it up.
1232							my $created_subject = 0;
1233							my $subject = $self->{msg}->get_pristine_header('Subject');
1234							if (!defined($subject) && $self->{is_spam}
1235							&& exists $self->{conf}->{rewrite_header}->{'Subject'})
1236							{
1237							push(@pristine_headers, "Subject: \n");
1238	32			32	0	64	$created_subject = 1;
1239							}
1240	32					55
1241	32					47	# Deal with header rewriting
1242							foreach (@pristine_headers) {
1243							# if we're not going to do a rewrite, skip this header!
1244							next if (!/^(From\|Subject\|To):/i);
1245							my $hdr = ucfirst(lc($1));
1246							next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1247	32					132
1248	32					155	# pop the original version onto the end of the header array
1249	120	50				426	if ($created_subject) {
1250	0					0	push(@pristine_headers, "X-Spam-Prev-Subject: (nonexistent)\n");
1251	0					0	} else {
1252							push(@pristine_headers, "X-Spam-Prev-$_");
1253	32					63	}
1254	32	50	33			333
1255	32					90	# Figure out the rewrite piece
1256							my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{$hdr});
1257							$tag =~ s/\n/ /gs;
1258	32					78
1259							# The tag should be a comment for this header ...
1260	32	50				91	$tag = "($tag)" if ($hdr =~ /^(?:From\|To)$/);
1261
1262							if((defined $self->{subjprefix}) and (defined $self->{conf}->{rewrite_header}->{Subject})) {
1263							if($self->{subjprefix} ne "") {
1264	0					0	$ntag = $self->_replace_tags($self->{subjprefix});
1265	0					0	$ntag =~ s/\n/ /gs;
1266	0	0	0			0	$ntag =~ s/\s+$//;
			0
1267
1268							local $1;
1269	0					0	if(defined $ntag) {
1270	0					0	s/^([^:]+:)[ \t]*(?:\Q${ntag}\E )?/$1 ${ntag} /i;
1271							}
1272							}
1273							}
1274	0					0	s/^([^:]+:)[ \t]*(?:\Q${tag}\E )?/$1 ${tag} /i;
1275							}
1276	0	0				0
1277	0					0	$addition = 'headers_spam';
1278	0	0				0	} else {
1279							# special-case: Subject lines. ensure one exists, if we're
1280							# supposed to mark it up.
1281	0	0				0	my $created_subject = 0;
1282	0					0	my $subject = $self->{msg}->get_pristine_header('Subject');
1283							if (!defined($subject)
1284	0					0	&& exists $self->{conf}->{rewrite_header}->{'Subject'})
1285							{
1286							push(@pristine_headers, "Subject: \n");
1287							$created_subject = 1;
1288	0					0	}
1289	0					0
1290							# Deal with header rewriting
1291							foreach (@pristine_headers) {
1292	0	0				0	# if we're not going to do a rewrite, skip this header!
1293							next if (!/^(Subject):/i);
1294	0	0	0			0	my $hdr = ucfirst(lc($1));
1295	0	0				0	next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1296	0					0
1297	0					0	if((defined $self->{subjprefix}) and (defined $self->{conf}->{rewrite_header}->{Subject})) {
1298	0					0	if($self->{subjprefix} ne "") {
1299							$ntag = $self->_replace_tags($self->{subjprefix});
1300	0					0	$ntag =~ s/\n/ /gs;
1301	0	0				0	$ntag =~ s/\s+$//;
1302	0					0
1303							local $1;
1304							if(defined $ntag) {
1305							s/^([^:]+:)[ \t]*(?:\Q${ntag}\E )?/$1 ${ntag} /i;
1306	0					0	}
1307							}
1308							}
1309	0					0	}
1310
1311							}
1312
1313	32					92	# Break the pristine header set into two blocks; $new_hdrs_pre is the stuff
1314	32					113	# that we want to ensure comes before any SpamAssassin markup headers,
1315	32	50	33			242	# like the Return-Path header (see bug 3409).
1316							#
1317							# all the rest of the message headers (as left in @pristine_headers), is
1318	0					0	# to be placed after the SpamAssassin markup hdrs. Once one of those headers
1319	0					0	# is seen, all further headers go into that set; it's assumed that it's an
1320							# old copy of the header, or attempted spoofing, if it crops up halfway
1321							# through the headers.
1322
1323	32					107	my $new_hdrs_pre = '';
1324							if (@pristine_headers && $pristine_headers[0] =~ /^Return-Path:/i) {
1325	88	50				219	$new_hdrs_pre .= shift(@pristine_headers);
1326	0					0	while (@pristine_headers && $pristine_headers[0] =~ /^[ \t]/) {
1327	0	0				0	$new_hdrs_pre .= shift(@pristine_headers);
1328							}
1329	0	0	0			0	}
1330	0	0				0	$new_hdrs_pre .= $self->_get_added_headers($addition);
1331	0					0
1332	0					0	# fix up line endings appropriately
1333	0					0	my $newmsg = $new_hdrs_pre . join('',@pristine_headers) . $separator;
1334							$self->_fixup_report_line_endings(\$newmsg);
1335	0					0
1336	0	0				0	return $newmsg.$self->{msg}->get_pristine_body();
1337	0					0	}
1338
1339							my ($self, $text) = @_;
1340
1341							# return unchanged if there are no 8-bit characters
1342							return $text if $text !~ tr/\x00-\x7F//c;
1343
1344							my $cs = 'ISO-8859-1';
1345							if ($self->{report_charset}) {
1346							$cs = $self->{report_charset};
1347							}
1348
1349							my @hexchars = split('', '0123456789abcdef');
1350							my $ord;
1351							local $1;
1352							$text =~ s{([\x80-\xff])}{
1353							$ord = ord $1;
1354							'='.$hexchars[($ord & 0xf0) >> 4].$hexchars[$ord & 0x0f]
1355	32					87	}ges;
1356	32	50	33			217
1357	0					0	$text = '=?'.$cs.'?Q?'.$text.'?=';
1358	0		0			0
1359	0					0	dbg("markup: encoding header in $cs: $text");
1360							return $text;
1361							}
1362	32					147
1363							my ($self, $hdr_name, $hdr_data) = @_;
1364
1365	32					220	$hdr_data = $self->_replace_tags($hdr_data);
1366	32					156	$hdr_data =~ s/(?:\r?\n)+$//; # make sure there are no trailing newlines ...
1367
1368	32					149	if ($self->{conf}->{fold_headers}) {
1369							if ($hdr_data =~ /\n/) {
1370							$hdr_data =~ s/\s\n\s/\n\t/g;
1371							return $hdr_data;
1372	168			168	0	436	}
1373							else {
1374							# use '!!' instead of ': ' so it doesn't wrap on the space
1375	168	50				697	my $hdr = "X-Spam-$hdr_name!!$hdr_data";
1376							$hdr = Mail::SpamAssassin::Util::wrap($hdr, "\t", "", 79, 0, '(?<=[\s,])');
1377	0					0	# make sure there are no blank lines in headers
1378	0	0				0	# buggy wrap might not prefix blank lines with \t, so use \s* (bug 7672)
1379	0					0	$hdr =~ s/^\s*\n//gm;
1380							return (split (/!!/, $hdr, 2))[1]; # just return the data part
1381							}
1382	0					0	}
1383	0					0	else {
1384	0					0	$hdr_data =~ s/\n/ /g; # Can't have newlines in headers, unless folded
1385	0					0	return $hdr_data;
1386	0					0	}
1387	0					0	}
1388
1389							my $self = shift;
1390	0					0	my $text = shift;
1391
1392	0					0	# default to leaving the original string in place, if we cannot find
1393	0					0	# a tag for it (bug 4793)
1394							local($1,$2,$3);
1395							$text =~ s{(_(\w+?)(?:$(.*?)$)?_)}{
1396							my $full = $1;
1397	168			168		477	my $tag = $2;
1398							my $result;
1399	168					364	if ($tag =~ /^ADDEDHEADER(?:HAM\|SPAM\|)\z/) {
1400	168					382	# Bug 6278: break infinite recursion through _get_added_headers and
1401							# _get_tag on an attempt to use such tag in add_header template
1402	168	50				404	} else {
1403	168	50				316	$result = $self->get_tag_raw($tag,$3);
1404	0					0	$result = join(' ',@$result) if ref $result eq 'ARRAY';
1405	0					0	}
1406							defined $result ? $result : $full;
1407							}ge;
1408
1409	168					504	return $text;
1410	168					437	}
1411
1412							###########################################################################
1413	168					417
1414	168					856	# public API for plugins
1415
1416							=item $status->action_depends_on_tags($tags, $code, @args)
1417
1418	0					0	Enqueue the supplied subroutine reference C<$code>, to become runnable when
1419	0					0	all the specified tags become available. The C<$tags> may be a simple
1420							scalar - a tag name, or a listref of tag names. The subroutine C<&$code>
1421							when called will be passed a C<permessagestatus> object as its first argument,
1422							followed by the supplied (optional) list C<@args> .
1423
1424	174			174		255	=cut
1425	174					303
1426							my($self, $tags, $code, @args) = @_;
1427
1428							ref $code eq 'CODE'
1429	174					502	or die "action_depends_on_tags: argument must be a subroutine ref";
1430	174					1024
1431	430					934	# tag names on which the given action depends
1432	430					803	my @dep_tags = !ref $tags ? uc $tags : map(uc($_),@$tags);
1433	430					517
1434	430	50				700	# @{$self->{tagrun_subs}} list of all submitted subroutines
1435							# @{$self->{tagrun_actions}{$tag}} bitmask of action indices blocked by tag
1436							# $self->{tagrun_tagscnt}[$action_ind] count of tags still pending
1437
1438	430					712	# store action details, obtain its index
1439	430	50				995	push(@{$self->{tagrun_subs}}, [$code,@args]);
1440							my $action_ind = $#{$self->{tagrun_subs}};
1441	430	50				2098
1442							# list dependency tag names which are not already satisfied
1443							my @blocking_tags;
1444	174					677	foreach (@dep_tags) {
1445							my $data = $self->get_tag($_);
1446							if (!defined $data \|\| $data eq '') {
1447							push @blocking_tags, $_;
1448							}
1449							}
1450
1451							$self->{tagrun_tagscnt}[$action_ind] = scalar @blocking_tags;
1452							$self->{tagrun_actions}{$_}[$action_ind] = 1 for @blocking_tags;
1453
1454							if (@blocking_tags) {
1455							dbg("check: tagrun - action %s blocking on tags %s",
1456							$action_ind, join(', ',@blocking_tags));
1457							} else {
1458							dbg("check: tagrun - tag %s was ready, action %s runnable immediately: %s",
1459							join(', ',@dep_tags), $action_ind, join(', ',$code,@args));
1460							&$code($self, @args);
1461							}
1462	81			81	1	232	}
1463
1464	81	50				260	# tag_is_ready() will be called by set_tag(), indicating that a given
1465							# tag just received its value, possibly unblocking an action routine
1466							# as declared by action_depends_on_tags().
1467							#
1468	81	50				317	# Well-behaving plugins should call set_tag() once when a tag is fully
1469							# assembled and ready. Multiple calls to set the same tag value are handled
1470							# gracefully, but may result in premature activation of a pending action.
1471							# Setting tag values by plugins should not be done directly but only through
1472							# the public API set_tag(), otherwise a pending action release may be missed.
1473							#
1474							my($self, $tag) = @_;
1475	81					120	$tag = uc $tag;
	81					319
1476	81					139
	81					220
1477							if (would_log('dbg', 'check')) {
1478							my $tag_val = $self->{tag_data}{$tag};
1479	81					135	dbg("check: tagrun - tag %s is now ready, value: %s",
1480	81					181	$tag, !defined $tag_val ? '<UNDEF>'
1481	81					284	: ref $tag_val ne 'ARRAY' ? $tag_val
1482	81	50	33			408	: 'ARY:[' . join(',',@$tag_val) . ']' );
1483	81					222	}
1484							if (ref $self->{tagrun_actions}{$tag}) { # any action blocking on this tag?
1485							my $action_ind = 0;
1486							foreach my $action_pending (@{$self->{tagrun_actions}{$tag}}) {
1487	81					250	if ($action_pending) {
1488	81					301	$self->{tagrun_actions}{$tag}[$action_ind] = 0;
1489							if ($self->{tagrun_tagscnt}[$action_ind] <= 0) {
1490	81	50				205	# should not happen, warn and ignore
1491	81					357	warn "tagrun error: count for $action_ind is ".
1492							$self->{tagrun_tagscnt}[$action_ind]."\n";
1493							} elsif (! --($self->{tagrun_tagscnt}[$action_ind])) {
1494	0					0	my($code,@args) = @{$self->{tagrun_subs}[$action_ind]};
1495							dbg("check: tagrun - tag %s unblocking the action %s: %s",
1496	0					0	$tag, $action_ind, join(', ',$code,@args));
1497							&$code($self, @args);
1498							}
1499							}
1500							$action_ind++;
1501							}
1502							}
1503							}
1504
1505							# debugging aid: show actions that are still pending, waiting for their
1506							# tags to receive a value
1507							#
1508							my($self) = @_;
1509							my @tags;
1510							@tags = keys %{$self->{tagrun_actions}} if ref $self->{tagrun_actions};
1511	897			897	0	1345	for my $tag (@tags) {
1512	897					1164	my @pending_actions = grep($self->{tagrun_actions}{$tag}[$_],
1513							(0 .. $#{$self->{tagrun_actions}{$tag}}));
1514	897	50				1759	dbg("check: tagrun - tag %s is still blocking action %s",
1515	0					0	$tag, join(', ', @pending_actions)) if @pending_actions;
1516	0	0				0	}
		0
1517							}
1518
1519							=item $status->set_tag($tagname, $value)
1520
1521	897	50				2695	Set a template tag, as used in C<add_header>, report templates, etc.
1522	0					0	This API is intended for use by plugins. Tag names will be converted
1523	0					0	to an all-uppercase representation internally.
	0					0
1524	0	0				0
1525	0					0	C<$value> can be a simple scalar (string or number), or a reference to an
1526	0	0				0	array, in which case the public method get_tag will join array elements
		0
1527							using a space as a separator, returning a single string for backward
1528							compatibility.
1529	0					0
1530							C<$value> can also be a subroutine reference, which will be evaluated
1531	0					0	each time the template is expanded. The first argument passed by get_tag
	0					0
1532	0					0	to a called subroutine will be a PerMsgStatus object (this module's object),
1533							followed by optional arguments provided a caller to get_tag.
1534	0					0
1535							Note that perl supports closures, which means that variables set in the
1536							caller's scope can be accessed inside this C<sub>. For example:
1537	0					0
1538							my $text = "hello world!";
1539							$status->set_tag("FOO", sub {
1540							my $pms = shift;
1541							return $text;
1542							});
1543
1544							See C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details
1545							on how template tags are used.
1546	136			136	0	305
1547	136					212	C<undef> will be returned if a tag by that name has not been defined.
1548	136	100				424
	98					399
1549	136					476	=cut
1550
1551	79					183	my($self,$tag,$val) = @_;
	79					412
1552	79	50				506	$self->{tag_data}->{uc $tag} = $val;
1553							$self->tag_is_ready($tag);
1554							}
1555
1556							# public API for plugins
1557
1558							=item $string = $status->get_tag($tagname)
1559
1560							Get the current value of a template tag, as used in C<add_header>, report
1561							templates, etc. This API is intended for use by plugins. Tag names will be
1562							converted to an all-uppercase representation internally. See
1563							C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details on
1564							tags.
1565
1566							C<undef> will be returned if a tag by that name has not been defined.
1567
1568							=cut
1569
1570							my($self, $tag, @args) = @_;
1571
1572							return if !defined $tag;
1573							$tag = uc $tag;
1574							my $data;
1575							if (exists $common_tags{$tag}) {
1576							# tag data from traditional pre-defined tag subroutines
1577							$data = $common_tags{$tag};
1578							$data = $data->($self,@args) if ref $data eq 'CODE';
1579							$data = join(' ',@$data) if ref $data eq 'ARRAY';
1580							$data = "" if !defined $data;
1581							} elsif (exists $self->{tag_data}->{$tag}) {
1582							# tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1583							$data = $self->{tag_data}->{$tag};
1584							$data = $data->($self,@args) if ref $data eq 'CODE';
1585							$data = join(' ',@$data) if ref $data eq 'ARRAY';
1586							$data = "" if !defined $data;
1587							}
1588							return $data;
1589							}
1590	897			897	1	1831
1591	897					6869	=item $string = $status->get_tag_raw($tagname, @args)
1592	897					1838
1593							Similar to C<get_tag>, but keeps a tag name unchanged (does not uppercase it),
1594							and does not convert arrayref tag values into a single string.
1595
1596							=cut
1597
1598							my($self, $tag, @args) = @_;
1599
1600							return if !defined $tag;
1601							my $data;
1602							if (exists $common_tags{$tag}) {
1603							# tag data from traditional pre-defined tag subroutines
1604							$data = $common_tags{$tag};
1605							$data = $data->($self,@args) if ref $data eq 'CODE';
1606							$data = "" if !defined $data;
1607							} elsif (exists $self->{tag_data}->{$tag}) {
1608							# tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1609							$data = $self->{tag_data}->{$tag};
1610	95			95	1	263	$data = $data->($self,@args) if ref $data eq 'CODE';
1611							$data = "" if !defined $data;
1612	95	50				240	}
1613	95					192	return $data;
1614	95					112	}
1615	95	100				598
		50
1616							###########################################################################
1617	4					12
1618	4	50				29	# public API for plugins
1619	4	50				15
1620	4	50				14	=item $status->set_spamd_result_item($subref)
1621
1622							Set an entry for the spamd result log line. C<$subref> should be a code
1623	91					195	reference for a subroutine which will return a string in C<'name=VALUE'>
1624	91	50				257	format, similar to the other entries in the spamd result line:
1625	91	50				215
1626	91	50				287	Jul 17 14:10:47 radish spamd[16670]: spamd: result: Y 22 - ALL_NATURAL,
1627							DATE_IN_FUTURE_03_06,DIET_1,DRUGS_ERECTILE,DRUGS_PAIN,
1628	95					239	TEST_FORGED_YAHOO_RCVD,TEST_INVALID_DATE,TEST_NOREALNAME,
1629							TEST_NORMAL_HTTP_TO_IP,UNDISC_RECIPS scantime=0.4,size=3138,user=jm,
1630							uid=1000,required_score=5.0,rhost=localhost,raddr=127.0.0.1,
1631							rport=33153,mid=<9PS291LhupY>,autolearn=spam
1632
1633							C<name> and C<VALUE> must not contain C<=> or C<,> characters, as it
1634							is important that these log lines are easy to parse.
1635
1636							The code reference will be called by spamd after the message has been scanned,
1637							and the C<PerMsgStatus::check()> method has returned.
1638
1639	430			430	1	1075	=cut
1640
1641	430	50				942	my ($self, $ref) = @_;
1642	430					480	push @{$self->{spamd_result_log_items}}, $ref;
1643	430	100				1110	}
		50
1644
1645	362					576	# called by spamd
1646	362	50				1161	my ($self) = @_;
1647	362	50				759	my @ret;
1648							foreach my $ref (@{$self->{spamd_result_log_items}}) {
1649							push @ret, &$ref;
1650	68					164	}
1651	68	50				185	return @ret;
1652	68	50				181	}
1653
1654	430					968	###########################################################################
1655
1656							my($self, $arg) = @_;
1657							my($arg_spam, $arg_ham);
1658							($arg_spam, $arg_ham) = split(/,/, $arg, 2) if defined $arg;
1659							return $self->{is_spam} ? (defined $arg_spam ? $arg_spam : 'Yes')
1660							: (defined $arg_ham ? $arg_ham : 'No');
1661							}
1662
1663							my ($self, $pad) = @_;
1664
1665							my $score = sprintf("%2.1f", $self->{score});
1666							my $rscore = $self->_get_tag_value_for_required_score();
1667
1668							#Change due to bug 6419 to use Util function for consistency with spamd
1669							#and PerMessageStatus
1670							$score = Mail::SpamAssassin::Util::get_tag_value_for_score($score, $rscore, $self->{is_spam});
1671
1672							#$pad IS PROVIDED BY THE _SCORE(PAD)_ tag
1673							if (defined $pad && $pad =~ /^(0+\| +)$/) {
1674							my $count = length($1) + 3 - length($score);
1675							$score = (substr($pad, 0, $count) . $score) if $count > 0;
1676							}
1677							return $score;
1678
1679							}
1680
1681							my $self = shift;
1682							return sprintf("%2.1f", $self->{conf}->{required_score});
1683	81			81	1	223	}
1684	81					130
	81					307
1685
1686							###########################################################################
1687
1688							=item $status->finish ()
1689	0			0	0	0
1690	0					0	Indicate that this C<$status> object is finished with, and can be destroyed.
1691	0					0
	0					0
1692	0					0	If you are using SpamAssassin in a persistent environment, or checking many
1693							mail messages from one C<Mail::SpamAssassin> factory, this method should be
1694	0					0	called to ensure Perl's garbage collection will clean up old status objects.
1695
1696							=cut
1697
1698							my ($self) = @_;
1699
1700	36			36		102	$self->{main}->call_plugins ("per_msg_finish", {
1701	36					69	permsgstatus => $self
1702	36	50				108	});
1703	36	50				163
		50
		100
1704							$self->report_unsatisfied_actions;
1705
1706							# Delete out all of the members of $self. This will remove any direct
1707							# circular references and let the memory get reclaimed while also being more
1708	38			38		93	# efficient than a foreach() loop over the keys.
1709							%{$self} = ();
1710	38					246	}
1711	38					142
1712							my ($conf) = @_;
1713							foreach my $method (@TEMPORARY_METHODS) {
1714							if (defined &{$method}) {
1715	38					180	undef &{$method};
1716							}
1717							}
1718	38	50	33			128	@TEMPORARY_METHODS = (); # clear for next time
1719	0					0	%TEMPORARY_EVAL_GLUE_METHODS = ();
1720	0	0				0	}
1721
1722	38					96
1723							=item $name = $status->get_current_eval_rule_name()
1724
1725							Return the name of the currently-running eval rule. C<undef> is
1726							returned if no eval rule is currently being run. Useful for plugins
1727	76			76		104	to determine the current rule name while inside an eval test function
1728	76					319	call.
1729
1730							=cut
1731
1732							my ($self) = @_;
1733							return $self->{current_rule_name};
1734							}
1735
1736							###########################################################################
1737
1738							my ($self) = @_;
1739
1740							my $timer = $self->{main}->time_method("extract_message_metadata");
1741							$self->{msg}->extract_message_metadata($self);
1742
1743							foreach my $item (qw(
1744							relays_trusted relays_trusted_str num_relays_trusted
1745	136			136	1	26053	relays_untrusted relays_untrusted_str num_relays_untrusted
1746							relays_internal relays_internal_str num_relays_internal
1747	136					701	relays_external relays_external_str num_relays_external
1748							num_relays_unparseable last_trusted_relay_index
1749							last_internal_relay_index
1750							))
1751	136					867	{
1752							$self->{$item} = $self->{msg}->{metadata}->{$item};
1753							}
1754
1755							# TODO: International domain names (UTF-8) must be converted to
1756	136					268	# ASCII-compatible encoding (ACE) for the purpose of setting the
	136					3855
1757							# SENDERDOMAIN and AUTHORDOMAIN tags (and probably for other uses too).
1758							# (explicitly required for DMARC, draft-kucherawy-dmarc-base sect. 5.6.1)
1759							#
1760	0			0	0	0	{ local $1;
1761	0					0	my $addr = $self->get('EnvelopeFrom:addr', undef);
1762	0	0				0	# collect a FQDN, ignoring potential trailing WSP
	0					0
1763	0					0	if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
	0					0
1764							$self->set_tag('SENDERDOMAIN', lc $1);
1765							}
1766	0					0	# TODO: the get ':addr' only returns the first address; this should be
1767	0					0	# augmented to be able to return all addresses in a header field, multiple
1768							# addresses in a From header field are allowed according to RFC 5322
1769							$addr = $self->get('From:addr', undef);
1770							if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1771							$self->set_tag('AUTHORDOMAIN', lc $1);
1772							}
1773							}
1774
1775							$self->set_tag('RELAYSTRUSTED', $self->{relays_trusted_str});
1776							$self->set_tag('RELAYSUNTRUSTED', $self->{relays_untrusted_str});
1777							$self->set_tag('RELAYSINTERNAL', $self->{relays_internal_str});
1778							$self->set_tag('RELAYSEXTERNAL', $self->{relays_external_str});
1779							$self->set_tag('LANGUAGES', $self->{msg}->get_metadata("X-Languages"));
1780
1781	0			0	1	0	# This should happen before we get called, but just in case.
1782	0					0	if (!defined $self->{msg}->{metadata}->{html}) {
1783							$self->get_decoded_stripped_body_text_array();
1784							}
1785							$self->{html} = $self->{msg}->{metadata}->{html};
1786
1787							# allow plugins to add more metadata, read the stuff that's there, etc.
1788	96			96	0	228	$self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
1789							}
1790	96					366
1791	96					536	###########################################################################
1792
1793	96					501	=item $status->get_decoded_body_text_array ()
1794
1795							Returns the message body, with B<base64> or B<quoted-printable> encodings
1796							decoded, and non-text parts or non-inline attachments stripped.
1797
1798							This is the same result text as used in 'rawbody' rules.
1799
1800							It is returned as an array of strings, with each string being a 2-4kB chunk
1801							of the body, split from boundaries if possible.
1802	1440					3012
1803							=cut
1804
1805							return $_[0]->{msg}->get_decoded_body_text_array();
1806							}
1807
1808							=item $status->get_decoded_stripped_body_text_array ()
1809
1810	96					198	Returns the message body, decoded (as described in
	96					307
1811	96					654	get_decoded_body_text_array()), with HTML rendered, and with whitespace
1812							normalized.
1813	96	100	66			505
1814	1					7	This is the same result text as used in 'body' rules.
1815
1816							It will always render text/html.
1817
1818							It is returned as an array of strings, with each string representing one
1819	96					295	'paragraph'. Paragraphs, in plain-text mails, are double-newline-separated
1820	96	100	66			1001	blocks of multi-line text.
1821	57					611
1822							=cut
1823
1824							return $_[0]->{msg}->get_rendered_body_text_array();
1825	96					403	}
1826	96					355
1827	96					339	###########################################################################
1828	96					332
1829	96					419	=item $status->get (header_name [, default_value])
1830
1831							Returns a message header, pseudo-header, real name or address.
1832	96	100				396	C<header_name> is the name of a mail header, such as 'Subject', 'To',
1833	95					361	etc. If C<default_value> is given, it will be used if the requested
1834							C<header_name> does not exist.
1835	96					334
1836							Appending C<:raw> to the header name will inhibit decoding of quoted-printable
1837							or base-64 encoded strings.
1838	96					545
1839							Appending a modifier C<:addr> to a header field name will cause everything
1840							except the first email address to be removed from the header field. It is
1841							mainly applicable to header fields 'From', 'Sender', 'To', 'Cc' along with
1842							their 'Resent-*' counterparts, and the 'Return-Path'. For example, all of
1843							the following will result in "example@foo":
1844
1845							=over 4
1846
1847							=item example@foo
1848
1849							=item example@foo (Foo Blah)
1850
1851							=item example@foo, example@bar
1852
1853							=item display: example@foo (Foo Blah), example@bar ;
1854
1855							=item Foo Blah <example@foo>
1856	110			110	1	598
1857							=item "Foo Blah" <example@foo>
1858
1859							=item "'Foo Blah'" <example@foo>
1860
1861							=back
1862
1863							Appending a modifier C<:name> to a header field name will cause everything
1864							except the first display name to be removed from the header field. It is
1865							mainly applicable to header fields containing a single mail address: 'From',
1866							'Sender', along with their 'Resent-From' and 'Resent-Sender' counterparts.
1867							For example, all of the following will result in "Foo Blah". One level of
1868							single quotes is stripped too, as it is often seen.
1869
1870							=over 4
1871
1872							=item example@foo (Foo Blah)
1873
1874							=item example@foo (Foo Blah), example@bar
1875
1876	340			340	1	1331	=item display: example@foo (Foo Blah), example@bar ;
1877
1878							=item Foo Blah <example@foo>
1879
1880							=item "Foo Blah" <example@foo>
1881
1882							=item "'Foo Blah'" <example@foo>
1883
1884							=back
1885
1886							There are several special pseudo-headers that can be specified:
1887
1888							=over 4
1889
1890							=item C<ALL> can be used to mean the text of all the message's headers.
1891							Each header is decoded and unfolded to single line, unless called with :raw.
1892
1893							=item C<ALL-TRUSTED> can be used to mean the text of all the message's headers
1894							that could only have been added by trusted relays.
1895
1896							=item C<ALL-INTERNAL> can be used to mean the text of all the message's headers
1897							that could only have been added by internal relays.
1898
1899							=item C<ALL-UNTRUSTED> can be used to mean the text of all the message's
1900							headers that may have been added by untrusted relays. To make this
1901							pseudo-header more useful for header rules the 'Received' header that was added
1902							by the last trusted relay is included, even though it can be trusted.
1903
1904							=item C<ALL-EXTERNAL> can be used to mean the text of all the message's headers
1905							that may have been added by external relays. Like C<ALL-UNTRUSTED> the
1906							'Received' header added by the last internal relay is included.
1907
1908							=item C<ToCc> can be used to mean the contents of both the 'To' and 'Cc'
1909							headers.
1910
1911							=item C<EnvelopeFrom> is the address used in the 'MAIL FROM:' phase of the SMTP
1912							transaction that delivered this message, if this data has been made available
1913							by the SMTP server.
1914
1915							=item C<MESSAGEID> is a symbol meaning all Message-Id's found in the message;
1916							some mailing list software moves the real 'Message-Id' to 'Resent-Message-Id'
1917							or 'X-Message-Id', then uses its own one in the 'Message-Id' header. The value
1918							returned for this symbol is the text from all 3 headers, separated by newlines.
1919
1920							=item C<X-Spam-Relays-Untrusted> is the generated metadata of untrusted relays
1921							the message has passed through
1922
1923							=item C<X-Spam-Relays-Trusted> is the generated metadata of trusted relays
1924							the message has passed through
1925
1926							=back
1927
1928							=cut
1929
1930							# only uses two arguments, ignores $defval
1931							my ($self, $request) = @_;
1932
1933							my $result;
1934							my $getaddr = 0;
1935							my $getname = 0;
1936							my $getraw = 0;
1937
1938							# special queries - process and strip modifiers
1939							if (index($request,':') >= 0) { # triage
1940							local $1;
1941							while ($request =~ s/:([^:]*)//) {
1942							if ($1 eq 'raw') { $getraw = 1 }
1943							elsif ($1 eq 'addr') { $getaddr = $getraw = 1 }
1944							elsif ($1 eq 'name') { $getname = 1 }
1945							}
1946							}
1947							my $request_lc = lc $request;
1948
1949							# ALL: entire pristine or semi-raw headers
1950							if ($request eq 'ALL') {
1951							return ($getraw ? $self->{msg}->get_pristine_header()
1952							: $self->{msg}->get_all_headers(0));
1953							}
1954							# ALL-TRUSTED: entire trusted raw headers
1955							elsif ($request eq 'ALL-TRUSTED') {
1956							# '+1' since we added the received header even though it's not considered
1957							# trusted, so we know that those headers can be trusted too
1958							return $self->get_all_hdrs_in_rcvd_index_range(
1959							undef, $self->{last_trusted_relay_index}+1,
1960							undef, undef, $getraw);
1961							}
1962							# ALL-INTERNAL: entire internal raw headers
1963							elsif ($request eq 'ALL-INTERNAL') {
1964							# '+1' for the same reason as in ALL-TRUSTED above
1965							return $self->get_all_hdrs_in_rcvd_index_range(
1966							undef, $self->{last_internal_relay_index}+1,
1967							undef, undef, $getraw);
1968							}
1969							# ALL-UNTRUSTED: entire untrusted raw headers
1970							elsif ($request eq 'ALL-UNTRUSTED') {
1971							# '+1' for the same reason as in ALL-TRUSTED above
1972							return $self->get_all_hdrs_in_rcvd_index_range(
1973							$self->{last_trusted_relay_index}+1, undef,
1974							undef, undef, $getraw);
1975							}
1976							# ALL-EXTERNAL: entire external raw headers
1977							elsif ($request eq 'ALL-EXTERNAL') {
1978							# '+1' for the same reason as in ALL-TRUSTED above
1979							return $self->get_all_hdrs_in_rcvd_index_range(
1980							$self->{last_internal_relay_index}+1, undef,
1981							undef, undef, $getraw);
1982							}
1983							# EnvelopeFrom: the SMTP MAIL FROM: address
1984	3306			3306		5921	elsif ($request_lc eq "\LEnvelopeFrom") {
1985							$result = $self->get_envelope_from();
1986	3306					3548	}
1987	3306					3662	# untrusted relays list, as string
1988	3306					3482	elsif ($request_lc eq "\LX-Spam-Relays-Untrusted") {
1989	3306					3418	$result = $self->{relays_untrusted_str};
1990							}
1991							# trusted relays list, as string
1992	3306	100				6628	elsif ($request_lc eq "\LX-Spam-Relays-Trusted") {
1993	747					1560	$result = $self->{relays_trusted_str};
1994	747					4135	}
1995	747	100				2386	# external relays list, as string
	1	100				5
		50
1996	742					2211	elsif ($request_lc eq "\LX-Spam-Relays-External") {
1997	4					11	$result = $self->{relays_external_str};
1998							}
1999							# internal relays list, as string
2000	3306					4739	elsif ($request_lc eq "\LX-Spam-Relays-Internal") {
2001							$result = $self->{relays_internal_str};
2002							}
2003	3306	100				17256	# ToCc: the combined recipients list
		50
		100
		50
		50
		100
		50
		50
		50
		50
		100
		100
2004							elsif ($request_lc eq "\LToCc") {
2005	15	50				94	$result = join("\n", $self->{msg}->get_header('To', $getraw));
2006							if ($result ne '') {
2007							chomp $result;
2008							$result .= ", " if $result =~ /\S/;
2009							}
2010							$result .= join("\n", $self->{msg}->get_header('Cc', $getraw));
2011							$result = undef if $result eq '';
2012	0					0	}
2013							# MESSAGEID: handle lists which move the real message-id to another
2014							# header for resending.
2015							elsif ($request eq 'MESSAGEID') {
2016							$result = join("\n", grep { defined($_) && $_ ne '' }
2017							$self->{msg}->get_header('X-Message-Id', $getraw),
2018							$self->{msg}->get_header('Resent-Message-Id', $getraw),
2019	4					23	$self->{msg}->get_header('X-Original-Message-ID', $getraw),
2020							$self->{msg}->get_header('Message-Id', $getraw));
2021							}
2022							# a conventional header
2023							else {
2024							my @results = $getraw ? $self->{msg}->raw_header($request)
2025							: $self->{msg}->get_header($request);
2026	0					0	# dbg("message: get(%s)%s = %s",
2027							# $request, $getraw?'raw':'', join(", ",@results));
2028							if (@results) {
2029							$result = join('', @results);
2030							} else { # metadata
2031							$result = $self->{msg}->get_metadata($request);
2032							}
2033	0					0	}
2034
2035							# special queries
2036							if (defined $result && ($getaddr \|\| $getname)) {
2037							local $1;
2038	110					557	$result =~ s/^[^:]+:(.);\s$/$1/gs; # 'undisclosed-recipients: ;'
2039							$result =~ s/\s+/ /g; # reduce whitespace
2040							$result =~ s/^\s+//; # leading whitespace
2041							$result =~ s/\s+$//; # trailing whitespace
2042	0					0
2043							if ($getaddr) {
2044							# Get the email address out of the header
2045							# All of these should result in "jm@foo":
2046	0					0	# jm@foo
2047							# jm@foo (Foo Blah)
2048							# jm@foo, jm@bar
2049							# display: jm@foo (Foo Blah), jm@bar ;
2050	0					0	# Foo Blah <jm@foo>
2051							# "Foo Blah" <jm@foo>
2052							# "'Foo Blah'" <jm@foo>
2053							#
2054	0					0	# strip out the (comments)
2055							$result =~ s/\s$.?$//g;
2056							# strip out the "quoted text", unless it's the only thing in the string
2057							if ($result !~ /^".*"$/) {
2058	81					317	$result =~ s/(?<!<)"[^"]*"(?!\@)//g; #" emacs
2059	81	100				303	}
2060	8					24	# Foo Blah <jm@xxx> or <jm@xxx>
2061	8	50				48	local $1;
2062							$result =~ s/^[^"<]?<(.?)>.*$/$1/;
2063	81					257	# multiple addresses on one line? remove all but first
2064	81	100				237	$result =~ s/,.*$//;
2065							}
2066							elsif ($getname) {
2067							# Get the display name out of the header
2068							# All of these should result in "Foo Blah":
2069	42	50				592	#
2070							# jm@foo (Foo Blah)
2071							# (Foo Blah) jm@foo
2072							# jm@foo (Foo Blah), jm@bar
2073	81					352	# display: jm@foo (Foo Blah), jm@bar ;
2074							# Foo Blah <jm@foo>
2075							# "Foo Blah" <jm@foo>
2076							# "'Foo Blah'" <jm@foo>
2077							#
2078	3015	100				8793	local $1;
2079							# does not handle mailbox-list or address-list or quotes well, to be improved
2080							if ($result =~ /^ \s* " (.?) (?<!\\)" \s < [^<>]* >/sx \|\|
2081	3015	100				4727	$result =~ /^ \s* (.?) \s < [^<>]* >/sx) {
2082	357					1399	$result = $1; # display-name, RFC 5322
2083							# name-addr = [display-name] angle-addr
2084	2658					5795	# display-name = phrase
2085							# phrase = 1*word / obs-phrase
2086							# word = atom / quoted-string
2087							# obs-phrase = word *(word / "." / CFWS)
2088							$result =~ s{ " ( (?: [^"\\] \| \\. )* ) " }
2089	3287	100	100			9131	{ my $s=$1; $s=~s{\\(.)}{$1}gs; $s }gsxe;
			100
2090	153					400	$result =~ s/\\"/"/gs;
2091	153					349	} elsif ($result =~ /^ [^(,]? $ (.?) $ /sx) { # legacy form
2092	153					902	# nested comments are not handled, to be improved
2093	153					616	$result = $1;
2094	153					598	} else { # no display name
2095							$result = '';
2096	153	100				359	}
		50
2097							$result =~ s/^ \s* ' \s* (.?) \s ' \s* \z/$1/sx;
2098							}
2099							}
2100							return $result;
2101							}
2102
2103							# optimized for speed
2104							# $_[0] is self
2105							# $_[1] is request
2106							# $_[2] is defval
2107							my $cache = $_[0]->{c};
2108	149					429	my $found;
2109							if (exists $cache->{$_[1]}) {
2110	149	100				481	# return cache entry if it is known
2111	148					316	# (measured hit/attempts rate on a production mailer is about 47%)
2112							$found = $cache->{$_[1]};
2113							} else {
2114	149					299	# fill in a cache entry
2115	149					410	$found = _get(@_);
2116							$cache->{$_[1]} = $found;
2117	149					449	}
2118							# if the requested header wasn't found, we should return a default value
2119							# as specified by the caller: if defval argument is present it represents
2120							# a default value even if undef; if defval argument is absent a default
2121							# value is an empty string for upwards compatibility
2122							return (defined $found ? $found : @_ > 2 ? $_[2] : '');
2123							}
2124
2125							###########################################################################
2126
2127							# uri parsing from plain text:
2128							# The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but
2129							# ignore random strings that might look like URIs, for example in uuencoded files, and to ignore
2130							# URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters.
2131	4					7	# When we extract a domain and look it up in an RBL, an FP on deciding that the text is a URI is not much
2132							# of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule
2133	4	100	66			111	# because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings.
		50
2134							# The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string,
2135	3					11	# then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most
2136							# likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that
2137							# it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD;
2138							# 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD
2139							#
2140							# We get the entire URI that would be linkified before dealing with it, in order to do the right thing
2141	3					23	# with URI-encodings and redirecting URIs.
	0					0
	0					0
	0					0
2142	3					14	#
2143							# The delimiters for start of a URI in TBird are @(`{\|[\"'<>,\s in OE they are ("<\s
2144							#
2145	0					0	# Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end.
2146							# TBird's end delimiters are )`{}\|[]"<>\s but ) is only an end delmiter if there is no ( in the URI
2147	1					5	# OE only uses space as a delimiter, but ignores [~!@#^&()_+`-={}\|[]:";'<>?,.] at the end.
2148							#
2149	4					12	# Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it.
2150							# TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end
2151							# delimiters include ( and '
2152	3287					7911	#
2153							# bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence ESC ( .
2154
2155							my ($self) = @_;
2156
2157							# Cached?
2158							return $self->{tbirdurire} if exists $self->{tbirdurire};
2159
2160	4131			4131	1	18447	# a hybrid of tbird and oe's version of uri parsing
2161	4131					4313	my $tbirdstartdelim = '><"\'`,{[(\|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
2162	4131	100				7606	my $iso2022shift = "\x1b" . '\(.'; # bug 4522
2163							my $tbirdenddelim = '><"`}\]{[\|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
2164							my $nonASCII = '\x80-\xff';
2165	825					1503
2166							# schemeless uri start delimiter, combo of most punctuations and delims above
2167							my $scstartdelim = qr/[\!\"\#\$\&\'\*\+\,\/\:\;\<\=\>\?\@\[\\\]\^\`\{\\|\}\~\s\x1b\xa0]/;
2168	3306					5411
2169	3306					9278	# bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
2170							# it's a common thing to find in text
2171							my $tbirdenddelimemail = $tbirdenddelim . ',(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes
2172							my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
2173
2174							# valid TLDs
2175	4131	100				24452	my $tldsRE = $self->{main}->{registryboundaries}->{valid_tlds_re};
		100
2176
2177							# knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
2178							# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
2179							my $urischemeless = qr/([a-z\d][a-z\d._-]{0,251}\.${tldsRE})\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,2048})?/i;
2180							my $uriknownscheme = qr/(?:(?:https?\|ftp):\/\/\|(?:www\d{0,2}\|ftp)\.)[^$tbirdenddelim]{1,2048}/i;
2181							my $urimailscheme = qr/(?:mailto:[^$tbirdenddelimemail]{1,2048}\|[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251})/i;
2182
2183							$self->{tbirdurire} = qr/(?:\b\|(?<=$iso2022shift)\|(?<=[$tbirdstartdelim]))
2184							(?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]\|\z))) \|
2185							(?:($urimailscheme)(?=(?:[$tbirdenddelimemail]\|\z))) \|
2186							(?:(?:^\|(?<=$scstartdelim))($urischemeless)(?=(?:[$tbirdenddelim]\|\z))))/ix;
2187
2188							return $self->{tbirdurire};
2189							}
2190
2191							=item $status->get_uri_list ()
2192
2193							Returns an array of all unique URIs found in the message. It takes
2194							a combination of the URIs found in the rendered (decoded and HTML
2195							stripped) body and the URIs found when parsing the HTML in the message.
2196							Will also set $status->{uri_list} (the array as returned by this function).
2197
2198							The returned array will include the "raw" URI as well as
2199							"slightly cooked" versions. For example, the single URI
2200							'http://%77w%77.example.com/' will get turned into:
2201							( 'http://%77w%77.example.com/', 'http://www.example.com/' )
2202
2203							=cut
2204
2205							my ($self) = @_;
2206
2207							# use cached answer if available
2208							if (exists $self->{uri_list}) {
2209	131			131		331	return @{$self->{uri_list}};
2210							}
2211
2212	131	50				405	my %uris;
2213							# $self->{redirect_num} = 0;
2214
2215	131					338	# get URIs from text/HTML parsing
2216	131					236	while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
2217	131					230	if ($info->{cleaned}) {
2218	131					321	foreach (@{$info->{cleaned}}) {
2219							$uris{$_} = 1;
2220
2221	131					537	# count redirection attempts and log it
2222							# if (my @http = m{\b(https?:/{0,2})}gi) {
2223							# $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
2224							# }
2225	131					461	}
2226	131					377	}
2227							}
2228
2229	131					421	@{$self->{uri_list}} = keys %uris;
2230							# $self->set_tag('URILIST', @uris == 1 ? $uris[0] : \@uris) if @uris;
2231
2232							return @{$self->{uri_list}};
2233	131					333584	}
2234	131					2483
2235	131					1908	=item $status->get_uri_detail_list ()
2236
2237	131					338651	Returns a hash reference of all unique URIs found in the message and
2238							various data about where the URIs were found in the message. It takes a
2239							combination of the URIs found in the rendered (decoded and HTML stripped)
2240							body and the URIs found when parsing the HTML in the message. Will also
2241							set $status->{uri_detail_list} (the hash reference as returned by this
2242	131					1786	function).
2243
2244							The hash format looks something like this:
2245
2246							raw_uri => {
2247							types => { a => 1, img => 1, parsed => 1, domainkeys => 1,
2248							unlinked => 1, schemeless => 1 },
2249							cleaned => [ canonicalized_uri ],
2250							anchor_text => [ "click here", "no click here" ],
2251							domains => { domain1 => 1, domain2 => 1 },
2252							hosts => { host1 => domain1, host2 => domain2 },
2253							}
2254
2255							C<raw_uri> is whatever the URI was in the message itself
2256							(http://spamassassin.apache%2Eorg/). Uris parsed from text will be prefixed
2257							with scheme if missing (http://, mailto: etc). HTML uris are as found.
2258
2259							C<types> is a hash of the HTML tags (lowercase) which referenced the
2260	131			131	1	462	raw_uri. I<parsed> is a faked type which specifies that the raw_uri was
2261							seen in the rendered text. I<domainkeys> is defined when raw_uri was found
2262							from DK/DKIM d= field. I<unlinked> is defined when it's assumed that MUA
2263	131	50				472	will not linkify uri (found in body without scheme or www. prefix).
2264	0					0	I<schemeless> is always added for uris without scheme, regardless of
	0					0
2265							linkifying (i.e. email address found in body without mailto:).
2266
2267	131					240	C<cleaned> is an array of the raw and canonicalized version of the raw_uri
2268							(http://spamassassin.apache%2Eorg/, https://spamassassin.apache.org/).
2269
2270							C<anchor_text> is an array of the anchor text (text between <a> and
2271	131					249	</a>), if any, which linked to the URI.
	619					1109
2272	488	50				835
2273	488					494	C<domains> is a hash of the domains found in the canonicalized URIs.
	488					675
2274	699					1247
2275							C<hosts> is a hash of unstripped hostnames found in the canonicalized URIs
2276							as hash keys, with their domain part stored as a value of each hash entry.
2277
2278							=cut
2279
2280							my ($self) = @_;
2281
2282							# process only once, use unique uri_detail_list_run flag,
2283							# in case add_uri_detail_list has already been called
2284	131					500	if ($self->{uri_detail_list_run}) {
	131					470
2285							return $self->{uri_detail_list};
2286							}
2287	131					247	$self->{uri_detail_list_run} = 1;
	131					766
2288
2289							my $timer = $self->{main}->time_method("get_uri_detail_list");
2290
2291							# process text parsed uris
2292							$self->_process_text_uri_list();
2293							# process html uris
2294							$self->_process_html_uri_list();
2295							# process dkim uris
2296							$self->_process_dkim_uri_list();
2297
2298							return $self->{uri_detail_list};
2299							}
2300
2301							my ($self) = @_;
2302
2303							# Use decoded stripped body, which does not contain HTML
2304							my $textary = $self->get_decoded_stripped_body_text_array();
2305							my $tbirdurire = $self->_tbirdurire;
2306							my %seen;
2307							my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
2308
2309							foreach my $text (@$textary) {
2310							# a workaround for [perl #69973] bug:
2311							# Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
2312							# Bug 6225, regexp and string should both be utf8, or none of them;
2313							# untainting string also seems to avoid the crash
2314							#
2315							# Bug 6225: untaint the string in an attempt to work around a perl crash
2316							local $_ = untaint_var($text);
2317
2318							local($1,$2,$3);
2319							while (/$tbirdurire/igo) {
2320							my $rawuri = $1\|\|$2\|\|$3;
2321							my $schost = $4;
2322							my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
2323							$rawuri =~ s/(^[^(])\).$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceding it
2324							$rawuri =~ s/[-~!@#^&()_+=:;\'?,.]$//; # remove trailing string of punctuations that TBird ignores
2325
2326							next if exists $seen{$rawuri};
2327							$seen{$rawuri} = 1;
2328
2329							dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
2330
2331							# Quick ignore if schemeless host not valid
2332							next if defined $schost && !is_fqdn_valid($schost, 1);
2333
2334							# Ignore cid: mid: as they can be mistaken for emails,
2335							# these should not be parsed from stripped body in any case.
2336	623			623	1	912	# Example: [cid:image001.png@01D4986E.E3459640]
2337							next if $rawuri =~ /^[cm]id:/i;
2338
2339							# Ignore empty uris
2340	623	100				1120	next if $rawuri =~ /^\w+:\/{0,2}$/i;
2341	492					1128
2342							my $types = {parsed => 1};
2343	131					300
2344							# If it's a hostname that was just sitting out in the
2345	131					563	# open, without a protocol, and not inside of an HTML tag,
2346							# the we should add the proper protocol in front, rather
2347							# than using the base URI.
2348	131					814	my $uri = $rawuri;
2349							if ($uri !~ /^(?:https?\|ftp\|mailto):/i) {
2350	131					728	if ($uri =~ /^ftp\./i) {
2351							$uri = "ftp://$uri";
2352	131					598	}
2353							elsif ($uri =~ /^www\d{0,2}\./i) {
2354	131					737	$uri = "http://$uri";
2355							}
2356							elsif (index($uri, '@') != -1) {
2357							# This is not linkified by MUAs: foo@bar%2Ecom
2358	131			131		322	# This IS linkified: foo@bar%2Ebar.com
2359							# And this is linkified: foo@bar%2Ecom?foo.com&bar (woot??)
2360							# And this is linkified with Outlook: foo@bar%2Ecom&foo (woot??)
2361	131					436	# ...
2362	131					540	# Skip if no dot found after @, tested without urldecoding,
2363	131					314	# quick skip for crap like Vi@gra.
2364	131					679	next unless $uri =~ /\@.+?\./;
2365							next if index($uri, ' ') != -1; # ignore garbled
2366	131					645	$uri =~ s/^(?:skype\|e?-?mail)?:+//i; # strip common misparses
2367							$uri = "mailto:$uri";
2368							}
2369							else {
2370							# some spammers are using unschemed URIs to escape filters
2371							# flag that this is a URI that MUAs don't linkify so only use for RBLs
2372							# (TODO: why only use for RBLs?? why not uri rules? Use tflags to choose?)
2373	672					1730	next if index($uri, '.') == -1; # skip unless dot found, garbage
2374							$uri = "http://$uri";
2375	672					1618	$types->{unlinked} = 1;
2376	672					56723	}
2377	325		66			1201	# Mark any of those schemeless
2378	325					561	$types->{schemeless} = 1;
2379	325	100				772	}
		100
2380	325					685
2381	325					1601	if ($uri =~ /^mailto:/i) {
2382							# MUAs linkify and urldecode mailto:foo%40bar%2Fcom
2383	325	100				939	$uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
2384	306					668	# Skip unless @ found after decoding, then check tld is valid
2385							next unless $uri =~ /\@([^?&>]*)/;
2386	306	50				500	my $host = $1; $host =~ s/(?:\%20)+$//; # strip trailing %20 from host
2387							next unless $self->{main}->{registryboundaries}->is_domain_valid($host);
2388							}
2389	306	100	100			756
2390							dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;
2391
2392							$self->add_uri_detail_list($uri, $types, 'parsed', 1);
2393							}
2394	297	50				653	}
2395							}
2396
2397	297	50				746	my ($self) = @_;
2398
2399	297					623	# get URIs from HTML parsing
2400							# use the metadata version since $self->{html} may not be setup
2401							my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} \|\| { };
2402							$self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
2403
2404							# canonicalize the HTML parsed URIs
2405	297					435	while(my($uri, $info) = each %{ $detail }) {
2406	297	100				825	if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
2407	140	100				501	# Need also to copy and uniq anchor text
		100
		100
2408	1					4	if (exists $info->{anchor_text}) {
2409							my %seen;
2410							foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
2411	59					143	push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
2412							}
2413							}
2414							}
2415							}
2416							}
2417
2418							my ($self) = @_;
2419
2420							# This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
2421	29	50				115	# this functionality
2422	29	50				76	# 2013-01-07
2423	29					48	# This functionality is re-enabled as a configuration option disabled by
2424	29					66	# default (bug 7087)
2425							# 2014-10-06
2426
2427							# Look for the domain in DK/DKIM headers
2428							if ($self->{conf}->{parse_dkim_uris}) {
2429							my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef ),
2430	51	50				113	$self->get('DKIM-Signature',undef) ));
2431	51					89	while ($dk =~ /\bd\s=\s([^;]+)/g) {
2432	51					96	my $d = $1;
2433							$d =~ s/\s+//g;
2434							# prefix with domainkeys: so it doesn't merge with identical keys
2435	140					225	$self->add_uri_detail_list("domainkeys:$d",
2436							{'domainkeys'=>1, 'nocanon'=>1, 'noclean'=>1},
2437							'domainkeys', 1);
2438	297	100				703	}
2439							}
2440	36	50				92	}
2441
2442	36	100				116	=item $status->add_uri_detail_list ($raw_uri, $types, $source, $valid_domain)
2443	34					64
	34					62
2444	34	100				114	Adds values to internal uri_detail_list. When used from Plugins, recommended
2445							to call from parsed_metadata (along with register_method_priority, -10) so
2446							other Plugins calling get_uri_detail_list() will see it.
2447	292	50				487
2448							C<raw_uri> is the URI to be added. The only required parameter.
2449	292					796
2450							C<types> is an optional hash reference, contents are added to
2451							uri_detail_list->{types} (see get_uri_detail_list for known keys).
2452							I<parsed> is default is no hash given. I<nocanon> does not run
2453							uri_list_canonicalize (no redirector, uri fixing). I<noclean> skips adding
2454							uri_detail_list->{cleaned}, so it would not be used in "uri" rule checks,
2455	131			131		386	but domain/hosts would still be used for URIBL/RBL purposes.
2456
2457							C<source> is an optional simple string, only used for debug logging purposes
2458							to identify where uri originates from (default: "parsed").
2459	131		100			1166
2460	131	50				469	C<valid_domain> is an optional boolean (0/1). If true, uri will not be
2461							added unless hostname/domain is in valid format and contains a valid TLD.
2462							(default: 0)
2463	131					254
	344					1333
2464	213	50				442	=cut
2465
2466	213	100				403	my ($self, $uri, $types, $source, $valid_domain) = @_;
2467	119					138
2468	119					137	$types = {'parsed' => 1} unless defined $types;
	203					503
	119					213
2469	119					147	$source \|\|= 'parsed';
	119					410
2470
2471							my (%domains, %hosts, %cleaned);
2472							my $udl = $self->{uri_detail_list};
2473
2474							dbg("uri: canonicalizing $source uri: $uri");
2475
2476							my @uris;
2477	131			131		304	if ($types->{nocanon}) {
2478							push @uris, $uri;
2479							} else {
2480							@uris = uri_list_canonicalize($self->{conf}->{redirector_patterns}, $uri);
2481							}
2482							foreach my $cleanuri (@uris) {
2483							# Make sure all the URIs are nice and short
2484							if (length($cleanuri) > MAX_URI_LENGTH) {
2485							$self->{'uri_truncated'} = 1;
2486							$cleanuri = substr($cleanuri, 0, MAX_URI_LENGTH);
2487	131	50				490	}
2488	131					525	dbg("uri: cleaned uri: $cleanuri");
	262					833
2489							$cleaned{$cleanuri} = 1;
2490	131					629	my ($domain, $host) = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
2491	0					0	if (defined $domain) {
2492	0					0	dbg("uri: added host: $host domain: $domain");
2493							$domains{$domain} = 1;
2494	0					0	$hosts{$host} = $domain;
2495							}
2496							}
2497
2498							# Bail out if no good uri found
2499							return unless %cleaned;
2500
2501							# Bail out if no domains/hosts found?
2502							return if $valid_domain && !%domains;
2503
2504							# Merge cleaned
2505							if (!$types->{noclean}) {
2506							if ($udl->{$uri}->{cleaned}) {
2507							$cleaned{$_} = 1 foreach (@{$udl->{$uri}->{cleaned}});
2508							}
2509							@{$udl->{$uri}->{cleaned}} = keys %cleaned;
2510							}
2511
2512							# Domains/hosts (there might not be any)
2513							$udl->{$uri}->{domains}->{$_} = 1 foreach keys %domains;
2514							$udl->{$uri}->{hosts}->{$_} = $hosts{$_} foreach keys %hosts;
2515
2516							# Types
2517							$udl->{$uri}->{types}->{$_} = 1 foreach keys %$types;
2518
2519							# Invalidate uri_list cache
2520							delete $self->{uri_list};
2521
2522							return 1;
2523							}
2524
2525
2526	505			505	1	1008	###########################################################################
2527
2528	505	50				832	my $self = shift;
2529	505		50			848	my $metarule = shift;
2530							# @_ is now the list of rules
2531	505					614
2532	505					664	foreach my $r (@_) {
2533							# dbg("rules: meta rule depends on net rule $r");
2534	505					1590	next if ($self->is_rule_complete($r));
2535
2536	505					641	dbg("rules: meta rule $metarule depends on pending rule $r, blocking");
2537	505	50				833	my $timer = $self->{main}->time_method("wait_for_pending_rules");
2538	0					0
2539							my $start = time;
2540	505					1455	$self->harvest_until_rule_completes($r);
2541							my $elapsed = sprintf "%.2f", time - $start;
2542	505					924
2543							if (!$self->is_rule_complete($r)) {
2544	717	50				1294	dbg("rules: rule $r is still not complete; exited early?");
2545	0					0	}
2546	0					0	elsif ($elapsed > 0) {
2547							my $txt = "rules: $r took $elapsed seconds to complete, for $metarule";
2548	717					1998	# Info only if something took over 1 sec to wait, prevent log flood
2549	717					1059	if ($elapsed >= 1) { info($txt); } else { dbg($txt); }
2550	717					2075	}
2551	717	100				1437	}
2552	574					1764	}
2553	574					1146
2554	574					1354	###########################################################################
2555
2556							# use a separate sub here, for brevity
2557							# called out of generated eval
2558							my ($self, $rulename) = @_;
2559	505	50				878	warn "rules: failed to run $rulename test, skipping:\n\t($@)\n";
2560							$self->{rule_errors}++;
2561							}
2562	505	100	100			1427
2563							my ($self, $function) = @_;
2564
2565	489	50				861	if (!$function) {
2566	489	100				1127	warn "rules: empty function name";
2567	1					3	return;
	1					4
2568							}
2569	489					833
	489					1408
2570							# only need to call this once per fn (globally)
2571							return if exists $TEMPORARY_EVAL_GLUE_METHODS{$function};
2572							$TEMPORARY_EVAL_GLUE_METHODS{$function} = undef;
2573	489					1489
2574	489					1411	# return if it's not an eval_plugin function
2575							return if (!exists $self->{conf}->{eval_plugins}->{$function});
2576
2577	489					1680	# return if it's been registered already
2578							return if ($self->can ($function) &&
2579							defined &{'Mail::SpamAssassin::PerMsgStatus::'.$function});
2580	489					712
2581							my $evalstr = <<"ENDOFEVAL";
2582	489					4613	{
2583							package Mail::SpamAssassin::PerMsgStatus;
2584
2585							sub $function {
2586							my (\$self) = shift;
2587							my \$plugin = \$self->{conf}->{eval_plugins}->{$function};
2588							return \$plugin->$function (\$self, \@_);
2589	0			0	0	0	}
2590	0					0
2591							1;
2592							}
2593	0					0	ENDOFEVAL
2594							eval $evalstr . '; 1' ## no critic
2595	0	0				0	or do {
2596							my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
2597	0					0	warn "rules: failed to run header tests, skipping some: $eval_stat\n";
2598	0					0	$self->{rule_errors}++;
2599							};
2600	0					0
2601	0					0	# ensure this method is deleted if finish_tests() is called
2602	0					0	push (@TEMPORARY_METHODS, $function);
2603							}
2604	0	0				0
		0
2605	0					0	###########################################################################
2606
2607							# note: only eval tests should store state in $self->{test_log_msgs};
2608	0					0	# pattern tests do not.
2609							#
2610	0	0				0	# the clearing of the test state is now inlined as:
	0					0
	0					0
2611							#
2612							# %{$self->{test_log_msgs}} = (); # clear test state
2613							#
2614							# except for this public API for plugin use:
2615
2616							=item $status->clear_test_state()
2617
2618							Clear test state, including test log messages from C<$status-E<gt>test_log()>.
2619
2620	0			0	0	0	=cut
2621	0					0
2622	0					0	my ($self) = @_;
2623							%{$self->{test_log_msgs}} = ();
2624							}
2625
2626	3773			3773	0	5877	# internal API, called only by got_hit()
2627							# TODO: refactor and merge this into that function
2628	3773	50				5813	my ($self, $rule, $score, $area, $ruletype, $desc) = @_;
2629	0					0
2630	0					0	$self->{main}->call_plugins ("hit_rule", {
2631							permsgstatus => $self,
2632							rulename => $rule,
2633							ruletype => $ruletype,
2634	3773	100				39582	score => $score
2635	283					616	});
2636
2637							# ignore meta-match sub-rules.
2638	283	50				946	if ($rule =~ /^__/) { push(@{$self->{subtest_names_hit}}, $rule); return; }
2639
2640							# this should not happen; warn about it
2641							if (!defined $score) {
2642	283	50	33			1682	warn "rules: score undef for rule '$rule' in '$area' '$desc'";
	0					0
2643							return;
2644	283					775	}
2645
2646							# this should not happen; warn about NaN (bug 3364)
2647							if ($score != $score) {
2648							warn "rules: score '$score' for rule '$rule' in '$area' '$desc'";
2649							return;
2650							}
2651
2652							# Add the rule hit to the score
2653							$self->{score} += $score;
2654
2655							push(@{$self->{test_names_hit}}, $rule);
2656							$area \|\|= '';
2657
2658	283	50		466	1	27814	if ($score >= 10 \|\| $score <= -10) {
	466			101	1	958
	466			81	0	923
	466			81	0	1448
	101			4	0	256
	101			77	0	263
	101			93	1	383
	81			81	1	250
	81			81	0	223
	81			81	1	302
	81			134	0	249
	81			93	0	251
	81			81	0	327
	4			555	0	10
	4			81	0	11
	4			81	0	58
	77			4	0	226
	77			4	0	569
	77			4	0	371
	81			4	0	256
	81			4	0	224
	81			4	0	452
	81			28	0	231
	81			4	0	235
	81			4	0	348
	81			4	0	252
	81			0	0	247
	81			77	0	415
	81			77	0	249
	81			77	0	240
	81			77	0	339
	81			77	0	235
	81			77	0	259
	81			77	0	368
	81					249
	81					204
	81					383
	81					240
	81					216
	81					339
	543					1138
	543					1019
	543					1685
	81					230
	81					240
	81					474
	81					267
	81					226
	81					370
	4					12
	4					12
	4					17
	4					9
	4					12
	4					19
	4					10
	4					15
	4					20
	4					11
	4					11
	4					17
	4					11
	4					18
	4					24
	4					11
	4					12
	4					16
	28					78
	28					64
	28					78
	4					33
	4					17
	4					20
	4					12
	4					17
	4					21
	4					8
	4					11
	4					19
	0					0
	0					0
	0					0
	77					212
	77					212
	77					323
	77					239
	77					219
	77					392
	77					233
	77					208
	77					378
	77					209
	89					267
	89					365
	89					255
	89					306
	77					335
	77					266
	77					207
	77					383
	77					222
	77					208
	77					377
2659	0	0				0	$score = sprintf("%4.0f", $score);
	0					0
2660	0					0	}
2661	0					0	else {
2662							$score = sprintf("%4.1f", $score);
2663							}
2664
2665	283					3504	# save both summaries
2666							# TODO: this is slower than necessary, if we only need one
2667							$self->{tag_data}->{REPORT} .= sprintf ("* %s %s %s%s\n%s",
2668							$score, $rule, $area,
2669							$self->_wrap_desc($desc,
2670							4+length($rule)+length($score)+length($area), "* "),
2671							($self->{test_log_msgs}->{TERSE} ?
2672							"* " . $self->{test_log_msgs}->{TERSE} : ''));
2673
2674							$self->{tag_data}->{SUMMARY} .= sprintf ("%s %-22s %s%s\n%s",
2675							$score, $rule, $area,
2676							$self->_wrap_desc($desc,
2677							3+length($rule)+length($score)+length($area), " " x 28),
2678							($self->{test_log_msgs}->{LONG} \|\| ''));
2679							if((defined $self->{subjprefix}) and ($self->{subjprefix} ne "")) {
2680							$self->{tag_data}->{SUBJPREFIX} = $self->{subjprefix};
2681							}
2682							}
2683
2684							my ($self, $desc, $firstlinelength, $prefix) = @_;
2685
2686	0			0	1	0	my $firstline = " " x $firstlinelength;
2687	0					0	my $wrapped = Mail::SpamAssassin::Util::wrap($desc, $prefix, $firstline, $self->{conf}->{report_wrap_width}, 0);
	0					0
2688							$wrapped =~ s/^\s+//s;
2689							$wrapped;
2690							}
2691
2692							###########################################################################
2693	140			128		416
2694							=item $status->got_hit ($rulename, $desc_prepend [, name => value, ...])
2695	140					836
2696							Register a hit against a rule in the ruleset.
2697
2698							There are two mandatory arguments. These are C<$rulename>, the name of the rule
2699							that fired, and C<$desc_prepend>, which is a short string that will be
2700							prepended to the rules C<describe> string in output reports.
2701
2702							In addition, callers can supplement that with the following optional
2703	140	100				657	data:
	88					166
	88					205
	88					232
2704
2705							=over 4
2706	56	50				207
2707	4					16	=item score => $num
2708	4					19
2709							Optional: the score to use for the rule hit. If unspecified,
2710							the value from the C<Mail::SpamAssassin::Conf> object's C<{scores}>
2711							hash will be used (a configured score), and in its absence the
2712	56	50				245	C<defscore> option value.
2713	4					11
2714	16					64	=item defscore => $num
2715
2716							Optional: the score to use for the rule hit if neither the
2717							option C<score> is provided, nor a configured score value is provided.
2718	56					179
2719							=item value => $num
2720	56					114
	56					168
2721	56		100			320	Optional: the value to assign to the rule; the default value is C<1>.
2722							I<tflags multiple> rules use values of greater than 1 to indicate
2723	56	100	66			327	multiple hits. This value is accessible to meta rules.
2724	17					87
2725							=item ruletype => $type
2726
2727	55					730	Optional, but recommended: the rule type string. This is used in the
2728							C<hit_rule> plugin call, called by this method. If unset, I<'unknown'> is
2729							used.
2730
2731							=item tflags => $string
2732
2733							Optional: a string, i.e. a space-separated list of additional tflags
2734							to be appended to an existing list of flags in $self->{conf}->{tflags},
2735							such as: "nice noautolearn multiple". No syntax checks are performed.
2736
2737	56	100				427	=item description => $string
2738
2739							Optional: a custom rule description string. This is used in the
2740							C<hit_rule> plugin call, called by this method. If unset, the static
2741							description is used.
2742
2743	56		100			371	=back
2744	140	50	33			636
2745	88					151	Backward compatibility: the two mandatory arguments have been part of this API
2746							since SpamAssassin 2.x. The optional I<name=<gt>value> pairs, however, are a
2747							new addition in SpamAssassin 3.2.0.
2748
2749							=cut
2750	192			104		646
2751							my ($self, $rule, $area, %params) = @_;
2752	108					344
2753	120					496	my $conf_ref = $self->{conf};
2754	108					522
2755	108					939	my $dynamic_score_provided;
2756							my $score = $params{score};
2757							if (defined $score) { # overrides any configured scores
2758							$dynamic_score_provided = 1;
2759							} else {
2760							$score = $conf_ref->{scores}->{$rule};
2761							$score = $params{defscore} if !defined $score;
2762							}
2763
2764							# adding a hit does nothing if we don't have a score -- we probably
2765							# shouldn't have run it in the first place
2766							if (!$score) {
2767							%{$self->{test_log_msgs}} = ();
2768							return;
2769							}
2770
2771							# ensure that rule values always result in an increase
2772							# of $self->{tests_already_hit}->{$rule}:
2773							my $value = $params{value};
2774							if (!$value \|\| $value <= 0) { $value = 1 }
2775
2776							my $tflags_ref = $conf_ref->{tflags};
2777							my $tflags_add = $params{tflags};
2778							if (defined $tflags_add && $tflags_add ne '') {
2779							$_ = (!defined $_ \|\| $_ eq '') ? $tflags_add : ($_ . ' ' . $tflags_add)
2780							for $tflags_ref->{$rule};
2781							};
2782
2783							my $already_hit = $self->{tests_already_hit}->{$rule} \|\| 0;
2784							# don't count hits multiple times, unless 'tflags multiple' is on
2785							if ($already_hit && ($tflags_ref->{$rule}\|\|'') !~ /\bmultiple\b/) {
2786							%{$self->{test_log_msgs}} = ();
2787							return;
2788							}
2789
2790							$self->{tests_already_hit}->{$rule} = $already_hit + $value;
2791
2792							# default ruletype, if not specified:
2793							$params{ruletype} \|\|= 'unknown';
2794
2795							if ($dynamic_score_provided) { # copy it to static for proper reporting
2796							$conf_ref->{scoreset}->[$_]->{$rule} = $score for (0..3);
2797							$conf_ref->{scores}->{$rule} = $score;
2798							}
2799
2800							my $rule_descr = $params{description};
2801							if (defined $rule_descr) {
2802							$conf_ref->{descriptions}->{$rule} = $rule_descr; # save dynamic descr.
2803							} else {
2804							$rule_descr = $conf_ref->get_description_for_rule($rule); # static
2805							}
2806							# Bug 6880 Set Rule Description to something that says no rule
2807							#$rule_descr = $rule if !defined $rule_descr \|\| $rule_descr eq '';
2808							$rule_descr = "No description available." if !defined $rule_descr \|\| $rule_descr eq '';
2809
2810							if(defined $self->{conf}->{rewrite_header}->{Subject}) {
2811							my $rule_subjprefix = $conf_ref->{subjprefix}->{$rule};
2812							if (defined $rule_subjprefix) {
2813							dbg("subjprefix: setting Subject prefix to $rule_subjprefix");
2814							if($self->{subjprefix} !~ /\Q$rule_subjprefix\E/) {
2815							$self->{subjprefix} .= $rule_subjprefix . " "; # save dynamic subject prefix.
2816							}
2817							}
2818	132			132	1	2563	}
2819
2820	132					308	$self->_handle_hit($rule,
2821							$score,
2822	170					243	$area,
2823	182					301	$params{ruletype},
2824	180	100				440	$rule_descr);
2825	50					98
2826							# take care of duplicate rules, too (bug 5206)
2827	188					680	my $dups = $conf_ref->{duplicate_rules}->{$rule};
2828	188	50				635	if ($dups && @{$dups}) {
2829							foreach my $dup (@{$dups}) {
2830							$self->got_hit($dup, $area, %params);
2831							}
2832							}
2833	146	50				376
2834	10					24	%{$self->{test_log_msgs}} = (); # clear test logs
	10					24
2835	42					92	return 1;
2836							}
2837
2838							###########################################################################
2839
2840	128					233	# TODO: this needs API doc
2841	128	100	66			356	my ($self, $msg) = @_;
	164					268
2842							local $1;
2843	128					197	while ($msg =~ s/^(.{30,48})\s//) {
2844	128					197	$self->_test_log_line ($1);
2845	170	50	33			392	}
2846							$self->_test_log_line ($msg);
2847	42	0	0			50	}
2848
2849							my ($self, $msg) = @_;
2850	170		100			587
2851							$self->{test_log_msgs}->{TERSE} .= sprintf ("[%s]\n", $msg);
2852	170	100	50			485	if (length($msg) > 47) {
			66
2853	42					158	$self->{test_log_msgs}->{LONG} .= sprintf ("%78s\n", "[$msg]");
	0					0
2854	42					253	} else {
2855							$self->{test_log_msgs}->{LONG} .= sprintf ("%27s [%s]\n", "", $msg);
2856							}
2857	170					417	}
2858
2859							###########################################################################
2860	170		100			483
2861							# helper for get()
2862	170	100				485	my ($self) = @_;
2863	0					0
2864	84					172	# Cached?
2865							return $self->{envelopefrom} if exists $self->{envelopefrom};
2866
2867	212					327	# bug 2142:
2868	212	50				479	# Get the SMTP MAIL FROM:, aka. the "envelope sender", if our
2869	84					281	# calling app has helpfully marked up the source message
2870							# with it. Various MTAs and calling apps each have their
2871	212					1002	# own idea of what header to use for this! see
2872
2873							my $envf;
2874
2875	186	100	66			873	# Rely on the 'envelope-sender-header' header if the user has configured one.
2876							# Assume that because they have configured it, their MTA will always add it.
2877	186	50				448	# This will prevent us falling through and picking up inappropriate headers.
2878	58					69	if (defined $self->{conf}->{envelope_sender_header}) {
2879	58	0				80	# make sure we get the most recent copy - there can be only one EnvelopeSender.
2880	58					106	$envf = $self->get($self->{conf}->{envelope_sender_header}.":addr",undef);
2881	2	0				4	# ok if it contains an "@" sign, or is "" (ie. "<>" without the < and >)
2882	56					104	if (defined $envf && (index($envf, '@') > 0 \|\| $envf eq '')) {
2883							dbg("message: using envelope_sender_header '%s' as EnvelopeFrom: '%s'",
2884							$self->{conf}->{envelope_sender_header}, $envf);
2885							$self->{envelopefrom} = $envf;
2886							return $envf;
2887							}
2888							# Warn them if it's configured, but not there or not usable.
2889							if (defined $envf) {
2890							dbg("message: envelope_sender_header '%s': '%s' is not valid, ignoring",
2891	184					636	$self->{conf}->{envelope_sender_header}, $envf);
2892							} else {
2893							dbg("message: envelope_sender_header '%s' not found in message",
2894	186					403	$self->{conf}->{envelope_sender_header});
2895	128	50	33			333	}
	0					0
2896	0					0	# Couldn't get envelope-sender using the configured header.
	111					267
2897	111					4798	$self->{envelopefrom} = undef;
2898							return;
2899							}
2900
2901	182					243	# User hasn't given us a header to trust, so try to guess the sender.
	186					339
2902	186					1676
2903							# use the "envelope-sender" string found in the Received headers,
2904							# if possible... use the last untrusted header, in case there's
2905							# trusted headers.
2906							my $lasthop = $self->{relays_untrusted}->[0];
2907							my $lasthop_str = 'last untrusted';
2908							if (!defined $lasthop) {
2909	58			4	0	113	# no untrusted headers? in that case, the message is ALL_TRUSTED.
2910	0					0	# use the first trusted header (ie. the oldest, originating one).
2911	58					166	$lasthop = $self->{relays_trusted}->[-1];
2912	70					182	$lasthop_str = 'first trusted';
2913							}
2914	20					52
2915							if (defined $lasthop) {
2916							$envf = $lasthop->{envfrom};
2917							# ok if it contains an "@" sign, or is "" (ie. "<>" without the < and >)
2918	8			4		16	if (defined $envf && (index($envf, '@') > 0 \|\| $envf eq '')) {
2919							dbg("message: using $lasthop_str relay envelope-from as EnvelopeFrom: '$envf'");
2920	8					22	$self->{envelopefrom} = $envf;
2921	62	50				108	return $envf;
2922	62					228	}
2923							}
2924	62					156
2925							# WARNING: a lot of list software adds an X-Sender for the original env-from
2926							# (including Yahoo! Groups). Unfortunately, fetchmail will pick it up and
2927							# reuse it as the env-from for its delivery -- even though the list
2928							# software had used a different env-from in the intervening delivery. Hence,
2929							# if this header is present, and there's a fetchmail sig in the Received
2930							# lines, we cannot trust any Envelope-From headers, since they're likely to
2931							# be incorrect fetchmail guesses.
2932	96			98	0	241
2933							if (index($self->get("X-Sender"), '@') != -1) {
2934							my $rcvd = join(' ', $self->get("Received"));
2935	108	50				364	if (index($rcvd, '(fetchmail') != -1) {
2936							dbg("message: X-Sender and fetchmail signatures found, cannot trust envelope-from");
2937							$self->{envelopefrom} = undef;
2938							return;
2939							}
2940							}
2941
2942							# procmailrc notes this (we now recommend adding it to Received instead)
2943	166					275	if (defined($envf = $self->get("X-Envelope-From:addr",undef))) {
2944							# heuristic: this could have been relayed via a list which then used
2945							# a new Envelope-from. check
2946							if ($self->get("ALL") =~ /^Received:.*?^X-Envelope-From:/smi) {
2947							dbg("message: X-Envelope-From header found after 1 or more Received lines, cannot trust envelope-from");
2948	166	50				556	$self->{envelopefrom} = undef;
2949							return;
2950	60					139	} else {
2951							dbg("message: using X-Envelope-From header as EnvelopeFrom: '$envf'");
2952	62	0	0			174	$self->{envelopefrom} = $envf;
			0
2953							return $envf;
2954	72					247	}
2955	62					156	}
2956	12					69
2957							# qmail, new-inject(1)
2958							if (defined($envf = $self->get("Envelope-Sender:addr",undef))) {
2959	12	0				53	# heuristic: this could have been relayed via a list which then used
2960							# a new Envelope-from. check
2961	12					80	if ($self->get("ALL") =~ /^Received:.*?^Envelope-Sender:/smi) {
2962							dbg("message: Envelope-Sender header found after 1 or more Received lines, cannot trust envelope-from");
2963							} else {
2964	0					0	dbg("message: using Envelope-Sender header as EnvelopeFrom: '$envf'");
2965							$self->{envelopefrom} = $envf;
2966							return $envf;
2967	0					0	}
2968	50					130	}
2969
2970							# Postfix, sendmail, amavisd-new, ...
2971							# RFC 2821 requires it:
2972							# When the delivery SMTP server makes the "final delivery" of a
2973							# message, it inserts a return-path line at the beginning of the mail
2974							# data. This use of return-path is required; mail systems MUST support
2975							# it. The return-path line preserves the information in the <reverse-
2976	144					311	# path> from the MAIL command.
2977	144					339	if (defined($envf = $self->get("Return-Path:addr",undef))) {
2978	94	100				261	# heuristic: this could have been relayed via a list which then used
2979							# a new Envelope-from. check
2980							if ($self->get("ALL") =~ /^Received:.*?^Return-Path:/smi) {
2981	86					211	dbg("message: Return-Path header found after 1 or more Received lines, cannot trust envelope-from");
2982	86					165	} else {
2983							dbg("message: using Return-Path header as EnvelopeFrom: '$envf'");
2984							$self->{envelopefrom} = $envf;
2985	94	100				234	return $envf;
2986	83					132	}
2987							}
2988	83	0	0			181
			33
2989	50					642	# give up.
2990	42					67	$self->{envelopefrom} = undef;
2991	42					86	return;
2992							}
2993
2994							###########################################################################
2995
2996							# helper for get(ALL-*). get() caches its results, so don't call this
2997							# directly unless you need a range of headers not covered by the ALL-*
2998							# psuedo-headers!
2999
3000							# Get all the headers found between an index range of received headers, the
3001							# index doesn't care if we could parse the received headers or not.
3002							# Use undef for the $start_rcvd or $end_rcvd numbers to start/end with the
3003	148	50				460	# first/last header in the message, otherwise indicate the index number you
3004	15					50	# want to start/end at. Set $include_start_rcvd or $include_end_rcvd to 0 to
3005	42	0				76	# indicate you don't want to include the received header found at the start or
3006	45					71	# end indexes... basically toggles between [s,e], [s,e), (s,e], (s,e).
3007	57					144	my ($self, $start_rcvd, $end_rcvd, $include_start_rcvd, $include_end_rcvd, $getraw) = @_;
3008	57					134
3009							# prevent bad input causing us to return the first header found
3010							return if (defined $end_rcvd && $end_rcvd < 0);
3011
3012							$include_start_rcvd = 1 unless defined $include_start_rcvd;
3013	106	50				457	$include_end_rcvd = 1 unless defined $include_end_rcvd;
3014
3015							my $cur_rcvd_index = -1; # none found yet
3016	45	0				174	my $result = '';
3017	16					47
3018	16					101	my @hdrs;
3019	16					125	if ($getraw) {
3020							@hdrs = $self->{msg}->get_pristine_header() =~ /^([^ \t].*?\n)(?![ \t])/smgi;
3021	16					100	} else {
3022	12					372	@hdrs = split(/^/m, $self->{msg}->get_all_headers(0));
3023	12					25	}
3024
3025							foreach my $hdr (@hdrs) {
3026							if ($hdr =~ /^Received:/i) {
3027							$cur_rcvd_index++;
3028	106	50				754	next if (defined $start_rcvd && !$include_start_rcvd &&
3029							$start_rcvd == $cur_rcvd_index);
3030							last if (defined $end_rcvd && !$include_end_rcvd &&
3031	30	0				164	$end_rcvd == $cur_rcvd_index);
3032	20					74	}
3033							if ((!defined $start_rcvd \|\| $start_rcvd <= $cur_rcvd_index) &&
3034	12					94	(!defined $end_rcvd \|\| $cur_rcvd_index < $end_rcvd)) {
3035	12					116	$result .= $hdr;
3036	12					40	}
3037							elsif (defined $end_rcvd && $cur_rcvd_index == $end_rcvd) {
3038							$result .= $hdr;
3039							last;
3040							}
3041							}
3042							return ($result eq '' ? undef : $result);
3043							}
3044
3045							###########################################################################
3046
3047	106	100				429
3048							###########################################################################
3049
3050	1	50				4	=item $status->create_fulltext_tmpfile (fulltext_ref)
3051	4					11
3052							This function creates a temporary file containing the passed scalar
3053	5					14	reference data (typically the full/pristine text of the message).
3054	5					14	This is typically used by external programs like pyzor and dccproc, to
3055	5					13	avoid hangs due to buffering issues. Methods that need this, should
3056							call $self->create_fulltext_tmpfile($fulltext) to retrieve the temporary
3057							filename; it will be created if it has not already been.
3058
3059							Note: This can only be called once until $status->delete_fulltext_tmpfile() is
3060	97					324	called.
3061	97					244
3062							=cut
3063
3064							my ($self, $fulltext) = @_;
3065
3066							if (defined $self->{fulltext_tmpfile}) {
3067							return $self->{fulltext_tmpfile};
3068							}
3069
3070							my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
3071							$tmpfh or die "failed to create a temporary file";
3072
3073							# PerlIO's buffered print writes in 8 kB chunks - which can be slow.
3074							# print $tmpfh $$fulltext or die "error writing to $tmpf: $!";
3075							#
3076							# reducing the number of writes and bypassing extra buffering in PerlIO
3077							# speeds up writing of larger text by a factor of 2
3078	0			4	0	0	my $nwrites;
3079							for (my $ofs = 0; $ofs < length($$fulltext); $ofs += $nwrites) {
3080							$nwrites = $tmpfh->syswrite($$fulltext, length($$fulltext)-$ofs, $ofs);
3081	0	50	33			0	defined $nwrites or die "error writing to $tmpf: $!";
3082							}
3083	0	50				0	close $tmpfh or die "error closing $tmpf: $!";
3084	0	50				0
3085							$self->{fulltext_tmpfile} = $tmpf;
3086	0					0
3087	4					19	dbg("check: create_fulltext_tmpfile, written %d bytes to file %s",
3088							length($$fulltext), $tmpf);
3089	0					0
3090	0	50				0	return $self->{fulltext_tmpfile};
3091	0					0	}
3092
3093	0					0	=item $status->delete_fulltext_tmpfile ()
3094
3095							Will cleanup after a $status->create_fulltext_tmpfile() call. Deletes the
3096	0					0	temporary file and uncaches the filename.
3097	4	50				13
3098	0					0	=cut
3099	0	0	0			0
			0
3100							my ($self) = @_;
3101	0	0	0			0	if (defined $self->{fulltext_tmpfile}) {
			0
3102							if (!unlink $self->{fulltext_tmpfile}) {
3103							my $msg = sprintf("cannot unlink %s: %s", $self->{fulltext_tmpfile}, $!);
3104	0	50	33			0	# don't fuss too much if file is missing, perhaps it wasn't even created
		0	33
			33
			0
3105							if ($! == ENOENT) { warn $msg } else { die $msg }
3106	0					0	}
3107							$self->{fulltext_tmpfile} = undef;
3108							}
3109	0					0	}
3110	0					0
3111							###########################################################################
3112
3113	4	50				10	my ($self) = @_;
3114
3115							if (exists $self->{all_from_addrs}) { return @{$self->{all_from_addrs}}; }
3116
3117							my @addrs;
3118	0			88	0	0
3119							# Resent- headers take priority, if present. see bug 672
3120							my $resent = $self->get('Resent-From',undef);
3121							if (defined $resent && $resent =~ /\S/) {
3122							@addrs = $self->{main}->find_all_addrs_in_line ($resent);
3123							}
3124							else {
3125							# bug 2292: Used to use find_all_addrs_in_line() with the same
3126							# headers, but the would catch addresses in comments which caused
3127							# FNs for things like whitelist_from. Since all of these are From
3128							# headers, there should only be 1 address in each anyway (not exactly
3129							# true, RFC 2822 allows multiple addresses in a From header field),
3130							# so use the :addr code...
3131							# bug 3366: some addresses come in as 'foo@bar...', which is invalid.
3132							# so deal with the multiple periods.
3133							## no critic
3134							@addrs = map { tr/././s; $_ } grep { $_ ne '' }
3135							($self->get('From:addr'), # std
3136							$self->get('Envelope-Sender:addr'), # qmail: new-inject(1)
3137	0			4	1	0	$self->get('Resent-Sender:addr'), # procmailrc manpage
3138							$self->get('X-Envelope-From:addr'), # procmailrc manpage
3139	0	50				0	$self->get('EnvelopeFrom:addr')); # SMTP envelope
3140	0					0	# http://www.cs.tut.fi/~jkorpela/headers.html is useful here
3141							}
3142
3143	0					0	# Remove duplicate addresses
3144	4	50				17	my %addrs = map { $_ => 1 } @addrs;
3145							@addrs = keys %addrs;
3146
3147							dbg("eval: all '*From' addrs: " . join(" ", @addrs));
3148							$self->{all_from_addrs} = \@addrs;
3149							return @addrs;
3150							}
3151	0					0
3152	0					0	=item all_from_addrs_domains
3153	0					0
3154	0	50				0	This function returns all the various from addresses in a message using all_from_addrs()
3155							and then returns only the domain names.
3156	0	50				0
3157							=cut
3158	4					9
3159							my ($self) = @_;
3160	4					9
3161							if (exists $self->{all_from_addrs_domains}) {
3162							return @{$self->{all_from_addrs_domains}};
3163	4					12	}
3164
3165							#TEST POINT - my @addrs = ("test.voipquotes2.net","test.voipquotes2.co.uk");
3166							#Start with all the normal from addrs
3167							my @addrs = all_from_addrs($self);
3168
3169							dbg("eval: all '*From' addrs domains (before): " . join(" ", @addrs));
3170
3171							#Take just the domain with a dummy localpart
3172							#removing invalid and duplicate domains
3173							my(%addrs_seen, @addrs_filtered);
3174	191			191	1	408	foreach my $a (@addrs) {
3175	191	100				1254	my $domain = $self->{main}->{registryboundaries}->uri_to_domain($a);
3176	4	50				8	next if !defined $domain \|\| $addrs_seen{lc $domain}++;
3177	4					6	push(@addrs_filtered, 'dummy@'.$domain);
3178							}
3179	4	0				6
	4					4
	4					9
3180							dbg("eval: all '*From' addrs domains (after uri to domain): " .
3181	0					0	join(" ", @addrs_filtered));
3182
3183							$self->{all_from_addrs_domains} = \@addrs_filtered;
3184
3185							return @addrs_filtered;
3186							}
3187
3188	237			275	0	433	my ($self) = @_;
3189
3190	237	100				577	if (exists $self->{all_to_addrs}) { return @{$self->{all_to_addrs}}; }
	171					282
	154					923
3191
3192	79					146	my @addrs;
3193
3194							# Resent- headers take priority, if present. see bug 672
3195	79					218	my $resent = join('', $self->get('Resent-To'), $self->get('Resent-Cc'));
3196	96	50	33			411	if ($resent =~ /\S/) {
3197	17					39	@addrs = $self->{main}->find_all_addrs_in_line($resent);
3198							} else {
3199							# OK, a fetchmail trick: try to find the recipient address from
3200							# the most recent 3 Received lines. This is required for sendmail,
3201							# since it does not add a helpful header like exim, qmail
3202							# or Postfix do.
3203							#
3204							my $rcvd = $self->get('Received');
3205							$rcvd =~ s/\n[ \t]+/ /gs;
3206							$rcvd =~ s/\n+/\n/gs;
3207
3208							my @rcvdlines = split(/\n/, $rcvd, 4); pop @rcvdlines; # forget last one
3209	79					218	my @rcvdaddrs;
	39					162
	43					216
	395					784
3210							foreach my $line (@rcvdlines) {
3211							if ($line =~ / for (\S+\@\S+);/) { push (@rcvdaddrs, $1); }
3212							}
3213
3214							@addrs = $self->{main}->find_all_addrs_in_line (
3215							join('',
3216							join(" ", @rcvdaddrs)."\n",
3217							$self->get('To'), # std
3218							$self->get('Apparently-To'), # sendmail, from envelope
3219	83					215	$self->get('Delivered-To'), # Postfix, poss qmail
	43					177
3220	79					219	$self->get('Envelope-Recipients'), # qmail: new-inject(1)
3221							$self->get('Apparently-Resent-To'), # procmailrc manpage
3222	83					415	$self->get('X-Envelope-To'), # procmailrc manpage
3223	83					262	$self->get('Envelope-To'), # exim
3224	83					506	$self->get('X-Delivered-To'), # procmail quick start
3225							$self->get('X-Original-To'), # procmail quick start
3226							$self->get('X-Rcpt-To'), # procmail quick start
3227							$self->get('X-Real-To'), # procmail quick start
3228							$self->get('Cc'))); # std
3229							# those are taken from various sources; thanks to Nancy McGough, who
3230							# noted some in <http://www.ii.com/internet/robots/procmail/qs/#envelope>
3231							}
3232
3233							dbg("eval: all '*To' addrs: " . join(" ", @addrs));
3234							$self->{all_to_addrs} = \@addrs;
3235	4			50	1	18	return @addrs;
3236
3237	4	0				72	# http://www.cs.tut.fi/~jkorpela/headers.html is useful here, also
3238	4					220	# http://www.exim.org/pipermail/exim-users/Week-of-Mon-20001009/021672.html
	4					52
3239							}
3240
3241							###########################################################################
3242
3243	4					13	1;
3244
3245	4					20	=back
3246
3247							=head1 SEE ALSO
3248
3249	4					20	Mail::SpamAssassin(3)
3250	12					20	spamassassin(1)
3251	12					51