File Coverage

blib/lib/Mail/SpamAssassin/RegistryBoundaries.pm
Criterion Covered Total %
statement 85 90 94.4
branch 42 54 77.7
condition 4 15 26.6
subroutine 11 11 100.0
pod 3 5 60.0
total 145 175 82.8


line stmt bran cond sub pod time code
1             # The (extremely complex) rules for domain delegation.
2              
3             # <@LICENSE>
4             # Licensed to the Apache Software Foundation (ASF) under one or more
5             # contributor license agreements. See the NOTICE file distributed with
6             # this work for additional information regarding copyright ownership.
7             # The ASF licenses this file to you under the Apache License, Version 2.0
8             # (the "License"); you may not use this file except in compliance with
9             # the License. You may obtain a copy of the License at:
10             #
11             # http://www.apache.org/licenses/LICENSE-2.0
12             #
13             # Unless required by applicable law or agreed to in writing, software
14             # distributed under the License is distributed on an "AS IS" BASIS,
15             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16             # See the License for the specific language governing permissions and
17             # limitations under the License.
18             # </@LICENSE>
19              
20             =head1 NAME
21              
22             Mail::SpamAssassin::RegistryBoundaries - domain delegation rules
23              
24             =cut
25              
26              
27             use strict;
28 41     41   279 use warnings;
  41         88  
  41         1361  
29 41     41   262 # use bytes;
  41         114  
  41         1467  
30             use re 'taint';
31 41     41   256  
  41         108  
  41         2356  
32             our @ISA = qw();
33              
34             use Mail::SpamAssassin::Logger;
35 41     41   278 use Mail::SpamAssassin::Constants qw(:ip);
  41         100  
  41         2682  
36 41     41   274 use Mail::SpamAssassin::Util qw(is_fqdn_valid);
  41         122  
  41         5015  
37 41     41   300  
  41         90  
  41         45237  
38             my $IP_ADDRESS = IP_ADDRESS;
39              
40             # called from SpamAssassin->init() to create $self->{util_rb}
41             my $class = shift;
42             $class = ref($class) || $class;
43 91     91 0 233  
44 91   33     623 my ($main) = @_;
45             my $self = {
46 91         243 'main' => $main,
47             'conf' => $main->{conf},
48             };
49             bless ($self, $class);
50 91         387  
51 91         397 # Initialize valid_tlds_re for schemeless uri parsing, FreeMail etc
52             if ($self->{conf}->{valid_tlds} && %{$self->{conf}->{valid_tlds}}) {
53             # International domain names are already in ASCII-compatible encoding (ACE)
54 91 50 33     711 my $tlds =
  91         457  
55             '(?<![a-zA-Z0-9-])(?:'. # make sure tld starts at boundary
56             join('|', keys %{$self->{conf}->{valid_tlds}}).
57             ')(?!(?:[a-zA-Z0-9-]|\.[a-zA-Z0-9]))'; # make sure it ends
58 91         256 # Perl 5.10+ trie optimizes lists, no need for fancy regex optimizing
  91         35828  
59             if (eval { $self->{valid_tlds_re} = qr/$tlds/i; 1; }) {
60             dbg("config: registryboundaries: %d tlds loaded",
61 91 50       3412 scalar keys %{$self->{conf}->{valid_tlds}});
  91         449117  
  91         7149  
62             } else {
63 91         227 warn "config: registryboundaries: failed to compile valid_tlds_re: $@\n";
  91         953  
64             $self->{valid_tlds_re} = qr/no_tlds_defined/;
65 0         0 }
66 0         0 }
67             else {
68             # Failsafe in case no tlds defined, we don't want this to match everything..
69             $self->{valid_tlds_re} = qr/no_tlds_defined/;
70             warn "config: registryboundaries: no tlds defined, need to run sa-update\n"
71 0         0 if !$self->{main}->{ignore_site_cf_files};
72             }
73 0 0       0  
74             $self;
75             }
76 91         596  
77             # This is required because the .us domain is nuts. See split_domain.
78             our %US_STATES = qw(
79             ak 1 al 1 ar 1 az 1 ca 1 co 1 ct 1 dc 1 de 1 fl 1 ga 1 gu 1 hi 1 ia 1 id 1 il 1 in 1 ks 1 ky 1 la 1 ma 1 md 1 me 1 mi 1
80             mn 1 mo 1 ms 1 mt 1 nc 1 nd 1 ne 1 nh 1 nj 1 nm 1 nv 1 ny 1 oh 1 ok 1 or 1 pa 1 pr 1 ri 1 sc 1 sd 1 tn 1 tx 1 ut 1 va 1
81             vi 1 vt 1 wa 1 wi 1 wv 1 wy 1
82             );
83              
84             ###########################################################################
85              
86             =head1 METHODS
87              
88             =over 4
89              
90             =item ($hostname, $domain) = split_domain ($fqdn)
91              
92             Cut a fully-qualified hostname into the hostname part and the domain
93             part, splitting at the DNS registry boundary.
94              
95             Examples:
96              
97             "www.foo.com" => ( "www", "foo.com" )
98             "www.foo.co.uk" => ( "www", "foo.co.uk" )
99              
100             =cut
101              
102             my $self = shift;
103             my $domain = lc shift;
104              
105 550     550 1 621 my $hostname = '';
106 550         808  
107             if (defined $domain && $domain ne '') {
108 550         759 # www..spamassassin.org -> www.spamassassin.org
109             $domain =~ tr/././s;
110 550 50 33     1616  
111             # leading/trailing dots
112 550         872 $domain =~ s/^\.+//;
113             $domain =~ s/\.+$//;
114              
115 550         815 # Split scalar domain into components
116 550         1024 my @domparts = split(/\./, $domain);
117             my @hostname;
118              
119 550         1115 while (@domparts > 1) { # go until we find the TLD
120 550         641 if (@domparts == 4) {
121             if ($domparts[3] eq 'us' &&
122 550         975 (($domparts[0] eq 'pvt' && $domparts[1] eq 'k12') ||
123 885 100       2125 ($domparts[0] =~ /^c[io]$/)))
    100          
    100          
124 27 0 0     69 {
      33        
125             # http://www.neustar.us/policies/docs/rfc_1480.txt
126             # "Fire-Dept.CI.Los-Angeles.CA.US"
127             # "<school-name>.PVT.K12.<state>.US"
128             last if ($US_STATES{$domparts[2]});
129             }
130             }
131 0 0       0 elsif (@domparts == 3) {
132             # http://www.neustar.us/policies/docs/rfc_1480.txt
133             # demon.co.uk
134             # esc.edu.ar
135             # [^\.]+\.${US_STATES}\.us
136             if ($domparts[2] eq 'us') {
137             last if ($US_STATES{$domparts[1]});
138             }
139 302 100       529 else {
140 5 50       23 my $temp = join(".", @domparts);
141             last if ($self->{conf}->{three_level_domains}{$temp});
142             }
143 297         590 }
144 297 100       761 elsif (@domparts == 2) {
145             # co.uk, etc.
146             my $temp = join(".", @domparts);
147             last if ($self->{conf}->{two_level_domains}{$temp});
148             }
149 545         879 push(@hostname, shift @domparts);
150 545 100       1332 }
151              
152 860         1880 # Look for a sub-delegated TLD
153             # use @domparts to skip trying to match on TLDs that can't possibly
154             # match, but keep in mind that the hostname can be blank, so 4TLD needs 4,
155             # 3TLD needs 3, 2TLD needs 2 ...
156             #
157             unshift @domparts, pop @hostname if @hostname;
158             $domain = join(".", @domparts);
159             $hostname = join(".", @hostname);
160 550 100       1184 }
161 550         952  
162 550         987 ($hostname, $domain);
163             }
164              
165 550         1312 ###########################################################################
166              
167             =item $domain = trim_domain($fqdn)
168              
169             Cut a fully-qualified hostname into the hostname part and the domain
170             part, returning just the domain.
171              
172             Examples:
173              
174             "www.foo.com" => "foo.com"
175             "www.foo.co.uk" => "foo.co.uk"
176              
177             =cut
178              
179             my $self = shift;
180             my $domain = shift;
181              
182             my ($host, $dom) = $self->split_domain($domain);
183 550     550 1 40606 return $dom;
184 550         639 }
185              
186 550         985 ###########################################################################
187 550         1020  
188             =item $ok = is_domain_valid($dom)
189              
190             Return C<1> if the domain is valid, C<undef> otherwise. A valid domain
191             (a) does not contain whitespace, (b) contains at least one dot, and (c)
192             uses a valid TLD or ccTLD.
193              
194             =back
195              
196             =cut
197              
198             my ($self, $dom) = @_;
199              
200             return 0 unless defined $dom;
201              
202             # domains don't have whitespace
203 607     607 1 1016 return 0 if ($dom =~ /\s/);
204              
205 607 50       983 # ensure it ends in a known-valid TLD, and has at least 1 dot
206             return 0 unless ($dom =~ /\.([^.]+)$/);
207             return 0 unless ($self->{conf}->{valid_tlds}{lc $1});
208 607 50       1201  
209             return 1; # nah, it's ok.
210             }
211 607 100       1885  
212 605 100       2119 #
213              
214 572         1250 my $self = shift;
215             my $uri = lc shift;
216              
217             # Javascript is not going to help us, so return.
218             # Likewise ignore cid, file
219             return if ($uri =~ /^(?:javascript|cid|file):/);
220 749     749 0 13016  
221 749         1303 if ($uri =~ s/^mailto://) { # handle mailto: specially
222             $uri =~ s/\?.*//; # drop parameters ?subject= etc
223             # note above, Outlook linkifies foo@bar%2Ecom&x.com to foo@bar.com !!
224             # uri_list_canonicalize should have made versions without ? &
225 749 100       2015 # Keep testing with & here just in case..
226             return if $uri =~ /\@.*?\@/; # abort if multiple @
227 743 100       1472 return unless $uri =~ s/.*@//; # drop username or abort
228 42         89 } else {
229             $uri =~ s{^[a-z]+:/{0,2}}{}gs; # drop the protocol
230             # strip path, CGI params, fragment. note: bug 4213 shows that "&" should
231             # *not* be likewise stripped here -- it's permitted in hostnames by
232 42 100       103 # some common MUAs!
233 39 100       146 $uri =~ s{[/?#].*}{}gs;
234             $uri =~ s{^[^/]*\@}{}gs; # drop username/passwd
235 701         2054 $uri =~ s{:\d*$}{}gs; # port, bug 4191: sometimes the # is missing
236             }
237              
238             # skip undecoded URIs if the encoded bits shouldn't be.
239 701         1408 # we'll see the decoded version as well. see url_encode()
240 701         942 return if $uri =~ /\%(?:2[1-9a-f]|[3-6][0-9a-f]|7[0-9a-e])/;
241 701         1027  
242             my $host = $uri; # unstripped/full domain name
243             my $domain = $host;
244              
245             # keep IPs intact
246 735 100       1314 if ($host !~ /^$IP_ADDRESS$/) {
247             # check that it's a valid hostname/fqdn
248 734         937 return unless is_fqdn_valid($host);
249 734         915 # ignore invalid TLDs
250             return unless $self->is_domain_valid($host);
251             # get rid of hostname part of domain, understanding delegation
252 734 100       9116 $domain = $self->trim_domain($host);
253             }
254 679 100       1965
255             # $uri is now the domain only, optionally return unstripped host name
256 573 100       1213 return !wantarray ? $domain : ($domain, $host);
257             }
258 541         915  
259             1;
260