File Coverage

blib/lib/Mail/SpamAssassin/RegistryBoundaries.pm
Criterion Covered Total %
statement 66 68 97.0
branch 32 40 80.0
condition 3 12 25.0
subroutine 8 8 100.0
pod 3 5 60.0
total 112 133 84.2


line stmt bran cond sub pod time code
1             # The (extremely complex) rules for domain delegation.
2              
3             # <@LICENSE>
4             # Licensed to the Apache Software Foundation (ASF) under one or more
5             # contributor license agreements. See the NOTICE file distributed with
6             # this work for additional information regarding copyright ownership.
7             # The ASF licenses this file to you under the Apache License, Version 2.0
8             # (the "License"); you may not use this file except in compliance with
9             # the License. You may obtain a copy of the License at:
10             #
11             # http://www.apache.org/licenses/LICENSE-2.0
12             #
13             # Unless required by applicable law or agreed to in writing, software
14             # distributed under the License is distributed on an "AS IS" BASIS,
15             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16             # See the License for the specific language governing permissions and
17             # limitations under the License.
18             # </@LICENSE>
19              
20             =head1 NAME
21              
22             Mail::SpamAssassin::RegistryBoundaries - domain delegation rules
23              
24             =cut
25              
26             package Mail::SpamAssassin::RegistryBoundaries;
27              
28 40     40   337 use strict;
  40         103  
  40         1268  
29 40     40   240 use warnings;
  40         112  
  40         1290  
30             # use bytes;
31 40     40   234 use re 'taint';
  40         122  
  40         39656  
32              
33             our @ISA = qw();
34              
35             # called from SpamAssassin->init() to create $self->{util_rb}
36             sub new {
37 81     81 0 267 my $class = shift;
38 81   33     589 $class = ref($class) || $class;
39              
40 81         247 my ($main) = @_;
41             my $self = {
42             'main' => $main,
43             'conf' => $main->{conf},
44 81         422 };
45 81         243 bless ($self, $class);
46              
47             # Initialize valid_tlds_re for schemeless uri parsing, FreeMail etc
48 81 50       464 if ($self->{conf}->{valid_tlds}) {
49 81         202 my $tlds = join('|', keys %{$self->{conf}->{valid_tlds}});
  81         16273  
50             # Perl 5.10+ trie optimizes lists, no need for fancy regex optimizing
51 81         214268 $self->{valid_tlds_re} = qr/(?:$tlds)/i;
52             }
53             else {
54             # Failsafe in case no tlds defined, we don't want this to match everything..
55 0         0 $self->{valid_tlds_re} = qr/no_tlds_defined/;
56             }
57              
58 81         3812 $self;
59             }
60              
61             # This is required because the .us domain is nuts. See split_domain.
62             our %US_STATES = qw(
63             ak 1 al 1 ar 1 az 1 ca 1 co 1 ct 1 dc 1 de 1 fl 1 ga 1 gu 1 hi 1 ia 1 id 1 il 1 in 1 ks 1 ky 1 la 1 ma 1 md 1 me 1 mi 1
64             mn 1 mo 1 ms 1 mt 1 nc 1 nd 1 ne 1 nh 1 nj 1 nm 1 nv 1 ny 1 oh 1 ok 1 or 1 pa 1 pr 1 ri 1 sc 1 sd 1 tn 1 tx 1 ut 1 va 1
65             vi 1 vt 1 wa 1 wi 1 wv 1 wy 1
66             );
67              
68             ###########################################################################
69              
70             =head1 METHODS
71              
72             =over 4
73              
74             =item ($hostname, $domain) = split_domain ($fqdn)
75              
76             Cut a fully-qualified hostname into the hostname part and the domain
77             part, splitting at the DNS registry boundary.
78              
79             Examples:
80              
81             "www.foo.com" => ( "www", "foo.com" )
82             "www.foo.co.uk" => ( "www", "foo.co.uk" )
83              
84             =cut
85              
86             sub split_domain {
87 2460     2460 1 2635 my $self = shift;
88 2460         2865 my $domain = lc shift;
89              
90 2460         2653 my $hostname = '';
91              
92 2460 50 33     6566 if (defined $domain && $domain ne '') {
93             # www..spamassassin.org -> www.spamassassin.org
94 2460         3283 $domain =~ tr/././s;
95              
96             # leading/trailing dots
97 2460         2989 $domain =~ s/^\.+//;
98 2460         4481 $domain =~ s/\.+$//;
99              
100             # Split scalar domain into components
101 2460         5494 my @domparts = split(/\./, $domain);
102 2460         2701 my @hostname;
103              
104 2460         4040 while (@domparts > 1) { # go until we find the TLD
105 3958 100       8139 if (@domparts == 4) {
    100          
    100          
106 40 0 0     106 if ($domparts[3] eq 'us' &&
      33        
107             (($domparts[0] eq 'pvt' && $domparts[1] eq 'k12') ||
108             ($domparts[0] =~ /^c[io]$/)))
109             {
110             # http://www.neustar.us/policies/docs/rfc_1480.txt
111             # "Fire-Dept.CI.Los-Angeles.CA.US"
112             # "<school-name>.PVT.K12.<state>.US"
113 0 0       0 last if ($US_STATES{$domparts[2]});
114             }
115             }
116             elsif (@domparts == 3) {
117             # http://www.neustar.us/policies/docs/rfc_1480.txt
118             # demon.co.uk
119             # esc.edu.ar
120             # [^\.]+\.${US_STATES}\.us
121 1508 100       2150 if ($domparts[2] eq 'us') {
122 10 50       33 last if ($US_STATES{$domparts[1]});
123             }
124             else {
125 1498         2498 my $temp = join(".", @domparts);
126 1498 100       3403 last if ($self->{conf}->{three_level_domains}{$temp});
127             }
128             }
129             elsif (@domparts == 2) {
130             # co.uk, etc.
131 2395         3421 my $temp = join(".", @domparts);
132 2395 100       5623 last if ($self->{conf}->{two_level_domains}{$temp});
133             }
134 3934         7471 push(@hostname, shift @domparts);
135             }
136              
137             # Look for a sub-delegated TLD
138             # use @domparts to skip trying to match on TLDs that can't possibly
139             # match, but keep in mind that the hostname can be blank, so 4TLD needs 4,
140             # 3TLD needs 3, 2TLD needs 2 ...
141             #
142 2460 100       4937 unshift @domparts, pop @hostname if @hostname;
143 2460         3931 $domain = join(".", @domparts);
144 2460         3490 $hostname = join(".", @hostname);
145             }
146              
147 2460         5146 ($hostname, $domain);
148             }
149              
150             ###########################################################################
151              
152             =item $domain = trim_domain($fqdn)
153              
154             Cut a fully-qualified hostname into the hostname part and the domain
155             part, returning just the domain.
156              
157             Examples:
158              
159             "www.foo.com" => "foo.com"
160             "www.foo.co.uk" => "foo.co.uk"
161              
162             =cut
163              
164             sub trim_domain {
165 2460     2460 1 35573 my $self = shift;
166 2460         2736 my $domain = shift;
167              
168 2460         3628 my ($host, $dom) = $self->split_domain($domain);
169 2460         3811 return $dom;
170             }
171              
172             ###########################################################################
173              
174             =item $ok = is_domain_valid($dom)
175              
176             Return C<1> if the domain is valid, C<undef> otherwise. A valid domain
177             (a) does not contain whitespace, (b) contains at least one dot, and (c)
178             uses a valid TLD or ccTLD.
179              
180             =back
181              
182             =cut
183              
184             sub is_domain_valid {
185 2451     2451 1 2854 my $self = shift;
186 2451         3055 my $dom = lc shift;
187              
188             # domains don't have whitespace
189 2451 50       4660 return 0 if ($dom =~ /\s/);
190              
191             # ensure it ends in a known-valid TLD, and has at least 1 dot
192 2451 100       6871 return 0 unless ($dom =~ /\.([^.]+)$/);
193 2391 100       6367 return 0 unless ($self->{conf}->{valid_tlds}{$1});
194              
195 2353         3653 return 1; # nah, it's ok.
196             }
197              
198             #
199              
200             sub uri_to_domain {
201 2538     2538 0 12974 my $self = shift;
202 2538         4143 my $uri = lc shift;
203              
204             # Javascript is not going to help us, so return.
205 2538 100       4362 return if ($uri =~ /^javascript:/);
206              
207 2537         3500 $uri =~ s{\#.*$}{}gs; # drop fragment
208 2537         7149 $uri =~ s{^[a-z]+:/{0,2}}{}gs; # drop the protocol
209 2537         3728 $uri =~ s{^[^/]*\@}{}gs; # username/passwd
210              
211             # strip path and CGI params. note: bug 4213 shows that "&" should
212             # *not* be likewise stripped here -- it's permitted in hostnames by
213             # some common MUAs!
214 2537         3677 $uri =~ s{[/?].*$}{}gs;
215              
216 2537         2837 $uri =~ s{:\d*$}{}gs; # port, bug 4191: sometimes the # is missing
217              
218             # skip undecoded URIs if the encoded bits shouldn't be.
219             # we'll see the decoded version as well. see url_encode()
220 2537 100       3750 return if $uri =~ /\%(?:2[1-9a-f]|[3-6][0-9a-f]|7[0-9a-e])/;
221              
222 2536         2963 my $host = $uri; # unstripped/full domain name
223              
224             # keep IPs intact
225 2536 100       4205 if ($uri !~ /^\d+\.\d+\.\d+\.\d+$/) {
226             # get rid of hostname part of domain, understanding delegation
227 2451         3831 $uri = $self->trim_domain($uri);
228              
229             # ignore invalid domains
230 2451 100       3877 return unless ($self->is_domain_valid($uri));
231             }
232            
233             # $uri is now the domain only, optionally return unstripped host name
234 2438 100       6803 return !wantarray ? $uri : ($uri, $host);
235             }
236              
237             1;
238