File Coverage

blib/lib/Mail/SpamAssassin/Constants.pm
Criterion Covered Total %
statement 84 84 100.0
branch n/a
condition n/a
subroutine 27 27 100.0
pod n/a
total 111 111 100.0


line stmt bran cond sub pod time code
1             # Constants used in many parts of the SpamAssassin codebase.
2             #
3             # TODO! we need to reimplement parts of the RESERVED regexp!
4              
5             # <@LICENSE>
6             # Licensed to the Apache Software Foundation (ASF) under one or more
7             # contributor license agreements. See the NOTICE file distributed with
8             # this work for additional information regarding copyright ownership.
9             # The ASF licenses this file to you under the Apache License, Version 2.0
10             # (the "License"); you may not use this file except in compliance with
11             # the License. You may obtain a copy of the License at:
12             #
13             # http://www.apache.org/licenses/LICENSE-2.0
14             #
15             # Unless required by applicable law or agreed to in writing, software
16             # distributed under the License is distributed on an "AS IS" BASIS,
17             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18             # See the License for the specific language governing permissions and
19             # limitations under the License.
20             # </@LICENSE>
21              
22             package Mail::SpamAssassin::Constants;
23              
24 40     40   452 use strict;
  40         85  
  40         1365  
25 40     40   296 use warnings;
  40         105  
  40         1417  
26 40     40   224 use re 'taint';
  40         77  
  40         1560  
27              
28 40     40   236 use Exporter ();
  40         75  
  40         7342  
29             our @ISA = qw(Exporter);
30              
31             our(@BAYES_VARS, @IP_VARS, @SA_VARS, %EXPORT_TAGS, @EXPORT_OK);
32              
33             # NOTE: Unless you need these to be available at BEGIN time, you're better with this out of a BEGIN block with a simple our statement.
34             BEGIN {
35 40     40   218 @IP_VARS = qw(
36             IP_IN_RESERVED_RANGE IP_PRIVATE LOCALHOST IPV4_ADDRESS IP_ADDRESS
37             );
38 40         4550 @BAYES_VARS = qw(
39             DUMP_MAGIC DUMP_TOKEN DUMP_BACKUP
40             );
41             # These are generic constants that may be used across several modules
42 40         166 @SA_VARS = qw(
43             HARVEST_DNSBL_PRIORITY MBX_SEPARATOR
44             MAX_BODY_LINE_LENGTH MAX_HEADER_KEY_LENGTH MAX_HEADER_VALUE_LENGTH
45             MAX_HEADER_LENGTH ARITH_EXPRESSION_LEXER AI_TIME_UNKNOWN
46             CHARSETS_LIKELY_TO_FP_AS_CAPS MAX_URI_LENGTH RULENAME_RE IS_RULENAME
47             META_RULES_MATCHING_RE
48             );
49              
50 40         495 %EXPORT_TAGS = (
51             bayes => [ @BAYES_VARS ],
52             ip => [ @IP_VARS ],
53             sa => [ @SA_VARS ],
54             all => [ @BAYES_VARS, @IP_VARS, @SA_VARS ],
55             );
56              
57 40         1352 @EXPORT_OK = ( @BAYES_VARS, @IP_VARS, @SA_VARS );
58             }
59              
60             # BAYES_VARS
61 40     40   262 use constant DUMP_MAGIC => 1;
  40         96  
  40         2761  
62 40     40   252 use constant DUMP_TOKEN => 2;
  40         72  
  40         2127  
63 40     40   257 use constant DUMP_SEEN => 4;
  40         87  
  40         2116  
64 40     40   233 use constant DUMP_BACKUP => 8;
  40         96  
  40         14012  
65              
66             # IP_VARS
67             # ---------------------------------------------------------------------------
68             # Initialize a regexp for private IPs, i.e. ones that could be
69             # used inside a company and be the first or second relay hit by
70             # a message. Some companies use these internally and translate
71             # them using a NAT firewall. These are listed in the RBL as invalid
72             # originators -- which is true, if you receive the mail directly
73             # from them; however we do not, so we should ignore them.
74             #
75             # sources:
76             # IANA = <https://www.iana.org/numbers>,
77             # 5735 = <https://tools.ietf.org/html/rfc5735>
78             # 6598 = <https://tools.ietf.org/html/rfc6598>
79             # 4193 = <https://tools.ietf.org/html/rfc4193>
80             # CYMRU = <https://www.team-cymru.com/bogon-reference.html>
81             #
82             # This includes:
83             # host-local address space 127.0.0.0/8 and ::1,
84             # link-local address space 169.254.0.0/16 and fe80::/10,
85             # private-use address space 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16,
86             # TODO: Unique Local Unicast Addresses fc00::/7 (RFC 4193)
87             # shared address space 100.64.0.0/10 (RFC 6598 - for use in CGN),
88             # IPv4-mapped IPv6 address ::ffff:0:0/96 (RFC 3513)
89             #
90 40         2430 use constant IP_PRIVATE => qr{^(?:
91             (?: # IPv4 addresses
92             10| # 10.0.0.0/8 Private Use (5735, 1918)
93             127| # 127.0.0.0/8 Host-local (5735, 1122)
94             169\.254| # 169.254.0.0/16 Link-local (5735, 3927)
95             172\.(?:1[6-9]|2[0-9]|3[01])| # 172.16.0.0/12 Private Use (5735, 1918)
96             192\.168| # 192.168.0.0/16 Private Use (5735, 1918)
97             100\.(?:6[4-9]|[7-9][0-9]|1[01][0-9]|12[0-7]) # 100.64.0.0/10 CGN (6598)
98             )\..*
99             |
100             (?: # IPv6 addresses
101             # don't use \b here, it hits on :'s
102             (?:IPv6: # with optional prefix
103             | (?<![a-f0-9:])
104             )
105             (?:
106             # IPv4 mapped in IPv6
107             # note the colon after the 12th byte in each here
108             (?:
109             # first 6 (12 bytes) non-zero
110             (?:0{1,4}:){5} ffff:
111             |
112             # leading zeros omitted (note {0,5} not {1,5})
113             ::(?:0{1,4}:){0,4} ffff:
114             |
115             # trailing zeros (in the first 6) omitted
116             (?:0{1,4}:){1,4}: ffff:
117             |
118             # 0000 in second up to (including) fifth omitted
119             0{1,4}::(?:0{1,4}:){1,3} ffff:
120             |
121             # 0000 in third up to (including) fifth omitted
122             (?:0{1,4}:){2}:0{1,2}: ffff:
123             |
124             # 0000 in fourth up to (including) fifth omitted
125             (?:0{1,4}:){3}:0: ffff:
126             |
127             # 0000 in fifth omitted
128             (?:0{1,4}:){4}: ffff:
129             )
130             # and the IPv4 address appended to all of the 12 bytes above
131             (?:
132             10|
133             127|
134             169\.254|
135             172\.(?:1[6-9]|2[0-9]|3[01])|
136             192\.168|
137             100\.(?:6[4-9]|[7-9][0-9]|1[01][0-9]|12[0-7])
138             )\..*
139              
140             | # or IPv6 link-local address space, fe80::/10
141             fe[89ab][0-9a-f]:.*
142              
143             | # or the host-local ::1 addr, as a pure IPv6 address
144              
145             # all 8 (16 bytes) of them present
146             (?:0{1,4}:){7} 0{0,3}1
147             |
148             # leading zeros omitted
149             :(?::0{1,4}){0,6}: 0{0,3}1
150             |
151             # 0000 in second up to (including) seventh omitted
152             0{1,4}:(?::0{1,4}){0,5}: 0{0,3}1
153             |
154             # 0000 in third up to (including) seventh omitted
155             (?:0{1,4}:){2}(?::0{1,4}){0,4}: 0{0,3}1
156             |
157             # 0000 in fourth up to (including) seventh omitted
158             (?:0{1,4}:){3}(?::0{1,4}){0,3}: 0{0,3}1
159             |
160             # 0000 in fifth up to (including) seventh omitted
161             (?:0{1,4}:){4}(?::0{1,4}){0,2}: 0{0,3}1
162             |
163             # 0000 in sixth up to (including) seventh omitted
164             (?:0{1,4}:){5}(?::0{1,4}){0,1}: 0{0,3}1
165             |
166             # 0000 in seventh omitted
167             (?:0{1,4}:){6}: 0{0,3}1
168             )
169             (?![a-f0-9:])
170             )
171 40     40   301 )}oxi;
  40         86  
172              
173             # backward compatibility
174 40     40   257 use constant IP_IN_RESERVED_RANGE => IP_PRIVATE;
  40         78  
  40         9599  
175              
176             # ---------------------------------------------------------------------------
177             # match the various ways of saying "localhost".
178              
179 40         5873 use constant LOCALHOST => qr/
180             (?:
181             # as a string
182             localhost(?:\.localdomain)?
183             |
184             \b(?<!:) # ensure no "::" IPv6 marker before this one
185             # plain IPv4
186             127\.0\.0\.1 \b
187             |
188             # IPv6 addresses
189             # don't use \b here, it hits on :'s
190             (?:IPv6: # with optional prefix
191             | (?<![a-f0-9:])
192             )
193             (?:
194             # IPv4 mapped in IPv6
195             # note the colon after the 12th byte in each here
196             (?:
197             # first 6 (12 bytes) non-zero
198             (?:0{1,4}:){5} ffff:
199             |
200             # leading zeros omitted (note {0,5} not {1,5})
201             ::(?:0{1,4}:){0,4} ffff:
202             |
203             # trailing zeros (in the first 6) omitted
204             (?:0{1,4}:){1,4}: ffff:
205             |
206             # 0000 in second up to (including) fifth omitted
207             0{1,4}::(?:0{1,4}:){1,3} ffff:
208             |
209             # 0000 in third up to (including) fifth omitted
210             (?:0{1,4}:){2}:0{1,2}: ffff:
211             |
212             # 0000 in fourth up to (including) fifth omitted
213             (?:0{1,4}:){3}:0: ffff:
214             |
215             # 0000 in fifth omitted
216             (?:0{1,4}:){4}: ffff:
217             )
218             # and the IPv4 address appended to all of the 12 bytes above
219             127\.0\.0\.1 # no \b, we check later
220              
221             | # or (separately) a pure IPv6 address
222              
223             # all 8 (16 bytes) of them present
224             (?:0{1,4}:){7} 0{0,3}1
225             |
226             # leading zeros omitted
227             :(?::0{1,4}){0,6}: 0{0,3}1
228             |
229             # 0000 in second up to (including) seventh omitted
230             0{1,4}:(?::0{1,4}){0,5}: 0{0,3}1
231             |
232             # 0000 in third up to (including) seventh omitted
233             (?:0{1,4}:){2}(?::0{1,4}){0,4}: 0{0,3}1
234             |
235             # 0000 in fourth up to (including) seventh omitted
236             (?:0{1,4}:){3}(?::0{1,4}){0,3}: 0{0,3}1
237             |
238             # 0000 in fifth up to (including) seventh omitted
239             (?:0{1,4}:){4}(?::0{1,4}){0,2}: 0{0,3}1
240             |
241             # 0000 in sixth up to (including) seventh omitted
242             (?:0{1,4}:){5}(?::0{1,4}){0,1}: 0{0,3}1
243             |
244             # 0000 in seventh omitted
245             (?:0{1,4}:){6}: 0{0,3}1
246             )
247             (?![a-f0-9:])
248             )
249 40     40   279 /oxi;
  40         79  
250              
251             # ---------------------------------------------------------------------------
252             # an IP address, in IPv4 format only.
253             #
254 40         16246 use constant IPV4_ADDRESS => qr/\b
255             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
256             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
257             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
258             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)
259 40     40   276 \b/ox;
  40         70  
260              
261             # ---------------------------------------------------------------------------
262             # an IP address, in IPv4, IPv4-mapped-in-IPv6, or IPv6 format. NOTE: cannot
263             # just refer to $IPV4_ADDRESS, due to perl bug reported in nesting qr//s. :(
264             #
265 40         2208 use constant IP_ADDRESS => qr/
266             (?:
267             \b(?<!:) # ensure no "::" IPv4 marker before this one
268             # plain IPv4, as above
269             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
270             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
271             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
272             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\b
273             |
274             # IPv6 addresses
275             # don't use \b here, it hits on :'s
276             (?:IPv6: # with optional prefix
277             | (?<![a-f0-9:])
278             )
279             (?:
280             # IPv4 mapped in IPv6
281             # note the colon after the 12th byte in each here
282             (?:
283             # first 6 (12 bytes) non-zero
284             (?:[a-f0-9]{1,4}:){6}
285             |
286             # leading zeros omitted (note {0,5} not {1,5})
287             ::(?:[a-f0-9]{1,4}:){0,5}
288             |
289             # trailing zeros (in the first 6) omitted
290             (?:[a-f0-9]{1,4}:){1,5}:
291             |
292             # 0000 in second up to (including) fifth omitted
293             [a-f0-9]{1,4}::(?:[a-f0-9]{1,4}:){1,4}
294             |
295             # 0000 in third up to (including) fifth omitted
296             (?:[a-f0-9]{1,4}:){2}:(?:[a-f0-9]{1,4}:){1,3}
297             |
298             # 0000 in fourth up to (including) fifth omitted
299             (?:[a-f0-9]{1,4}:){3}:(?:[a-f0-9]{1,4}:){1,2}
300             |
301             # 0000 in fifth omitted
302             (?:[a-f0-9]{1,4}:){4}:[a-f0-9]{1,4}:
303             )
304             # and the IPv4 address appended to all of the 12 bytes above
305             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
306             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
307             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
308             (?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d) # no \b, we check later
309              
310             | # or (separately) a pure IPv6 address
311              
312             # all 8 (16 bytes) of them present
313             (?:[a-f0-9]{1,4}:){7}[a-f0-9]{1,4}
314             |
315             # leading zeros omitted
316             :(?::[a-f0-9]{1,4}){1,7}
317             |
318             # trailing zeros omitted
319             (?:[a-f0-9]{1,4}:){1,7}:
320             |
321             # 0000 in second up to (including) seventh omitted
322             [a-f0-9]{1,4}:(?::[a-f0-9]{1,4}){1,6}
323             |
324             # 0000 in third up to (including) seventh omitted
325             (?:[a-f0-9]{1,4}:){2}(?::[a-f0-9]{1,4}){1,5}
326             |
327             # 0000 in fourth up to (including) seventh omitted
328             (?:[a-f0-9]{1,4}:){3}(?::[a-f0-9]{1,4}){1,4}
329             |
330             # 0000 in fifth up to (including) seventh omitted
331             (?:[a-f0-9]{1,4}:){4}(?::[a-f0-9]{1,4}){1,3}
332             |
333             # 0000 in sixth up to (including) seventh omitted
334             (?:[a-f0-9]{1,4}:){5}(?::[a-f0-9]{1,4}){1,2}
335             |
336             # 0000 in seventh omitted
337             (?:[a-f0-9]{1,4}:){6}:[a-f0-9]{1,4}
338             |
339             # :: (the unspecified address 0:0:0:0:0:0:0:0)
340             # dos: I don't expect to see this address in a header, and
341             # it may cause non-address strings to match, but we'll
342             # include it for now since it is valid
343             ::
344             )
345             (?![a-f0-9:])
346             )
347 40     40   302 /oxi;
  40         83  
348              
349             # ---------------------------------------------------------------------------
350              
351 40     40   246 use constant HARVEST_DNSBL_PRIORITY => 500;
  40         83  
  40         4552  
352              
353             # regular expression that matches message separators in The University of
354             # Washington's MBX mailbox format
355 40     40   283 use constant MBX_SEPARATOR => qr/^([\s\d]\d-[a-zA-Z]{3}-\d{4}\s\d{2}:\d{2}:\d{2}.*),(\d+);([\da-f]{12})-(\w{8})\r?$/;
  40         72  
  40         2220  
356             # $1 = datestamp (str)
357             # $2 = size of message in bytes (int)
358             # $3 = message status - binary (hex)
359             # $4 = message ID (hex)
360              
361             # ---------------------------------------------------------------------------
362             # values used for internal message representations
363              
364             # maximum byte length of lines in the body
365 40     40   268 use constant MAX_BODY_LINE_LENGTH => 2048;
  40         76  
  40         1955  
366             # maximum byte length of a header key
367 40     40   260 use constant MAX_HEADER_KEY_LENGTH => 256;
  40         78  
  40         2058  
368             # maximum byte length of a header value including continued lines
369 40     40   254 use constant MAX_HEADER_VALUE_LENGTH => 8192;
  40         77  
  40         1900  
370             # maximum byte length of entire header
371 40     40   225 use constant MAX_HEADER_LENGTH => 65536;
  40         91  
  40         1997  
372              
373             # maximum byte length of any given URI
374 40     40   270 use constant MAX_URI_LENGTH => 8192;
  40         81  
  40         8714  
375              
376             # used for meta rules and "if" conditionals in Conf::Parser
377 40         2006 use constant ARITH_EXPRESSION_LEXER => qr/(?:
378             [\-\+\d\.]+| # A Number
379             \w[\w\:]*| # Rule or Class Name
380             [\(\)]| # Parens
381             \|\|| # Boolean OR
382             \&\&| # Boolean AND
383             \^| # Boolean XOR
384             !(?!=)| # Boolean NOT
385             >=?| # GT or EQ
386             <=?| # LT or EQ
387             ==| # EQ
388             !=| # NEQ
389             [\+\-\*\/]| # Mathematical Operator
390             [\?:] # ? : Operator
391 40     40   312 )/ox;
  40         70  
392              
393             # ArchiveIterator
394              
395             # if AI doesn't read in the message in the first pass to see if the received
396             # date makes the message useful or not, we need to mark it so that in the
397             # second pass (when the message is actually read + processed) the received
398             # date is calculated. this value signifies "unknown" from the first pass.
399 40     40   210 use constant AI_TIME_UNKNOWN => 0;
  40         72  
  40         6162  
400              
401             # Charsets which use capital letters heavily in their encoded representation.
402 40         2719 use constant CHARSETS_LIKELY_TO_FP_AS_CAPS => qr{[-_a-z0-9]*(?:
403             koi|jp|jis|euc|gb|big5|isoir|cp1251|windows-1251|georgianps|pt154|tis
404 40     40   274 )[-_a-z0-9]*}ix;
  40         84  
405              
406             # Allowed rulename format
407 40     40   225 use constant RULENAME_RE => qr([_a-zA-Z][_a-zA-Z0-9]{0,127});
  40         62  
  40         2544  
408             # Exact match
409 40     40   209 use constant IS_RULENAME => qr/^${\(RULENAME_RE)}$/;
  40         218  
  40         75  
  40         3571  
410              
411             # meta function rules_matching(), takes argument RULENAME_RE with glob *? characters
412 40     40   233 use constant META_RULES_MATCHING_RE => qr/(?<!_)\brules_matching\(\s*([_a-zA-Z*?][_a-zA-Z0-9*?]{0,127})\s*\)/;
  40         62  
  40         2368  
413              
414             1;