File Coverage

blib/lib/Mail/SpamAssassin/Locales.pm
Criterion Covered Total %
statement 12 40 30.0
branch 0 30 0.0
condition 0 3 0.0
subroutine 4 5 80.0
pod 0 1 0.0
total 16 79 20.2


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18              
19             use strict;
20 22     22   138 use Mail::SpamAssassin::Logger;
  22         50  
  22         624  
21 22     22   123 use warnings;
  22         49  
  22         1260  
22 22     22   148 # use bytes;
  22         47  
  22         652  
23             use re 'taint';
24 22     22   125  
  22         37  
  22         9238  
25             ###########################################################################
26              
27             # A mapping of known country codes to frequent charsets used therein.
28             # note that the ISO and CP charsets will already have been permitted,
29             # so only "unusual" charsets should be listed here.
30             #
31             # Country codes should be lowercase, charsets uppercase.
32             #
33             # A good listing is in /usr/share/config/charsets from KDE 2.2.1
34             #
35             our %charsets_for_locale = (
36              
37             # Japanese: Peter Evans writes: iso-2022-jp = rfc approved, rfc 1468, created
38             # by Jun Murai in 1993 back when he didn't have white hair! rfc approved.
39             # (rfc 2237) <-- by M$.
40             'ja' => 'EUCJP JISX020119760 JISX020819830 JISX020819900 JISX020819970 '.
41             'JISX021219900 JISX021320001 JISX021320002 SHIFT_JIS SHIFTJIS '.
42             'ISO2022JP SJIS JIS7 JISX0201 JISX0208 JISX0212',
43              
44             # Korea
45             'ko' => 'EUCKR KSC56011987',
46              
47             # Cyrillic: Andrew Vasilyev notes CP866 is common (bug 2278)
48             'ru' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
49             'ka' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
50             'tg' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
51             'be' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
52             'uk' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
53             'bg' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
54              
55             # Thai
56             'th' => 'TIS620',
57              
58             # Chinese (simplified and traditional). Peter Evans writes: new government
59             # mandated chinese encoding = gb18030, chinese mail is supposed to be
60             # iso-2022-cn (rfc 1922?)
61             'zh' => 'GB1988 GB2312 GB231219800 GB18030 GBK BIG5HKSCS BIG5 EUCTW ISO2022CN',
62              
63             # Chinese Traditional charsets only
64             'zh.big5' => 'BIG5HKSCS BIG5 EUCTW',
65              
66             # Chinese Simplified charsets only
67             'zh.gb2312' => 'GB1988 GB2312 GB231219800 GB18030 GBK ISO2022CN',
68             );
69              
70             ###########################################################################
71              
72             my ($cs, @locales) = @_;
73              
74 0     0 0   $cs = uc $cs; $cs =~ s/[^A-Z0-9]//g;
75             $cs =~ s/^3D//gs; # broken by quoted-printable
76 0           $cs =~ s/:.*$//gs; # trim off multiple charsets, just use 1st
  0            
77 0           dbg ("locales: is $cs ok for @locales?");
78 0            
79 0           study $cs; # study is a no-op since perl 5.16.0, eliminating related bugs
80             #warn "JMD $cs";
81 0            
82             # always OK (the net speaks mostly roman charsets)
83             return 1 if ($cs eq 'USASCII');
84             return 1 if ($cs eq 'ASCII');
85 0 0         return 1 if ($cs =~ /^ISO8859/);
86 0 0         return 1 if ($cs =~ /^ISO10646/);
87 0 0         return 1 if ($cs =~ /^UTF/);
88 0 0         return 1 if ($cs =~ /^UCS/);
89 0 0         return 1 if ($cs =~ /^CP125/);
90 0 0         return 1 if ($cs =~ /^WINDOWS/); # argh, Windows
91 0 0         return 1 if ($cs eq 'IBM852');
92 0 0         return 1 if ($cs =~ /^UNICODE11UTF[78]/); # wtf? never heard of it
93 0 0         return 1 if ($cs eq 'XUNKNOWN'); # added by sendmail when converting to 8bit
94 0 0         return 1 if ($cs eq 'ISO'); # Magellan, sending as 'charset=iso 8859-15'. grr
95 0 0          
96 0 0         foreach my $locale (@locales) {
97             if (!defined($locale) || $locale eq 'C') { $locale = 'en'; }
98 0           $locale =~ s/^([a-z][a-z]).*$/$1/; # zh_TW... => zh
99 0 0 0        
  0            
100 0           my $ok_for_loc = $charsets_for_locale{$locale};
101             next if (!defined $ok_for_loc);
102 0            
103 0 0         if ($ok_for_loc =~ /(?:^| )\Q${cs}\E(?:$| )/) {
104             return 1;
105 0 0         }
106 0           }
107              
108             return 0;
109             }
110 0            
111             1;