File Coverage

blib/lib/Mail/SpamAssassin/Locales.pm
Criterion Covered Total %
statement 9 35 25.7
branch 0 28 0.0
condition 0 3 0.0
subroutine 3 4 75.0
pod 0 1 0.0
total 12 71 16.9


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             package Mail::SpamAssassin::Locales;
19              
20 21     21   154 use strict;
  21         61  
  21         708  
21 21     21   133 use warnings;
  21         77  
  21         689  
22             # use bytes;
23 21     21   126 use re 'taint';
  21         71  
  21         9553  
24              
25             ###########################################################################
26              
27             # A mapping of known country codes to frequent charsets used therein.
28             # note that the ISO and CP charsets will already have been permitted,
29             # so only "unusual" charsets should be listed here.
30             #
31             # Country codes should be lowercase, charsets uppercase.
32             #
33             # A good listing is in /usr/share/config/charsets from KDE 2.2.1
34             #
35             our %charsets_for_locale = (
36              
37             # Japanese: Peter Evans writes: iso-2022-jp = rfc approved, rfc 1468, created
38             # by Jun Murai in 1993 back when he didn't have white hair! rfc approved.
39             # (rfc 2237) <-- by M$.
40             'ja' => 'EUCJP JISX020119760 JISX020819830 JISX020819900 JISX020819970 '.
41             'JISX021219900 JISX021320001 JISX021320002 SHIFT_JIS SHIFTJIS '.
42             'ISO2022JP SJIS JIS7 JISX0201 JISX0208 JISX0212',
43              
44             # Korea
45             'ko' => 'EUCKR KSC56011987',
46              
47             # Cyrillic: Andrew Vasilyev notes CP866 is common (bug 2278)
48             'ru' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
49             'ka' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
50             'tg' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
51             'be' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
52             'uk' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
53             'bg' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
54              
55             # Thai
56             'th' => 'TIS620',
57              
58             # Chinese (simplified and traditional). Peter Evans writes: new government
59             # mandated chinese encoding = gb18030, chinese mail is supposed to be
60             # iso-2022-cn (rfc 1922?)
61             'zh' => 'GB1988 GB2312 GB231219800 GB18030 GBK BIG5HKSCS BIG5 EUCTW ISO2022CN',
62              
63             # Chinese Traditional charsets only
64             'zh.big5' => 'BIG5HKSCS BIG5 EUCTW',
65              
66             # Chinese Simplified charsets only
67             'zh.gb2312' => 'GB1988 GB2312 GB231219800 GB18030 GBK ISO2022CN',
68             );
69              
70             ###########################################################################
71              
72             sub is_charset_ok_for_locales {
73 0     0 0   my ($cs, @locales) = @_;
74              
75 0           $cs = uc $cs; $cs =~ s/[^A-Z0-9]//g;
  0            
76 0           $cs =~ s/^3D//gs; # broken by quoted-printable
77 0           $cs =~ s/:.*$//gs; # trim off multiple charsets, just use 1st
78              
79 0           study $cs; # study is a no-op since perl 5.16.0, eliminating related bugs
80             #warn "JMD $cs";
81              
82             # always OK (the net speaks mostly roman charsets)
83 0 0         return 1 if ($cs eq 'USASCII');
84 0 0         return 1 if ($cs =~ /^ISO8859/);
85 0 0         return 1 if ($cs =~ /^ISO10646/);
86 0 0         return 1 if ($cs =~ /^UTF/);
87 0 0         return 1 if ($cs =~ /^UCS/);
88 0 0         return 1 if ($cs =~ /^CP125/);
89 0 0         return 1 if ($cs =~ /^WINDOWS/); # argh, Windows
90 0 0         return 1 if ($cs eq 'IBM852');
91 0 0         return 1 if ($cs =~ /^UNICODE11UTF[78]/); # wtf? never heard of it
92 0 0         return 1 if ($cs eq 'XUNKNOWN'); # added by sendmail when converting to 8bit
93 0 0         return 1 if ($cs eq 'ISO'); # Magellan, sending as 'charset=iso 8859-15'. grr
94              
95 0           foreach my $locale (@locales) {
96 0 0 0       if (!defined($locale) || $locale eq 'C') { $locale = 'en'; }
  0            
97 0           $locale =~ s/^([a-z][a-z]).*$/$1/; # zh_TW... => zh
98              
99 0           my $ok_for_loc = $charsets_for_locale{$locale};
100 0 0         next if (!defined $ok_for_loc);
101              
102 0 0         if ($ok_for_loc =~ /(?:^| )\Q${cs}\E(?:$| )/) {
103 0           return 1;
104             }
105             }
106              
107 0           return 0;
108             }
109              
110             1;