File Coverage

blib/lib/HTML/Entities/Latin2.pm
Criterion Covered Total %
statement 28 29 96.5
branch 10 12 83.3
condition n/a
subroutine 5 5 100.0
pod 0 1 0.0
total 43 47 91.4


line stmt bran cond sub pod time code
1             package HTML::Entities::Latin2;
2              
3 1     1   41690 use 5.006;
  1         4  
  1         43  
4 1     1   5 use strict;
  1         2  
  1         39  
5 1     1   6 use warnings;
  1         7  
  1         48  
6 1     1   6 use vars qw(*encode_entities);
  1         1  
  1         2116  
7             require Exporter;
8             our @ISA = qw(Exporter);
9             our @EXPORT_OK = qw(encode encode_entities);
10             our $VERSION = '0.04';
11              
12             my %ascii_entities = (
13             '"' => ['"', '"', '"', "\x{0022}", 'QUOTATION MARK'],
14             '&' => ['&', '&', '&', "\x{0026}", 'AMPERSAND'],
15             '\'' => [''', ''', ''', "\x{0027}", 'APOSTROPHE'],
16             '<' => ['<', '<', '<', "\x{003C}", 'LESS-THAN SIGN'],
17             '>' => ['>', '>', '>', "\x{003E}", 'GREATER-THAN SIGN'],
18             );
19              
20             my @char_map = (
21             # chr(160) to chr(255)
22             [' ', ' ', ' ', "\x{00A0}", 'NO-BREAK SPACE'],
23             ['Ą', 'Ą', 'Ą', "\x{0104}", 'LATIN CAPITAL LETTER A WITH OGONEK'],
24             ['˘', '˘', '˘', "\x{02D8}", 'BREVE'],
25             ['Ł', 'Ł', 'Ł', "\x{0141}", 'LATIN CAPITAL LETTER L WITH STROKE'],
26             ['¤', '¤', '¤', "\x{00A4}", 'CURRENCY SIGN'],
27             ['Ľ', 'Ľ', 'Ľ', "\x{013D}", 'LATIN CAPITAL LETTER L WITH CARON'],
28             ['Ś', 'Ś', 'Ś', "\x{015A}", 'LATIN CAPITAL LETTER S WITH ACUTE'],
29             ['§', '§', '§', "\x{00A7}", 'SECTION SIGN'],
30             ['¨', '¨', '¨', "\x{00A8}", 'DIAERESIS'],
31             ['Š', 'Š', 'Š', "\x{0160}", 'LATIN CAPITAL LETTER S WITH CARON'],
32             ['Ş', 'Ş', 'Ş', "\x{015E}", 'LATIN CAPITAL LETTER S WITH CEDILLA'],
33             ['Ť', 'Ť', 'Ť', "\x{0164}", 'LATIN CAPITAL LETTER T WITH CARON'],
34             ['Ź', 'Ź', 'Ź', "\x{0179}", 'LATIN CAPITAL LETTER Z WITH ACUTE'],
35             ['–', '­', '­', "\x{00AD}", 'SOFT HYPHEN'],
36             ['Ž', 'Ž', 'Ž', "\x{017D}", 'LATIN CAPITAL LETTER Z WITH CARON'],
37             ['Ż', 'Ż', 'Ż', "\x{017B}", 'LATIN CAPITAL LETTER Z WITH DOT ABOVE'],
38             ['˚', '°', '°', "\x{00B0}", 'DEGREE SIGN'],
39             ['ą', 'ą', 'ą', "\x{0105}", 'LATIN SMALL LETTER A WITH OGONEK'],
40             ['˛', '˛', '˛', "\x{02DB}", 'OGONEK'],
41             ['ł', 'ł', 'ł', "\x{0142}", 'LATIN SMALL LETTER L WITH STROKE'],
42             ['ˊ', '´', '´', "\x{00B4}", 'ACUTE ACCENT'],
43             ['ľ', 'ľ', 'ľ', "\x{013E}", 'LATIN SMALL LETTER L WITH CARON'],
44             ['ś', 'ś', 'ś', "\x{015B}", 'LATIN SMALL LETTER S WITH ACUTE'],
45             ['ˇ', 'ˇ', '&caron', "\x{02C7}", 'CARON'],
46             ['¸', '¸', '¸', "\x{00B8}", 'CEDILLA'],
47             ['š', 'š', 'š', "\x{0161}", 'LATIN SMALL LETTER S WITH CARON'],
48             ['ş', 'ş', 'ş', "\x{015F}", 'LATIN SMALL LETTER S WITH CEDILLA'],
49             ['ť', 'ť', 'ť', "\x{0165}", 'LATIN SMALL LETTER T WITH CARON'],
50             ['ź', 'ź', 'ź', "\x{017A}", 'LATIN SMALL LETTER Z WITH ACUTE'],
51             ['˝', '˝', '˝', "\x{02DD}", 'DOUBLE ACUTE ACCENT'],
52             ['ž', 'ž', 'ž', "\x{017E}", 'LATIN SMALL LETTER Z WITH CARON'],
53             ['ż', 'ż', 'ż', "\x{017C}", 'LATIN SMALL LETTER Z WITH DOT ABOVE'],
54             ['Ŕ', 'Ŕ', 'Ŕ', "\x{0154}", 'LATIN CAPITAL LETTER R WITH ACUTE'],
55             ['Á', 'Á', 'Á', "\x{00C1}", 'LATIN CAPITAL LETTER A WITH ACUTE'],
56             ['Â', 'Â', 'Â', "\x{00C2}", 'LATIN CAPITAL LETTER A WITH CIRCUMFLEX'],
57             ['Ă', 'Ă', 'Ă', "\x{0102}", 'LATIN CAPITAL LETTER A WITH BREVE'],
58             ['Ä', 'Ä', 'Ä', "\x{00C4}", 'LATIN CAPITAL LETTER A WITH UMLAUT'],
59             ['Ĺ', 'Ĺ', 'Ĺ', "\x{0139}", 'LATIN CAPITAL LETTER L WITH ACUTE'],
60             ['Ć', 'Ć', 'Ć', "\x{0106}", 'LATIN CAPITAL LETTER C WITH ACUTE'],
61             ['Ç', 'Ç', 'Ç', "\x{00C7}", 'LATIN CAPITAL LETTER C WITH CEDILLA'],
62             ['Č', 'Č', 'Č', "\x{010C}", 'LATIN CAPITAL LETTER C WITH CARON'],
63             ['É', 'É', 'É', "\x{00C9}", 'LATIN CAPITAL LETTER E WITH ACUTE'],
64             ['Ę', 'Ę', 'Ę', "\x{0118}", 'LATIN CAPITAL LETTER E WITH OGONEK'],
65             ['Ë', 'Ë', 'Ë', "\x{00CB}", 'LATIN CAPITAL LETTER E WITH UMLAUT'],
66             ['Ě', 'Ě', 'Ě', "\x{011A}", 'LATIN CAPITAL LETTER E WITH CARON'],
67             ['Í', 'Í', 'Í', "\x{00CD}", 'LATIN CAPITAL LETTER I WITH ACUTE'],
68             ['Î', 'Î', 'Î', "\x{00CE}", 'LATIN CAPITAL LETTER I WITH CIRCUMFLEX'],
69             ['Ď', 'Ď', 'Ď', "\x{010E}", 'LATIN CAPITAL LETTER D WITH CARON'],
70             ['Đ', 'Đ', 'Đ', "\x{0110}", 'LATIN CAPITAL LETTER D WITH STROKE'],
71             ['Ń', 'Ń', 'Ń', "\x{0143}", 'LATIN CAPITAL LETTER N WITH ACUTE'],
72             ['Ň', 'Ň', 'Ň', "\x{0147}", 'LATIN CAPITAL LETTER N WITH CARON'],
73             ['Ó', 'Ó', 'Ó', "\x{00D3}", 'LATIN CAPITAL LETTER O WITH ACUTE'],
74             ['Ô', 'Ô', 'Ô', "\x{00D4}", 'LATIN CAPITAL LETTER O WITH CIRCUMFLEX'],
75             ['Ő', 'ő', 'Ő', "\x{0151}", 'LATIN CAPITAL LETTER O WITH DOUBLE ACUTE'],
76             ['Ö', 'Ö', 'Ö', "\x{00D6}", 'LATIN CAPITAL LETTER O WITH UMLAUT'],
77             ['×', '×', '×', "\x{00D7}", 'MULTIPLICATION SIGN'],
78             ['Ř', 'Ř', 'Ř', "\x{0158}", 'LATIN CAPITAL LETTER R WITH CARON'],
79             ['Ů', 'Ů', 'Ů', "\x{016E}", 'LATIN CAPITAL LETTER U WITH RING ABOVE'],
80             ['Ú', 'Ú', 'Ú', "\x{00DA}", 'LATIN CAPITAL LETTER U WITH ACUTE'],
81             ['Ű', 'Ű', 'Ű', "\x{0170}", 'LATIN CAPITAL LETTER U WITH DOUBLE ACUTE'],
82             ['Ü', 'Ü', 'Ü', "\x{00DC}", 'LATIN CAPITAL LETTER U WITH UMLAUT'],
83             ['Ý', 'Ý', 'Ý', "\x{00DD}", 'LATIN CAPITAL LETTER Y WITH ACUTE'],
84             ['Ţ', 'Ţ', 'Ţ', "\x{0162}", 'LATIN CAPITAL LETTER T WITH CEDILLA'],
85             ['ß', 'ß', 'ß', "\x{00DF}", 'LATIN SMALL LETTER SHARP S'],
86             ['ŕ', 'ŕ', 'ŕ', "\x{0155}", 'LATIN SMALL LETTER R WITH ACUTE'],
87             ['á', 'á', 'á', "\x{00E1}", 'LATIN SMALL LETTER A WITH ACUTE'],
88             ['â', 'â', 'â', "\x{00E2}", 'LATIN SMALL LETTER A WITH CIRCUMFLEX'],
89             ['ă', 'ă', 'ă', "\x{0103}", 'LATIN SMALL LETTER A WITH BREVE'],
90             ['ä', 'ä', 'ä', "\x{00E4}", 'LATIN SMALL LETTER A WITH UMLAUT'],
91             ['ĺ', 'ĺ', 'ĺ', "\x{013A}", 'LATIN SMALL LETTER L WITH ACUTE'],
92             ['ć', 'ć', 'ć', "\x{0107}", 'LATIN SMALL LETTER C WITH ACUTE'],
93             ['ç', 'ç', 'ç', "\x{00E7}", 'LATIN SMALL LETTER C WITH CEDILLA'],
94             ['č', 'č', 'č', "\x{010D}", 'LATIN SMALL LETTER C WITH CARON'],
95             ['é', 'é', 'é', "\x{00E9}", 'LATIN SMALL LETTER E WITH ACUTE'],
96             ['ę', 'ę', 'ę', "\x{0119}", 'LATIN SMALL LETTER E WITH OGONEK'],
97             ['ë', 'ë', 'ë', "\x{00EB}", 'LATIN SMALL LETTER E WITH UMLAUT'],
98             ['ě', 'ě', 'ě', "\x{011B}", 'LATIN SMALL LETTER E WITH CARON'],
99             ['í', 'í', 'í', "\x{00ED}", 'LATIN SMALL LETTER I WITH ACUTE'],
100             ['î', 'î', 'î', "\x{00EE}", 'LATIN SMALL LETTER I WITH CIRCUMFLEX'],
101             ['ď', 'ď', 'ď', "\x{010F}", 'LATIN SMALL LETTER D WITH CARON'],
102             ['đ', 'đ', 'đ', "\x{0111}", 'LATIN SMALL LETTER D WITH STROKE'],
103             ['ń', 'ń', 'ń', "\x{0144}", 'LATIN SMALL LETTER N WITH ACUTE'],
104             ['ň', 'ň', 'ň', "\x{0148}", 'LATIN SMALL LETTER N WITH CARON'],
105             ['ó', 'ó', 'ó', "\x{00F3}", 'LATIN SMALL LETTER O WITH ACUTE'],
106             ['ô', 'ô', 'ô', "\x{00F4}", 'LATIN SMALL LETTER O WITH CIRCUMFLEX'],
107             ['ő', 'ő', 'ő', "\x{0151}", 'LATIN SMALL LETTER O WITH DOUBLE ACUTE'],
108             ['ö', 'ö', 'ö', "\x{00F6}", 'LATIN SMALL LETTER O WITH UMLAUT'],
109             ['÷', '÷', '÷', "\x{00F7}", 'DIVISION SIGN'],
110             ['ř', 'ř', 'ř', "\x{0159}", 'LATIN SMALL LETTER R WITH CARON'],
111             ['ů', 'ů', 'ů', "\x{016F}", 'LATIN SMALL LETTER U WITH RING ABOVE'],
112             ['ú', 'ú', 'ú', "\x{00FA}", 'LATIN SMALL LETTER U WITH ACUTE'],
113             ['ű', 'ű', 'ű', "\x{0171}", 'LATIN SMALL LETTER U WITH DOUBLE ACUTE'],
114             ['ü', 'ü', 'ü', "\x{00FC}", 'LATIN SMALL LETTER U WITH UMLAUT'],
115             ['ý', 'ý', 'ý', "\x{00FD}", 'LATIN SMALL LETTER Y WITH ACUTE'],
116             ['ţ', 'ţ', 'ţ', "\x{0163}", 'LATIN SMALL LETTER T WITH CEDILLA'],
117             ['·', '˙', '˙', "\x{02D9}", 'DOT ABOVE'],
118             );
119              
120             sub encode {
121 3     3 0 13 my($source_str, $scheme_name, $unsafe) = @_;
122            
123 3         71 my $scheme = {
124             decimal=>0, number=>0, numeric=>0, 'hex'=>1, name=>2, named=>2, utf8=>3, description=>4
125             }->{lc($scheme_name)};
126            
127 3 100       15 $scheme = 0 unless defined $scheme; # defaults to decimal/numeric entities
128            
129 3         5 my %unsafe = ();
130 3 100       8 if ($unsafe) {
131 1         4 foreach (split //, $unsafe) {
132 2 50       9 if (defined $ascii_entities{$_}) { $unsafe{ord $_} = $ascii_entities{$_}; }
  2         6  
133             }
134             }
135              
136 3         6 my $encoded = '';
137 3         10 foreach my $char_val (unpack('C*', $source_str)) {
138 84 100       126 if ($char_val < 127) { # ASCII character
    50          
139 69 100       121 if (defined $unsafe{$char_val}) {
140 4         10 $encoded .= $unsafe{$char_val}->[$scheme];
141             }
142 65         93 else { $encoded .= chr $char_val; }
143             }
144             elsif ($char_val >= 160) {
145 15         31 $encoded .= $char_map[$char_val - 160]->[$scheme];
146             }
147             else {
148 0         0 warn 'character not in Latin-2 map, character code: '.$char_val;
149             }
150             }
151 3         25 return $encoded;
152             }
153              
154             *encode_entities = \&encode;
155              
156             1;
157             __END__