| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Encode::ZapCP1252; |
|
2
|
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
205428
|
use strict; |
|
|
3
|
|
|
|
|
23
|
|
|
|
3
|
|
|
|
|
125
|
|
|
4
|
|
|
|
|
|
|
require Exporter; |
|
5
|
3
|
|
|
3
|
|
19
|
use vars qw($VERSION @ISA @EXPORT); |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
187
|
|
|
6
|
3
|
|
|
3
|
|
79
|
use 5.006_002; |
|
|
3
|
|
|
|
|
10
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
$VERSION = '0.40'; |
|
9
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
|
10
|
|
|
|
|
|
|
@EXPORT = qw(zap_cp1252 fix_cp1252); |
|
11
|
3
|
|
|
3
|
|
28
|
use constant PERL588 => $] >= 5.008_008; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
326
|
|
|
12
|
3
|
|
|
3
|
|
1757
|
use Encode (); |
|
|
3
|
|
|
|
|
31001
|
|
|
|
3
|
|
|
|
|
1413
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our %ascii_for = ( |
|
15
|
|
|
|
|
|
|
# https://en.wikipedia.org/wiki/Windows-1252 |
|
16
|
|
|
|
|
|
|
"\x80" => 'e', # EURO SIGN |
|
17
|
|
|
|
|
|
|
"\x82" => ',', # SINGLE LOW-9 QUOTATION MARK |
|
18
|
|
|
|
|
|
|
"\x83" => 'f', # LATIN SMALL LETTER F WITH HOOK |
|
19
|
|
|
|
|
|
|
"\x84" => ',,', # DOUBLE LOW-9 QUOTATION MARK |
|
20
|
|
|
|
|
|
|
"\x85" => '...', # HORIZONTAL ELLIPSIS |
|
21
|
|
|
|
|
|
|
"\x86" => '+', # DAGGER |
|
22
|
|
|
|
|
|
|
"\x87" => '++', # DOUBLE DAGGER |
|
23
|
|
|
|
|
|
|
"\x88" => '^', # MODIFIER LETTER CIRCUMFLEX ACCENT |
|
24
|
|
|
|
|
|
|
"\x89" => '%', # PER MILLE SIGN |
|
25
|
|
|
|
|
|
|
"\x8a" => 'S', # LATIN CAPITAL LETTER S WITH CARON |
|
26
|
|
|
|
|
|
|
"\x8b" => '<', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
|
27
|
|
|
|
|
|
|
"\x8c" => 'OE', # LATIN CAPITAL LIGATURE OE |
|
28
|
|
|
|
|
|
|
"\x8e" => 'Z', # LATIN CAPITAL LETTER Z WITH CARON |
|
29
|
|
|
|
|
|
|
"\x91" => "'", # LEFT SINGLE QUOTATION MARK |
|
30
|
|
|
|
|
|
|
"\x92" => "'", # RIGHT SINGLE QUOTATION MARK |
|
31
|
|
|
|
|
|
|
"\x93" => '"', # LEFT DOUBLE QUOTATION MARK |
|
32
|
|
|
|
|
|
|
"\x94" => '"', # RIGHT DOUBLE QUOTATION MARK |
|
33
|
|
|
|
|
|
|
"\x95" => '*', # BULLET |
|
34
|
|
|
|
|
|
|
"\x96" => '-', # EN DASH |
|
35
|
|
|
|
|
|
|
"\x97" => '--', # EM DASH |
|
36
|
|
|
|
|
|
|
"\x98" => '~', # SMALL TILDE |
|
37
|
|
|
|
|
|
|
"\x99" => '(tm)', # TRADE MARK SIGN |
|
38
|
|
|
|
|
|
|
"\x9a" => 's', # LATIN SMALL LETTER S WITH CARON |
|
39
|
|
|
|
|
|
|
"\x9b" => '>', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
|
40
|
|
|
|
|
|
|
"\x9c" => 'oe', # LATIN SMALL LIGATURE OE |
|
41
|
|
|
|
|
|
|
"\x9e" => 'z', # LATIN SMALL LETTER Z WITH CARON |
|
42
|
|
|
|
|
|
|
"\x9f" => 'Y', # LATIN CAPITAL LETTER Y WITH DIAERESIS |
|
43
|
|
|
|
|
|
|
); |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
our %utf8_for = ( |
|
46
|
|
|
|
|
|
|
# https://en.wikipedia.org/wiki/Windows-1252 |
|
47
|
|
|
|
|
|
|
"\x80" => '€', # EURO SIGN |
|
48
|
|
|
|
|
|
|
"\x82" => ',', # SINGLE LOW-9 QUOTATION MARK |
|
49
|
|
|
|
|
|
|
"\x83" => 'ƒ', # LATIN SMALL LETTER F WITH HOOK |
|
50
|
|
|
|
|
|
|
"\x84" => '„', # DOUBLE LOW-9 QUOTATION MARK |
|
51
|
|
|
|
|
|
|
"\x85" => '…', # HORIZONTAL ELLIPSIS |
|
52
|
|
|
|
|
|
|
"\x86" => '†', # DAGGER |
|
53
|
|
|
|
|
|
|
"\x87" => '‡', # DOUBLE DAGGER |
|
54
|
|
|
|
|
|
|
"\x88" => 'ˆ', # MODIFIER LETTER CIRCUMFLEX ACCENT |
|
55
|
|
|
|
|
|
|
"\x89" => '‰', # PER MILLE SIGN |
|
56
|
|
|
|
|
|
|
"\x8a" => 'Š', # LATIN CAPITAL LETTER S WITH CARON |
|
57
|
|
|
|
|
|
|
"\x8b" => '‹', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
|
58
|
|
|
|
|
|
|
"\x8c" => 'Œ', # LATIN CAPITAL LIGATURE OE |
|
59
|
|
|
|
|
|
|
"\x8e" => 'Ž', # LATIN CAPITAL LETTER Z WITH CARON |
|
60
|
|
|
|
|
|
|
"\x91" => '‘', # LEFT SINGLE QUOTATION MARK |
|
61
|
|
|
|
|
|
|
"\x92" => '’', # RIGHT SINGLE QUOTATION MARK |
|
62
|
|
|
|
|
|
|
"\x93" => '“', # LEFT DOUBLE QUOTATION MARK |
|
63
|
|
|
|
|
|
|
"\x94" => '”', # RIGHT DOUBLE QUOTATION MARK |
|
64
|
|
|
|
|
|
|
"\x95" => '•', # BULLET |
|
65
|
|
|
|
|
|
|
"\x96" => '–', # EN DASH |
|
66
|
|
|
|
|
|
|
"\x97" => '—', # EM DASH |
|
67
|
|
|
|
|
|
|
"\x98" => '˜', # SMALL TILDE |
|
68
|
|
|
|
|
|
|
"\x99" => '™', # TRADE MARK SIGN |
|
69
|
|
|
|
|
|
|
"\x9a" => 'š', # LATIN SMALL LETTER S WITH CARON |
|
70
|
|
|
|
|
|
|
"\x9b" => '›', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
|
71
|
|
|
|
|
|
|
"\x9c" => 'œ', # LATIN SMALL LIGATURE OE |
|
72
|
|
|
|
|
|
|
"\x9e" => 'ž', # LATIN SMALL LETTER Z WITH CARON |
|
73
|
|
|
|
|
|
|
"\x9f" => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS |
|
74
|
|
|
|
|
|
|
); |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
my @utf8_skip = ( |
|
77
|
|
|
|
|
|
|
# This translates a utf-8-encoded byte into how many bytes the full utf8 |
|
78
|
|
|
|
|
|
|
# character occupies. Illegal start bytes have a negative count. |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# UTF-8 is a variable-length encoding. The 128 ASCII characters were very |
|
81
|
|
|
|
|
|
|
# deliberately set to be themselves, so UTF-8 would be backwards compatible |
|
82
|
|
|
|
|
|
|
# with 7-bit applications. Every other character has 2 - 13 bytes comprising |
|
83
|
|
|
|
|
|
|
# it. |
|
84
|
|
|
|
|
|
|
# |
|
85
|
|
|
|
|
|
|
# If the first bit of the first byte in a character is 0, it is one of those |
|
86
|
|
|
|
|
|
|
# 128 ASCII characters with length 1. |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# Otherwise, the first bit is 1, and if the second bit is also one, this byte |
|
89
|
|
|
|
|
|
|
# starts the sequence of bytes that represent the character. The bytes C0-FF |
|
90
|
|
|
|
|
|
|
# have the characteristic that the first two bits are both one. The number of |
|
91
|
|
|
|
|
|
|
# bytes that form a character corresponds to the number of consecutive leading |
|
92
|
|
|
|
|
|
|
# bits that are all one in the start byte. In the case of FE, the first 7 |
|
93
|
|
|
|
|
|
|
# bits are one, so the number of bytes in the character it represents is 7. |
|
94
|
|
|
|
|
|
|
# FF is a special case, and Perl has arbitrarily set it to 13 instead of the |
|
95
|
|
|
|
|
|
|
# expected 8. |
|
96
|
|
|
|
|
|
|
# |
|
97
|
|
|
|
|
|
|
# The remaining bytes begin with '10', from 80..9F. They are called |
|
98
|
|
|
|
|
|
|
# continuation bytes, and a UTF-8 character is comprised of a start byte |
|
99
|
|
|
|
|
|
|
# indicating 'n' bytes total in it, then 'n-1' of these continuation bytes. |
|
100
|
|
|
|
|
|
|
# What the character is that each sequence represents is derived by shifting |
|
101
|
|
|
|
|
|
|
# and adding the other bits in the bytes. (C0 and C1 aren't actually legal |
|
102
|
|
|
|
|
|
|
# start bytes for security reasons that need not concern us here, hence are |
|
103
|
|
|
|
|
|
|
# marked as negative in the table below.) |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# 0 1 2 3 4 5 6 7 8 9 A B C D E F |
|
106
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0 |
|
107
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 1 |
|
108
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 2 |
|
109
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 3 |
|
110
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 4 |
|
111
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 5 |
|
112
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 6 |
|
113
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 7 |
|
114
|
|
|
|
|
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 8 |
|
115
|
|
|
|
|
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 9 |
|
116
|
|
|
|
|
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # A |
|
117
|
|
|
|
|
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # B |
|
118
|
|
|
|
|
|
|
-1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C |
|
119
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D |
|
120
|
|
|
|
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E |
|
121
|
|
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7,13, # F |
|
122
|
|
|
|
|
|
|
); |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
BEGIN { |
|
125
|
3
|
50
|
|
3
|
|
21
|
my $proto = $] >= 5.010000 ? '_' : '$'; |
|
126
|
3
|
|
|
10
|
0
|
219
|
eval "sub zap_cp1252($proto) { unshift \@_, \\%ascii_for; &_tweakit; }"; |
|
|
10
|
|
|
|
|
3300
|
|
|
|
10
|
|
|
|
|
34
|
|
|
127
|
3
|
|
|
11
|
0
|
1494
|
eval "sub fix_cp1252($proto) { unshift \@_, \\%utf8_for; &_tweakit; }"; |
|
|
11
|
|
|
|
|
4984
|
|
|
|
11
|
|
|
|
|
35
|
|
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# These are the bytes that CP1252 redefines |
|
131
|
|
|
|
|
|
|
my $cp1252_re = qr/[\x80\x82-\x8c\x8e\x91-\x9c\x9e\x9f]/; |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub _tweakit { |
|
134
|
21
|
|
|
21
|
|
45
|
my $table = shift; |
|
135
|
21
|
100
|
|
|
|
64
|
return unless defined $_[0]; |
|
136
|
19
|
100
|
|
|
|
57
|
local $_[0] = $_[0] if defined wantarray; |
|
137
|
19
|
|
|
|
|
57
|
my $is_utf8 = PERL588 && Encode::is_utf8($_[0]); |
|
138
|
19
|
|
100
|
|
|
66
|
my $valid_utf8 = $is_utf8 && utf8::valid($_[0]); |
|
139
|
19
|
100
|
|
|
|
51
|
if (!$is_utf8) { |
|
|
|
100
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Here is non-UTF-8. Change the 1252 characters to their UTF-8 |
|
142
|
|
|
|
|
|
|
# counterparts. These bytes are very rarely used in real world |
|
143
|
|
|
|
|
|
|
# applications, so their presence likely indicates that CP1252 was |
|
144
|
|
|
|
|
|
|
# meant. |
|
145
|
12
|
|
|
|
|
124
|
$_[0] =~ s/($cp1252_re)/$table->{$1}/gems; |
|
|
272
|
|
|
|
|
647
|
|
|
146
|
|
|
|
|
|
|
} elsif ($valid_utf8) { |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# Here is well-formed Perl extended UTF-8 and has the UTF-8 flag on |
|
149
|
|
|
|
|
|
|
# and the string is held as bytes. Change the 1252 characters to their |
|
150
|
|
|
|
|
|
|
# Unicode counterparts. |
|
151
|
5
|
|
|
|
|
50
|
$_[0] =~ s/($cp1252_re)/Encode::decode_utf8($table->{$1})/gems; |
|
|
81
|
|
|
|
|
1324
|
|
|
152
|
|
|
|
|
|
|
} else { # Invalid UTF-8. Look for single-byte CP1252 gremlins |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# Turn off the UTF-8 flag so that we can go through the string |
|
155
|
|
|
|
|
|
|
# byte-by-byte. |
|
156
|
2
|
|
|
|
|
7
|
Encode::_utf8_off($_[0]); |
|
157
|
|
|
|
|
|
|
|
|
158
|
2
|
|
|
|
|
3
|
my $i = 0; |
|
159
|
2
|
|
|
|
|
5
|
my $length = length $_[0]; |
|
160
|
2
|
|
|
|
|
3
|
my $fixed = ""; # The input after being fixed up by this loop |
|
161
|
2
|
|
|
|
|
6
|
while ($i < $length) { |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Each time through the loop, we should here be ready to look at a |
|
164
|
|
|
|
|
|
|
# new character, and it's 0th byte is called a 'start byte' |
|
165
|
106
|
|
|
|
|
151
|
my $start_byte = substr($_[0], $i, 1); |
|
166
|
106
|
|
|
|
|
142
|
my $skip = $utf8_skip[ord $start_byte]; |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# The table is set up so that legal UTF-8 start bytes have a |
|
169
|
|
|
|
|
|
|
# positive byte length. Simply add all the bytes in the character |
|
170
|
|
|
|
|
|
|
# to the output, and go on to handle the next character in the |
|
171
|
|
|
|
|
|
|
# next loop iteration. |
|
172
|
106
|
100
|
|
|
|
166
|
if ($skip > 0) { |
|
173
|
52
|
|
|
|
|
73
|
$fixed .= substr($_[0], $i, $skip); |
|
174
|
52
|
|
|
|
|
62
|
$i += $skip; |
|
175
|
52
|
|
|
|
|
85
|
next; |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# Here we have a byte that isn't a start byte in a position that |
|
179
|
|
|
|
|
|
|
# should oughta be a start byte. The whole point of this loop is |
|
180
|
|
|
|
|
|
|
# to find such bytes that are CP1252 ones and which were |
|
181
|
|
|
|
|
|
|
# incorrectly inserted by the upstream process into an otherwise |
|
182
|
|
|
|
|
|
|
# valid UTF-8 string. So, if we have such a one, change it into |
|
183
|
|
|
|
|
|
|
# its corresponding correct character. |
|
184
|
54
|
50
|
|
|
|
192
|
if ($start_byte =~ s/($cp1252_re)/$table->{$1}/ems) { |
|
|
54
|
|
|
|
|
152
|
|
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# The correct character may be UTF-8 bytes. We treat them as |
|
187
|
|
|
|
|
|
|
# just a sequence of non-UTF-8 bytes, because that's what |
|
188
|
|
|
|
|
|
|
# $fixed has in it so far. After everything is consistently |
|
189
|
|
|
|
|
|
|
# added, we turn the UTF-8 flag back on before returning at |
|
190
|
|
|
|
|
|
|
# the end. |
|
191
|
54
|
|
|
|
|
114
|
Encode::_utf8_off($start_byte); |
|
192
|
54
|
|
|
|
|
78
|
$fixed .= $start_byte; |
|
193
|
54
|
|
|
|
|
64
|
$i++; |
|
194
|
54
|
|
|
|
|
113
|
next; |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
# Here the byte isn't a CP1252 one. |
|
198
|
0
|
|
|
|
|
0
|
die "Unexpected continuation byte: %02x", ord $start_byte; |
|
199
|
|
|
|
|
|
|
} |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# $fixed now has everything properly in it, but set to return it in |
|
202
|
|
|
|
|
|
|
# $_[0], marked as UTF-8. |
|
203
|
2
|
|
|
|
|
5
|
$_[0] = $fixed; |
|
204
|
2
|
|
|
|
|
5
|
Encode::_utf8_on($_[0]); |
|
205
|
|
|
|
|
|
|
} |
|
206
|
19
|
100
|
|
|
|
175
|
return $_[0] if defined wantarray; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
1; |
|
210
|
|
|
|
|
|
|
__END__ |