line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::GuessEncoding; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
33351
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
4
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
31
|
|
5
|
1
|
|
|
1
|
|
1742
|
use POSIX; |
|
1
|
|
|
|
|
10560
|
|
|
1
|
|
|
|
|
8
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Text::GuessEncoding - Convert Text from almost any encoding to ASCII or UTF8 |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Version 0.07 |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.07'; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
CAUTION: unfinished code. No objects created. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
Text::GuessEncoding gathers statistic about typical and invalid codes in |
25
|
|
|
|
|
|
|
both Latin1 and UTF-8. The concept of 'typical' is currently from an |
26
|
|
|
|
|
|
|
european point of view. |
27
|
|
|
|
|
|
|
Based on this statistics, methods to transform to Latin1 and UTF-8 are provided. These methods handle 'broken' input strings with mixed encodings well. |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
The input string may or may not have its utf8 flag set correctly; the flag |
30
|
|
|
|
|
|
|
is ignored. The returned string has the utf8 flag always off, and contains |
31
|
|
|
|
|
|
|
no characters above codepoint 127 (which means it is inside the ASCII |
32
|
|
|
|
|
|
|
character set). If called in a list context, C returns the |
33
|
|
|
|
|
|
|
mapping table as a second value. This mapping table is a hash, using all |
34
|
|
|
|
|
|
|
recognized encodings as keys. (Any well-formed string should only have one |
35
|
|
|
|
|
|
|
encoding, but one can never be sure.) Value per encoding is an array ref, |
36
|
|
|
|
|
|
|
listing all the codepoints in the following form: |
37
|
|
|
|
|
|
|
C<[ [ $codepoint, $replacement_bytecount, [ $offset, ... ] ], ... ]> |
38
|
|
|
|
|
|
|
Offset positions refer to the output string, where byte counts are identical |
39
|
|
|
|
|
|
|
with character counts. |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Example: |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
my $guess = new Text::GuessEncoding(); |
44
|
|
|
|
|
|
|
($ascii, $map) = $guess->to_ascii("J\x{fc}rgen \x{c3}\x{bc}\n"); |
45
|
|
|
|
|
|
|
# $ascii = 'Juergen ue'; |
46
|
|
|
|
|
|
|
# $map = { 'utf8' => [252, 2, [8]], 'latin1' => [252, 2, [1]] }; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
The input string contains both utf8 encoded u-umlaut glyph and a plain latin1 byte u-umlaut. |
49
|
|
|
|
|
|
|
The output string is never flagged as utf8. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
($utf8, $map) = $guess->to_utf8("J\x{fc}rgen \x{c3}\x{bc}\n"); |
52
|
|
|
|
|
|
|
# $utf8 = 'J\N{U+fc}rgen \N{U+fc}'; |
53
|
|
|
|
|
|
|
# $map = { 'utf8' => [7], 'latin1' => [1] }; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
C returns a simpler mapping table, as the string preserves more inforation. |
56
|
|
|
|
|
|
|
Note that the offsets differ from to_ascii(), as no multi-character rewriting takes place. |
57
|
|
|
|
|
|
|
The output string is always flagged as utf8. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
use Text::GuessEncoding; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
my $asciitext = Text::GuessEncoding::to_ascii($enctext); |
62
|
|
|
|
|
|
|
my ($asciitext,$mapping) = Text::GuessEncoding::to_ascii($enctext); |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=head1 EXPORT |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
C - create plain text in 7-bit ASCII encoding. |
67
|
|
|
|
|
|
|
C - return UTF-8 encoded text . |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head2 to_ascii |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
C is implemented in perl code as a post-processor of C. |
74
|
|
|
|
|
|
|
It examines C and constructs some useful ascii replacements from these. |
75
|
|
|
|
|
|
|
A number of frequently used codepoint values can be precompiled for speed. |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=cut |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
sub to_ascii |
80
|
|
|
|
|
|
|
{ |
81
|
0
|
|
|
0
|
1
|
|
my ($text) = @_; |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# run through the text, searching a byte with the high order bit set. |
84
|
|
|
|
|
|
|
# this then, is a non-ascii byte, and needs conversion. |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# We distinguish two cases here: |
87
|
|
|
|
|
|
|
# $text might know that it is utf8, or $text might believe it is not. |
88
|
|
|
|
|
|
|
|
89
|
0
|
0
|
|
|
|
|
if (utf8::is_utf8($text)) |
90
|
|
|
|
|
|
|
{ |
91
|
0
|
|
|
|
|
|
warn "to_ascii() running on a utf8 string"; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
else |
94
|
|
|
|
|
|
|
{ |
95
|
1
|
|
|
1
|
|
6143
|
use Data::Dumper; |
|
1
|
|
|
|
|
15014
|
|
|
1
|
|
|
|
|
2634
|
|
96
|
|
|
|
|
|
|
## exmine the first 24 bytes to guess if this is a 16bit encoding |
97
|
0
|
|
|
|
|
|
warn "to_ascii() running on a non-utf8 string"; |
98
|
0
|
|
|
|
|
|
my @bytes = unpack("C24", $text); |
99
|
0
|
|
|
|
|
|
print Dumper \@bytes; |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
0
|
|
|
|
|
|
while ($text =~ m{[[:^ascii:]]}g) |
103
|
|
|
|
|
|
|
{ |
104
|
0
|
|
|
|
|
|
printf "non-ascii char at pos %d\n", pos($text); |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head2 sysread_tout(FILE, $len, $tout) |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
attempts to read $len bytes from FILE, with a select() timeout of $tout. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=cut |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub sysread_tout |
115
|
|
|
|
|
|
|
{ |
116
|
0
|
|
|
0
|
1
|
|
my ($FILE, $len, $tout) = @_; |
117
|
0
|
|
|
|
|
|
my $r = ''; |
118
|
0
|
|
|
|
|
|
while ($len > 0) |
119
|
|
|
|
|
|
|
{ |
120
|
0
|
|
|
|
|
|
my $rout; |
121
|
0
|
|
|
|
|
|
my $rin = ''; |
122
|
0
|
|
|
|
|
|
vec($rin,fileno($FILE), 1) = 1; |
123
|
0
|
|
|
|
|
|
my ($n, $t) = select($rout = $rin, undef, undef, $tout); |
124
|
0
|
0
|
|
|
|
|
$tout = $t if defined $t; |
125
|
0
|
0
|
|
|
|
|
last unless $n; |
126
|
0
|
|
|
|
|
|
my $buf = ''; |
127
|
0
|
0
|
|
|
|
|
last if sysread($FILE, $buf, 1) <= 0; |
128
|
0
|
|
|
|
|
|
$r .= $buf; |
129
|
0
|
|
|
|
|
|
$len--; |
130
|
|
|
|
|
|
|
} |
131
|
0
|
|
|
|
|
|
return $r; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head2 $orig_mode = tty_raw(FILE) |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
Uses POSIX::Termios to set the terminal FILE to raw mode. Returns the previous mode so |
137
|
|
|
|
|
|
|
that it can be restored with tty_set(). |
138
|
|
|
|
|
|
|
=cut |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub tty_raw |
141
|
|
|
|
|
|
|
{ |
142
|
0
|
|
|
0
|
1
|
|
my ($FILE) = @_; |
143
|
|
|
|
|
|
|
|
144
|
0
|
|
|
|
|
|
my $t = POSIX::Termios->new; |
145
|
0
|
|
|
|
|
|
my $o = POSIX::Termios->new; |
146
|
0
|
|
|
|
|
|
$t->getattr(fileno $FILE); |
147
|
0
|
|
|
|
|
|
$o->getattr(fileno $FILE); |
148
|
|
|
|
|
|
|
|
149
|
0
|
|
|
|
|
|
$t->setlflag(0); # -echo, -icanon |
150
|
0
|
|
|
|
|
|
$t->setcc(POSIX::VMIN, 1); |
151
|
0
|
|
|
|
|
|
$t->setcc(POSIX::VTIME, 0); |
152
|
0
|
|
|
|
|
|
tty_set($FILE, $t); |
153
|
0
|
|
|
|
|
|
return $o; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 tty_set(FILE, $tty_mode) |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Sets the FILE, which is assumed to be a terminal, to mode $tty_mode . |
160
|
|
|
|
|
|
|
=cut |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub tty_set |
163
|
|
|
|
|
|
|
{ |
164
|
0
|
|
|
0
|
1
|
|
my ($FILE, $t) = @_; |
165
|
0
|
0
|
|
|
|
|
$t->setattr(fileno $FILE, POSIX::TCSANOW) or die "TCSANOW failed: $!\n"; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=head2 get_cursor_pos |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
Sets tty to raw mode by calling tty_raw(), flushes STDIN, and sends ANSI code 'ESC [ 6 n' |
172
|
|
|
|
|
|
|
(DC6 aka Report cursor Position) and attempts to read the returned position with |
173
|
|
|
|
|
|
|
a 100msec timeout. |
174
|
|
|
|
|
|
|
The terminal is returned to the previous mode, and a hashref containing the keys x, y |
175
|
|
|
|
|
|
|
is returned. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=cut |
178
|
|
|
|
|
|
|
sub get_cursor_pos |
179
|
|
|
|
|
|
|
{ |
180
|
0
|
|
|
0
|
1
|
|
my ($hint) = @_; |
181
|
|
|
|
|
|
|
# 1 may be an ansi term? |
182
|
|
|
|
|
|
|
# testing device status report 6, as seen in vttest. |
183
|
0
|
|
|
|
|
|
my $t = tty_raw(\*STDIN); |
184
|
|
|
|
|
|
|
|
185
|
0
|
|
|
|
|
|
while (length(sysread_tout(\*STDIN, 1, 0.1))) { } |
186
|
|
|
|
|
|
|
|
187
|
0
|
|
|
|
|
|
syswrite(\*STDOUT, "\33[6n", 4); |
188
|
0
|
|
|
|
|
|
my $r = sysread_tout(\*STDIN, 10, 0.1); |
189
|
0
|
|
|
|
|
|
tty_set(\*STDIN, $t); |
190
|
0
|
0
|
|
|
|
|
return { x => $2 - 1, y => $1 - 1, hint => 'DC6' } if $r =~ m{^\33\[(\d+);(\d+)R}; |
191
|
0
|
|
|
|
|
|
return undef; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head2 probe_tty |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Prints a cariage return (no linefeed), to move the cursor to a defined column |
198
|
|
|
|
|
|
|
position. |
199
|
|
|
|
|
|
|
Prints a few test characters to STDOUT and calls get_cursor_pos() to check how |
200
|
|
|
|
|
|
|
the terminal reacts upon each, e.g. by (not) advancing the cursor position |
201
|
|
|
|
|
|
|
one or multiple positions. |
202
|
|
|
|
|
|
|
Then restores the cursor to the carriage return position. |
203
|
|
|
|
|
|
|
=cut |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub probe_tty |
206
|
|
|
|
|
|
|
{ |
207
|
|
|
|
|
|
|
# |
208
|
|
|
|
|
|
|
# we can use STDIN and STDERR. |
209
|
|
|
|
|
|
|
# 0) first, see, if the terminal can report cursor positions. |
210
|
0
|
|
|
0
|
1
|
|
syswrite(STDOUT, "\r", 1); |
211
|
0
|
|
|
|
|
|
my $o = get_cursor_pos(); |
212
|
0
|
0
|
|
|
|
|
print ", x=$o->{x}\n" if $::verbose > 1; |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# - if not, abort. |
215
|
0
|
0
|
|
|
|
|
die "get_cursor_pos failed.\n" unless defined $o; |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# - if it can, store the current position |
218
|
0
|
0
|
|
|
|
|
if ($o->{x} != 0) |
219
|
|
|
|
|
|
|
{ |
220
|
0
|
0
|
|
|
|
|
warn "strace (or other) output interferes or\n" if $o->{x} >= 20; |
221
|
0
|
|
|
|
|
|
die "carriage return does not work.\n"; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# 1) write a single byte ascii character, 'X' and check, |
225
|
|
|
|
|
|
|
# if it advances by one. |
226
|
0
|
|
|
|
|
|
syswrite(STDOUT, "\rX", 2); |
227
|
0
|
|
|
|
|
|
my $p = get_cursor_pos($o->{hint}); |
228
|
0
|
0
|
|
|
|
|
print ", x=$p->{x}\n" if $::verbose; |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# - If not, it is probably in microsoft-multibyte encoding, |
232
|
|
|
|
|
|
|
# and requires '\0' prefixing. check this, report and abort. |
233
|
0
|
0
|
|
|
|
|
die "multi-byte mode" if $p->{x} != 1; |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
# 2)Then try non-ascii characters, e.g. a-umlaut. |
236
|
|
|
|
|
|
|
# 2a) send its latin1 code, and see what happens, |
237
|
0
|
|
|
|
|
|
syswrite(STDOUT, "\r1\34434", 5); # 1, a-umlaut-latin1, 3, 4 |
238
|
0
|
|
|
|
|
|
$p = get_cursor_pos($o->{hint}); |
239
|
0
|
0
|
|
|
|
|
print ", x=$p->{x}\n" if $::verbose; |
240
|
0
|
0
|
|
|
|
|
die "no report" unless defined $p; |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# - no advance indicates that the terminal is not in latin1 mode |
243
|
|
|
|
|
|
|
# or a lousy font is used. |
244
|
|
|
|
|
|
|
# - advance by 2 indicates a defect in the tty-emulator. |
245
|
0
|
0
|
0
|
|
|
|
die "latin1 a-umlaut caused confusion." if $p->{x} > 4 or $p->{x} < 2; |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# in utf8, our \344 consumes another char, thus the '3' is not printed. |
248
|
|
|
|
|
|
|
# we don't know what the font does then. |
249
|
0
|
0
|
0
|
|
|
|
my $maybe = 'utf8' if $p->{x} == 2 or $p->{x} == 3; |
250
|
0
|
0
|
|
|
|
|
$maybe = 'latin1' if $p->{x} == 4; |
251
|
0
|
0
|
|
|
|
|
print "maybe $maybe\n" if $::verbose; |
252
|
|
|
|
|
|
|
# - advance by 1 says nothing, may be latin1. |
253
|
|
|
|
|
|
|
# 2b) send its utf8 code. |
254
|
|
|
|
|
|
|
|
255
|
0
|
|
|
|
|
|
syswrite(STDOUT, "\r1\303\24434", 6); # 1, a-umlaut-utf8, 3, 4 |
256
|
0
|
|
|
|
|
|
$p = get_cursor_pos($o->{hint}); |
257
|
0
|
0
|
|
|
|
|
print ", x=$p->{x}\n" if $::verbose; |
258
|
|
|
|
|
|
|
|
259
|
0
|
0
|
|
|
|
|
die "no report" unless defined $p; |
260
|
|
|
|
|
|
|
# - no advance indicates that a lousy font is used. |
261
|
|
|
|
|
|
|
# - advance by one indicates that the terminal is in utf8 mode. |
262
|
|
|
|
|
|
|
# - advance by two indicates that the terminal is in latin1 mode. |
263
|
|
|
|
|
|
|
|
264
|
0
|
0
|
|
|
|
|
syswrite(STDOUT, "\r \r", 8) unless $::verbose; # clear scratch area |
265
|
|
|
|
|
|
|
|
266
|
0
|
0
|
|
|
|
|
if ($p->{x} == 4) |
267
|
|
|
|
|
|
|
{ |
268
|
0
|
0
|
|
|
|
|
return 'utf8' if $maybe eq 'utf8'; |
269
|
0
|
|
|
|
|
|
return 'possibly utf8'; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
0
|
0
|
|
|
|
|
return 'latin1' if $maybe eq 'latin1'; |
273
|
0
|
|
|
|
|
|
return 'possibly latin1'; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=head2 probe_file |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
C contains all the material, from which we should build C. |
280
|
|
|
|
|
|
|
=cut |
281
|
|
|
|
|
|
|
## |
282
|
|
|
|
|
|
|
## if utf8_valid is positive, then it can only be utf-8. |
283
|
|
|
|
|
|
|
## (if also utf8_invalid and/or latin1_typ are positive, then it is a mixture) |
284
|
|
|
|
|
|
|
## if only utf8_invalid or latin1_typ are positive, then it is latin1. |
285
|
|
|
|
|
|
|
## if all 3 are zero, it is plain ascii. |
286
|
|
|
|
|
|
|
## |
287
|
|
|
|
|
|
|
## FIXME: should take an optional length parameter to limit runtime. |
288
|
|
|
|
|
|
|
## |
289
|
|
|
|
|
|
|
sub probe_file |
290
|
|
|
|
|
|
|
{ |
291
|
0
|
|
|
0
|
1
|
|
my ($fd, $name) = @_; |
292
|
|
|
|
|
|
|
# print "probing $name\n" if $::verbose; |
293
|
|
|
|
|
|
|
|
294
|
0
|
|
|
|
|
|
my %typ_latin = map { $_ => 1 } qw(169 171 174 176 177 178 179 181 |
|
0
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
185 187 191 192 193 194 195 196 197 199 200 201 202 203 204 205 206 207 208 209 |
296
|
|
|
|
|
|
|
210 211 212 213 214 215 216 217 218 219 220 |
297
|
|
|
|
|
|
|
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
298
|
|
|
|
|
|
|
246 249 250 251 252 253 189 164); |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# when running incremental, $fd is probably not seekable. |
302
|
|
|
|
|
|
|
# so we need to buffer characters to be re-read after a lookahead. |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
# http://de.wikipedia.org/wiki/UTF-8#Kodierung |
305
|
|
|
|
|
|
|
|
306
|
0
|
|
|
|
|
|
my $utf8_valid = 0; # parser happy. |
307
|
0
|
|
|
|
|
|
my $utf8_invalid = 0; # something wrong. |
308
|
0
|
|
|
|
|
|
my $latin1_typ = 0; # valid chars in 128..255 range followed by a ascii byte |
309
|
0
|
|
|
|
|
|
my $ascii = 0; # char in 10..127 range |
310
|
0
|
|
|
|
|
|
my $utf8_size = 0; # how many bytes belong to this utf-8 char. |
311
|
0
|
|
|
|
|
|
my $utf8_len = 0; # how many more bytes belong to this utf-8 char. |
312
|
0
|
|
|
|
|
|
my $utf8_start = 0; # ord of utf_8 start char. |
313
|
|
|
|
|
|
|
|
314
|
0
|
|
|
|
|
|
while (defined(my $c = getc($fd))) |
315
|
|
|
|
|
|
|
{ |
316
|
0
|
|
|
|
|
|
my $v = ord($c); |
317
|
0
|
0
|
0
|
|
|
|
if ($utf8_len) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
318
|
|
|
|
|
|
|
{ |
319
|
0
|
0
|
|
|
|
|
if (($v & 0xc0) == 0x80) # 10xx xxxx |
320
|
|
|
|
|
|
|
{ |
321
|
|
|
|
|
|
|
# printf "0 %02x\n", $v; |
322
|
0
|
0
|
|
|
|
|
unless (--$utf8_len) |
323
|
|
|
|
|
|
|
{ |
324
|
0
|
|
|
|
|
|
$utf8_valid++; |
325
|
0
|
|
|
|
|
|
$utf8_size = 0; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
else |
329
|
|
|
|
|
|
|
{ |
330
|
|
|
|
|
|
|
# printf "0x %02x %02x '$c' $utf8_size-$utf8_len\n", $utf8_start, $v; |
331
|
0
|
0
|
0
|
|
|
|
if (($utf8_size - $utf8_len) == 1 and $typ_latin{$utf8_start}) |
332
|
|
|
|
|
|
|
{ |
333
|
0
|
0
|
0
|
|
|
|
if ($v > 7 && $v < 128) |
|
|
0
|
|
|
|
|
|
334
|
|
|
|
|
|
|
{ |
335
|
0
|
|
|
|
|
|
$latin1_typ++; |
336
|
0
|
|
|
|
|
|
$ascii++; |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
elsif ($typ_latin{$v}) |
339
|
|
|
|
|
|
|
{ |
340
|
0
|
|
|
|
|
|
$latin1_typ += 2; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
else |
343
|
|
|
|
|
|
|
{ |
344
|
0
|
|
|
|
|
|
$utf8_invalid++; |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
else |
348
|
|
|
|
|
|
|
{ |
349
|
0
|
|
|
|
|
|
$utf8_invalid++; |
350
|
|
|
|
|
|
|
} |
351
|
0
|
|
|
|
|
|
$utf8_len = $utf8_size = $utf8_start = 0; |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
elsif ($v > 7 && $v < 128) |
355
|
|
|
|
|
|
|
{ |
356
|
0
|
|
|
|
|
|
$ascii++; |
357
|
0
|
|
|
|
|
|
next; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
elsif (($v & 0xe0) == 0xc0) # 110x xxxx |
360
|
|
|
|
|
|
|
{ |
361
|
0
|
|
|
|
|
|
$utf8_start = $v; |
362
|
0
|
|
|
|
|
|
$utf8_size = 2; |
363
|
0
|
|
|
|
|
|
$utf8_len = 1; |
364
|
|
|
|
|
|
|
# printf "1 %02x\n", $v; |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
elsif (($v & 0xf0) == 0xe0) # 1110 xxxx |
367
|
|
|
|
|
|
|
{ |
368
|
0
|
|
|
|
|
|
$utf8_start = $v; |
369
|
0
|
|
|
|
|
|
$utf8_size = 3; |
370
|
0
|
|
|
|
|
|
$utf8_len = 2; |
371
|
|
|
|
|
|
|
# printf "2 %02x\n", $v; |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
elsif (($v & 0xf8) == 0xf0) # 1111 0xxx |
374
|
|
|
|
|
|
|
{ |
375
|
0
|
|
|
|
|
|
$utf8_start = $v; |
376
|
0
|
|
|
|
|
|
$utf8_size = 4; |
377
|
0
|
|
|
|
|
|
$utf8_len = 3; |
378
|
|
|
|
|
|
|
# printf "3 %02x\n", $v; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
elsif ($typ_latin{$v}) |
381
|
|
|
|
|
|
|
{ |
382
|
0
|
|
|
|
|
|
$latin1_typ++; |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
else |
385
|
|
|
|
|
|
|
{ |
386
|
0
|
|
|
|
|
|
$utf8_invalid++; |
387
|
|
|
|
|
|
|
# printf "x %02x\n", $v; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
} |
390
|
0
|
|
|
|
|
|
print "$name: utf8_valid=$utf8_valid utf8_invalid=$utf8_invalid latin1_typ=$latin1_typ ascii=$ascii\n"; |
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=head2 utf8toascii |
394
|
|
|
|
|
|
|
convert a well formed utf8 file into ascii transcript. |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
#! /usr/bin/perl -w -T |
397
|
|
|
|
|
|
|
# |
398
|
|
|
|
|
|
|
# utf8toascii -- convert a well formed utf8 file into ascii transcript. |
399
|
|
|
|
|
|
|
# |
400
|
|
|
|
|
|
|
# 2010-01-09, jw -- initial draft. |
401
|
|
|
|
|
|
|
# |
402
|
|
|
|
|
|
|
# perl -e 'use Encode; use charnames ":full"; map { print charnames::viacode($_). "\n" } unpack "W*", Encode::decode_utf8("J\x{c3}\x{bc}rgen fl\x{ef}\x{ac}\x{82} ü\x{c2}\x{a8}\n")' |
403
|
|
|
|
|
|
|
# |
404
|
|
|
|
|
|
|
#use charnames ":full"; |
405
|
|
|
|
|
|
|
# |
406
|
|
|
|
|
|
|
#my $ofd = \*STDOUT; |
407
|
|
|
|
|
|
|
#my $ifd = \*STDIN; |
408
|
|
|
|
|
|
|
# |
409
|
|
|
|
|
|
|
#print STDERR utf8toascii($ifd, $ofd); |
410
|
|
|
|
|
|
|
#exit 0; |
411
|
|
|
|
|
|
|
############# |
412
|
|
|
|
|
|
|
=cut |
413
|
|
|
|
|
|
|
sub utf8toascii |
414
|
|
|
|
|
|
|
{ |
415
|
0
|
|
|
0
|
1
|
|
my ($ifd, $ofd) = @_; |
416
|
0
|
|
|
|
|
|
my $msg = ''; |
417
|
|
|
|
|
|
|
|
418
|
0
|
|
|
|
|
|
binmode($ifd, ":utf8"); |
419
|
|
|
|
|
|
|
|
420
|
0
|
|
|
|
|
|
my %ascii_map = |
421
|
|
|
|
|
|
|
( |
422
|
|
|
|
|
|
|
'RIGHT DOUBLE QUOTATION MARK' => "``", |
423
|
|
|
|
|
|
|
'LEFT DOUBLE QUOTATION MARK' => "''", |
424
|
|
|
|
|
|
|
'RIGHT SINGLE QUOTATION MARK' => "`", |
425
|
|
|
|
|
|
|
'LEFT SINGLE QUOTATION MARK' => "'", |
426
|
|
|
|
|
|
|
'MODIFIER LETTER TURNED COMMA' => "'", |
427
|
|
|
|
|
|
|
'MODIFIER LETTER VERTICAL LINE' => "|", |
428
|
|
|
|
|
|
|
'RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK' => ">>", |
429
|
|
|
|
|
|
|
'LEFT-POINTING DOUBLE ANGLE QUOTATION MARK' => "<<", |
430
|
|
|
|
|
|
|
'DOUBLE ACUTE ACCENT' => '"', |
431
|
|
|
|
|
|
|
'ACUTE ACCENT' => "'", |
432
|
|
|
|
|
|
|
'DOUBLE LOW-9 QUOTATION MARK' => ',,', |
433
|
|
|
|
|
|
|
'HORIZONTAL ELLIPSIS' => '...', |
434
|
|
|
|
|
|
|
'LATIN SMALL LETTER A WITH DIAERESIS' => "ae", |
435
|
|
|
|
|
|
|
'LATIN SMALL LETTER O WITH DIAERESIS' => "oe", |
436
|
|
|
|
|
|
|
'LATIN SMALL LETTER U WITH DIAERESIS' => "ue", |
437
|
|
|
|
|
|
|
'LATIN CAPITAL LETTER A WITH DIAERESIS' => "Ae", |
438
|
|
|
|
|
|
|
'LATIN CAPITAL LETTER O WITH DIAERESIS' => "Oe", |
439
|
|
|
|
|
|
|
'LATIN CAPITAL LETTER U WITH DIAERESIS' => "Ue", |
440
|
|
|
|
|
|
|
'LATIN SMALL LETTER SHARP S' => "ss", |
441
|
|
|
|
|
|
|
'LATIN SMALL LIGATURE FL' => "fl", |
442
|
|
|
|
|
|
|
'LATIN SMALL LIGATURE FI' => "fi", |
443
|
|
|
|
|
|
|
'LATIN SMALL LIGATURE FF' => "ff", |
444
|
|
|
|
|
|
|
'LATIN SMALL LIGATURE FFI' => "ffi", |
445
|
|
|
|
|
|
|
'LATIN SMALL LIGATURE FFL' => "ffl", |
446
|
|
|
|
|
|
|
'NON-BREAKING HYPHEN' => "-", |
447
|
|
|
|
|
|
|
'SOFT HYPHEN' => "-", |
448
|
|
|
|
|
|
|
'EN DASH' => "-", |
449
|
|
|
|
|
|
|
'EM DASH' => "--", |
450
|
|
|
|
|
|
|
'SECTION SIGN' => "#", |
451
|
|
|
|
|
|
|
'DIVISION SIGN' => "/", |
452
|
|
|
|
|
|
|
'WHITE BULLET' => "^", |
453
|
|
|
|
|
|
|
'DEGREE SIGN' => "^", |
454
|
|
|
|
|
|
|
'DAGGER' => "+", |
455
|
|
|
|
|
|
|
'DOUBLE DAGGER' => "(++)", |
456
|
|
|
|
|
|
|
'FEMININE ORDINAL INDICATOR' => "(x)", |
457
|
|
|
|
|
|
|
'BLACK DIAMOND SUIT' => "(*)", |
458
|
|
|
|
|
|
|
'BULLET' => "*", |
459
|
|
|
|
|
|
|
'ASTERISK OPERATOR' => "*", |
460
|
|
|
|
|
|
|
'MULTIPLICATION SIGN' => "*", |
461
|
|
|
|
|
|
|
'INCREMENT' => "-", |
462
|
|
|
|
|
|
|
'COPYRIGHT SIGN' => "(c)", |
463
|
|
|
|
|
|
|
'REGISTERED SIGN' => "(R)", |
464
|
|
|
|
|
|
|
'TRADE MARK SIGN' => "(TM)", |
465
|
|
|
|
|
|
|
'BLACK FOUR POINTED STAR' => "+", |
466
|
|
|
|
|
|
|
'NARROW NO-BREAK SPACE' => " ", |
467
|
|
|
|
|
|
|
'NO-BREAK SPACE' => " ", |
468
|
|
|
|
|
|
|
'FIGURE SPACE' => " ", |
469
|
|
|
|
|
|
|
'EM SPACE' => " ", |
470
|
|
|
|
|
|
|
'DIE FACE-6' => "[6]", |
471
|
|
|
|
|
|
|
'DIE FACE-5' => "[5]", |
472
|
|
|
|
|
|
|
'DIE FACE-4' => "[4]", |
473
|
|
|
|
|
|
|
'DIE FACE-3' => "[3]", |
474
|
|
|
|
|
|
|
'DIE FACE-2' => "[2]", |
475
|
|
|
|
|
|
|
'DIE FACE-1' => "[1]", |
476
|
|
|
|
|
|
|
'DINGBAT NEGATIVE CIRCLED DIGIT ONE' => "(1)", |
477
|
|
|
|
|
|
|
'DINGBAT NEGATIVE CIRCLED DIGIT TWO' => "(2)", |
478
|
|
|
|
|
|
|
'DINGBAT NEGATIVE CIRCLED DIGIT THREE'=> "(3)", |
479
|
|
|
|
|
|
|
'DINGBAT NEGATIVE CIRCLED DIGIT FOUR' => "(4)", |
480
|
|
|
|
|
|
|
'DINGBAT NEGATIVE CIRCLED DIGIT FIVE' => "(5)", |
481
|
|
|
|
|
|
|
'DINGBAT NEGATIVE CIRCLED DIGIT SIX' => "(6)", |
482
|
|
|
|
|
|
|
'DINGBAT NEGATIVE CIRCLED DIGIT SEVEN'=> "(7)", |
483
|
|
|
|
|
|
|
'DINGBAT NEGATIVE CIRCLED DIGIT EIGHT'=> "(8)", |
484
|
|
|
|
|
|
|
'DINGBAT NEGATIVE CIRCLED DIGIT NINE' => "(9)", |
485
|
|
|
|
|
|
|
'WHITE SQUARE' => "[ ]", |
486
|
|
|
|
|
|
|
'UPWARDS ARROW' => "^", |
487
|
|
|
|
|
|
|
'DRAFTING POINT RIGHTWARDS ARROW' => "(->)", |
488
|
|
|
|
|
|
|
'VULGAR FRACTION ONE HALF' => "(1/2)", |
489
|
|
|
|
|
|
|
'VULGAR FRACTION THREE QUARTERS' => "(3/4)", |
490
|
|
|
|
|
|
|
); |
491
|
|
|
|
|
|
|
|
492
|
0
|
|
|
|
|
|
while (defined(my $ch = getc($ifd))) |
493
|
|
|
|
|
|
|
{ |
494
|
0
|
|
|
|
|
|
my $v = ord($ch); |
495
|
0
|
0
|
|
|
|
|
if ($v < 128) |
496
|
|
|
|
|
|
|
{ |
497
|
0
|
|
|
|
|
|
print $ofd $ch; |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
else |
500
|
|
|
|
|
|
|
{ |
501
|
0
|
|
|
|
|
|
my $name = charnames::viacode($v); |
502
|
0
|
0
|
|
|
|
|
if (defined(my $a = $ascii_map{$name})) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
503
|
|
|
|
|
|
|
{ |
504
|
0
|
|
|
|
|
|
print $ofd $a; |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
elsif ($name =~ m{^LATIN (SMALL )?LETTER (FINAL |SMALL CAPITAL |INVERTED )*(\w)}) |
507
|
|
|
|
|
|
|
{ |
508
|
0
|
|
|
|
|
|
print $ofd lc $3; |
509
|
|
|
|
|
|
|
} |
510
|
|
|
|
|
|
|
elsif ($name =~ m{^LATIN CAPITAL LETTER (FINAL |SMALL CAPITAL |INVERTED )*(\w)}) |
511
|
|
|
|
|
|
|
{ |
512
|
0
|
|
|
|
|
|
print $ofd uc $2; |
513
|
|
|
|
|
|
|
} |
514
|
|
|
|
|
|
|
elsif ($name =~ m{^(ARABIC|GREEK) (SMALL )?LETTER (FINAL |SMALL CAPITAL |INVERTED )*(\w+)$}) |
515
|
|
|
|
|
|
|
{ |
516
|
0
|
|
|
|
|
|
print $ofd lc "-$4-"; |
517
|
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
elsif ($name =~ m{^(ARABIC|GREEK) CAPITAL LETTER (FINAL |SMALL CAPITAL |INVERTED )*(\w+)$}) |
519
|
|
|
|
|
|
|
{ |
520
|
0
|
|
|
|
|
|
print $ofd uc "-$3-"; |
521
|
|
|
|
|
|
|
} |
522
|
|
|
|
|
|
|
else |
523
|
|
|
|
|
|
|
{ |
524
|
0
|
|
|
|
|
|
printf $ofd "[[%x='$name']]", $v; |
525
|
0
|
|
|
|
|
|
$msg .= sprintf "unknown %x='$name'\n", $v; |
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
} |
528
|
|
|
|
|
|
|
} |
529
|
0
|
|
|
|
|
|
return $msg; |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=head1 AUTHOR |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
Juergen Weigert, C<< >> |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=head1 BUGS |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through |
540
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll |
541
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
=head1 SUPPORT |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
perldoc Text::GuessEncoding |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
You can also look for information at: |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=over 4 |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
L |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
L |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
=item * CPAN Ratings |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
L |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
=item * Search CPAN |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
L |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
=back |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
Copyright 2010 Juergen Weigert. |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
584
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
585
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=cut |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
1; # End of Text::GuessEncoding |