File Coverage

blib/lib/Text/GuessEncoding.pm

Criterion	Covered	Total	%
statement	12	140	8.5
branch	0	82	0.0
condition	0	15	0.0
subroutine	4	12	33.3
pod	8	8	100.0
total	24	257	9.3

line	stmt	bran	cond	sub	pod	time	code
1							package Text::GuessEncoding;
2
3	1			1		33351	use warnings;
	1					2
	1					31
4	1			1		6	use strict;
	1					1
	1					31
5	1			1		1742	use POSIX;
	1					10560
	1					8
6
7							=head1 NAME
8
9							Text::GuessEncoding - Convert Text from almost any encoding to ASCII or UTF8
10
11							=head1 VERSION
12
13							Version 0.07
14
15							=cut
16
17							our $VERSION = '0.07';
18
19
20							=head1 SYNOPSIS
21
22							CAUTION: unfinished code. No objects created.
23
24							Text::GuessEncoding gathers statistic about typical and invalid codes in
25							both Latin1 and UTF-8. The concept of 'typical' is currently from an
26							european point of view.
27							Based on this statistics, methods to transform to Latin1 and UTF-8 are provided. These methods handle 'broken' input strings with mixed encodings well.
28
29							The input string may or may not have its utf8 flag set correctly; the flag
30							is ignored. The returned string has the utf8 flag always off, and contains
31							no characters above codepoint 127 (which means it is inside the ASCII
32							character set). If called in a list context, C returns the
33							mapping table as a second value. This mapping table is a hash, using all
34							recognized encodings as keys. (Any well-formed string should only have one
35							encoding, but one can never be sure.) Value per encoding is an array ref,
36							listing all the codepoints in the following form:
37							C<[ [ $codepoint, $replacement_bytecount, [ $offset, ... ] ], ... ]>
38							Offset positions refer to the output string, where byte counts are identical
39							with character counts.
40
41							Example:
42
43							my $guess = new Text::GuessEncoding();
44							($ascii, $map) = $guess->to_ascii("J\x{fc}rgen \x{c3}\x{bc}\n");
45							# $ascii = 'Juergen ue';
46							# $map = { 'utf8' => [252, 2, [8]], 'latin1' => [252, 2, [1]] };
47
48							The input string contains both utf8 encoded u-umlaut glyph and a plain latin1 byte u-umlaut.
49							The output string is never flagged as utf8.
50
51							($utf8, $map) = $guess->to_utf8("J\x{fc}rgen \x{c3}\x{bc}\n");
52							# $utf8 = 'J\N{U+fc}rgen \N{U+fc}';
53							# $map = { 'utf8' => [7], 'latin1' => [1] };
54
55							C returns a simpler mapping table, as the string preserves more inforation.
56							Note that the offsets differ from to_ascii(), as no multi-character rewriting takes place.
57							The output string is always flagged as utf8.
58
59							use Text::GuessEncoding;
60
61							my $asciitext = Text::GuessEncoding::to_ascii($enctext);
62							my ($asciitext,$mapping) = Text::GuessEncoding::to_ascii($enctext);
63
64							=head1 EXPORT
65
66							C - create plain text in 7-bit ASCII encoding.
67							C - return UTF-8 encoded text .
68
69							=head1 SUBROUTINES/METHODS
70
71							=head2 to_ascii
72
73							C is implemented in perl code as a post-processor of C.
74							It examines C and constructs some useful ascii replacements from these.
75							A number of frequently used codepoint values can be precompiled for speed.
76
77							=cut
78
79							sub to_ascii
80							{
81	0			0	1		my ($text) = @_;
82
83							# run through the text, searching a byte with the high order bit set.
84							# this then, is a non-ascii byte, and needs conversion.
85
86							# We distinguish two cases here:
87							# $text might know that it is utf8, or $text might believe it is not.
88
89	0	0					if (utf8::is_utf8($text))
90							{
91	0						warn "to_ascii() running on a utf8 string";
92							}
93							else
94							{
95	1			1		6143	use Data::Dumper;
	1					15014
	1					2634
96							## exmine the first 24 bytes to guess if this is a 16bit encoding
97	0						warn "to_ascii() running on a non-utf8 string";
98	0						my @bytes = unpack("C24", $text);
99	0						print Dumper \@bytes;
100							}
101
102	0						while ($text =~ m{[[:^ascii:]]}g)
103							{
104	0						printf "non-ascii char at pos %d\n", pos($text);
105							}
106							}
107
108							=head2 sysread_tout(FILE, $len, $tout)
109
110							attempts to read $len bytes from FILE, with a select() timeout of $tout.
111
112							=cut
113
114							sub sysread_tout
115							{
116	0			0	1		my ($FILE, $len, $tout) = @_;
117	0						my $r = '';
118	0						while ($len > 0)
119							{
120	0						my $rout;
121	0						my $rin = '';
122	0						vec($rin,fileno($FILE), 1) = 1;
123	0						my ($n, $t) = select($rout = $rin, undef, undef, $tout);
124	0	0					$tout = $t if defined $t;
125	0	0					last unless $n;
126	0						my $buf = '';
127	0	0					last if sysread($FILE, $buf, 1) <= 0;
128	0						$r .= $buf;
129	0						$len--;
130							}
131	0						return $r;
132							}
133
134							=head2 $orig_mode = tty_raw(FILE)
135
136							Uses POSIX::Termios to set the terminal FILE to raw mode. Returns the previous mode so
137							that it can be restored with tty_set().
138							=cut
139
140							sub tty_raw
141							{
142	0			0	1		my ($FILE) = @_;
143
144	0						my $t = POSIX::Termios->new;
145	0						my $o = POSIX::Termios->new;
146	0						$t->getattr(fileno $FILE);
147	0						$o->getattr(fileno $FILE);
148
149	0						$t->setlflag(0); # -echo, -icanon
150	0						$t->setcc(POSIX::VMIN, 1);
151	0						$t->setcc(POSIX::VTIME, 0);
152	0						tty_set($FILE, $t);
153	0						return $o;
154							}
155
156
157							=head2 tty_set(FILE, $tty_mode)
158
159							Sets the FILE, which is assumed to be a terminal, to mode $tty_mode .
160							=cut
161
162							sub tty_set
163							{
164	0			0	1		my ($FILE, $t) = @_;
165	0	0					$t->setattr(fileno $FILE, POSIX::TCSANOW) or die "TCSANOW failed: $!\n";
166							}
167
168
169							=head2 get_cursor_pos
170
171							Sets tty to raw mode by calling tty_raw(), flushes STDIN, and sends ANSI code 'ESC [ 6 n'
172							(DC6 aka Report cursor Position) and attempts to read the returned position with
173							a 100msec timeout.
174							The terminal is returned to the previous mode, and a hashref containing the keys x, y
175							is returned.
176
177							=cut
178							sub get_cursor_pos
179							{
180	0			0	1		my ($hint) = @_;
181							# 1 may be an ansi term?
182							# testing device status report 6, as seen in vttest.
183	0						my $t = tty_raw(\*STDIN);
184
185	0						while (length(sysread_tout(\*STDIN, 1, 0.1))) { }
186
187	0						syswrite(\*STDOUT, "\33[6n", 4);
188	0						my $r = sysread_tout(\*STDIN, 10, 0.1);
189	0						tty_set(\*STDIN, $t);
190	0	0					return { x => $2 - 1, y => $1 - 1, hint => 'DC6' } if $r =~ m{^\33\[(\d+);(\d+)R};
191	0						return undef;
192							}
193
194
195							=head2 probe_tty
196
197							Prints a cariage return (no linefeed), to move the cursor to a defined column
198							position.
199							Prints a few test characters to STDOUT and calls get_cursor_pos() to check how
200							the terminal reacts upon each, e.g. by (not) advancing the cursor position
201							one or multiple positions.
202							Then restores the cursor to the carriage return position.
203							=cut
204
205							sub probe_tty
206							{
207							#
208							# we can use STDIN and STDERR.
209							# 0) first, see, if the terminal can report cursor positions.
210	0			0	1		syswrite(STDOUT, "\r", 1);
211	0						my $o = get_cursor_pos();
212	0	0					print ", x=$o->{x}\n" if $::verbose > 1;
213
214							# - if not, abort.
215	0	0					die "get_cursor_pos failed.\n" unless defined $o;
216
217							# - if it can, store the current position
218	0	0					if ($o->{x} != 0)
219							{
220	0	0					warn "strace (or other) output interferes or\n" if $o->{x} >= 20;
221	0						die "carriage return does not work.\n";
222							}
223
224							# 1) write a single byte ascii character, 'X' and check,
225							# if it advances by one.
226	0						syswrite(STDOUT, "\rX", 2);
227	0						my $p = get_cursor_pos($o->{hint});
228	0	0					print ", x=$p->{x}\n" if $::verbose;
229
230
231							# - If not, it is probably in microsoft-multibyte encoding,
232							# and requires '\0' prefixing. check this, report and abort.
233	0	0					die "multi-byte mode" if $p->{x} != 1;
234
235							# 2)Then try non-ascii characters, e.g. a-umlaut.
236							# 2a) send its latin1 code, and see what happens,
237	0						syswrite(STDOUT, "\r1\34434", 5); # 1, a-umlaut-latin1, 3, 4
238	0						$p = get_cursor_pos($o->{hint});
239	0	0					print ", x=$p->{x}\n" if $::verbose;
240	0	0					die "no report" unless defined $p;
241
242							# - no advance indicates that the terminal is not in latin1 mode
243							# or a lousy font is used.
244							# - advance by 2 indicates a defect in the tty-emulator.
245	0	0	0				die "latin1 a-umlaut caused confusion." if $p->{x} > 4 or $p->{x} < 2;
246
247							# in utf8, our \344 consumes another char, thus the '3' is not printed.
248							# we don't know what the font does then.
249	0	0	0				my $maybe = 'utf8' if $p->{x} == 2 or $p->{x} == 3;
250	0	0					$maybe = 'latin1' if $p->{x} == 4;
251	0	0					print "maybe $maybe\n" if $::verbose;
252							# - advance by 1 says nothing, may be latin1.
253							# 2b) send its utf8 code.
254
255	0						syswrite(STDOUT, "\r1\303\24434", 6); # 1, a-umlaut-utf8, 3, 4
256	0						$p = get_cursor_pos($o->{hint});
257	0	0					print ", x=$p->{x}\n" if $::verbose;
258
259	0	0					die "no report" unless defined $p;
260							# - no advance indicates that a lousy font is used.
261							# - advance by one indicates that the terminal is in utf8 mode.
262							# - advance by two indicates that the terminal is in latin1 mode.
263
264	0	0					syswrite(STDOUT, "\r \r", 8) unless $::verbose; # clear scratch area
265
266	0	0					if ($p->{x} == 4)
267							{
268	0	0					return 'utf8' if $maybe eq 'utf8';
269	0						return 'possibly utf8';
270							}
271
272	0	0					return 'latin1' if $maybe eq 'latin1';
273	0						return 'possibly latin1';
274							}
275
276
277							=head2 probe_file
278
279							C contains all the material, from which we should build C.
280							=cut
281							##
282							## if utf8_valid is positive, then it can only be utf-8.
283							## (if also utf8_invalid and/or latin1_typ are positive, then it is a mixture)
284							## if only utf8_invalid or latin1_typ are positive, then it is latin1.
285							## if all 3 are zero, it is plain ascii.
286							##
287							## FIXME: should take an optional length parameter to limit runtime.
288							##
289							sub probe_file
290							{
291	0			0	1		my ($fd, $name) = @_;
292							# print "probing $name\n" if $::verbose;
293
294	0						my %typ_latin = map { $_ => 1 } qw(169 171 174 176 177 178 179 181
	0
295							185 187 191 192 193 194 195 196 197 199 200 201 202 203 204 205 206 207 208 209
296							210 211 212 213 214 215 216 217 218 219 220
297							223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
298							246 249 250 251 252 253 189 164);
299
300
301							# when running incremental, $fd is probably not seekable.
302							# so we need to buffer characters to be re-read after a lookahead.
303
304							# http://de.wikipedia.org/wiki/UTF-8#Kodierung
305
306	0						my $utf8_valid = 0; # parser happy.
307	0						my $utf8_invalid = 0; # something wrong.
308	0						my $latin1_typ = 0; # valid chars in 128..255 range followed by a ascii byte
309	0						my $ascii = 0; # char in 10..127 range
310	0						my $utf8_size = 0; # how many bytes belong to this utf-8 char.
311	0						my $utf8_len = 0; # how many more bytes belong to this utf-8 char.
312	0						my $utf8_start = 0; # ord of utf_8 start char.
313
314	0						while (defined(my $c = getc($fd)))
315							{
316	0						my $v = ord($c);
317	0	0	0				if ($utf8_len)
		0
		0
		0
		0
		0
318							{
319	0	0					if (($v & 0xc0) == 0x80) # 10xx xxxx
320							{
321							# printf "0 %02x\n", $v;
322	0	0					unless (--$utf8_len)
323							{
324	0						$utf8_valid++;
325	0						$utf8_size = 0;
326							}
327							}
328							else
329							{
330							# printf "0x %02x %02x '$c' $utf8_size-$utf8_len\n", $utf8_start, $v;
331	0	0	0				if (($utf8_size - $utf8_len) == 1 and $typ_latin{$utf8_start})
332							{
333	0	0	0				if ($v > 7 && $v < 128)
		0
334							{
335	0						$latin1_typ++;
336	0						$ascii++;
337							}
338							elsif ($typ_latin{$v})
339							{
340	0						$latin1_typ += 2;
341							}
342							else
343							{
344	0						$utf8_invalid++;
345							}
346							}
347							else
348							{
349	0						$utf8_invalid++;
350							}
351	0						$utf8_len = $utf8_size = $utf8_start = 0;
352							}
353							}
354							elsif ($v > 7 && $v < 128)
355							{
356	0						$ascii++;
357	0						next;
358							}
359							elsif (($v & 0xe0) == 0xc0) # 110x xxxx
360							{
361	0						$utf8_start = $v;
362	0						$utf8_size = 2;
363	0						$utf8_len = 1;
364							# printf "1 %02x\n", $v;
365							}
366							elsif (($v & 0xf0) == 0xe0) # 1110 xxxx
367							{
368	0						$utf8_start = $v;
369	0						$utf8_size = 3;
370	0						$utf8_len = 2;
371							# printf "2 %02x\n", $v;
372							}
373							elsif (($v & 0xf8) == 0xf0) # 1111 0xxx
374							{
375	0						$utf8_start = $v;
376	0						$utf8_size = 4;
377	0						$utf8_len = 3;
378							# printf "3 %02x\n", $v;
379							}
380							elsif ($typ_latin{$v})
381							{
382	0						$latin1_typ++;
383							}
384							else
385							{
386	0						$utf8_invalid++;
387							# printf "x %02x\n", $v;
388							}
389							}
390	0						print "$name: utf8_valid=$utf8_valid utf8_invalid=$utf8_invalid latin1_typ=$latin1_typ ascii=$ascii\n";
391							}
392
393							=head2 utf8toascii
394							convert a well formed utf8 file into ascii transcript.
395
396							#! /usr/bin/perl -w -T
397							#
398							# utf8toascii -- convert a well formed utf8 file into ascii transcript.
399							#
400							# 2010-01-09, jw -- initial draft.
401							#
402							# perl -e 'use Encode; use charnames ":full"; map { print charnames::viacode($_). "\n" } unpack "W*", Encode::decode_utf8("J\x{c3}\x{bc}rgen fl\x{ef}\x{ac}\x{82} �\x{c2}\x{a8}\n")'
403							#
404							#use charnames ":full";
405							#
406							#my $ofd = \*STDOUT;
407							#my $ifd = \*STDIN;
408							#
409							#print STDERR utf8toascii($ifd, $ofd);
410							#exit 0;
411							#############
412							=cut
413							sub utf8toascii
414							{
415	0			0	1		my ($ifd, $ofd) = @_;
416	0						my $msg = '';
417
418	0						binmode($ifd, ":utf8");
419
420	0						my %ascii_map =
421							(
422							'RIGHT DOUBLE QUOTATION MARK' => "``",
423							'LEFT DOUBLE QUOTATION MARK' => "''",
424							'RIGHT SINGLE QUOTATION MARK' => "`",
425							'LEFT SINGLE QUOTATION MARK' => "'",
426							'MODIFIER LETTER TURNED COMMA' => "'",
427							'MODIFIER LETTER VERTICAL LINE' => "\|",
428							'RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK' => ">>",
429							'LEFT-POINTING DOUBLE ANGLE QUOTATION MARK' => "<<",
430							'DOUBLE ACUTE ACCENT' => '"',
431							'ACUTE ACCENT' => "'",
432							'DOUBLE LOW-9 QUOTATION MARK' => ',,',
433							'HORIZONTAL ELLIPSIS' => '...',
434							'LATIN SMALL LETTER A WITH DIAERESIS' => "ae",
435							'LATIN SMALL LETTER O WITH DIAERESIS' => "oe",
436							'LATIN SMALL LETTER U WITH DIAERESIS' => "ue",
437							'LATIN CAPITAL LETTER A WITH DIAERESIS' => "Ae",
438							'LATIN CAPITAL LETTER O WITH DIAERESIS' => "Oe",
439							'LATIN CAPITAL LETTER U WITH DIAERESIS' => "Ue",
440							'LATIN SMALL LETTER SHARP S' => "ss",
441							'LATIN SMALL LIGATURE FL' => "fl",
442							'LATIN SMALL LIGATURE FI' => "fi",
443							'LATIN SMALL LIGATURE FF' => "ff",
444							'LATIN SMALL LIGATURE FFI' => "ffi",
445							'LATIN SMALL LIGATURE FFL' => "ffl",
446							'NON-BREAKING HYPHEN' => "-",
447							'SOFT HYPHEN' => "-",
448							'EN DASH' => "-",
449							'EM DASH' => "--",
450							'SECTION SIGN' => "#",
451							'DIVISION SIGN' => "/",
452							'WHITE BULLET' => "^",
453							'DEGREE SIGN' => "^",
454							'DAGGER' => "+",
455							'DOUBLE DAGGER' => "(++)",
456							'FEMININE ORDINAL INDICATOR' => "(x)",
457							'BLACK DIAMOND SUIT' => "(*)",
458							'BULLET' => "*",
459							'ASTERISK OPERATOR' => "*",
460							'MULTIPLICATION SIGN' => "*",
461							'INCREMENT' => "-",
462							'COPYRIGHT SIGN' => "(c)",
463							'REGISTERED SIGN' => "(R)",
464							'TRADE MARK SIGN' => "(TM)",
465							'BLACK FOUR POINTED STAR' => "+",
466							'NARROW NO-BREAK SPACE' => " ",
467							'NO-BREAK SPACE' => " ",
468							'FIGURE SPACE' => " ",
469							'EM SPACE' => " ",
470							'DIE FACE-6' => "[6]",
471							'DIE FACE-5' => "[5]",
472							'DIE FACE-4' => "[4]",
473							'DIE FACE-3' => "[3]",
474							'DIE FACE-2' => "[2]",
475							'DIE FACE-1' => "[1]",
476							'DINGBAT NEGATIVE CIRCLED DIGIT ONE' => "(1)",
477							'DINGBAT NEGATIVE CIRCLED DIGIT TWO' => "(2)",
478							'DINGBAT NEGATIVE CIRCLED DIGIT THREE'=> "(3)",
479							'DINGBAT NEGATIVE CIRCLED DIGIT FOUR' => "(4)",
480							'DINGBAT NEGATIVE CIRCLED DIGIT FIVE' => "(5)",
481							'DINGBAT NEGATIVE CIRCLED DIGIT SIX' => "(6)",
482							'DINGBAT NEGATIVE CIRCLED DIGIT SEVEN'=> "(7)",
483							'DINGBAT NEGATIVE CIRCLED DIGIT EIGHT'=> "(8)",
484							'DINGBAT NEGATIVE CIRCLED DIGIT NINE' => "(9)",
485							'WHITE SQUARE' => "[ ]",
486							'UPWARDS ARROW' => "^",
487							'DRAFTING POINT RIGHTWARDS ARROW' => "(->)",
488							'VULGAR FRACTION ONE HALF' => "(1/2)",
489							'VULGAR FRACTION THREE QUARTERS' => "(3/4)",
490							);
491
492	0						while (defined(my $ch = getc($ifd)))
493							{
494	0						my $v = ord($ch);
495	0	0					if ($v < 128)
496							{
497	0						print $ofd $ch;
498							}
499							else
500							{
501	0						my $name = charnames::viacode($v);
502	0	0					if (defined(my $a = $ascii_map{$name}))
		0
		0
		0
		0
503							{
504	0						print $ofd $a;
505							}
506							elsif ($name =~ m{^LATIN (SMALL )?LETTER (FINAL \|SMALL CAPITAL \|INVERTED )*(\w)})
507							{
508	0						print $ofd lc $3;
509							}
510							elsif ($name =~ m{^LATIN CAPITAL LETTER (FINAL \|SMALL CAPITAL \|INVERTED )*(\w)})
511							{
512	0						print $ofd uc $2;
513							}
514							elsif ($name =~ m{^(ARABIC\|GREEK) (SMALL )?LETTER (FINAL \|SMALL CAPITAL \|INVERTED )*(\w+)$})
515							{
516	0						print $ofd lc "-$4-";
517							}
518							elsif ($name =~ m{^(ARABIC\|GREEK) CAPITAL LETTER (FINAL \|SMALL CAPITAL \|INVERTED )*(\w+)$})
519							{
520	0						print $ofd uc "-$3-";
521							}
522							else
523							{
524	0						printf $ofd "[[%x='$name']]", $v;
525	0						$msg .= sprintf "unknown %x='$name'\n", $v;
526							}
527							}
528							}
529	0						return $msg;
530							}
531
532
533							=head1 AUTHOR
534
535							Juergen Weigert, C<< >>
536
537							=head1 BUGS
538
539							Please report any bugs or feature requests to C, or through
540							the web interface at L. I will be notified, and then you'll
541							automatically be notified of progress on your bug as I make changes.
542
543
544
545
546							=head1 SUPPORT
547
548							You can find documentation for this module with the perldoc command.
549
550							perldoc Text::GuessEncoding
551
552
553							You can also look for information at:
554
555							=over 4
556
557							=item * RT: CPAN's request tracker
558
559							L
560
561							=item * AnnoCPAN: Annotated CPAN documentation
562
563							L
564
565							=item * CPAN Ratings
566
567							L
568
569							=item * Search CPAN
570
571							L
572
573							=back
574
575
576							=head1 ACKNOWLEDGEMENTS
577
578
579							=head1 LICENSE AND COPYRIGHT
580
581							Copyright 2010 Juergen Weigert.
582
583							This program is free software; you can redistribute it and/or modify it
584							under the terms of either: the GNU General Public License as published
585							by the Free Software Foundation; or the Artistic License.
586
587							See http://dev.perl.org/licenses/ for more information.
588
589
590							=cut
591
592							1; # End of Text::GuessEncoding