File Coverage

blib/lib/Text/GuessEncoding.pm
Criterion Covered Total %
statement 12 140 8.5
branch 0 82 0.0
condition 0 15 0.0
subroutine 4 12 33.3
pod 8 8 100.0
total 24 257 9.3


line stmt bran cond sub pod time code
1             package Text::GuessEncoding;
2              
3 1     1   33351 use warnings;
  1         2  
  1         31  
4 1     1   6 use strict;
  1         1  
  1         31  
5 1     1   1742 use POSIX;
  1         10560  
  1         8  
6              
7             =head1 NAME
8              
9             Text::GuessEncoding - Convert Text from almost any encoding to ASCII or UTF8
10              
11             =head1 VERSION
12              
13             Version 0.07
14              
15             =cut
16              
17             our $VERSION = '0.07';
18              
19              
20             =head1 SYNOPSIS
21              
22             CAUTION: unfinished code. No objects created.
23              
24             Text::GuessEncoding gathers statistic about typical and invalid codes in
25             both Latin1 and UTF-8. The concept of 'typical' is currently from an
26             european point of view.
27             Based on this statistics, methods to transform to Latin1 and UTF-8 are provided. These methods handle 'broken' input strings with mixed encodings well.
28              
29             The input string may or may not have its utf8 flag set correctly; the flag
30             is ignored. The returned string has the utf8 flag always off, and contains
31             no characters above codepoint 127 (which means it is inside the ASCII
32             character set). If called in a list context, C returns the
33             mapping table as a second value. This mapping table is a hash, using all
34             recognized encodings as keys. (Any well-formed string should only have one
35             encoding, but one can never be sure.) Value per encoding is an array ref,
36             listing all the codepoints in the following form:
37             C<[ [ $codepoint, $replacement_bytecount, [ $offset, ... ] ], ... ]>
38             Offset positions refer to the output string, where byte counts are identical
39             with character counts.
40              
41             Example:
42            
43             my $guess = new Text::GuessEncoding();
44             ($ascii, $map) = $guess->to_ascii("J\x{fc}rgen \x{c3}\x{bc}\n");
45             # $ascii = 'Juergen ue';
46             # $map = { 'utf8' => [252, 2, [8]], 'latin1' => [252, 2, [1]] };
47              
48             The input string contains both utf8 encoded u-umlaut glyph and a plain latin1 byte u-umlaut.
49             The output string is never flagged as utf8.
50              
51             ($utf8, $map) = $guess->to_utf8("J\x{fc}rgen \x{c3}\x{bc}\n");
52             # $utf8 = 'J\N{U+fc}rgen \N{U+fc}';
53             # $map = { 'utf8' => [7], 'latin1' => [1] };
54            
55             C returns a simpler mapping table, as the string preserves more inforation.
56             Note that the offsets differ from to_ascii(), as no multi-character rewriting takes place.
57             The output string is always flagged as utf8.
58              
59             use Text::GuessEncoding;
60              
61             my $asciitext = Text::GuessEncoding::to_ascii($enctext);
62             my ($asciitext,$mapping) = Text::GuessEncoding::to_ascii($enctext);
63              
64             =head1 EXPORT
65              
66             C - create plain text in 7-bit ASCII encoding.
67             C - return UTF-8 encoded text .
68              
69             =head1 SUBROUTINES/METHODS
70              
71             =head2 to_ascii
72              
73             C is implemented in perl code as a post-processor of C.
74             It examines C and constructs some useful ascii replacements from these.
75             A number of frequently used codepoint values can be precompiled for speed.
76              
77             =cut
78              
79             sub to_ascii
80             {
81 0     0 1   my ($text) = @_;
82              
83             # run through the text, searching a byte with the high order bit set.
84             # this then, is a non-ascii byte, and needs conversion.
85              
86             # We distinguish two cases here:
87             # $text might know that it is utf8, or $text might believe it is not.
88              
89 0 0         if (utf8::is_utf8($text))
90             {
91 0           warn "to_ascii() running on a utf8 string";
92             }
93             else
94             {
95 1     1   6143 use Data::Dumper;
  1         15014  
  1         2634  
96             ## exmine the first 24 bytes to guess if this is a 16bit encoding
97 0           warn "to_ascii() running on a non-utf8 string";
98 0           my @bytes = unpack("C24", $text);
99 0           print Dumper \@bytes;
100             }
101              
102 0           while ($text =~ m{[[:^ascii:]]}g)
103             {
104 0           printf "non-ascii char at pos %d\n", pos($text);
105             }
106             }
107              
108             =head2 sysread_tout(FILE, $len, $tout)
109              
110             attempts to read $len bytes from FILE, with a select() timeout of $tout.
111              
112             =cut
113              
114             sub sysread_tout
115             {
116 0     0 1   my ($FILE, $len, $tout) = @_;
117 0           my $r = '';
118 0           while ($len > 0)
119             {
120 0           my $rout;
121 0           my $rin = '';
122 0           vec($rin,fileno($FILE), 1) = 1;
123 0           my ($n, $t) = select($rout = $rin, undef, undef, $tout);
124 0 0         $tout = $t if defined $t;
125 0 0         last unless $n;
126 0           my $buf = '';
127 0 0         last if sysread($FILE, $buf, 1) <= 0;
128 0           $r .= $buf;
129 0           $len--;
130             }
131 0           return $r;
132             }
133              
134             =head2 $orig_mode = tty_raw(FILE)
135              
136             Uses POSIX::Termios to set the terminal FILE to raw mode. Returns the previous mode so
137             that it can be restored with tty_set().
138             =cut
139              
140             sub tty_raw
141             {
142 0     0 1   my ($FILE) = @_;
143              
144 0           my $t = POSIX::Termios->new;
145 0           my $o = POSIX::Termios->new;
146 0           $t->getattr(fileno $FILE);
147 0           $o->getattr(fileno $FILE);
148              
149 0           $t->setlflag(0); # -echo, -icanon
150 0           $t->setcc(POSIX::VMIN, 1);
151 0           $t->setcc(POSIX::VTIME, 0);
152 0           tty_set($FILE, $t);
153 0           return $o;
154             }
155              
156              
157             =head2 tty_set(FILE, $tty_mode)
158              
159             Sets the FILE, which is assumed to be a terminal, to mode $tty_mode .
160             =cut
161              
162             sub tty_set
163             {
164 0     0 1   my ($FILE, $t) = @_;
165 0 0         $t->setattr(fileno $FILE, POSIX::TCSANOW) or die "TCSANOW failed: $!\n";
166             }
167              
168              
169             =head2 get_cursor_pos
170              
171             Sets tty to raw mode by calling tty_raw(), flushes STDIN, and sends ANSI code 'ESC [ 6 n'
172             (DC6 aka Report cursor Position) and attempts to read the returned position with
173             a 100msec timeout.
174             The terminal is returned to the previous mode, and a hashref containing the keys x, y
175             is returned.
176              
177             =cut
178             sub get_cursor_pos
179             {
180 0     0 1   my ($hint) = @_;
181             # 1 may be an ansi term?
182             # testing device status report 6, as seen in vttest.
183 0           my $t = tty_raw(\*STDIN);
184              
185 0           while (length(sysread_tout(\*STDIN, 1, 0.1))) { }
186              
187 0           syswrite(\*STDOUT, "\33[6n", 4);
188 0           my $r = sysread_tout(\*STDIN, 10, 0.1);
189 0           tty_set(\*STDIN, $t);
190 0 0         return { x => $2 - 1, y => $1 - 1, hint => 'DC6' } if $r =~ m{^\33\[(\d+);(\d+)R};
191 0           return undef;
192             }
193              
194              
195             =head2 probe_tty
196              
197             Prints a cariage return (no linefeed), to move the cursor to a defined column
198             position.
199             Prints a few test characters to STDOUT and calls get_cursor_pos() to check how
200             the terminal reacts upon each, e.g. by (not) advancing the cursor position
201             one or multiple positions.
202             Then restores the cursor to the carriage return position.
203             =cut
204              
205             sub probe_tty
206             {
207             #
208             # we can use STDIN and STDERR.
209             # 0) first, see, if the terminal can report cursor positions.
210 0     0 1   syswrite(STDOUT, "\r", 1);
211 0           my $o = get_cursor_pos();
212 0 0         print ", x=$o->{x}\n" if $::verbose > 1;
213              
214             # - if not, abort.
215 0 0         die "get_cursor_pos failed.\n" unless defined $o;
216              
217             # - if it can, store the current position
218 0 0         if ($o->{x} != 0)
219             {
220 0 0         warn "strace (or other) output interferes or\n" if $o->{x} >= 20;
221 0           die "carriage return does not work.\n";
222             }
223              
224             # 1) write a single byte ascii character, 'X' and check,
225             # if it advances by one.
226 0           syswrite(STDOUT, "\rX", 2);
227 0           my $p = get_cursor_pos($o->{hint});
228 0 0         print ", x=$p->{x}\n" if $::verbose;
229            
230              
231             # - If not, it is probably in microsoft-multibyte encoding,
232             # and requires '\0' prefixing. check this, report and abort.
233 0 0         die "multi-byte mode" if $p->{x} != 1;
234              
235             # 2)Then try non-ascii characters, e.g. a-umlaut.
236             # 2a) send its latin1 code, and see what happens,
237 0           syswrite(STDOUT, "\r1\34434", 5); # 1, a-umlaut-latin1, 3, 4
238 0           $p = get_cursor_pos($o->{hint});
239 0 0         print ", x=$p->{x}\n" if $::verbose;
240 0 0         die "no report" unless defined $p;
241              
242             # - no advance indicates that the terminal is not in latin1 mode
243             # or a lousy font is used.
244             # - advance by 2 indicates a defect in the tty-emulator.
245 0 0 0       die "latin1 a-umlaut caused confusion." if $p->{x} > 4 or $p->{x} < 2;
246              
247             # in utf8, our \344 consumes another char, thus the '3' is not printed.
248             # we don't know what the font does then.
249 0 0 0       my $maybe = 'utf8' if $p->{x} == 2 or $p->{x} == 3;
250 0 0         $maybe = 'latin1' if $p->{x} == 4;
251 0 0         print "maybe $maybe\n" if $::verbose;
252             # - advance by 1 says nothing, may be latin1.
253             # 2b) send its utf8 code.
254              
255 0           syswrite(STDOUT, "\r1\303\24434", 6); # 1, a-umlaut-utf8, 3, 4
256 0           $p = get_cursor_pos($o->{hint});
257 0 0         print ", x=$p->{x}\n" if $::verbose;
258              
259 0 0         die "no report" unless defined $p;
260             # - no advance indicates that a lousy font is used.
261             # - advance by one indicates that the terminal is in utf8 mode.
262             # - advance by two indicates that the terminal is in latin1 mode.
263              
264 0 0         syswrite(STDOUT, "\r \r", 8) unless $::verbose; # clear scratch area
265            
266 0 0         if ($p->{x} == 4)
267             {
268 0 0         return 'utf8' if $maybe eq 'utf8';
269 0           return 'possibly utf8';
270             }
271            
272 0 0         return 'latin1' if $maybe eq 'latin1';
273 0           return 'possibly latin1';
274             }
275              
276              
277             =head2 probe_file
278              
279             C contains all the material, from which we should build C.
280             =cut
281             ##
282             ## if utf8_valid is positive, then it can only be utf-8.
283             ## (if also utf8_invalid and/or latin1_typ are positive, then it is a mixture)
284             ## if only utf8_invalid or latin1_typ are positive, then it is latin1.
285             ## if all 3 are zero, it is plain ascii.
286             ##
287             ## FIXME: should take an optional length parameter to limit runtime.
288             ##
289             sub probe_file
290             {
291 0     0 1   my ($fd, $name) = @_;
292             # print "probing $name\n" if $::verbose;
293              
294 0           my %typ_latin = map { $_ => 1 } qw(169 171 174 176 177 178 179 181
  0            
295             185 187 191 192 193 194 195 196 197 199 200 201 202 203 204 205 206 207 208 209
296             210 211 212 213 214 215 216 217 218 219 220
297             223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
298             246 249 250 251 252 253 189 164);
299              
300              
301             # when running incremental, $fd is probably not seekable.
302             # so we need to buffer characters to be re-read after a lookahead.
303              
304             # http://de.wikipedia.org/wiki/UTF-8#Kodierung
305              
306 0           my $utf8_valid = 0; # parser happy.
307 0           my $utf8_invalid = 0; # something wrong.
308 0           my $latin1_typ = 0; # valid chars in 128..255 range followed by a ascii byte
309 0           my $ascii = 0; # char in 10..127 range
310 0           my $utf8_size = 0; # how many bytes belong to this utf-8 char.
311 0           my $utf8_len = 0; # how many more bytes belong to this utf-8 char.
312 0           my $utf8_start = 0; # ord of utf_8 start char.
313              
314 0           while (defined(my $c = getc($fd)))
315             {
316 0           my $v = ord($c);
317 0 0 0       if ($utf8_len)
    0          
    0          
    0          
    0          
    0          
318             {
319 0 0         if (($v & 0xc0) == 0x80) # 10xx xxxx
320             {
321             # printf "0 %02x\n", $v;
322 0 0         unless (--$utf8_len)
323             {
324 0           $utf8_valid++;
325 0           $utf8_size = 0;
326             }
327             }
328             else
329             {
330             # printf "0x %02x %02x '$c' $utf8_size-$utf8_len\n", $utf8_start, $v;
331 0 0 0       if (($utf8_size - $utf8_len) == 1 and $typ_latin{$utf8_start})
332             {
333 0 0 0       if ($v > 7 && $v < 128)
    0          
334             {
335 0           $latin1_typ++;
336 0           $ascii++;
337             }
338             elsif ($typ_latin{$v})
339             {
340 0           $latin1_typ += 2;
341             }
342             else
343             {
344 0           $utf8_invalid++;
345             }
346             }
347             else
348             {
349 0           $utf8_invalid++;
350             }
351 0           $utf8_len = $utf8_size = $utf8_start = 0;
352             }
353             }
354             elsif ($v > 7 && $v < 128)
355             {
356 0           $ascii++;
357 0           next;
358             }
359             elsif (($v & 0xe0) == 0xc0) # 110x xxxx
360             {
361 0           $utf8_start = $v;
362 0           $utf8_size = 2;
363 0           $utf8_len = 1;
364             # printf "1 %02x\n", $v;
365             }
366             elsif (($v & 0xf0) == 0xe0) # 1110 xxxx
367             {
368 0           $utf8_start = $v;
369 0           $utf8_size = 3;
370 0           $utf8_len = 2;
371             # printf "2 %02x\n", $v;
372             }
373             elsif (($v & 0xf8) == 0xf0) # 1111 0xxx
374             {
375 0           $utf8_start = $v;
376 0           $utf8_size = 4;
377 0           $utf8_len = 3;
378             # printf "3 %02x\n", $v;
379             }
380             elsif ($typ_latin{$v})
381             {
382 0           $latin1_typ++;
383             }
384             else
385             {
386 0           $utf8_invalid++;
387             # printf "x %02x\n", $v;
388             }
389             }
390 0           print "$name: utf8_valid=$utf8_valid utf8_invalid=$utf8_invalid latin1_typ=$latin1_typ ascii=$ascii\n";
391             }
392              
393             =head2 utf8toascii
394             convert a well formed utf8 file into ascii transcript.
395              
396             #! /usr/bin/perl -w -T
397             #
398             # utf8toascii -- convert a well formed utf8 file into ascii transcript.
399             #
400             # 2010-01-09, jw -- initial draft.
401             #
402             # perl -e 'use Encode; use charnames ":full"; map { print charnames::viacode($_). "\n" } unpack "W*", Encode::decode_utf8("J\x{c3}\x{bc}rgen fl\x{ef}\x{ac}\x{82} ü\x{c2}\x{a8}\n")'
403             #
404             #use charnames ":full";
405             #
406             #my $ofd = \*STDOUT;
407             #my $ifd = \*STDIN;
408             #
409             #print STDERR utf8toascii($ifd, $ofd);
410             #exit 0;
411             #############
412             =cut
413             sub utf8toascii
414             {
415 0     0 1   my ($ifd, $ofd) = @_;
416 0           my $msg = '';
417              
418 0           binmode($ifd, ":utf8");
419              
420 0           my %ascii_map =
421             (
422             'RIGHT DOUBLE QUOTATION MARK' => "``",
423             'LEFT DOUBLE QUOTATION MARK' => "''",
424             'RIGHT SINGLE QUOTATION MARK' => "`",
425             'LEFT SINGLE QUOTATION MARK' => "'",
426             'MODIFIER LETTER TURNED COMMA' => "'",
427             'MODIFIER LETTER VERTICAL LINE' => "|",
428             'RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK' => ">>",
429             'LEFT-POINTING DOUBLE ANGLE QUOTATION MARK' => "<<",
430             'DOUBLE ACUTE ACCENT' => '"',
431             'ACUTE ACCENT' => "'",
432             'DOUBLE LOW-9 QUOTATION MARK' => ',,',
433             'HORIZONTAL ELLIPSIS' => '...',
434             'LATIN SMALL LETTER A WITH DIAERESIS' => "ae",
435             'LATIN SMALL LETTER O WITH DIAERESIS' => "oe",
436             'LATIN SMALL LETTER U WITH DIAERESIS' => "ue",
437             'LATIN CAPITAL LETTER A WITH DIAERESIS' => "Ae",
438             'LATIN CAPITAL LETTER O WITH DIAERESIS' => "Oe",
439             'LATIN CAPITAL LETTER U WITH DIAERESIS' => "Ue",
440             'LATIN SMALL LETTER SHARP S' => "ss",
441             'LATIN SMALL LIGATURE FL' => "fl",
442             'LATIN SMALL LIGATURE FI' => "fi",
443             'LATIN SMALL LIGATURE FF' => "ff",
444             'LATIN SMALL LIGATURE FFI' => "ffi",
445             'LATIN SMALL LIGATURE FFL' => "ffl",
446             'NON-BREAKING HYPHEN' => "-",
447             'SOFT HYPHEN' => "-",
448             'EN DASH' => "-",
449             'EM DASH' => "--",
450             'SECTION SIGN' => "#",
451             'DIVISION SIGN' => "/",
452             'WHITE BULLET' => "^",
453             'DEGREE SIGN' => "^",
454             'DAGGER' => "+",
455             'DOUBLE DAGGER' => "(++)",
456             'FEMININE ORDINAL INDICATOR' => "(x)",
457             'BLACK DIAMOND SUIT' => "(*)",
458             'BULLET' => "*",
459             'ASTERISK OPERATOR' => "*",
460             'MULTIPLICATION SIGN' => "*",
461             'INCREMENT' => "-",
462             'COPYRIGHT SIGN' => "(c)",
463             'REGISTERED SIGN' => "(R)",
464             'TRADE MARK SIGN' => "(TM)",
465             'BLACK FOUR POINTED STAR' => "+",
466             'NARROW NO-BREAK SPACE' => " ",
467             'NO-BREAK SPACE' => " ",
468             'FIGURE SPACE' => " ",
469             'EM SPACE' => " ",
470             'DIE FACE-6' => "[6]",
471             'DIE FACE-5' => "[5]",
472             'DIE FACE-4' => "[4]",
473             'DIE FACE-3' => "[3]",
474             'DIE FACE-2' => "[2]",
475             'DIE FACE-1' => "[1]",
476             'DINGBAT NEGATIVE CIRCLED DIGIT ONE' => "(1)",
477             'DINGBAT NEGATIVE CIRCLED DIGIT TWO' => "(2)",
478             'DINGBAT NEGATIVE CIRCLED DIGIT THREE'=> "(3)",
479             'DINGBAT NEGATIVE CIRCLED DIGIT FOUR' => "(4)",
480             'DINGBAT NEGATIVE CIRCLED DIGIT FIVE' => "(5)",
481             'DINGBAT NEGATIVE CIRCLED DIGIT SIX' => "(6)",
482             'DINGBAT NEGATIVE CIRCLED DIGIT SEVEN'=> "(7)",
483             'DINGBAT NEGATIVE CIRCLED DIGIT EIGHT'=> "(8)",
484             'DINGBAT NEGATIVE CIRCLED DIGIT NINE' => "(9)",
485             'WHITE SQUARE' => "[ ]",
486             'UPWARDS ARROW' => "^",
487             'DRAFTING POINT RIGHTWARDS ARROW' => "(->)",
488             'VULGAR FRACTION ONE HALF' => "(1/2)",
489             'VULGAR FRACTION THREE QUARTERS' => "(3/4)",
490             );
491              
492 0           while (defined(my $ch = getc($ifd)))
493             {
494 0           my $v = ord($ch);
495 0 0         if ($v < 128)
496             {
497 0           print $ofd $ch;
498             }
499             else
500             {
501 0           my $name = charnames::viacode($v);
502 0 0         if (defined(my $a = $ascii_map{$name}))
    0          
    0          
    0          
    0          
503             {
504 0           print $ofd $a;
505             }
506             elsif ($name =~ m{^LATIN (SMALL )?LETTER (FINAL |SMALL CAPITAL |INVERTED )*(\w)})
507             {
508 0           print $ofd lc $3;
509             }
510             elsif ($name =~ m{^LATIN CAPITAL LETTER (FINAL |SMALL CAPITAL |INVERTED )*(\w)})
511             {
512 0           print $ofd uc $2;
513             }
514             elsif ($name =~ m{^(ARABIC|GREEK) (SMALL )?LETTER (FINAL |SMALL CAPITAL |INVERTED )*(\w+)$})
515             {
516 0           print $ofd lc "-$4-";
517             }
518             elsif ($name =~ m{^(ARABIC|GREEK) CAPITAL LETTER (FINAL |SMALL CAPITAL |INVERTED )*(\w+)$})
519             {
520 0           print $ofd uc "-$3-";
521             }
522             else
523             {
524 0           printf $ofd "[[%x='$name']]", $v;
525 0           $msg .= sprintf "unknown %x='$name'\n", $v;
526             }
527             }
528             }
529 0           return $msg;
530             }
531              
532              
533             =head1 AUTHOR
534              
535             Juergen Weigert, C<< >>
536              
537             =head1 BUGS
538              
539             Please report any bugs or feature requests to C, or through
540             the web interface at L. I will be notified, and then you'll
541             automatically be notified of progress on your bug as I make changes.
542              
543              
544              
545              
546             =head1 SUPPORT
547              
548             You can find documentation for this module with the perldoc command.
549              
550             perldoc Text::GuessEncoding
551              
552              
553             You can also look for information at:
554              
555             =over 4
556              
557             =item * RT: CPAN's request tracker
558              
559             L
560              
561             =item * AnnoCPAN: Annotated CPAN documentation
562              
563             L
564              
565             =item * CPAN Ratings
566              
567             L
568              
569             =item * Search CPAN
570              
571             L
572              
573             =back
574              
575              
576             =head1 ACKNOWLEDGEMENTS
577              
578              
579             =head1 LICENSE AND COPYRIGHT
580              
581             Copyright 2010 Juergen Weigert.
582              
583             This program is free software; you can redistribute it and/or modify it
584             under the terms of either: the GNU General Public License as published
585             by the Free Software Foundation; or the Artistic License.
586              
587             See http://dev.perl.org/licenses/ for more information.
588              
589              
590             =cut
591              
592             1; # End of Text::GuessEncoding