File Coverage

blib/lib/Text/Unidecode.pm
Criterion Covered Total %
statement 52 65 80.0
branch 19 34 55.8
condition 1 2 50.0
subroutine 11 12 91.6
pod 1 5 20.0
total 84 118 71.1


line stmt bran cond sub pod time code
1             ;;;;# -*-coding:utf-8;-*- µ ← col73
2              
3             require 5;
4 11     11   74800 use 5.8.0;
  11         39  
5             package Text::Unidecode;
6             $Last_Modified =' Time-stamp: "2015-10-21 06:18:37 MDT sburke@cpan.org"';
7 11     11   2731 use utf8;
  11         45  
  11         69  
8 11     11   281 use strict;
  11         31  
  11         256  
9 11     11   8764 use integer; # vroom vroom!
  11         116  
  11         57  
10 11     11   363 use vars qw($VERSION @ISA @EXPORT @Char $UNKNOWN $NULLMAP $TABLE_SIZE $Last_Modified);
  11         19  
  11         1732  
11             $VERSION = '1.26';
12             require Exporter;
13             @ISA = ('Exporter');
14             @EXPORT = ('unidecode');
15              
16 11 50   11   1943 BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
17             $UNKNOWN = '[?] ';
18             $TABLE_SIZE = 256;
19             $NULLMAP = [( $UNKNOWN ) x $TABLE_SIZE]; # for blocks we can't load
20              
21             #--------------------------------------------------------------------------
22             {
23             my $x = join '', "\x00" .. "\x7F";
24             die "the 7-bit purity test fails!" unless $x eq unidecode($x);
25             }
26              
27             #--------------------------------------------------------------------------
28              
29             sub unidecode {
30             # Destructive in void context -- in other contexts, nondestructive.
31              
32 684 50   684 1 34215 unless(@_) { # Sanity: Nothing coming in!
33 0 0       0 return() if wantarray;
34 0         0 return '';
35             }
36              
37 684 50       1411 if( defined wantarray ) {
38             # We're in list or scalar context (i.e., just not void context.)
39             # So make @_'s items no longer be aliases.
40 684         2803 @_ = map $_, @_;
41             } else {
42             # Otherwise (if we're in void context), then just let @_ stay
43             # aliases, and alter their elements IN-PLACE!
44             }
45              
46 684         1545 foreach my $n (@_) {
47 684 50       1595 next unless defined $n;
48              
49             # Shut up potentially fatal warnings about UTF-16 surrogate
50             # characters when running under perl -w
51             # This is per https://rt.cpan.org/Ticket/Display.html?id=97456
52 11     11   60 no warnings 'utf8';
  11         20  
  11         8367  
53              
54 684 100       3534 $n =~ s~([^\x00-\x7f])~${$Char[ord($1)>>8]||t($1)}[ord($1)&255]~egs;
  702         878  
  702         3180  
55             }
56             # That means:
57             # Replace character 0xABCD with $Char[0xAB][0xCD], loading
58             # the table 0xAB as needed.
59             #
60             #======================================================================
61             #
62             # Yes, that's dense code. It's the warp core!
63             # Here is an expansion into pseudocode... as best as I can manage it...
64             #
65             # $character = $1;
66             # $charnum = ord($character);
67             # $charnum_lowbits = $charnum & 255;
68             # $charnum_highbits = $charnum >> 8;
69             #
70             # $table_ref = $Char->[$charnum_highbits];
71             #
72             # if($table_ref) {
73             # # As expected, we got the arrayref for this table.
74             # } else {
75             # # Uhoh, we couldn't find the arrayref for this table.
76             # # So we call t($character).
77             # # It loads a table. Namely, it does:
78             # Load_Table_For( $charnum_highbits );
79             # # ...which does magic, and puts something in
80             # # $Char->[$charnum_highbits],
81             # # so NOW we actually CAN do:
82             # $table_ref = $Char->[$charnum_highbits];
83             # }
84             #
85             # $for_this_char
86             # = $table_ref->[ $charnum_lowbits ];
87             #
88             # # Although the syntax we actually use is the odd
89             # but COMPLETE EQUIVALENT to this syntax:
90             #
91             # $for_this_char
92             # = ${ $table_ref }[ $charnum_lowbits ];
93             #
94             # and $for_this_char is the replacement text for this
95             # character, in:
96             # $n =~ s~(char)~replacement~egs
97             #
98             # (And why did I use s~x~y~ instead of s/x/y/ ?
99             # It's all the same for Perl: perldoc perlretut says:
100             # As with the match "m//" operator, "s///" can
101             # use other delimiters, such as "s!!!" and "s{}{}",
102             # I didn't do it for sake of obscurity. I think it's just to
103             # keep my editor's syntax highlighter from crashing,
104             # which was a problem with s/// when the insides are as gory
105             # as we have here.
106              
107 684 50       1642 return unless defined wantarray; # void context
108 684 100       2177 return @_ if wantarray; # normal list context -- return the copies
109             # Else normal scalar context:
110 436 50       2733 return $_[0] if @_ == 1;
111 0         0 return join '', @_; # rarer fallthru: a list in, but a scalar out.
112             }
113              
114             #======================================================================
115              
116             sub make_placeholder_map {
117 134     134 0 6310 return [( $UNKNOWN ) x $TABLE_SIZE ];
118             }
119             sub make_placeholder_map_nulls {
120 0     0 0 0 return [( "" ) x $TABLE_SIZE ];
121             }
122              
123             #======================================================================
124              
125             sub t { # "t" is for "t"able.
126             # Load (and return) a char table for this character
127             # this should get called only once per table per session.
128 538     538 0 1123 my $bank = ord($_[0]) >> 8;
129 538 50       1191 return $Char[$bank] if $Char[$bank];
130            
131 538         987 load_bank($bank);
132            
133             # Now see how that fared...
134              
135 538 50 50     2364 if(ref($Char[$bank] || '') ne 'ARRAY') {
136 0         0 DEBUG > 1 and print
137             " Loading failed for bank $bank (err $@). Using null map.\n";
138 0         0 return $Char[$bank] = $NULLMAP;
139             }
140              
141              
142 538         677 DEBUG > 1 and print " Loading succeeded.\n";
143 538         927 my $cb = $Char[$bank];
144              
145             # Sanity-check it:
146 538 100       1439 if(@$cb == $TABLE_SIZE) {
147             # As expected. Fallthru.
148              
149             } else {
150 100 50       309 if(@$cb > $TABLE_SIZE) {
    50          
151 0         0 DEBUG and print "Bank $bank is too large-- it has ", scalar @$cb,
152             " entries in it. Pruning.\n";
153 0         0 splice @$cb, $TABLE_SIZE;
154             # That two-argument form splices everything off into nowhere,
155             # starting with the first overage character.
156              
157             } elsif( @$cb < $TABLE_SIZE) {
158 100         108 DEBUG and print "Bank $bank is too small-- it has ", scalar @$cb,
159             " entries in it. Now padding it.\n";
160 100 50       235 if(@$cb == 0) {
161 0         0 DEBUG and print " (Yes, ZERO entries!)\n";
162             }
163 100         566 push @$cb,
164             ( $UNKNOWN ) x ( $TABLE_SIZE - @$cb)
165             # i.e., however many items, times the deficit
166             ;
167             # And fallthru...
168              
169             } else {
170 0         0 die "UNREACHABLE CODE HERE (INSANE)";
171             }
172             }
173              
174             # Check for undefness in block:
175              
176 538         1516 for(my $i = 0; $i < $TABLE_SIZE; ++$i) {
177 137728 50       405509 unless(defined $cb->[$i]) {
178 0         0 DEBUG and printf "Undef at position %d in block x%02x\n",
179             $i, $bank;
180 0         0 $cb->[$i] = '';
181             }
182             }
183              
184 538         4739 return $Char[$bank];
185             }
186              
187             #-----------------------------------------------------------------------
188              
189             our $eval_loaded_okay;
190              
191             sub load_bank {
192              
193             # This is in its own sub, for sake of sweeping the scary thing
194             # (namely, a call to eval) under the rug.
195             # I.e., to paraphrase what Larry Wall once said to me: if
196             # you're going to do something odd, maybe you should do it
197             # in private.
198              
199 538     538 0 900 my($banknum) = @_; # just as an integer value
200              
201 538         613 DEBUG and printf
202             "# Eval-loading %s::x%02x ...\n";
203              
204 538         735 $eval_loaded_okay = 0;
205 538         1921 my $code =
206             sprintf( "require %s::x%02x; \$eval_loaded_okay = 1;\n",
207             __PACKAGE__,
208             $banknum);
209              
210             {
211 538         695 local $SIG{'__DIE__'};
  538         2056  
212 538         41708 eval($code);
213             }
214              
215 538 50       2821 return 1 if $eval_loaded_okay;
216 0           return 0;
217             }
218              
219             #======================================================================
220              
221             1;
222             __END__