File Coverage

blib/lib/Text/Unidecode.pm
Criterion Covered Total %
statement 48 68 70.5
branch 15 36 41.6
condition 1 2 50.0
subroutine 11 12 91.6
pod 1 5 20.0
total 76 123 61.7


line stmt bran cond sub pod time code
1             ;;;;# -*-coding:utf-8;-*- µ ← col73
2              
3             require 5;
4 11     11   89424 use 5.8.0;
  11         74  
5             package Text::Unidecode;
6             $Last_Modified =' Time-stamp: "2016-11-26 05:01:56 MST"';
7 11     11   2723 use utf8;
  11         83  
  11         64  
8 11     11   402 use strict;
  11         28  
  11         372  
9 11     11   8654 use integer; # vroom vroom!
  11         130  
  11         63  
10 11         2610 use vars qw($VERSION @ISA @EXPORT @Char $UNKNOWN $NULLMAP $TABLE_SIZE $Last_Modified
11             $Note_Broken_Tables %Broken_Table_Size %Broken_Table_Copy
12 11     11   636 );
  11         47  
13             $VERSION = '1.30';
14             require Exporter;
15             @ISA = ('Exporter');
16             @EXPORT = ('unidecode');
17             $Note_Broken_Tables = 0;
18 11 50   11   2574 BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
19             $UNKNOWN = '[?] ';
20             $TABLE_SIZE = 256;
21             $NULLMAP = [( $UNKNOWN ) x $TABLE_SIZE]; # for blocks we can't load
22              
23             #--------------------------------------------------------------------------
24             {
25             my $x = join '', "\x00" .. "\x7F";
26             die "the 7-bit purity test fails!" unless $x eq unidecode($x);
27             }
28              
29             #--------------------------------------------------------------------------
30              
31             sub unidecode {
32             # Destructive in void context -- in other contexts, nondestructive.
33              
34 685 50   685 1 30287 unless(@_) { # Sanity: Nothing coming in!
35 0 0       0 return() if wantarray;
36 0         0 return '';
37             }
38              
39 685 50       1378 if( defined wantarray ) {
40             # We're in list or scalar context (i.e., just not void context.)
41             # So make @_'s items no longer be aliases.
42 685         2756 @_ = map $_, @_;
43             } else {
44             # Otherwise (if we're in void context), then just let @_ stay
45             # aliases, and alter their elements IN-PLACE!
46             }
47              
48 685         1469 foreach my $n (@_) {
49 685 50       1485 next unless defined $n;
50              
51             # Shut up potentially fatal warnings about UTF-16 surrogate
52             # characters when running under perl -w
53             # This is per https://rt.cpan.org/Ticket/Display.html?id=97456
54 11     11   72 no warnings 'utf8';
  11         30  
  11         10298  
55              
56 685 100       4254 $n =~ s~([^\x00-\x7f])~${$Char[ord($1)>>8]||t($1)}[ord($1)&255]~egs;
  702         808  
  702         3437  
57             }
58             # That means:
59             # Replace character 0xABCD with $Char[0xAB][0xCD], loading
60             # the table 0xAB as needed.
61             #
62             #======================================================================
63             #
64             # Yes, that's dense code. It's the warp core!
65             # Here is an expansion into pseudocode... as best as I can manage it...
66             #
67             # $character = $1;
68             # $charnum = ord($character);
69             # $charnum_lowbits = $charnum & 255;
70             # $charnum_highbits = $charnum >> 8;
71             #
72             # $table_ref = $Char->[$charnum_highbits];
73             #
74             # if($table_ref) {
75             # # As expected, we got the arrayref for this table.
76             # } else {
77             # # Uhoh, we couldn't find the arrayref for this table.
78             # # So we call t($character).
79             # # It loads a table. Namely, it does:
80             # Load_Table_For( $charnum_highbits );
81             # # ...which does magic, and puts something in
82             # # $Char->[$charnum_highbits],
83             # # so NOW we actually CAN do:
84             # $table_ref = $Char->[$charnum_highbits];
85             # }
86             #
87             # $for_this_char
88             # = $table_ref->[ $charnum_lowbits ];
89             #
90             # # Although the syntax we actually use is the odd
91             # but COMPLETE EQUIVALENT to this syntax:
92             #
93             # $for_this_char
94             # = ${ $table_ref }[ $charnum_lowbits ];
95             #
96             # and $for_this_char is the replacement text for this
97             # character, in:
98             # $n =~ s~(char)~replacement~egs
99             #
100             # (And why did I use s~x~y~ instead of s/x/y/ ?
101             # It's all the same for Perl: perldoc perlretut says:
102             # As with the match "m//" operator, "s///" can
103             # use other delimiters, such as "s!!!" and "s{}{}",
104             # I didn't do it for sake of obscurity. I think it's just to
105             # keep my editor's syntax highlighter from crashing,
106             # which was a problem with s/// when the insides are as gory
107             # as we have here.
108              
109 685 50       1674 return unless defined wantarray; # void context
110 685 100       2146 return @_ if wantarray; # normal list context -- return the copies
111             # Else normal scalar context:
112 437 50       3415 return $_[0] if @_ == 1;
113 0         0 return join '', @_; # rarer fallthru: a list in, but a scalar out.
114             }
115              
116             #======================================================================
117              
118             sub make_placeholder_map {
119 134     134 0 7892 return [( $UNKNOWN ) x $TABLE_SIZE ];
120             }
121             sub make_placeholder_map_nulls {
122 0     0 0 0 return [( "" ) x $TABLE_SIZE ];
123             }
124              
125             #======================================================================
126              
127             sub t { # "t" is for "t"able.
128             # Load (and return) a char table for this character
129             # this should get called only once per table per session.
130 538     538 0 887 my $bank = ord($_[0]) >> 8;
131 538 50       1253 return $Char[$bank] if $Char[$bank];
132            
133 538         841 load_bank($bank);
134            
135             # Now see how that fared...
136              
137 538 50 50     2510 if(ref($Char[$bank] || '') ne 'ARRAY') {
138 0         0 DEBUG > 1 and print
139             " Loading failed for bank $bank (err $@). Using null map.\n";
140 0         0 return $Char[$bank] = $NULLMAP;
141             }
142              
143              
144 538         528 DEBUG > 1 and print " Loading succeeded.\n";
145 538         782 my $cb = $Char[$bank];
146              
147             # Sanity-check it:
148 538 50       1599 if(@$cb == $TABLE_SIZE) {
149             # As expected. Fallthru.
150              
151             } else {
152 0 0       0 if($Note_Broken_Tables) {
153 0         0 $Broken_Table_Size{$bank} = scalar @$cb;
154 0         0 $Broken_Table_Copy{$bank} = [ @$cb ];
155             }
156              
157 0 0       0 if(@$cb > $TABLE_SIZE) {
    0          
158 0         0 DEBUG and print "Bank $bank is too large-- it has ", scalar @$cb,
159             " entries in it. Pruning.\n";
160 0         0 splice @$cb, $TABLE_SIZE;
161             # That two-argument form splices everything off into nowhere,
162             # starting with the first overage character.
163              
164             } elsif( @$cb < $TABLE_SIZE) {
165 0         0 DEBUG and print "Bank $bank is too small-- it has ", scalar @$cb,
166             " entries in it. Now padding it.\n";
167 0 0       0 if(@$cb == 0) {
168 0         0 DEBUG and print " (Yes, ZERO entries!)\n";
169             }
170 0         0 push @$cb,
171             ( $UNKNOWN ) x ( $TABLE_SIZE - @$cb)
172             # i.e., however many items, times the deficit
173             ;
174             # And fallthru...
175              
176             } else {
177 0         0 die "UNREACHABLE CODE HERE (INSANE)";
178             }
179             }
180              
181             # Check for undefness in block:
182              
183 538         1404 for(my $i = 0; $i < $TABLE_SIZE; ++$i) {
184 137728 50       314842 unless(defined $cb->[$i]) {
185 0         0 DEBUG and printf "Undef at position %d in block x%02x\n",
186             $i, $bank;
187 0         0 $cb->[$i] = '';
188             }
189             }
190              
191 538         5765 return $Char[$bank];
192             }
193              
194             #-----------------------------------------------------------------------
195              
196             our $eval_loaded_okay;
197              
198             sub load_bank {
199              
200             # This is in its own sub, for sake of sweeping the scary thing
201             # (namely, a call to eval) under the rug.
202             # I.e., to paraphrase what Larry Wall once said to me: if
203             # you're going to do something odd, maybe you should do it
204             # in private.
205              
206 538     538 0 836 my($banknum) = @_; # just as an integer value
207              
208 538         432 DEBUG and printf
209             "# Eval-loading %s::x%02x ...\n";
210              
211 538         574 $eval_loaded_okay = 0;
212 538         1961 my $code =
213             sprintf( "require %s::x%02x; \$eval_loaded_okay = 1;\n",
214             __PACKAGE__,
215             $banknum);
216              
217             {
218 538         549 local $SIG{'__DIE__'};
  538         2082  
219 538         47875 eval($code);
220             }
221              
222 538 50       2435 return 1 if $eval_loaded_okay;
223 0           return 0;
224             }
225              
226             #======================================================================
227              
228             1;
229             __END__