File Coverage

blib/lib/Search/Tools/Transliterate.pm
Criterion Covered Total %
statement 67 71 94.3
branch 25 38 65.7
condition n/a
subroutine 11 11 100.0
pod 3 3 100.0
total 106 123 86.1


line stmt bran cond sub pod time code
1             package Search::Tools::Transliterate;
2 2     2   154692 use Moo;
  2         25453  
  2         10  
3             extends 'Search::Tools::Object';
4 2     2   4163 use Search::Tools::UTF8;
  2         10  
  2         298  
5 2     2   24 use Carp;
  2         32  
  2         141  
6 2     2   30 use Encode;
  2         6  
  2         204  
7 2     2   1480 use Encoding::FixLatin qw( fix_latin );
  2         4346  
  2         142  
8 2     2   26 use Data::Dump qw( dump );
  2         5  
  2         2293  
9              
10             has 'ebit' => ( is => 'rw', default => sub {1} );
11             has 'map' => ( is => 'ro' );
12              
13             our $VERSION = '1.007';
14              
15             =pod
16              
17             =head1 NAME
18              
19             Search::Tools::Transliterate - transliterations of UTF-8 chars
20              
21             =head1 SYNOPSIS
22              
23             my $tr = Search::Tools::Transliterate->new();
24             print $tr->convert( 'some string of utf8 chars' );
25            
26             =head1 DESCRIPTION
27              
28             Search::Tools::Transliterate transliterates UTF-8 characters
29             to single-byte equivalents. It is based on the transmap project
30             by Markus Kuhn http://www.cl.cam.ac.uk/~mgk25/.
31              
32             B All the I encoding check methods that existed in this class prior
33             to version 0.05 were moved to Search::Tools::UTF8 and refactored as functions,
34             many using XS for speed improvements.
35              
36              
37             =head1 METHODS
38              
39             =head2 new
40              
41             Create new instance. Takes the following optional parameters:
42              
43             =over
44              
45             =item map
46              
47             Customize the character mapping. Should be a hashref. See map() method.
48              
49             =item ebit
50              
51             Allow convert() to use full native 8bit characters for transliterating,
52             rather than only 7bit ASCII. The default is true (1). Set to 0 to disable.
53             B This must be set in new(). Changing via the accessor
54             after new() will have no effect on map().
55              
56             =back
57              
58             =head2 BUILD
59              
60             Called internally by new().
61              
62             =head2 map
63              
64             Access the transliteration character map. Example:
65              
66             use Search::Tools::Transliterate;
67             my $tr = Search::Tools::Transliterate->new;
68             $tr->map->{mychar} = 'my transliteration';
69             print $tr->convert('mychar'); # prints 'my transliteration'
70              
71             NOTE: The map() method is an accessor only. You can not pass in a new map.
72              
73              
74             =head2 convert( I )
75              
76             Returns UTF-8 I converted with all single bytes, transliterated according
77             to %Map. Will croak if I is not valid UTF-8, so if in doubt, check first with
78             is_valid_utf8() in Search::Tools::UTF8.
79              
80             =head2 convert1252( I )
81              
82             Returns UTF-8 I converted to all single byte characters,
83             transliterated with convert() and the Windows 1252 characters in the range
84             B<0x80> and B<0x9f> inclusive.
85              
86             The 1252 codepoints are converted first to
87             their UTF-8 counterparts per
88             http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
89             using Encoding::FixLatin::fix_latin() and then
90             I is run through convert().
91              
92             Note that I is checked with the looks_like_cp1252() function from
93             Search::Tools::UTF8 before calling fix_latin().
94              
95             =head1 BUGS
96              
97             You might consider the whole attempt as a bug. It's really an attempt to
98             accomodate applications that don't support Unicode. Perhaps we shouldn't even
99             try. But for things like curly quotes and other 'smart' punctuation, it's often
100             helpful to render the UTF-8 character as B rather than just letting
101             a character without a direct translation slip into the ether.
102              
103             That said, if a character has no mapping (and there are plenty that do not)
104             a single space will be used.
105              
106             =head1 AUTHOR
107              
108             Peter Karman C<< >>
109              
110             Originally based on the HTML::HiLiter regular expression building code,
111             by the same author, copyright 2004 by Cray Inc.
112              
113             Thanks to Atomic Learning C
114             for sponsoring the development of some of these modules.
115              
116             =head1 BUGS
117              
118             Please report any bugs or feature requests to C, or through
119             the web interface at L.
120             I will be notified, and then you'll
121             automatically be notified of progress on your bug as I make changes.
122              
123             =head1 SUPPORT
124              
125             You can find documentation for this module with the perldoc command.
126              
127             perldoc Search::Tools
128              
129              
130             You can also look for information at:
131              
132             =over 4
133              
134             =item * RT: CPAN's request tracker
135              
136             L
137              
138             =item * AnnoCPAN: Annotated CPAN documentation
139              
140             L
141              
142             =item * CPAN Ratings
143              
144             L
145              
146             =item * Search CPAN
147              
148             L
149              
150             =back
151              
152             =head1 COPYRIGHT
153              
154             Copyright 2006-2010 by Peter Karman.
155              
156             This package is free software; you can redistribute it and/or modify it under the
157             same terms as Perl itself.
158              
159             =head1 SEE ALSO
160              
161             Search::Tools::UTF8, Unicode::Map, Encode, Test::utf8, Encoding::FixLatin
162              
163             =cut
164              
165             # must memoize the first time since if we call new()
166             # more than once, has already been iterated over
167             # and _init_map() will end up returning empty hash.
168             my %MAP;
169              
170             sub _init_map {
171 4     4   8 my $self = shift;
172              
173 4 100       1112 return {%MAP} if %MAP;
174              
175 2         11 while () {
176 1680         2958 chomp;
177 1680 50       4677 next unless m/^
178 1680         6794 my ( $from, $to ) = (m/^()\ (.+)$/);
179 1680 50       3645 if ( !defined $to ) {
180 0         0 warn "Undefined mapping for $_\n";
181 0         0 next;
182             }
183 1680         3629 my @o = split( /;/, $to );
184 1680         3279 $MAP{ _Utag_to_chr($from) } = _Utag_to_chr( $o[0] );
185             }
186              
187 2         1498 return {%MAP};
188             }
189              
190             sub _Utag_to_chr {
191 3360     3360   5222 my $t = shift;
192              
193             # cruft
194 3360         11441 $t =~ s/[<>"]+//g;
195              
196 3360         10751 $t =~ s,U([0-9A-F]+),chr( hex($1) ),oge;
  4528         13780  
197 3360         14009 return $t;
198             }
199              
200             sub BUILD {
201 4     4 1 62 my $self = shift;
202              
203 4         14 my $map = $self->_init_map;
204              
205             # add/override 8bit chars
206 4 100       45 if ( $self->ebit ) {
207 1 50       44 $self->debug and warn "ebit on\n";
208 1         23 for ( 128 .. 255 ) {
209 128         220 my $c = chr($_);
210 128 50       2053 $self->debug and warn "chr $_ -> $c\n";
211 128         869 $map->{$c} = $c;
212             }
213             }
214              
215 4 50       15 if ( $self->{map} ) {
216 0         0 $map->{$_} = $self->{map}->{$_} for keys %{ $self->{map} };
  0         0  
217             }
218 4         38 $self->{map} = $map;
219             }
220              
221             # benchmark shows this is 244% faster than previous version.
222             sub convert {
223 13     13 1 96 my ( $self, $buf ) = @_;
224 13         25 my $newbuf = '';
225              
226             # don't bother unless we have non-ascii bytes
227 13 100       85 return $buf if is_ascii($buf);
228              
229             # make sure we've got valid UTF-8 to start with
230 12 100       42 unless ( is_valid_utf8($buf) ) {
231 1         6 my $badbyte = find_bad_utf8($buf);
232 1         9 croak "bad UTF-8 byte(s) at $badbyte [ " . dump($buf) . " ]";
233             }
234              
235             # an alternate algorithm. no idea if it is faster.
236             # it depends on Perl's utf8 char matching (.)
237             # which should work if locale is correct, afaik.
238 11         37 my $map = $self->map;
239              
240 11 50       356 $self->debug and warn "converting $buf\n";
241 11         135 while ( $buf =~ m/(.)/gso ) {
242 351         707 my $char = $1;
243 351 50       5922 $self->debug and warn "$char\n";
244 351 100       2486 if ( is_ascii($char) ) {
    100          
245 156 50       2484 $self->debug and warn "$char is_ascii\n";
246 156         1193 $newbuf .= $char;
247             }
248             elsif ( !exists $map->{$char} ) {
249 57 50       925 $self->debug and warn "$char not in map\n";
250 57         414 $newbuf .= ' ';
251             }
252             else {
253 138 50       2378 $self->debug and warn "transliterate $char => $map->{$char}\n";
254 138         1135 $newbuf .= $map->{$char};
255             }
256              
257             }
258              
259 11         67 return $newbuf;
260             }
261              
262             sub convert1252 {
263 2     2 1 6 my ( $self, $buf ) = @_;
264              
265             # don't bother unless we have non-ascii bytes
266 2 50       10 return $buf if is_ascii($buf);
267              
268 2 50       52 $self->debug and warn "converting $buf\n";
269 2 50       20 my $newbuf = looks_like_cp1252($buf) ? fix_latin($buf) : $buf;
270 2         2124 return $self->convert($newbuf);
271             }
272              
273             1;
274              
275             # map taken directly from
276             # http://www.cl.cam.ac.uk/~mgk25/download/transtab.tar.gz
277             # by Markus Kuhn
278              
279             __DATA__