File Coverage

blib/lib/Text/Transliterator/Unaccent.pm
Criterion Covered Total %
statement 38 59 64.4
branch 6 18 33.3
condition 6 12 50.0
subroutine 7 14 50.0
pod 3 3 100.0
total 60 106 56.6


line stmt bran cond sub pod time code
1             package Text::Transliterator::Unaccent;
2              
3 2     2   70981 use warnings;
  2         11  
  2         66  
4 2     2   11 use strict;
  2         4  
  2         68  
5              
6             our $VERSION = "1.04";
7              
8 2     2   411 use Text::Transliterator;
  2         5  
  2         68  
9 2     2   2215 use Unicode::UCD qw(charinfo charscript charblock);
  2         109791  
  2         182  
10 2     2   22 use Unicode::Normalize qw();
  2         4  
  2         543  
11              
12             sub char_map {
13 1     1 1 2 my $class = shift;
14              
15 1         3 my @all_ranges;
16 1         2 my $ignore_wide = 0;
17 1         2 my $ignore_upper = 0;
18 1         2 my $ignore_lower = 0;
19              
20             # decode arguments to get character ranges and boolean flags
21 1         7 while (my ($kind, $arg) = splice(@_, 0, 2)) {
22 0         0 my $ranges;
23              
24             my $todo = {
25 0 0   0   0 script => sub { $ranges = charscript($arg)
26             or die "$arg is not a valid Unicode script" },
27 0 0   0   0 block => sub { $ranges = charblock($arg)
28             or die "$arg is not a valid Unicode block" },
29 0     0   0 ranges => sub { $ranges = $arg },
30 0     0   0 wide => sub { $ignore_wide = !$arg },
31 0     0   0 upper => sub { $ignore_upper = !$arg },
32 0     0   0 lower => sub { $ignore_lower = !$arg },
33 0         0 };
34 0 0       0 my $coderef = $todo->{$kind}
35             or die "invalid argument: $kind";
36 0         0 $coderef->();
37 0 0       0 push @all_ranges, @$ranges if $ranges;
38             }
39              
40             # default
41 1 50       3 @all_ranges = @{charscript('Latin')} if !@all_ranges;
  1         6  
42              
43             # build the map
44 1         17813 my %map;
45 1         4 foreach my $range (@all_ranges) {
46 31         62 my ($start, $end) = @$range;
47              
48             # iterate over characters in range
49             CHAR:
50 31         54 for my $c ($start .. $end) {
51              
52             # maybe drop that char under some conditions
53 1350 50 33     2332 last CHAR if $ignore_wide and $c > 255;
54 1350 50 33     2297 next CHAR if $ignore_upper and chr($c) =~ /\p{Uppercase_Letter}/;
55 1350 50 33     2275 next CHAR if $ignore_lower and chr($c) =~ /\p{Lowercase_Letter}/;
56              
57             # get canonical decomposition (if any)
58 1350         2225 my $canon = Unicode::Normalize::getCanon($c);
59              
60             # store into map
61 1350 100 100     3058 if ($canon && length($canon) > 1) {
62             # the unaccented char is the the base (first char) of the decomposition
63 498         858 my $base = substr $canon, 0, 1;
64 498         1431 $map{chr($c)} = $base,
65             }
66             }
67             }
68              
69 1         12 return \%map;
70             }
71              
72             sub char_map_descr {
73 0     0 1 0 my $class = shift;
74              
75 0         0 my $map = $class->char_map(@_);
76              
77 0         0 my $txt = "";
78             # while (my ($k, $v) = each %$map) {
79 0         0 foreach my $k (sort {$a cmp $b} keys %$map) {
  0         0  
80 0         0 my $v = $map->{$k};
81 0         0 my $accented = ord($k);
82 0         0 my $base = ord($v);
83             $txt .= sprintf "U+%04x %-55s => U+%04x %s\n",
84             $accented,
85             charinfo($accented)->{name},
86             $base,
87 0         0 charinfo($base)->{name};
88             }
89 0         0 return $txt;
90             }
91              
92             sub new {
93 1     1 1 888 my $class = shift;
94 1         3 my $map = $class->char_map(@_);
95 1         12 return Text::Transliterator->new($map)
96             }
97              
98             1; # End of Text::Transliterator::Unaccent
99              
100              
101             __END__