File Coverage

blib/lib/Unicode/Diacritic/Strip.pm
Criterion Covered Total %
statement 65 68 95.5
branch 15 18 83.3
condition 1 3 33.3
subroutine 10 10 100.0
pod 3 4 75.0
total 94 103 91.2


line stmt bran cond sub pod time code
1             package Unicode::Diacritic::Strip;
2 5     5   347704 use warnings;
  5         54  
  5         215  
3 5     5   33 use strict;
  5         11  
  5         104  
4 5     5   1904 use utf8;
  5         50  
  5         37  
5             require Exporter;
6 5     5   213 use base qw(Exporter);
  5         12  
  5         1049  
7             our @EXPORT_OK = qw/strip_diacritics strip_alphabet fast_strip/;
8             our %EXPORT_TAGS = (all => \@EXPORT_OK);
9             our $VERSION = '0.11';
10 5     5   5068 use Unicode::UCD 'charinfo';
  5         264536  
  5         430  
11 5     5   2742 use Encode 'decode_utf8';
  5         49012  
  5         4451  
12              
13             sub strip_diacritics
14             {
15 3     3 1 2421 my ($diacritics_text) = @_;
16 3 100       26 if ($diacritics_text !~ /[^\x{01}-\x{80}]/) {
17             # All the characters in this text are ASCII, and so there are
18             # no diacritics.
19 1         3 return $diacritics_text;
20             }
21 2         22 my @characters = split //, $diacritics_text;
22 2         8 for my $character (@characters) {
23             # Leave non-word characters unaltered.
24 34 100       161 if ($character =~ /\W/) {
25 1         3 next;
26             }
27 33         84 my $decomposed = decompose ($character);
28 33 100       139 if ($character ne $decomposed) {
29 28         88 $character = $decomposed;
30             }
31             }
32 2         16 my $stripped_text = join '', @characters;
33 2         17 return $stripped_text;
34             }
35              
36             sub decompose
37             {
38 109     109 0 223 my ($character) = @_;
39             # Get the Unicode::UCD decomposition.
40 109         305 my $charinfo = charinfo (ord $character);
41 109         855156 my $decomposition = $charinfo->{decomposition};
42             # Give up if there is no decomposition for $character
43 109 100       294 if (! $decomposition) {
44 71         413 return $character;
45             }
46             # Get the first character of the decomposition
47 38         215 my @decomposition_chars = split /\s+/, $decomposition;
48 38         100 $character = chr hex $decomposition_chars[0];
49             # A character may have multiple decompositions, so repeat this
50             # process until there are none left.
51 38         130 return decompose ($character);
52             }
53              
54             sub strip_alphabet
55             {
56 1     1 1 1874 my ($diacritics_text, %options) = @_;
57 1         4 my %swaps;
58 1 50 33     13 if (! defined $diacritics_text || length ($diacritics_text) == 0) {
59 0         0 return ($diacritics_text, {});
60             }
61 1         59 my @characters = split //, $diacritics_text;
62 1         5 my %alphabet;
63 1         4 for my $c (@characters) {
64 295         515 $alphabet{$c} = 1;
65             }
66 1         18 my @c = keys %alphabet;
67              
68 1         5 for my $character (@c) {
69             # Reject non-word characters
70 44 100       173 if ($character !~ /\w/) {
71 6 50       14 if ($options{verbose}) {
72 0         0 print "Not altering non-word character '$character'.\n";
73             }
74 6         14 next;
75             }
76 38         113 my $decomposed = decompose ($character, %options);
77 38 100       118 if ($character ne $decomposed) {
78 8         20 my $boo = "$decomposed baba";
79 8         25 $swaps{$character} = $boo;
80 8         45 $swaps{$character} =~ s/ baba$//;
81             }
82             }
83              
84             # Make the version of the text with all the diacritics removed.
85              
86 1         3 my $stripped_text = $diacritics_text;
87 1         7 for my $k (keys %swaps) {
88 8 50       23 if ($options{verbose}) {
89 0         0 printf "Swapping $k for $swaps{$k} (%X).\n", ord ($swaps{$k});
90             }
91 8         103 $stripped_text =~ s/$k/$swaps{$k}/g;
92             }
93 1         59 return ($stripped_text, \%swaps);
94             }
95              
96             sub fast_strip
97             {
98 250     250 1 144192 my ($word) = @_;
99             # Expand ligatures.
100 250         734 $word =~ s/œ/oe/g;
101             # Thorn is "th".
102 250         834 $word =~ s/Þ|þ/th/g;
103             # Remove all diacritics
104 250         1157 $word =~ tr/ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöùúûüýÿĀāĂ㥹ĆćĈĉĊċČčĎďĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĨĩĪīĬĭĮįİĴĵĶķĹĺĻļĽľŁłŃńŅņŇňŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžƠơƯưǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǞǟǠǡǦǧǨǩǪǫǬǭǰǴǵǸǹǺǻȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȞȟȦȧȨȩȪȫȬȭȮȯȰȱȲȳøØḀḁḂḃḄḅḆḇḈḉḊḋḌḍḎḏḐḑḒḓḔḕḖḗḘḙḚḛḜḝḞḟḠḡḢḣḤḥḦḧḨḩḪḫḬḭḮḯḰḱḲḳḴḵḶḷḸḹḺḻḼḽḾḿṀṁṂṃṄṅṆṇṈṉṊṋṌṍṎṏṐṑṒṓṔṕṖṗṘṙṚṛṜṝṞṟṠṡṢṣṤṥṦṧṨṩṪṫṬṭṮṯṰṱṲṳṴṵṶṷṸṹṺṻṼṽṾṿẀẁẂẃẄẅẆẇẈẉẊẋẌẍẎẏẐẑẒẓẔẕẖẗẘẙẚẛẜẝẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ/AAAAAACEEEEIIIINOOOOOUUUUYaaaaaaceeeeiiiinooooouuuuyyAaAaAaCcCcCcCcDdEeEeEeEeEeGgGgGgGgHhIiIiIiIiIJjKkLlLlLlLlNnNnNnOoOoOoRrRrRrSsSsSsSsTtTtUuUuUuUuUuUuWwYyYZzZzZzOoUuAaIiOoUuUuUuUuUuAaAaGgKkOoOojGgNnAaAaAaEeEeIiIiOoOoRrRrUuUuSsTtHhAaEeOoOoOoOoYyoOAaBbBbBbCcDdDdDdDdDdEeEeEeEeEeFfGgHhHhHhHhHhIiIiKkKkKkLlLlLlLlMmMmMmNnNnNnNnOoOoOoOoPpPpRrRrRrRrSsSsSsSsSsTtTtTtTtUuUuUuUuUuVvVvWwWwWwWwWwXxXxYyZzZzZzhtwyafffAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy/;
105 250         611 return $word;
106             }
107              
108             1;