File Coverage

lib/Text/WagnerFischer/Armenian.pm
Criterion Covered Total %
statement 81 93 87.1
branch 26 32 81.2
condition 6 6 100.0
subroutine 9 9 100.0
pod 2 2 100.0
total 124 142 87.3


line stmt bran cond sub pod time code
1             package Text::WagnerFischer::Armenian;
2              
3             =encoding utf8
4              
5             =head1 NAME
6              
7             Text::WagnerFischer::Armenian - a variation on Text::WagnerFischer for Armenian-language strings
8              
9             =head1 SYNOPSIS
10              
11             use Text::WagnerFischer::Armenian qw( distance );
12             use utf8; # for the Armenian characters in the source code
13              
14             print distance("ձեռն", "ձեռան") . "\n";
15             # "dzerrn -> dzerran"; prints 1
16             print distance("ձեռն", "ձերն") . "\n";
17             # "dzerrn -> dzern"; prints 0.5
18             print distance("կինք", "կին") . "\n";
19             # "kin" -> "kink'"; prints 0.5
20             my @words = qw( զօրսն Զորս զզօրսն );
21             my @distances = distance( "զօրս", @words );
22             print "@distances\n";
23             # "zors" -> "zorsn, Zors, zzorsn"
24             # prints "0.5 0.25 1"
25              
26             # Change the cost of a letter case mismatch to 1
27             my $edit_values = [ 0, 1, 1, 1, 0.5, 0.5, 0.5 ],
28             print distance( $edit_values, "ձեռն", "Ձեռն" ) . "\n";
29             # "dzerrn" -> "DZerrn"; prints 1
30              
31             =head1 DESCRIPTION
32              
33             This module implements the Wagner-Fischer distance algorithm modified
34             for Armenian strings. The Armenian language has a number of
35             single-letter prefixes and suffixes which, while not changing the
36             basic meaning of the word, function as definite articles,
37             prepositions, or grammatical markers. These changes, and letter
38             substitutions that represent vocalic equivalence, should be counted as
39             a smaller edit distance than a change that is a normal character
40             substitution.
41              
42             The Armenian weight function recognizes four extra edit types:
43              
44             / a: x = y (cost for letter match)
45             | b: x = - or y = - (cost for letter insertion/deletion)
46             w( x, y ) = | c: x != y (cost for letter mismatch)
47             | d: x = X (cost for case mismatch)
48             | e: x ~ y (cost for letter vocalic equivalence)
49             | f: x = (z|y|ts) && y = - (or vice versa)
50             | (cost for grammatic prefix)
51             | g: x = (n|k'|s|d) && y = - (or vice versa)
52             \ (cost for grammatic suffix)
53              
54              
55             =cut
56              
57 1     1   1770 use strict;
  1         1  
  1         28  
58 1     1   4 use warnings;
  1         1  
  1         21  
59 1     1   3 no warnings 'redefine';
  1         1  
  1         24  
60 1     1   3 use Exporter 'import';
  1         1  
  1         25  
61 1     1   401 use Text::WagnerFischer;
  1         562  
  1         34  
62 1     1   4 use utf8;
  1         1  
  1         3  
63              
64             my( %VocalicEquivalence, @Prefixes, @Suffixes, $REFC );
65              
66             our $VERSION = "0.04";
67             our @EXPORT_OK = qw( &distance &am_lc );
68              
69             # Set new default costs:
70             #
71             # WagnerFischer : equal, insert/delete, mismatch,
72             # LetterCaseEquiv : same word, case mismatch
73             # VocalicEquiv : letter that changed with pronunciation shift
74             # PrefixAddDrop : same word, one has prefix e.g. preposition form "y-"
75             # SuffixAddDrop : same word, one has suffix e.g. definite article "-n"
76             $REFC = [ 0, 1, 1, 0.25, 0.5, 0.5, 0.5 ]; # mid-word: no pre/suffix
77              
78             %VocalicEquivalence = (
79             'բ' => [ 'պ' ],
80             'գ' => [ 'ք', 'կ' ],
81             'դ' => [ 'տ' ],
82             'ե' => [ 'է' ],
83             'է' => [ 'ե' ],
84             'թ' => [ 'տ' ],
85             'լ' => [ 'ղ' ],
86             'կ' => [ 'գ', 'ք' ],
87             'ղ' => [ 'լ' ],
88             'յ' => [ '՛' ], # Only in manuscripts
89             'ո' => [ 'օ' ],
90             'պ' => [ 'բ', 'փ' ],
91             'ռ' => [ 'ր' ],
92             'վ' => [ 'ւ' ],
93             'տ' => [ 'դ', 'թ'],
94             'ր' => [ 'ռ' ],
95             'ւ' => [ 'վ' ],
96             'փ' => [ 'պ', 'ֆ' ],
97             'ք' => [ 'գ', 'կ' ],
98             'օ' => [ 'ո' ],
99             'ֆ' => [ 'փ' ],
100             '՛' => [ 'յ' ], # Only in manuscripts
101             );
102              
103             @Prefixes = qw( զ ց յ );
104             @Suffixes = qw( ն ս դ ք );
105              
106             sub _am_weight
107             {
108 1858     1858   2919 my ($x,$y,$refc)=@_;
109              
110 1858 100       4337 if ($x eq $y) {
    100          
111             # Simple case: exact match.
112 104         414 return $refc->[0];
113             } elsif( am_lc( $x ) eq am_lc( $y ) ) {
114             # Almost as simple: case difference.
115 2         11 return $refc->[3]; # Vocalic equivalence.
116             }
117              
118             # Got this far? We have to play games with prefixes, suffixes,
119             # similar-letter substitution, and the like.
120              
121             # Downcase both of them.
122 1752         3194 $x = am_lc( $x );
123 1752         2599 $y = am_lc( $y );
124              
125 1752 100 100     6716 if ( ($x eq '-') or ($y eq '-') ) {
126             # Are we dealing with a prefix or a suffix?
127             # print STDERR "x is $x; y is $y;\n";
128 1316 100       65452 if( grep( /(\Q$x\E|\Q$y\E)/, @Prefixes ) > 0 ) {
    100          
129 108         512 return $refc->[5];
130             } elsif( grep( /(\Q$x\E|\Q$y\E)/, @Suffixes ) > 0 ) {
131 130         670 return $refc->[6];
132             } else {
133             # Normal insert/delete
134 1078         5029 return $refc->[1];
135             }
136             } else {
137 436 100       846 if( exists( $VocalicEquivalence{$x} ) ) {
138             # Same word, vocalic shift?
139             # N.B. This will mistakenly give less weight to a few genuinely
140             # different words, e.g. the verbs "գամ" vs. "կամ". I can live with that.
141 156         171 my @equivs = @{$VocalicEquivalence{$x}};
  156         411  
142 156 100       2154 my $val = grep (/$y/, @equivs ) ? $refc->[4] : $refc->[2];
143 156         732 return $val;
144             } else {
145 280         1252 return $refc->[2];
146             }
147             }
148             }
149              
150             # Annoyingly, I need to copy this whole damn thing because I need to change
151             # the refc mid-stream.
152              
153             =head1 SUBROUTINES
154              
155             =over
156              
157             =item B( \@editweight, $string1, $string2, [ .. $stringN ] );
158              
159             =item B( $string1, $string2, [ .. $stringN ] );
160              
161             The main exported function of this module. Takes a list of two or
162             more strings and returns the edit distance between the first string
163             and each of the others. The "edit_distances" array is an optional
164             first argument, with which users may override the default edit
165             penalties, as described above.
166              
167             =cut
168              
169             sub distance {
170 26     26 1 90 my ($refcarg,$s,@t)=@_;
171              
172             # The refc values are as documented above:
173             # 0. x,x; 1. x,''; 2. x,y; 3. x,X; 4. d,t; 5. x,zx; 6. x,xn
174             # 6 only applies at beginnings of words, and 7 only applies at
175             # ends.
176              
177 26         48 my $refc = [];
178 26 100       94 if (!@t) {
    50          
179             # Two args...
180 13 50       34 if (ref($refcarg) ne "ARRAY") {
181             # the first of which is a string...
182 13 50       24 if (ref($s) ne "ARRAY") {
183             # ...and the second of which is a string.
184             # Use default refc set.
185 13         45 $t[0]=$s;
186 13         19 $s=$refcarg;
187 13         54 push( @$refc, @$REFC );
188             } else {
189             # ...one of which is an array. Croak.
190 0         0 require Carp;
191 0         0 Carp::croak("Text::WagnerFischer: second string is needed");
192             }
193             } else {
194             # one refc, and one string. Croak.
195 0         0 require Carp;
196 0         0 Carp::croak("Text::WagnerFischer: second string is needed");
197             }
198             } elsif (ref($refcarg) ne "ARRAY") {
199             # Three or more args, all strings.
200             # Use default refc set.
201 0         0 unshift @t,$s;
202 0         0 $s=$refcarg;
203 0         0 push( @$refc, @$REFC );
204             } else {
205             # A refc array and (presumably) some strings.
206             # Copy the passed array into our own array, because
207             # we are going to mutate our copy.
208 13         49 push( @$refc, @$refcarg );
209             }
210            
211             # Set up the refc arrays in three different formats - one for word
212             # beginnings, one for word ends, and one for everything else.
213 26         55 my( $refc_start, $refc_end ) = ( [], [] );
214 26         76 push( @$refc_start, @$refc );
215             # Count suffixes as normal add/del.
216 26         49 $refc_start->[6] = $refc->[1];
217 26         62 push( @$refc_end, @$refc );
218 26         34 $refc_end->[5] = $refc->[1];
219              
220             # Now alter our main refc, which should no longer
221             # care about prefixes or suffixes.
222 26         39 $refc->[5] = $refc->[1];
223 26         36 $refc->[6] = $refc->[1];
224            
225              
226             # binmode STDERR, ":utf8"; # for debugging
227             # Start the real string comparison.
228 26         55 my $n=length($s);
229 26         29 my @result;
230            
231 26         52 foreach my $t (@t) {
232            
233 26         26 my @d;
234            
235 26         37 my $m=length($t);
236 26 50       68 if(!$n) {push @result,$m*$refc->[1];next}
  0         0  
  0         0  
237 26 50       55 if(!$m) {push @result,$n*$refc->[1];next}
  0         0  
  0         0  
238            
239 26         56 $d[0][0]=0;
240            
241             # Populate the zero row.
242             # Cannot assume that blank vs. 1st letter is "add". Might
243             # be "prefix."
244 26         101 my $f_i = substr($s,0,1);
245 26         60 foreach my $i (1 .. $n) {$d[$i][0]=$i*&_am_weight('-',$f_i,$refc_start);}
  116         211  
246 26         601 my $f_j = substr($t,0,1);
247 26         57 foreach my $j (1 .. $m) {$d[0][$j]=$j*&_am_weight($f_j,'-',$refc_start);}
  116         201  
248            
249 26         49 foreach my $i (1 .. $n) {
250 116         1027 my $s_i=substr($s,$i-1,1);
251 116         214 foreach my $j (1 .. $m) {
252             # Switch to suffix refc if we are to end of either word.
253 542 100 100     4795 $refc = $refc_end if( $i == $n || $j == $m );
254 542         1194 my $t_i=substr($t,$j-1,1);
255            
256 542         1190 $d[$i][$j]=Text::WagnerFischer::_min($d[$i-1][$j]+_am_weight($s_i,'-',$refc),
257             $d[$i][$j-1]+_am_weight('-',$t_i,$refc),
258             $d[$i-1][$j-1]+_am_weight($s_i,$t_i,$refc));
259             }
260             }
261            
262 26         266 my $r = $d[$n][$m];
263             ## Round up to get an integer result.
264             ## On second thought, don't.
265             # if( $r - int( $r ) > 0 ) {
266             # $r = int( $r ) + 1;
267             # }
268              
269 26         133 push @result, $r;
270              
271             ## Debugging statements
272             # print "\nARRAY for $s / $t\n";
273             # foreach my $arr ( @d ) {
274             # print join( " ", @$arr ) . "\n"
275             # }
276             }
277 26 50       58 if (wantarray) {return @result} else {return $result[0]}
  0         0  
  26         231  
278             }
279            
280              
281             =item B( $char )
282              
283             A small utility function, useful for Armenian text. Returns the
284             lowercase version of the character passed in.
285              
286             =back
287              
288             =cut
289              
290             sub am_lc {
291 7012     7012 1 7089 my $char = shift;
292             # Is it in the uppercase Armenian range?
293 7012 100       16452 if( $char =~ /[\x{531}-\x{556}]/ ) {
294 158         338 my $codepoint = unpack( "U", $char );
295 158         174 $codepoint += 48;
296 158         377 $char = pack( "U", $codepoint );
297             }
298 7012         13303 return $char;
299             }
300              
301             =head1 LIMITATIONS
302              
303             There are many cases of Armenian word equivalence that are not
304             perfectly handled by this; it is meant to be a rough heuristic for
305             comparing transcriptions of handwriting. In particular, multi-letter
306             suffixes, and some orthographic equivalence e.g "o" -> "aw", are not
307             handled at all.
308              
309             =head1 LICENSE
310              
311             This package is free software and is provided "as is" without express
312             or implied warranty. You can redistribute it and/or modify it under
313             the same terms as Perl itself.
314              
315             =head1 AUTHOR
316              
317             Tara L Andrews, L
318              
319             =cut
320              
321             1;