File Coverage

lib/Text/WagnerFischer/Armenian.pm
Criterion Covered Total %
statement 81 93 87.1
branch 26 32 81.2
condition 6 6 100.0
subroutine 9 9 100.0
pod 2 2 100.0
total 124 142 87.3


line stmt bran cond sub pod time code
1             package Text::WagnerFischer::Armenian;
2              
3             =head1 NAME
4              
5             Text::WagnerFischer::Armenian - a variation on Text::WagnerFischer for Armenian-language strings
6              
7             =head1 SYNOPSIS
8              
9             use Text::WagnerFischer::Armenian qw( distance );
10             use utf8; # for the Armenian characters in the source code
11              
12             print distance("ձեռն", "ձեռան") . "\n";
13             # "dzerrn -> dzerran"; prints 1
14             print distance("ձեռն", "ձերն") . "\n";
15             # "dzerrn -> dzern"; prints 0.5
16             print distance("կինք", "կին") . "\n";
17             # "kin" -> "kink'"; prints 0.5
18             my @words = qw( զօրսն Զորս զզօրսն );
19             my @distances = distance( "զօրս", @words );
20             print "@distances\n";
21             # "zors" -> "zorsn, Zors, zzorsn"
22             # prints "0.5 0.25 1"
23              
24             # Change the cost of a letter case mismatch to 1
25             my $edit_values = [ 0, 1, 1, 1, 0.5, 0.5, 0.5 ],
26             print distance( $edit_values, "ձեռն", "Ձեռն" ) . "\n";
27             # "dzerrn" -> "DZerrn"; prints 1
28              
29             =head1 DESCRIPTION
30              
31             This module implements the Wagner-Fischer distance algorithm modified
32             for Armenian strings. The Armenian language has a number of
33             single-letter prefixes and suffixes which, while not changing the
34             basic meaning of the word, function as definite articles,
35             prepositions, or grammatical markers. These changes, and letter
36             substitutions that represent vocalic equivalence, should be counted as
37             a smaller edit distance than a change that is a normal character
38             substitution.
39              
40             The Armenian weight function recognizes four extra edit types:
41              
42             / a: x = y (cost for letter match)
43             | b: x = - or y = - (cost for letter insertion/deletion)
44             w( x, y ) = | c: x != y (cost for letter mismatch)
45             | d: x = X (cost for case mismatch)
46             | e: x ~ y (cost for letter vocalic equivalence)
47             | f: x = (z|y|ts) && y = - (or vice versa)
48             | (cost for grammatic prefix)
49             | g: x = (n|k'|s|d) && y = - (or vice versa)
50             \ (cost for grammatic suffix)
51              
52              
53             =cut
54              
55 1     1   4673 use strict;
  1         3  
  1         90  
56 1     1   5 use warnings;
  1         2  
  1         34  
57 1     1   4 no warnings 'redefine';
  1         2  
  1         39  
58 1     1   5 use Exporter 'import';
  1         1  
  1         34  
59 1     1   807 use Text::WagnerFischer;
  1         827  
  1         38  
60 1     1   6 use utf8;
  1         1  
  1         5  
61              
62             my( %VocalicEquivalence, @Prefixes, @Suffixes, $REFC );
63              
64             our $VERSION = "0.03";
65             our @EXPORT_OK = qw( &distance &am_lc );
66              
67             # Set new default costs:
68             #
69             # WagnerFischer : equal, insert/delete, mismatch,
70             # LetterCaseEquiv : same word, case mismatch
71             # VocalicEquiv : letter that changed with pronunciation shift
72             # PrefixAddDrop : same word, one has prefix e.g. preposition form "y-"
73             # SuffixAddDrop : same word, one has suffix e.g. definite article "-n"
74             $REFC = [ 0, 1, 1, 0.25, 0.5, 0.5, 0.5 ]; # mid-word: no pre/suffix
75              
76             %VocalicEquivalence = (
77             'բ' => [ 'պ' ],
78             'գ' => [ 'ք', 'կ' ],
79             'դ' => [ 'տ' ],
80             'ե' => [ 'է' ],
81             'է' => [ 'ե' ],
82             'թ' => [ 'տ' ],
83             'լ' => [ 'ղ' ],
84             'կ' => [ 'գ', 'ք' ],
85             'ղ' => [ 'լ' ],
86             'յ' => [ '՛' ], # Only in manuscripts
87             'ո' => [ 'օ' ],
88             'պ' => [ 'բ', 'փ' ],
89             'ռ' => [ 'ր' ],
90             'վ' => [ 'ւ' ],
91             'տ' => [ 'դ', 'թ'],
92             'ր' => [ 'ռ' ],
93             'ւ' => [ 'վ' ],
94             'փ' => [ 'պ', 'ֆ' ],
95             'ք' => [ 'գ', 'կ' ],
96             'օ' => [ 'ո' ],
97             'ֆ' => [ 'փ' ],
98             '՛' => [ 'յ' ], # Only in manuscripts
99             );
100              
101             @Prefixes = qw( զ ց յ );
102             @Suffixes = qw( ն ս դ ք );
103              
104             sub _am_weight
105             {
106 1858     1858   3416 my ($x,$y,$refc)=@_;
107              
108 1858 100       4378 if ($x eq $y) {
    100          
109             # Simple case: exact match.
110 104         404 return $refc->[0];
111             } elsif( am_lc( $x ) eq am_lc( $y ) ) {
112             # Almost as simple: case difference.
113 2         12 return $refc->[3]; # Vocalic equivalence.
114             }
115              
116             # Got this far? We have to play games with prefixes, suffixes,
117             # similar-letter substitution, and the like.
118              
119             # Downcase both of them.
120 1752         3233 $x = am_lc( $x );
121 1752         2964 $y = am_lc( $y );
122              
123 1752 100 100     6568 if ( ($x eq '-') or ($y eq '-') ) {
124             # Are we dealing with a prefix or a suffix?
125             # print STDERR "x is $x; y is $y;\n";
126 1316 100       75509 if( grep( /(\Q$x\E|\Q$y\E)/, @Prefixes ) > 0 ) {
    100          
127 108         518 return $refc->[5];
128             } elsif( grep( /(\Q$x\E|\Q$y\E)/, @Suffixes ) > 0 ) {
129 130         715 return $refc->[6];
130             } else {
131             # Normal insert/delete
132 1078         4194 return $refc->[1];
133             }
134             } else {
135 436 100       892 if( exists( $VocalicEquivalence{$x} ) ) {
136             # Same word, vocalic shift?
137             # N.B. This will mistakenly give less weight to a few genuinely
138             # different words, e.g. the verbs "գամ" vs. "կամ". I can live with that.
139 156         164 my @equivs = @{$VocalicEquivalence{$x}};
  156         495  
140 156 100       4120 my $val = grep (/$y/, @equivs ) ? $refc->[4] : $refc->[2];
141 156         766 return $val;
142             } else {
143 280         1137 return $refc->[2];
144             }
145             }
146             }
147              
148             # Annoyingly, I need to copy this whole damn thing because I need to change
149             # the refc mid-stream.
150              
151             =head1 SUBROUTINES
152              
153             =over
154              
155             =item B( \@editweight, $string1, $string2, [ .. $stringN ] );
156              
157             =item B( $string1, $string2, [ .. $stringN ] );
158              
159             The main exported function of this module. Takes a list of two or
160             more strings and returns the edit distance between the first string
161             and each of the others. The "edit_distances" array is an optional
162             first argument, with which users may override the default edit
163             penalties, as described above.
164              
165             =cut
166              
167             sub distance {
168 26     26 1 99 my ($refcarg,$s,@t)=@_;
169              
170             # The refc values are as documented above:
171             # 0. x,x; 1. x,''; 2. x,y; 3. x,X; 4. d,t; 5. x,zx; 6. x,xn
172             # 6 only applies at beginnings of words, and 7 only applies at
173             # ends.
174              
175 26         65 my $refc = [];
176 26 100       106 if (!@t) {
    50          
177             # Two args...
178 13 50       39 if (ref($refcarg) ne "ARRAY") {
179             # the first of which is a string...
180 13 50       32 if (ref($s) ne "ARRAY") {
181             # ...and the second of which is a string.
182             # Use default refc set.
183 13         20 $t[0]=$s;
184 13         21 $s=$refcarg;
185 13         52 push( @$refc, @$REFC );
186             } else {
187             # ...one of which is an array. Croak.
188 0         0 require Carp;
189 0         0 Carp::croak("Text::WagnerFischer: second string is needed");
190             }
191             } else {
192             # one refc, and one string. Croak.
193 0         0 require Carp;
194 0         0 Carp::croak("Text::WagnerFischer: second string is needed");
195             }
196             } elsif (ref($refcarg) ne "ARRAY") {
197             # Three or more args, all strings.
198             # Use default refc set.
199 0         0 unshift @t,$s;
200 0         0 $s=$refcarg;
201 0         0 push( @$refc, @$REFC );
202             } else {
203             # A refc array and (presumably) some strings.
204             # Copy the passed array into our own array, because
205             # we are going to mutate our copy.
206 13         61 push( @$refc, @$refcarg );
207             }
208            
209             # Set up the refc arrays in three different formats - one for word
210             # beginnings, one for word ends, and one for everything else.
211 26         66 my( $refc_start, $refc_end ) = ( [], [] );
212 26         79 push( @$refc_start, @$refc );
213             # Count suffixes as normal add/del.
214 26         53 $refc_start->[6] = $refc->[1];
215 26         74 push( @$refc_end, @$refc );
216 26         48 $refc_end->[5] = $refc->[1];
217              
218             # Now alter our main refc, which should no longer
219             # care about prefixes or suffixes.
220 26         47 $refc->[5] = $refc->[1];
221 26         41 $refc->[6] = $refc->[1];
222            
223              
224             # binmode STDERR, ":utf8"; # for debugging
225             # Start the real string comparison.
226 26         52 my $n=length($s);
227 26         33 my @result;
228            
229 26         45 foreach my $t (@t) {
230            
231 26         28 my @d;
232            
233 26         39 my $m=length($t);
234 26 50       62 if(!$n) {push @result,$m*$refc->[1];next}
  0         0  
  0         0  
235 26 50       49 if(!$m) {push @result,$n*$refc->[1];next}
  0         0  
  0         0  
236            
237 26         100 $d[0][0]=0;
238            
239             # Populate the zero row.
240             # Cannot assume that blank vs. 1st letter is "add". Might
241             # be "prefix."
242 26         70 my $f_i = substr($s,0,1);
243 26         63 foreach my $i (1 .. $n) {$d[$i][0]=$i*&_am_weight('-',$f_i,$refc_start);}
  116         224  
244 26         615 my $f_j = substr($t,0,1);
245 26         54 foreach my $j (1 .. $m) {$d[0][$j]=$j*&_am_weight($f_j,'-',$refc_start);}
  116         207  
246            
247 26         60 foreach my $i (1 .. $n) {
248 116         965 my $s_i=substr($s,$i-1,1);
249 116         300 foreach my $j (1 .. $m) {
250             # Switch to suffix refc if we are to end of either word.
251 542 100 100     7011 $refc = $refc_end if( $i == $n || $j == $m );
252 542         1240 my $t_i=substr($t,$j-1,1);
253            
254 542         1181 $d[$i][$j]=Text::WagnerFischer::_min($d[$i-1][$j]+_am_weight($s_i,'-',$refc),
255             $d[$i][$j-1]+_am_weight('-',$t_i,$refc),
256             $d[$i-1][$j-1]+_am_weight($s_i,$t_i,$refc));
257             }
258             }
259            
260 26         261 my $r = $d[$n][$m];
261             ## Round up to get an integer result.
262             ## On second thought, don't.
263             # if( $r - int( $r ) > 0 ) {
264             # $r = int( $r ) + 1;
265             # }
266              
267 26         166 push @result, $r;
268              
269             ## Debugging statements
270             # print "\nARRAY for $s / $t\n";
271             # foreach my $arr ( @d ) {
272             # print join( " ", @$arr ) . "\n"
273             # }
274             }
275 26 50       60 if (wantarray) {return @result} else {return $result[0]}
  0         0  
  26         253  
276             }
277            
278              
279             =item B( $char )
280              
281             A small utility function, useful for Armenian text. Returns the
282             lowercase version of the character passed in.
283              
284             =back
285              
286             =cut
287              
288             sub am_lc {
289 7012     7012 1 10748 my $char = shift;
290             # Is it in the uppercase Armenian range?
291 7012 100       19338 if( $char =~ /[\x{531}-\x{556}]/ ) {
292 158         346 my $codepoint = unpack( "U", $char );
293 158         205 $codepoint += 48;
294 158         418 $char = pack( "U", $codepoint );
295             }
296 7012         20878 return $char;
297             }
298              
299             =head1 LIMITATIONS
300              
301             There are many cases of Armenian word equivalence that are not
302             perfectly handled by this; it is meant to be a rough heuristic for
303             comparing transcriptions of handwriting. In particular, multi-letter
304             suffixes, and some orthographic equivalence e.g "o" -> "aw", are not
305             handled at all.
306              
307             =head1 LICENSE
308              
309             This package is free software and is provided "as is" without express
310             or implied warranty. You can redistribute it and/or modify it under
311             the same terms as Perl itself.
312              
313             =head1 AUTHOR
314              
315             Tara L Andrews, L
316              
317             =cut
318              
319             1;