File Coverage

lib/Text/WagnerFischer/Armenian.pm

Criterion	Covered	Total	%
statement	81	93	87.1
branch	26	32	81.2
condition	6	6	100.0
subroutine	9	9	100.0
pod	2	2	100.0
total	124	142	87.3

line	stmt	bran	cond	sub	pod	time	code
1							package Text::WagnerFischer::Armenian;
2
3							=encoding utf8
4
5							=head1 NAME
6
7							Text::WagnerFischer::Armenian - a variation on Text::WagnerFischer for Armenian-language strings
8
9							=head1 SYNOPSIS
10
11							use Text::WagnerFischer::Armenian qw( distance );
12							use utf8; # for the Armenian characters in the source code
13
14							print distance("ձեռն", "ձեռան") . "\n";
15							# "dzerrn -> dzerran"; prints 1
16							print distance("ձեռն", "ձերն") . "\n";
17							# "dzerrn -> dzern"; prints 0.5
18							print distance("կինք", "կին") . "\n";
19							# "kin" -> "kink'"; prints 0.5
20							my @words = qw( զօրսն Զորս զզօրսն );
21							my @distances = distance( "զօրս", @words );
22							print "@distances\n";
23							# "zors" -> "zorsn, Zors, zzorsn"
24							# prints "0.5 0.25 1"
25
26							# Change the cost of a letter case mismatch to 1
27							my $edit_values = [ 0, 1, 1, 1, 0.5, 0.5, 0.5 ],
28							print distance( $edit_values, "ձեռն", "Ձեռն" ) . "\n";
29							# "dzerrn" -> "DZerrn"; prints 1
30
31							=head1 DESCRIPTION
32
33							This module implements the Wagner-Fischer distance algorithm modified
34							for Armenian strings. The Armenian language has a number of
35							single-letter prefixes and suffixes which, while not changing the
36							basic meaning of the word, function as definite articles,
37							prepositions, or grammatical markers. These changes, and letter
38							substitutions that represent vocalic equivalence, should be counted as
39							a smaller edit distance than a change that is a normal character
40							substitution.
41
42							The Armenian weight function recognizes four extra edit types:
43
44							/ a: x = y (cost for letter match)
45							\| b: x = - or y = - (cost for letter insertion/deletion)
46							w( x, y ) = \| c: x != y (cost for letter mismatch)
47							\| d: x = X (cost for case mismatch)
48							\| e: x ~ y (cost for letter vocalic equivalence)
49							\| f: x = (z\|y\|ts) && y = - (or vice versa)
50							\| (cost for grammatic prefix)
51							\| g: x = (n\|k'\|s\|d) && y = - (or vice versa)
52							\ (cost for grammatic suffix)
53
54
55							=cut
56
57	1			1		1770	use strict;
	1					1
	1					28
58	1			1		4	use warnings;
	1					1
	1					21
59	1			1		3	no warnings 'redefine';
	1					1
	1					24
60	1			1		3	use Exporter 'import';
	1					1
	1					25
61	1			1		401	use Text::WagnerFischer;
	1					562
	1					34
62	1			1		4	use utf8;
	1					1
	1					3
63
64							my( %VocalicEquivalence, @Prefixes, @Suffixes, $REFC );
65
66							our $VERSION = "0.04";
67							our @EXPORT_OK = qw( &distance &am_lc );
68
69							# Set new default costs:
70							#
71							# WagnerFischer : equal, insert/delete, mismatch,
72							# LetterCaseEquiv : same word, case mismatch
73							# VocalicEquiv : letter that changed with pronunciation shift
74							# PrefixAddDrop : same word, one has prefix e.g. preposition form "y-"
75							# SuffixAddDrop : same word, one has suffix e.g. definite article "-n"
76							$REFC = [ 0, 1, 1, 0.25, 0.5, 0.5, 0.5 ]; # mid-word: no pre/suffix
77
78							%VocalicEquivalence = (
79							'բ' => [ 'պ' ],
80							'գ' => [ 'ք', 'կ' ],
81							'դ' => [ 'տ' ],
82							'ե' => [ 'է' ],
83							'է' => [ 'ե' ],
84							'թ' => [ 'տ' ],
85							'լ' => [ 'ղ' ],
86							'կ' => [ 'գ', 'ք' ],
87							'ղ' => [ 'լ' ],
88							'յ' => [ '՛' ], # Only in manuscripts
89							'ո' => [ 'օ' ],
90							'պ' => [ 'բ', 'փ' ],
91							'ռ' => [ 'ր' ],
92							'վ' => [ 'ւ' ],
93							'տ' => [ 'դ', 'թ'],
94							'ր' => [ 'ռ' ],
95							'ւ' => [ 'վ' ],
96							'փ' => [ 'պ', 'ֆ' ],
97							'ք' => [ 'գ', 'կ' ],
98							'օ' => [ 'ո' ],
99							'ֆ' => [ 'փ' ],
100							'՛' => [ 'յ' ], # Only in manuscripts
101							);
102
103							@Prefixes = qw( զ ց յ );
104							@Suffixes = qw( ն ս դ ք );
105
106							sub _am_weight
107							{
108	1858			1858		2919	my ($x,$y,$refc)=@_;
109
110	1858	100				4337	if ($x eq $y) {
		100
111							# Simple case: exact match.
112	104					414	return $refc->[0];
113							} elsif( am_lc( $x ) eq am_lc( $y ) ) {
114							# Almost as simple: case difference.
115	2					11	return $refc->[3]; # Vocalic equivalence.
116							}
117
118							# Got this far? We have to play games with prefixes, suffixes,
119							# similar-letter substitution, and the like.
120
121							# Downcase both of them.
122	1752					3194	$x = am_lc( $x );
123	1752					2599	$y = am_lc( $y );
124
125	1752	100	100			6716	if ( ($x eq '-') or ($y eq '-') ) {
126							# Are we dealing with a prefix or a suffix?
127							# print STDERR "x is $x; y is $y;\n";
128	1316	100				65452	if( grep( /(\Q$x\E\|\Q$y\E)/, @Prefixes ) > 0 ) {
		100
129	108					512	return $refc->[5];
130							} elsif( grep( /(\Q$x\E\|\Q$y\E)/, @Suffixes ) > 0 ) {
131	130					670	return $refc->[6];
132							} else {
133							# Normal insert/delete
134	1078					5029	return $refc->[1];
135							}
136							} else {
137	436	100				846	if( exists( $VocalicEquivalence{$x} ) ) {
138							# Same word, vocalic shift?
139							# N.B. This will mistakenly give less weight to a few genuinely
140							# different words, e.g. the verbs "գամ" vs. "կամ". I can live with that.
141	156					171	my @equivs = @{$VocalicEquivalence{$x}};
	156					411
142	156	100				2154	my $val = grep (/$y/, @equivs ) ? $refc->[4] : $refc->[2];
143	156					732	return $val;
144							} else {
145	280					1252	return $refc->[2];
146							}
147							}
148							}
149
150							# Annoyingly, I need to copy this whole damn thing because I need to change
151							# the refc mid-stream.
152
153							=head1 SUBROUTINES
154
155							=over
156
157							=item B( \@editweight, $string1, $string2, [ .. $stringN ] );
158
159							=item B( $string1, $string2, [ .. $stringN ] );
160
161							The main exported function of this module. Takes a list of two or
162							more strings and returns the edit distance between the first string
163							and each of the others. The "edit_distances" array is an optional
164							first argument, with which users may override the default edit
165							penalties, as described above.
166
167							=cut
168
169							sub distance {
170	26			26	1	90	my ($refcarg,$s,@t)=@_;
171
172							# The refc values are as documented above:
173							# 0. x,x; 1. x,''; 2. x,y; 3. x,X; 4. d,t; 5. x,zx; 6. x,xn
174							# 6 only applies at beginnings of words, and 7 only applies at
175							# ends.
176
177	26					48	my $refc = [];
178	26	100				94	if (!@t) {
		50
179							# Two args...
180	13	50				34	if (ref($refcarg) ne "ARRAY") {
181							# the first of which is a string...
182	13	50				24	if (ref($s) ne "ARRAY") {
183							# ...and the second of which is a string.
184							# Use default refc set.
185	13					45	$t[0]=$s;
186	13					19	$s=$refcarg;
187	13					54	push( @$refc, @$REFC );
188							} else {
189							# ...one of which is an array. Croak.
190	0					0	require Carp;
191	0					0	Carp::croak("Text::WagnerFischer: second string is needed");
192							}
193							} else {
194							# one refc, and one string. Croak.
195	0					0	require Carp;
196	0					0	Carp::croak("Text::WagnerFischer: second string is needed");
197							}
198							} elsif (ref($refcarg) ne "ARRAY") {
199							# Three or more args, all strings.
200							# Use default refc set.
201	0					0	unshift @t,$s;
202	0					0	$s=$refcarg;
203	0					0	push( @$refc, @$REFC );
204							} else {
205							# A refc array and (presumably) some strings.
206							# Copy the passed array into our own array, because
207							# we are going to mutate our copy.
208	13					49	push( @$refc, @$refcarg );
209							}
210
211							# Set up the refc arrays in three different formats - one for word
212							# beginnings, one for word ends, and one for everything else.
213	26					55	my( $refc_start, $refc_end ) = ( [], [] );
214	26					76	push( @$refc_start, @$refc );
215							# Count suffixes as normal add/del.
216	26					49	$refc_start->[6] = $refc->[1];
217	26					62	push( @$refc_end, @$refc );
218	26					34	$refc_end->[5] = $refc->[1];
219
220							# Now alter our main refc, which should no longer
221							# care about prefixes or suffixes.
222	26					39	$refc->[5] = $refc->[1];
223	26					36	$refc->[6] = $refc->[1];
224
225
226							# binmode STDERR, ":utf8"; # for debugging
227							# Start the real string comparison.
228	26					55	my $n=length($s);
229	26					29	my @result;
230
231	26					52	foreach my $t (@t) {
232
233	26					26	my @d;
234
235	26					37	my $m=length($t);
236	26	50				68	if(!$n) {push @result,$m*$refc->[1];next}
	0					0
	0					0
237	26	50				55	if(!$m) {push @result,$n*$refc->[1];next}
	0					0
	0					0
238
239	26					56	$d[0][0]=0;
240
241							# Populate the zero row.
242							# Cannot assume that blank vs. 1st letter is "add". Might
243							# be "prefix."
244	26					101	my $f_i = substr($s,0,1);
245	26					60	foreach my $i (1 .. $n) {$d[$i][0]=$i*&_am_weight('-',$f_i,$refc_start);}
	116					211
246	26					601	my $f_j = substr($t,0,1);
247	26					57	foreach my $j (1 .. $m) {$d[0][$j]=$j*&_am_weight($f_j,'-',$refc_start);}
	116					201
248
249	26					49	foreach my $i (1 .. $n) {
250	116					1027	my $s_i=substr($s,$i-1,1);
251	116					214	foreach my $j (1 .. $m) {
252							# Switch to suffix refc if we are to end of either word.
253	542	100	100			4795	$refc = $refc_end if( $i == $n \|\| $j == $m );
254	542					1194	my $t_i=substr($t,$j-1,1);
255
256	542					1190	$d[$i][$j]=Text::WagnerFischer::_min($d[$i-1][$j]+_am_weight($s_i,'-',$refc),
257							$d[$i][$j-1]+_am_weight('-',$t_i,$refc),
258							$d[$i-1][$j-1]+_am_weight($s_i,$t_i,$refc));
259							}
260							}
261
262	26					266	my $r = $d[$n][$m];
263							## Round up to get an integer result.
264							## On second thought, don't.
265							# if( $r - int( $r ) > 0 ) {
266							# $r = int( $r ) + 1;
267							# }
268
269	26					133	push @result, $r;
270
271							## Debugging statements
272							# print "\nARRAY for $s / $t\n";
273							# foreach my $arr ( @d ) {
274							# print join( " ", @$arr ) . "\n"
275							# }
276							}
277	26	50				58	if (wantarray) {return @result} else {return $result[0]}
	0					0
	26					231
278							}
279
280
281							=item B( $char )
282
283							A small utility function, useful for Armenian text. Returns the
284							lowercase version of the character passed in.
285
286							=back
287
288							=cut
289
290							sub am_lc {
291	7012			7012	1	7089	my $char = shift;
292							# Is it in the uppercase Armenian range?
293	7012	100				16452	if( $char =~ /[\x{531}-\x{556}]/ ) {
294	158					338	my $codepoint = unpack( "U", $char );
295	158					174	$codepoint += 48;
296	158					377	$char = pack( "U", $codepoint );
297							}
298	7012					13303	return $char;
299							}
300
301							=head1 LIMITATIONS
302
303							There are many cases of Armenian word equivalence that are not
304							perfectly handled by this; it is meant to be a rough heuristic for
305							comparing transcriptions of handwriting. In particular, multi-letter
306							suffixes, and some orthographic equivalence e.g "o" -> "aw", are not
307							handled at all.
308
309							=head1 LICENSE
310
311							This package is free software and is provided "as is" without express
312							or implied warranty. You can redistribute it and/or modify it under
313							the same terms as Perl itself.
314
315							=head1 AUTHOR
316
317							Tara L Andrews, L
318
319							=cut
320
321							1;