File Coverage

lib/Text/WagnerFischer/Armenian.pm

Criterion	Covered	Total	%
statement	81	93	87.1
branch	26	32	81.2
condition	6	6	100.0
subroutine	9	9	100.0
pod	2	2	100.0
total	124	142	87.3

line	stmt	bran	cond	sub	pod	time	code
1							package Text::WagnerFischer::Armenian;
2
3							=head1 NAME
4
5							Text::WagnerFischer::Armenian - a variation on Text::WagnerFischer for Armenian-language strings
6
7							=head1 SYNOPSIS
8
9							use Text::WagnerFischer::Armenian qw( distance );
10							use utf8; # for the Armenian characters in the source code
11
12							print distance("ձեռն", "ձեռան") . "\n";
13							# "dzerrn -> dzerran"; prints 1
14							print distance("ձեռն", "ձերն") . "\n";
15							# "dzerrn -> dzern"; prints 0.5
16							print distance("կինք", "կին") . "\n";
17							# "kin" -> "kink'"; prints 0.5
18							my @words = qw( զօրսն Զորս զզօրսն );
19							my @distances = distance( "զօրս", @words );
20							print "@distances\n";
21							# "zors" -> "zorsn, Zors, zzorsn"
22							# prints "0.5 0.25 1"
23
24							# Change the cost of a letter case mismatch to 1
25							my $edit_values = [ 0, 1, 1, 1, 0.5, 0.5, 0.5 ],
26							print distance( $edit_values, "ձեռն", "Ձեռն" ) . "\n";
27							# "dzerrn" -> "DZerrn"; prints 1
28
29							=head1 DESCRIPTION
30
31							This module implements the Wagner-Fischer distance algorithm modified
32							for Armenian strings. The Armenian language has a number of
33							single-letter prefixes and suffixes which, while not changing the
34							basic meaning of the word, function as definite articles,
35							prepositions, or grammatical markers. These changes, and letter
36							substitutions that represent vocalic equivalence, should be counted as
37							a smaller edit distance than a change that is a normal character
38							substitution.
39
40							The Armenian weight function recognizes four extra edit types:
41
42							/ a: x = y (cost for letter match)
43							\| b: x = - or y = - (cost for letter insertion/deletion)
44							w( x, y ) = \| c: x != y (cost for letter mismatch)
45							\| d: x = X (cost for case mismatch)
46							\| e: x ~ y (cost for letter vocalic equivalence)
47							\| f: x = (z\|y\|ts) && y = - (or vice versa)
48							\| (cost for grammatic prefix)
49							\| g: x = (n\|k'\|s\|d) && y = - (or vice versa)
50							\ (cost for grammatic suffix)
51
52
53							=cut
54
55	1			1		4673	use strict;
	1					3
	1					90
56	1			1		5	use warnings;
	1					2
	1					34
57	1			1		4	no warnings 'redefine';
	1					2
	1					39
58	1			1		5	use Exporter 'import';
	1					1
	1					34
59	1			1		807	use Text::WagnerFischer;
	1					827
	1					38
60	1			1		6	use utf8;
	1					1
	1					5
61
62							my( %VocalicEquivalence, @Prefixes, @Suffixes, $REFC );
63
64							our $VERSION = "0.03";
65							our @EXPORT_OK = qw( &distance &am_lc );
66
67							# Set new default costs:
68							#
69							# WagnerFischer : equal, insert/delete, mismatch,
70							# LetterCaseEquiv : same word, case mismatch
71							# VocalicEquiv : letter that changed with pronunciation shift
72							# PrefixAddDrop : same word, one has prefix e.g. preposition form "y-"
73							# SuffixAddDrop : same word, one has suffix e.g. definite article "-n"
74							$REFC = [ 0, 1, 1, 0.25, 0.5, 0.5, 0.5 ]; # mid-word: no pre/suffix
75
76							%VocalicEquivalence = (
77							'բ' => [ 'պ' ],
78							'գ' => [ 'ք', 'կ' ],
79							'դ' => [ 'տ' ],
80							'ե' => [ 'է' ],
81							'է' => [ 'ե' ],
82							'թ' => [ 'տ' ],
83							'լ' => [ 'ղ' ],
84							'կ' => [ 'գ', 'ք' ],
85							'ղ' => [ 'լ' ],
86							'յ' => [ '՛' ], # Only in manuscripts
87							'ո' => [ 'օ' ],
88							'պ' => [ 'բ', 'փ' ],
89							'ռ' => [ 'ր' ],
90							'վ' => [ 'ւ' ],
91							'տ' => [ 'դ', 'թ'],
92							'ր' => [ 'ռ' ],
93							'ւ' => [ 'վ' ],
94							'փ' => [ 'պ', 'ֆ' ],
95							'ք' => [ 'գ', 'կ' ],
96							'օ' => [ 'ո' ],
97							'ֆ' => [ 'փ' ],
98							'՛' => [ 'յ' ], # Only in manuscripts
99							);
100
101							@Prefixes = qw( զ ց յ );
102							@Suffixes = qw( ն ս դ ք );
103
104							sub _am_weight
105							{
106	1858			1858		3416	my ($x,$y,$refc)=@_;
107
108	1858	100				4378	if ($x eq $y) {
		100
109							# Simple case: exact match.
110	104					404	return $refc->[0];
111							} elsif( am_lc( $x ) eq am_lc( $y ) ) {
112							# Almost as simple: case difference.
113	2					12	return $refc->[3]; # Vocalic equivalence.
114							}
115
116							# Got this far? We have to play games with prefixes, suffixes,
117							# similar-letter substitution, and the like.
118
119							# Downcase both of them.
120	1752					3233	$x = am_lc( $x );
121	1752					2964	$y = am_lc( $y );
122
123	1752	100	100			6568	if ( ($x eq '-') or ($y eq '-') ) {
124							# Are we dealing with a prefix or a suffix?
125							# print STDERR "x is $x; y is $y;\n";
126	1316	100				75509	if( grep( /(\Q$x\E\|\Q$y\E)/, @Prefixes ) > 0 ) {
		100
127	108					518	return $refc->[5];
128							} elsif( grep( /(\Q$x\E\|\Q$y\E)/, @Suffixes ) > 0 ) {
129	130					715	return $refc->[6];
130							} else {
131							# Normal insert/delete
132	1078					4194	return $refc->[1];
133							}
134							} else {
135	436	100				892	if( exists( $VocalicEquivalence{$x} ) ) {
136							# Same word, vocalic shift?
137							# N.B. This will mistakenly give less weight to a few genuinely
138							# different words, e.g. the verbs "գամ" vs. "կամ". I can live with that.
139	156					164	my @equivs = @{$VocalicEquivalence{$x}};
	156					495
140	156	100				4120	my $val = grep (/$y/, @equivs ) ? $refc->[4] : $refc->[2];
141	156					766	return $val;
142							} else {
143	280					1137	return $refc->[2];
144							}
145							}
146							}
147
148							# Annoyingly, I need to copy this whole damn thing because I need to change
149							# the refc mid-stream.
150
151							=head1 SUBROUTINES
152
153							=over
154
155							=item B( \@editweight, $string1, $string2, [ .. $stringN ] );
156
157							=item B( $string1, $string2, [ .. $stringN ] );
158
159							The main exported function of this module. Takes a list of two or
160							more strings and returns the edit distance between the first string
161							and each of the others. The "edit_distances" array is an optional
162							first argument, with which users may override the default edit
163							penalties, as described above.
164
165							=cut
166
167							sub distance {
168	26			26	1	99	my ($refcarg,$s,@t)=@_;
169
170							# The refc values are as documented above:
171							# 0. x,x; 1. x,''; 2. x,y; 3. x,X; 4. d,t; 5. x,zx; 6. x,xn
172							# 6 only applies at beginnings of words, and 7 only applies at
173							# ends.
174
175	26					65	my $refc = [];
176	26	100				106	if (!@t) {
		50
177							# Two args...
178	13	50				39	if (ref($refcarg) ne "ARRAY") {
179							# the first of which is a string...
180	13	50				32	if (ref($s) ne "ARRAY") {
181							# ...and the second of which is a string.
182							# Use default refc set.
183	13					20	$t[0]=$s;
184	13					21	$s=$refcarg;
185	13					52	push( @$refc, @$REFC );
186							} else {
187							# ...one of which is an array. Croak.
188	0					0	require Carp;
189	0					0	Carp::croak("Text::WagnerFischer: second string is needed");
190							}
191							} else {
192							# one refc, and one string. Croak.
193	0					0	require Carp;
194	0					0	Carp::croak("Text::WagnerFischer: second string is needed");
195							}
196							} elsif (ref($refcarg) ne "ARRAY") {
197							# Three or more args, all strings.
198							# Use default refc set.
199	0					0	unshift @t,$s;
200	0					0	$s=$refcarg;
201	0					0	push( @$refc, @$REFC );
202							} else {
203							# A refc array and (presumably) some strings.
204							# Copy the passed array into our own array, because
205							# we are going to mutate our copy.
206	13					61	push( @$refc, @$refcarg );
207							}
208
209							# Set up the refc arrays in three different formats - one for word
210							# beginnings, one for word ends, and one for everything else.
211	26					66	my( $refc_start, $refc_end ) = ( [], [] );
212	26					79	push( @$refc_start, @$refc );
213							# Count suffixes as normal add/del.
214	26					53	$refc_start->[6] = $refc->[1];
215	26					74	push( @$refc_end, @$refc );
216	26					48	$refc_end->[5] = $refc->[1];
217
218							# Now alter our main refc, which should no longer
219							# care about prefixes or suffixes.
220	26					47	$refc->[5] = $refc->[1];
221	26					41	$refc->[6] = $refc->[1];
222
223
224							# binmode STDERR, ":utf8"; # for debugging
225							# Start the real string comparison.
226	26					52	my $n=length($s);
227	26					33	my @result;
228
229	26					45	foreach my $t (@t) {
230
231	26					28	my @d;
232
233	26					39	my $m=length($t);
234	26	50				62	if(!$n) {push @result,$m*$refc->[1];next}
	0					0
	0					0
235	26	50				49	if(!$m) {push @result,$n*$refc->[1];next}
	0					0
	0					0
236
237	26					100	$d[0][0]=0;
238
239							# Populate the zero row.
240							# Cannot assume that blank vs. 1st letter is "add". Might
241							# be "prefix."
242	26					70	my $f_i = substr($s,0,1);
243	26					63	foreach my $i (1 .. $n) {$d[$i][0]=$i*&_am_weight('-',$f_i,$refc_start);}
	116					224
244	26					615	my $f_j = substr($t,0,1);
245	26					54	foreach my $j (1 .. $m) {$d[0][$j]=$j*&_am_weight($f_j,'-',$refc_start);}
	116					207
246
247	26					60	foreach my $i (1 .. $n) {
248	116					965	my $s_i=substr($s,$i-1,1);
249	116					300	foreach my $j (1 .. $m) {
250							# Switch to suffix refc if we are to end of either word.
251	542	100	100			7011	$refc = $refc_end if( $i == $n \|\| $j == $m );
252	542					1240	my $t_i=substr($t,$j-1,1);
253
254	542					1181	$d[$i][$j]=Text::WagnerFischer::_min($d[$i-1][$j]+_am_weight($s_i,'-',$refc),
255							$d[$i][$j-1]+_am_weight('-',$t_i,$refc),
256							$d[$i-1][$j-1]+_am_weight($s_i,$t_i,$refc));
257							}
258							}
259
260	26					261	my $r = $d[$n][$m];
261							## Round up to get an integer result.
262							## On second thought, don't.
263							# if( $r - int( $r ) > 0 ) {
264							# $r = int( $r ) + 1;
265							# }
266
267	26					166	push @result, $r;
268
269							## Debugging statements
270							# print "\nARRAY for $s / $t\n";
271							# foreach my $arr ( @d ) {
272							# print join( " ", @$arr ) . "\n"
273							# }
274							}
275	26	50				60	if (wantarray) {return @result} else {return $result[0]}
	0					0
	26					253
276							}
277
278
279							=item B( $char )
280
281							A small utility function, useful for Armenian text. Returns the
282							lowercase version of the character passed in.
283
284							=back
285
286							=cut
287
288							sub am_lc {
289	7012			7012	1	10748	my $char = shift;
290							# Is it in the uppercase Armenian range?
291	7012	100				19338	if( $char =~ /[\x{531}-\x{556}]/ ) {
292	158					346	my $codepoint = unpack( "U", $char );
293	158					205	$codepoint += 48;
294	158					418	$char = pack( "U", $codepoint );
295							}
296	7012					20878	return $char;
297							}
298
299							=head1 LIMITATIONS
300
301							There are many cases of Armenian word equivalence that are not
302							perfectly handled by this; it is meant to be a rough heuristic for
303							comparing transcriptions of handwriting. In particular, multi-letter
304							suffixes, and some orthographic equivalence e.g "o" -> "aw", are not
305							handled at all.
306
307							=head1 LICENSE
308
309							This package is free software and is provided "as is" without express
310							or implied warranty. You can redistribute it and/or modify it under
311							the same terms as Perl itself.
312
313							=head1 AUTHOR
314
315							Tara L Andrews, L
316
317							=cut
318
319							1;