File Coverage

blib/lib/Lingua/Orthon.pm
Criterion Covered Total %
statement 68 78 87.1
branch 22 34 64.7
condition 3 6 50.0
subroutine 13 16 81.2
pod 8 8 100.0
total 114 142 80.2


line stmt bran cond sub pod time code
1             package Lingua::Orthon;
2 2     2   48679 use Math::MatrixReal;
  2         74194  
  2         145  
3 2     2   2096 use List::AllUtils qw(mesh);
  2         6196  
  2         197  
4 2     2   38 use 5.006;
  2         13  
  2         72  
5 2     2   12 use strict;
  2         4  
  2         60  
6 2     2   11 use warnings;
  2         4  
  2         2794  
7             our $VERSION = '0.02';
8            
9             =pod
10            
11             =head1 NAME
12            
13             Lingua-Orthon - Various measures of orthographic relatedness between two letter strings
14            
15             =head1 VERSION
16            
17             Version 0.02
18            
19             =head1 SYNOPSIS
20            
21             use Lingua::Orthon 0.02;
22             my $ortho = Lingua::Orthon->new();
23             my $n = $ortho->index_identical('BANG', 'BARN');
24            
25             =head1 DESCRIPTION
26            
27             Lingua-Orthon - Various measures of orthographic relatedness between two letter strings
28            
29             =head1 METHODS
30            
31             =head2 new
32            
33             my $ortho = Lingua::Orthon->new();
34            
35             Constructs/returns class object for accessing and passing params to other methods.
36            
37             =cut
38            
39             sub new {
40 1     1 1 11 my ($class, %attribs) = @_;
41 1         2 my $self = {};
42 1         3 bless $self, $class;
43             # $self->{'_proddat'} = _productivity_dat();
44             # $self->{'_frqdat'} = _frequency_dat();
45 1         7 my @lets = ('A'..'Z');
46 1         3 my @nums = (0 .. 25);
47 1         36 $self->{'_alphahash'} = { mesh(@lets, @nums ) };
48 1         19 return $self;
49             }
50            
51             =head3 are_orthons
52            
53             Returns 0 or 1 if the two strings qualify as 1-mismatch (Coltheart-type) orthons: same size, and only one discrepancy by substitution (no additions, deletions or transpositions).
54            
55             =cut
56            
57             sub are_orthons {
58 0     0 1 0 my $self = shift;
59 0 0       0 return 0 if length($_[0]) != length($_[1]);
60 0 0       0 return index_identical($self, $_[0], $_[1]) == length($_[0]) - 1 ? 1 : 0;
61             }
62            
63             =head3 index_identical
64            
65             $val = $orthon->index_identical($w1, $w2);
66            
67             Returns the number of letters that are both identical and in the same serial position. So for BANG and BARN, 2 would be returned, for B and A, ignoring the common N as it is the third letter in BANG, the fourth letter in BARN, and so not in the same serial position across these two words.
68            
69             =cut
70            
71             sub index_identical { # "Coltheart" orthons
72 4     4 1 1317 my $self = shift;
73            
74 4         7 my ($w1, $w2, $n, $i, $j) = (shift, shift, 0); # BENCHMARK: ~10%-25% faster than by list and separate decs
75 4         13 for ($i = 0; $i < length($w1); $i++) {
76 18 100 100     90 $n++ if substr($w1, $i, 1) eq (substr($w2, $i, 1) or last); # BENCHMARK: ~10%-20% faster than or by ||
77             # run the length of the second word and see how many common letters anyway, and how many index positions apart:
78             #my @cmn = ();
79             #for ($j = 0; $i < length($w2); $j++) {
80             # if ( substr($w1, $i, 1) eq (substr($w2, $j, 1) or last) ) {
81             #
82             # }
83             #}
84             }
85 4         11 return $n;
86             }
87            
88             =head3 hdist
89            
90             $val = $orthon->hdist('String1', 'String2');
91            
92             Return the Hamming Distance between two letter strings.
93            
94             =cut
95            
96             sub hdist {
97 1     1 1 492 shift(@_);
98             #String length is assumed to be equal
99 1         6 return ($_[0] ^ $_[1]) =~ tr/\001-\255//; # thanks to: http://www.perlmonks.org/?node_id=500235
100             #return length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] );
101             }
102            
103             =head3 ldist
104            
105             $val = $orthon->ldist('String1', 'String2');
106            
107             Return the Levenshtein Distance between two letter strings.
108            
109             =cut
110            
111             sub ldist {
112 3     3 1 1033 my ($self, $w1, $w2) = @_; # e.g. TAP, TAR
113            
114 3         4 my ($identity_cost, $addition_cost, $deletion_cost, $substitution_cost, $permutation_cost) = (0, 1, 1, 1, 1);
115 3         8 my $refc = [$identity_cost, $deletion_cost, $addition_cost, $substitution_cost];
116            
117 3 50       7 return 0 if $w1 eq $w2; # Zero for total equality
118            
119             # The length of the other string if no length to this one:
120 3         5 my $n1 = length($w1);
121 3         4 my $n2 = length($w2);
122 3 50       7 return $addition_cost * $n2 if !$n1; # $w2 is a complete addition to $w1
123 3 50       7 return $deletion_cost * $n1 if !$n2; # $w2 is a complete deletion of $w1
124            
125 3         8 my $d_matrix = _load_matrix_weighted($self, $w1, $w2, $n1, $n2, $refc);
126 3 50       13 return ref $d_matrix ? $d_matrix->element($n1+1, $n2+1) : $d_matrix;
127            
128             # add 1 to every element in the matrix: passed the element, the row index and the column index, i
129             #$matrix = $matrix->each ( sub { (shift) + 1 } );
130             #$new_matrix = $matrix->each_diag( \&function );
131            
132             }
133            
134             =head3 len_maxseq
135            
136             $val = $orthon->len_maxseq('String1', 'String2');
137            
138             Return the length of the longest common subsequence between two letter strings. This subsequence is found by the C method in L.
139            
140             =cut
141            
142             sub len_maxseq {
143 1     1 1 325 my ($self, $w1, $w2) = @_; # e.g. TAP, TAR
144 1         922 require String::LCSS_XS;
145 1         652 my @lcs = String::LCSS_XS::lcss($w1, $w2);
146 1   50     6 return length($lcs[0]) || 0;
147             }
148            
149             =head3 unique_abbrevs
150            
151             $val = $orthon->unique_abbrevs('String1', 'String2');
152            
153             Return the number of unique abbreviations that can be made between two letter strings. This subsequence is found by the C method in L.
154            
155             =cut
156            
157             sub unique_abbrevs {
158 0     0 1 0 my ($self, $w1, $w2) = @_; # e.g. TAP, TAR
159 0         0 require Text::Abbrev;
160 0         0 my $href = Text::Abbrev::abbrev($w1, $w2);
161 0   0     0 return scalar(keys(%{$href})) || 0;
162             }
163            
164             =head3 myers_ukkonen
165            
166             $val = $orthon->myers_ukkonen('String1', 'String2');
167            
168             Return the Myers-Ukkonen distance between two letter strings, as found by the C method in L.
169            
170             =cut
171            
172             sub myers_ukkonen {
173 0     0 1 0 my ($self, $w1, $w2) = @_; # e.g. TAP, TAR
174 0         0 require String::Similarity;
175 0         0 return String::Similarity::similarity($w1, $w2);
176             }
177            
178             # private methods
179            
180             sub _load_matrix_weighted { # without early out-clauses:
181 3     3   5 my ($self, $w1, $w2, $n1, $n2, $refc) = @_;
182            
183 3         16 my $d_matrix = new Math::MatrixReal($n1+1, $n2+1);
184            
185 3         143 $d_matrix->assign(1, 1, 0);
186            
187 3         118 foreach (1 .. $n1) {$d_matrix->assign($_+1,1, $_*$refc->[1]);}
  20         256  
188 3         35 foreach (1 .. $n2) {$d_matrix->assign(1,$_+1,$_*$refc->[1]);}
  20         203  
189            
190 3         34 for my $i (1 .. $n1) {
191 20         230 my $w1_i = substr($w1, $i-1, 1);
192             #print "before one inner row:\n", $d_matrix, "\n";
193 20         28 for my $j(1 .. $n2) {
194 134         2124 my $w2_i = substr($w2, $j-1, 1);
195 134         542 $d_matrix->assign($i+1, $j+1, # starts at column 2 row 2
196             &_min(
197             $d_matrix->element($i, $j+1) + _weight($self, $w1_i,'-' . $w2_i, $refc), # deletion
198             $d_matrix->element($i+1, $j) + _weight($self, '-' . $w1_i, $w2_i, $refc), # addition
199             $d_matrix->element($i, $j) + _weight($self, $w1_i, $w2_i, $refc) # substitution
200             )
201             );
202             }
203             #print "after one inner row:\n", $d_matrix, "\n";
204             }
205             #print "final:\n", $d_matrix, "\n";
206 3         39 return $d_matrix;
207             }
208            
209             sub _weight {
210             #the cost function
211 402     402   3859 my ($self, $w1_i, $w2_i, $refc) = @_;
212 402         586 my $type = '_frqdat';
213 402         576 my $apply = 0;
214            
215 402 100       1655 if ($w1_i eq $w2_i) { #print "identity $w1_i $w2_i\n";
    100          
    100          
216 17 50       45 return $apply ? ($refc->[0]+1) * $self->{$type}->[$self->{'_alphahash'}->{$w1_i}] : 0; # cost for letter match
217             }
218             elsif ($w2_i =~ s/^\-//) {#print "deletion $w1_i $w2_i\n";
219             # inversion of free-production for the lost letter: loss of the more familiar = less competition; makes the cost less, so "closer" distance
220 134 50       195 my $prod = $apply ? $self->{$type}->[$self->{'_alphahash'}->{$w2_i}] : 1;
221 134         500 return $refc->[1] * 1/$prod; # cost for deletion
222             }
223             elsif ($w1_i =~ s/^\-//) { #print "addition $w1_i $w2_i\n";
224 134 50       179 my $prod = $apply ? $self->{$type}->[$self->{'_alphahash'}->{$w1_i}] : 1;
225 134         3729 return $refc->[2] * 1/$prod; # cost for addition
226             }
227             else { #print "mismatch $w1_i $w2_i\n";# $w1_i ne $w2_i
228 117 50       150 my $prod = $apply ? $self->{$type}->[$self->{'_alphahash'}->{$w2_i}] / $self->{$type}->[$self->{'_alphahash'}->{$w1_i}] : 1;
229 117         514 return $refc->[3] * $prod; # cost for letter mismatch
230             }
231             }
232            
233             sub _min {
234 134 100   134   663 return $_[0] < $_[1]
    100          
    100          
235             ? $_[0] < $_[2] ? $_[0] : $_[2]
236             : $_[1] < $_[2] ? $_[1] : $_[2];
237             }
238            
239             =head1 AUTHOR
240            
241             Roderick Garton, C<< >>
242            
243             =head1 BUGS
244            
245             Please report any bugs or feature requests to C, or through
246             the web interface at L. I will be notified, and then you'll
247             automatically be notified of progress on your bug as I make changes.
248            
249             =head1 SUPPORT
250            
251             You can find documentation for this module with the perldoc command.
252            
253             perldoc Lingua::Orthon
254            
255            
256             You can also look for information at:
257            
258             =over 4
259            
260             =item * RT: CPAN's request tracker (report bugs here)
261            
262             L
263            
264             =item * AnnoCPAN: Annotated CPAN documentation
265            
266             L
267            
268             =item * CPAN Ratings
269            
270             L
271            
272             =item * Search CPAN
273            
274             L
275            
276             =back
277            
278             =head1 LICENSE AND COPYRIGHT
279            
280             Copyright 2011-2012 Roderick Garton.
281            
282             This program is free software; you can redistribute it and/or modify it
283             under the terms of either: the GNU General Public License as published
284             by the Free Software Foundation; or the Artistic License.
285            
286             See http://dev.perl.org/licenses/ for more information.
287            
288             =cut
289            
290             __DATA__