line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Orthon;
|
2
|
7
|
|
|
7
|
|
395360
|
use 5.006;
|
|
7
|
|
|
|
|
70
|
|
3
|
7
|
|
|
7
|
|
44
|
use strict;
|
|
7
|
|
|
|
|
16
|
|
|
7
|
|
|
|
|
129
|
|
4
|
7
|
|
|
7
|
|
25
|
use warnings FATAL => 'all';
|
|
7
|
|
|
|
|
8
|
|
|
7
|
|
|
|
|
195
|
|
5
|
7
|
|
|
7
|
|
188
|
use Carp qw(croak);
|
|
7
|
|
|
|
|
12
|
|
|
7
|
|
|
|
|
335
|
|
6
|
7
|
|
|
7
|
|
2728
|
use List::AllUtils qw(any);
|
|
7
|
|
|
|
|
79667
|
|
|
7
|
|
|
|
|
433
|
|
7
|
7
|
|
|
7
|
|
2625
|
use Number::Misc qw(is_numeric);
|
|
7
|
|
|
|
|
6833
|
|
|
7
|
|
|
|
|
334
|
|
8
|
7
|
|
|
7
|
|
2580
|
use Statistics::Lite qw(mean);
|
|
7
|
|
|
|
|
8263
|
|
|
7
|
|
|
|
|
377
|
|
9
|
7
|
|
|
7
|
|
2915
|
use String::Util qw(hascontent nocontent);
|
|
7
|
|
|
|
|
30037
|
|
|
7
|
|
|
|
|
419
|
|
10
|
7
|
|
|
7
|
|
4097
|
use Unicode::Collate;
|
|
7
|
|
|
|
|
46837
|
|
|
7
|
|
|
|
|
8462
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
$Lingua::Orthon::VERSION = '0.03';
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=pod
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=encoding CP-1252
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Lingua-Orthon - Orthographic similarity of string to one or more others by Coltheart's N and related measures
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 VERSION
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This is documentation for B of Lingua::Orthon.
|
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use Lingua::Orthon 0.03;
|
29
|
|
|
|
|
|
|
my $orthon = Lingua::Orthon->new();
|
30
|
|
|
|
|
|
|
my $bool = $orthon->are_orthons('BANG', 'BARN'); # 0
|
31
|
|
|
|
|
|
|
$bool = $orthon->are_orthons('BANG', 'BONG'); # 1
|
32
|
|
|
|
|
|
|
my $idx = $orthon->index_diff('BANK', 'BARK'); # 2
|
33
|
|
|
|
|
|
|
my $count = $orthon->index_identical('BANG', 'BARN'); # 2
|
34
|
|
|
|
|
|
|
my (@diff) = $orthon->char_diff('BANG', 'BONG'); # (qw/A O/)
|
35
|
|
|
|
|
|
|
$count = $orthon->onc(
|
36
|
|
|
|
|
|
|
test => 'BANG',
|
37
|
|
|
|
|
|
|
sample => [qw/BAND COCO BING RANG BONG SONG/]); # 4
|
38
|
|
|
|
|
|
|
my $aref = $orthon->list_orthons(
|
39
|
|
|
|
|
|
|
test => 'BANG',
|
40
|
|
|
|
|
|
|
sample => [qw/BAND COCO BING RANG BONG SONG/]); # BAND, BING, RANG, BONG
|
41
|
|
|
|
|
|
|
$count = $orthon->levenshtein('BANG', 'BARN'); # 2
|
42
|
|
|
|
|
|
|
my $float = $orthon->old(
|
43
|
|
|
|
|
|
|
test => 'BANG',
|
44
|
|
|
|
|
|
|
sample => [qw/BAND COCO BING RANG BONG SONG/]); # ~= 1.67
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Lingua-Orthon provides measures of similarity of character strings based on their orthographic identity, as relevant to psycholinguistic research. Case- and mark-sensitivity for determining character equality can be controlled. Wraps to Levenshtein Distance methods, extended to the OLD-20 metric, are provided for convenience of comparison. No methods are explicitly exported; all methods are called in the object-oriented way.
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=head2 new
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
my $ortho = Lingua::Orthon->new();
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Constructs/returns class object for accessing other methods.
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
Optionally, set the argument B to an integer value ranging from 0 to 3 to control case- and mark-sensitivity. See L.
|
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=cut
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub new {
|
63
|
9
|
|
|
9
|
1
|
1673
|
my ( $class, %args ) = @_;
|
64
|
9
|
|
|
|
|
23
|
my $self = {};
|
65
|
9
|
|
|
|
|
20
|
bless $self, $class;
|
66
|
9
|
|
|
|
|
38
|
$self->set_eq( match_level => $args{'match_level'} );
|
67
|
9
|
|
|
|
|
1115
|
return $self;
|
68
|
|
|
|
|
|
|
}
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head2 are_orthons
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
$bool = $orthon->are_orthons('String1', 'String2');
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Returns 0 or 1 (Coltheart's Boolean) if two given strings are orthographic neighbours by a 1-mismatch I: i.e., the strings are of the same size (are equal in character count) and there is only one discrepancy between them by a single substitution of a character in the same ordinal position (no additions, deletions or transpositions). So I and I are orthons by this measure (they differ only in the final letter), but I and I are not (the letter I is an I to I via I, or a I from I to I).
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
I: If two identical letter strings are given (I, I), they are defined as I being orthons: the number of index identical characters must be at least one less than the length of the string(s).
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
I: By default, identity is defined case-insensitively; e.g., I and I, and I and I are orthons. However, if B has been set (in L or L) to a higher level than 1 (or as undef or 0), then case is respected; e.g., I and I are orthons, but I and I are NOT orthons (they involve substituting both the Is and the second letters (I and I) ... but I and I, or I and I, are orthons. (This usefully applies to putting L|Lingua::Orthon/onc, coltheart_n> (the sum of single-substitution orthons a string has within a lexicon) to questions of the featural versus lexical basis of neighbourhood effects).
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
See Coltheart et al. (1977) (in L). The measure is computationally simple and economical, relative to other measures, such as based on a wider array of edit-types (e.g., Levenshtein Distance), that, while having greater explanatory power (Yarkoni et al., 2008), can tax resources on the order of days to effectively compute for a single string relative to a humanly memorable corpus.
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=cut
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
sub are_orthons {
|
85
|
13
|
|
|
13
|
1
|
2836
|
my ( $self, $w1, $w2 ) = @_;
|
86
|
13
|
|
|
|
|
32
|
return _are_orthons( $w1, $w2, $self->{'_EQ'} );
|
87
|
|
|
|
|
|
|
}
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head2 index_identical
|
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
$count = $orthon->index_identical('String1', 'String2');
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Returns a count: the number of letters that are identical and in the same serial position among two given letter-strings.
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
For example, given I and I, 2 is returned for the two index-identical letters, I and I; I is in both strings, but it is ignored as it is the third letter in I but the fourth letter in I, and so not in the same serial position across the two words.
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=cut
|
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub index_identical {
|
100
|
6
|
|
|
6
|
1
|
1418
|
my ( $self, $w1, $w2 ) = @_;
|
101
|
6
|
|
|
|
|
14
|
return _index_identical( $w1, $w2, $self->{'_EQ'} );
|
102
|
|
|
|
|
|
|
}
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head2 index_diff
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
$posint = $orthon->index_diff('String1', 'String2');
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Assuming the two strings are single-substitution orthons, returns the single index (anchored at zero) at which their letters differ. So if the two strings are "bring" and "being", the returned value is 1.
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=cut
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub index_diff {
|
113
|
2
|
|
|
2
|
1
|
425
|
my ( $self, $w1, $w2 ) = @_;
|
114
|
2
|
|
|
|
|
9
|
my $idx = 0;
|
115
|
2
|
|
|
|
|
5
|
for my $i ( 0 .. _smallest_len( $w1, $w2 ) - 1 ) {
|
116
|
7
|
100
|
|
|
|
701
|
if ( not $self->{'_EQ'}->( map { substr $_, $i, 1 } ( $w1, $w2 ) ) ) {
|
|
14
|
|
|
|
|
24
|
|
117
|
1
|
|
|
|
|
157
|
$idx = $i;
|
118
|
1
|
|
|
|
|
3
|
last;
|
119
|
|
|
|
|
|
|
}
|
120
|
|
|
|
|
|
|
}
|
121
|
2
|
|
|
|
|
128
|
return $idx;
|
122
|
|
|
|
|
|
|
}
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head2 char_diff
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
@ari = $orthon->char_diff('String1', 'String2');
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Returns a list of the first two characters (letters) that, reading from left to right, differ between two given strings. If the strings are single-substitution orthons, these are the characters that make them so. So if the two strings are "bring" and "being", the returned list is ('r', 'e') - the order of these characters in the returned list respecting the order of the given strings. The search across the strings terminates as soon there is a mismatch; otherwise, it continues only for as long as the length of the shortest string.
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
The identity match (or mismatch) is sensitive to the setting of the equality function per case and marks; see L.
|
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=cut
|
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
sub char_diff {
|
135
|
4
|
|
|
4
|
1
|
926
|
my ( $self, $w1, $w2 ) = @_;
|
136
|
4
|
|
|
|
|
8
|
my @ds = ();
|
137
|
4
|
|
|
|
|
8
|
for my $i ( 0 .. _smallest_len( $w1, $w2 ) - 1 ) {
|
138
|
13
|
|
|
|
|
1492
|
my @tmp = map { substr $_, $i, 1 } ( $w1, $w2 );
|
|
26
|
|
|
|
|
56
|
|
139
|
13
|
100
|
|
|
|
23
|
if ( not $self->{'_EQ'}->(@tmp) ) {
|
140
|
2
|
|
|
|
|
264
|
@ds = @tmp;
|
141
|
2
|
|
|
|
|
4
|
last;
|
142
|
|
|
|
|
|
|
}
|
143
|
|
|
|
|
|
|
}
|
144
|
4
|
|
|
|
|
282
|
return @ds;
|
145
|
|
|
|
|
|
|
}
|
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=head2 onc, coltheart_n
|
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
$int = $orthon->onc(test => CHARSTR, sample => AREF);
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
Returns the I (ONC), a.k.a. Coltheart's I: the number of single-letter substitution orthons a particular string has with respect to a list of strings (or "lexicon") (Coltheart et al., 1977). So I has two orthons (I and I) in the list (I, I, I and I).
|
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=cut
|
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub onc {
|
156
|
1
|
|
|
1
|
1
|
12
|
my ( $self, %args ) = @_;
|
157
|
|
|
|
|
|
|
my $test_str =
|
158
|
|
|
|
|
|
|
hascontent( $args{'test'} )
|
159
|
1
|
50
|
|
|
|
5
|
? $args{'test'}
|
160
|
|
|
|
|
|
|
: croak 'Need a single character string to test for orthons';
|
161
|
|
|
|
|
|
|
my $sample_aref =
|
162
|
|
|
|
|
|
|
ref $args{'sample'}
|
163
|
1
|
50
|
|
|
|
20
|
? $args{'sample'}
|
164
|
|
|
|
|
|
|
: croak
|
165
|
|
|
|
|
|
|
'Need a list (aref) of character-strings to sample for orthon listing';
|
166
|
1
|
|
|
|
|
2
|
my $count = 0;
|
167
|
1
|
|
|
|
|
2
|
for ( 0 .. ( scalar @{$sample_aref} - 1 ) ) {
|
|
1
|
|
|
|
|
4
|
|
168
|
6
|
100
|
|
|
|
11
|
if ( _are_orthons( $test_str, $sample_aref->[$_], $self->{'_EQ'} ) ) {
|
169
|
4
|
|
|
|
|
7
|
$count++;
|
170
|
|
|
|
|
|
|
}
|
171
|
|
|
|
|
|
|
}
|
172
|
1
|
|
|
|
|
4
|
return $count;
|
173
|
|
|
|
|
|
|
}
|
174
|
|
|
|
|
|
|
*coltheart_n = \&index_indentical;
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head2 list_orthons
|
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
$aref = $orthon->list_orthons(test => CHARSTR, sample => AREF);
|
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
Returns a reference to an array of single-substitution orthographic neighbours of a given B character-string that are among a given list of B character-strings. The referenced is to an empty array if no orthons are found. The order of items in the returned array follows that in which they appear in the B.
|
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=cut
|
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
sub list_orthons {
|
185
|
1
|
|
|
1
|
1
|
463
|
my ( $self, %args ) = @_;
|
186
|
|
|
|
|
|
|
my $test_str =
|
187
|
|
|
|
|
|
|
hascontent( $args{'test'} )
|
188
|
1
|
50
|
|
|
|
4
|
? $args{'test'}
|
189
|
|
|
|
|
|
|
: croak 'Need a single character string to test for orthons';
|
190
|
|
|
|
|
|
|
my $sample_aref =
|
191
|
|
|
|
|
|
|
ref $args{'sample'}
|
192
|
1
|
50
|
|
|
|
13
|
? $args{'sample'}
|
193
|
|
|
|
|
|
|
: croak
|
194
|
|
|
|
|
|
|
'Need a list (aref) of character-strings to sample for orthon listing';
|
195
|
1
|
|
|
|
|
2
|
my @orthon_list = ();
|
196
|
1
|
|
|
|
|
2
|
for ( 0 .. ( scalar @{$sample_aref} - 1 ) ) {
|
|
1
|
|
|
|
|
3
|
|
197
|
6
|
100
|
|
|
|
10
|
if ( _are_orthons( $test_str, $sample_aref->[$_], $self->{'_EQ'} ) ) {
|
198
|
4
|
|
|
|
|
8
|
push @orthon_list, $sample_aref->[$_];
|
199
|
|
|
|
|
|
|
}
|
200
|
|
|
|
|
|
|
}
|
201
|
1
|
|
|
|
|
3
|
return \@orthon_list;
|
202
|
|
|
|
|
|
|
}
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=head2 ldist, levenshtein
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
$count = $orthon->ldist('String1', 'String2'); # minimal, strings will be lower-cased
|
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
Returns the Levenshtein Distance between two given letter strings, wrapping to various Perl module's that more or less implement the Levenshtein algorithm for efficiency and case-sensitivity. Specifically, if the match level has been set at 1 (to ignore case and diacritics), the method uses L (which offers "ignoring diacritics"); otherwise, it uses L to ignore case but not marks (given present limitations of this module). The required case- and mark-sensitivity are set in the L or L methods. By default, the match is made case- and mark-Isensitively (by canned Perl L).
|
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=cut
|
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
sub ldist {
|
213
|
135
|
|
|
135
|
1
|
2902
|
my ( $self, $w1, $w2 ) = @_;
|
214
|
135
|
|
|
|
|
150
|
my $ldist;
|
215
|
135
|
100
|
|
|
|
233
|
if ( $self->{'_MATCH_LEVEL'} == 1 ) {
|
216
|
91
|
|
|
|
|
1378
|
require Text::Levenshtein;
|
217
|
91
|
|
|
|
|
1639
|
$ldist =
|
218
|
|
|
|
|
|
|
Text::Levenshtein::distance( $w1, $w2, { ignore_diacritics => 1 } )
|
219
|
|
|
|
|
|
|
; # also ignores case
|
220
|
|
|
|
|
|
|
}
|
221
|
|
|
|
|
|
|
else {
|
222
|
44
|
|
|
|
|
1133
|
require Text::Levenshtein::XS;
|
223
|
44
|
100
|
|
|
|
1085
|
if ( $self->{'_MATCH_LEVEL'} == 2 ) {
|
224
|
1
|
|
|
|
|
4
|
( $w1, $w2 ) = map { lc } ( $w1, $w2 ); # ignore case but not marks
|
|
2
|
|
|
|
|
6
|
|
225
|
|
|
|
|
|
|
}
|
226
|
44
|
|
|
|
|
68
|
$ldist = Text::Levenshtein::XS::distance( $w1, $w2 )
|
227
|
|
|
|
|
|
|
; # ignores nothing on its own
|
228
|
|
|
|
|
|
|
}
|
229
|
135
|
|
|
|
|
969881
|
return $ldist;
|
230
|
|
|
|
|
|
|
}
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=head2 old
|
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
$mean = $orthon->old(test => CHARSTR, sample => AREF, lim => INT);
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Returns the mean orthographic Levenshtein distance (OLD) of the smallest B such edit distances for a given B string to all the strings in a B list. Based on Yarkoni et al. (2008), where, with the value of B is set to 20, the measure substantially contributes to prediction of performance in word recognition tasks. Here, if B is not defined, not numeric, or greater than the size of the B, then it is set by default to the size of the sample.
|
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Levenshtein distance is calculated per the method L, wrapping to external modules with respect to the conditions of string equality set in L or L. Different settings lead to different speed of calculation. The slowest calculation (by far) occurs if B => 1 so that case- and mark-insensitive matching occurs; this relies on the pure Perl implementation in Text::Levenshtein with its argument B => 1. The fastest calculation (the default) occurs by setting B => 3, when exact characters are matched, e.g., I in the test-string and I in a sample-string at the same index across them are taken as unequal and so will count as a substitution. This relies on the C-implementation in Text::Levenshtein::XS. Ignore case but not marks with B => 2.
|
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=cut
|
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
sub old {
|
243
|
3
|
|
|
3
|
1
|
769
|
my ( $self, %args ) = @_;
|
244
|
|
|
|
|
|
|
my $test_str =
|
245
|
|
|
|
|
|
|
hascontent( $args{'test'} )
|
246
|
3
|
50
|
|
|
|
15
|
? $args{'test'}
|
247
|
|
|
|
|
|
|
: croak 'Need a single character string to calculate OLD';
|
248
|
|
|
|
|
|
|
my $sample_aref =
|
249
|
|
|
|
|
|
|
ref $args{'sample'}
|
250
|
3
|
50
|
|
|
|
57
|
? $args{'sample'}
|
251
|
|
|
|
|
|
|
: croak 'Need a list (aref) of character-strings to calculate OLD';
|
252
|
3
|
|
|
|
|
8
|
my @ldists = ();
|
253
|
3
|
|
|
|
|
4
|
for ( 0 .. ( scalar @{$sample_aref} - 1 ) ) {
|
|
3
|
|
|
|
|
31
|
|
254
|
123
|
|
|
|
|
246
|
push @ldists, $self->ldist( $test_str, $sample_aref->[$_] );
|
255
|
|
|
|
|
|
|
}
|
256
|
|
|
|
|
|
|
my $lim =
|
257
|
|
|
|
|
|
|
( is_numeric( $args{'lim'} ) and $args{'lim'} <= scalar @ldists )
|
258
|
3
|
50
|
33
|
|
|
15
|
? $args{'lim'}
|
259
|
|
|
|
|
|
|
: scalar @ldists;
|
260
|
3
|
|
|
|
|
85
|
return mean( ( sort { $a <=> $b } @ldists )[ 0 .. int $lim - 1 ] )
|
|
380
|
|
|
|
|
359
|
|
261
|
|
|
|
|
|
|
; # mean of first/smallest $lim-th values
|
262
|
|
|
|
|
|
|
}
|
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=head2 set_eq
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
$orthon->set_eq(match_level => INT); # undef, 0, 1, 2 or 3
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
Sets the string-matching level used in the above methods. This is called implicitly in L when given a B, or with the default value of 0. This is adopted and slightly adapted from how L controls for case/diacritic-sensitive matching.
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=over 4
|
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=item match_level = undef, 0
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
Match with respect to case and diacritics: same as B<3> but simply by Perl's eq. So, e.g., I<éclair> and I would be taken as non-identical, just as would I and I.
|
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
This is the fastest option. The higher levels, as follow, use the C() function in L.
|
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=item match_level = 1
|
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
Match ignoring case and diacritics: I to I involves 1 edit (from I to I only)
|
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=item match_level = 2
|
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
Match ignoring case but respect diacritics: "ber" to "BéZ" involves 2 edits (the "er" to "éZ")
|
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
=item match_level = 3
|
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
Match with respect to case and diacritics: "ber" to "BéZ" involves 3 edits (of all its letters)
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=back
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
So, for example, if the test string is "abbé", it could be picked up as having the single-substitution orthographic neighbour "able" if the match level is 1, but not if it is 0, 2 or 3.
|
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=cut
|
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub set_eq {
|
297
|
12
|
|
|
12
|
1
|
846
|
my ( $self, %args ) = @_;
|
298
|
12
|
|
|
|
|
27
|
my $match_level_arg = $args{'match_level'};
|
299
|
12
|
50
|
33
|
|
|
63
|
if ( nocontent($match_level_arg) or $match_level_arg == 0 ) {
|
|
|
50
|
|
|
|
|
|
300
|
0
|
|
|
|
|
0
|
$self->{'_MATCH_LEVEL'} = 0;
|
301
|
0
|
|
|
0
|
|
0
|
$self->{'_EQ'} = sub { return $_[0] eq $_[1] };
|
|
0
|
|
|
|
|
0
|
|
302
|
|
|
|
|
|
|
}
|
303
|
22
|
|
|
22
|
|
383
|
elsif ( any { $match_level_arg == $_ } ( 1 .. 3 ) ) {
|
304
|
12
|
|
|
|
|
40
|
$self->{'_MATCH_LEVEL'} = $match_level_arg;
|
305
|
12
|
|
|
|
|
66
|
my $collator = Unicode::Collate->new(
|
306
|
|
|
|
|
|
|
normalization => undef,
|
307
|
|
|
|
|
|
|
level => $match_level_arg
|
308
|
|
|
|
|
|
|
);
|
309
|
|
|
|
|
|
|
$self->{'_EQ'} = sub {
|
310
|
134
|
|
|
134
|
|
260
|
return $collator->eq(@_);
|
311
|
12
|
|
|
|
|
440947
|
};
|
312
|
|
|
|
|
|
|
}
|
313
|
|
|
|
|
|
|
else {
|
314
|
0
|
|
|
|
|
0
|
croak "Invalid value '$match_level_arg' given as a match level";
|
315
|
|
|
|
|
|
|
}
|
316
|
12
|
|
|
|
|
64
|
return;
|
317
|
|
|
|
|
|
|
}
|
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
# private methods
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
sub _smallest_len {
|
322
|
35
|
|
|
35
|
|
57
|
my @strs = @_;
|
323
|
35
|
|
|
|
|
62
|
return ( sort { $a <=> $b } map { length } @strs )[0];
|
|
35
|
|
|
|
|
117
|
|
|
70
|
|
|
|
|
155
|
|
324
|
|
|
|
|
|
|
}
|
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
sub _are_orthons {
|
327
|
25
|
|
|
25
|
|
39
|
my ( $w1, $w2, $eq_fn ) = @_;
|
328
|
25
|
100
|
|
|
|
48
|
return 0 if length $w1 != length $w2;
|
329
|
23
|
|
|
|
|
37
|
return _index_identical( $w1, $w2, $eq_fn ) == ( length $w1 ) - 1;
|
330
|
|
|
|
|
|
|
}
|
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
sub _index_identical {
|
333
|
29
|
|
|
29
|
|
41
|
my ( $w1, $w2, $eq_fn ) = @_;
|
334
|
29
|
|
|
|
|
32
|
my $count = 0;
|
335
|
29
|
|
|
|
|
62
|
for my $i ( 0 .. _smallest_len( $w1, $w2 ) - 1 ) {
|
336
|
114
|
100
|
|
|
|
4851
|
if ( $eq_fn->( map { substr $_, $i, 1 } ( $w1, $w2 ) ) ) {
|
|
228
|
|
|
|
|
410
|
|
337
|
62
|
|
|
|
|
8479
|
$count++;
|
338
|
|
|
|
|
|
|
}
|
339
|
|
|
|
|
|
|
}
|
340
|
29
|
|
|
|
|
2011
|
return $count;
|
341
|
|
|
|
|
|
|
}
|
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head1 DIAGNOSTICS
|
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=over 4
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=item Invalid value '...' given as a match level
|
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
Argument B in new() or set_eq() needs to be an integer in range 0 .. 3, or undefined.
|
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
=item Need a single character string to test for orthons
|
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
Argument B for calculating ONC and OLD, and listing orthons, needs to be defined and not empty.
|
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=item Need a single character string to test for orthons
|
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
Argument B should reference an array of character-strings when calculating ONC and OLD, and listing orthons.
|
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=back
|
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=head1 REFERENCES
|
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Coltheart, M., Davelaar, E., Jonasson, J. T., & Besner, D. (1977). Access to the internal lexicon. In S. Dornic (Ed.), I (Vol. 6, pp. 535-555). London, UK: Academic.
|
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
Yarkoni, T., Balota, D. A., & Yap, M. (2008). Moving beyond Coltheart's I: A new measure of orthographic similarity. I, I<15>, 971-979. doi: L<10.3758/PBR.15.5.971|http://dx.doi.org/10.3758/PBR.15.5.971>.
|
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=head1 DEPENDENCIES
|
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
L
|
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
L
|
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
L
|
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
L
|
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
L
|
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
L
|
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
L
|
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
=head1 AUTHOR
|
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
Roderick Garton, C<< >>
|
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=head1 SEE ALSO
|
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
L
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
L
|
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
L
|
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through
|
398
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
|
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=head1 SUPPORT
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command.
|
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
perldoc Lingua::Orthon
|
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
You can also look for information at:
|
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=over 4
|
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here)
|
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
L
|
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation
|
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
L
|
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=item * CPAN Ratings
|
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
L
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=item * Search CPAN
|
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
L
|
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=back
|
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT
|
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
Copyright 2011-2018 Roderick Garton.
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it
|
433
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published
|
434
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License.
|
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
See L for more information.
|
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=cut
|
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
1; # End of Lingua::Orthon
|